From 34b92fcec84b3270fd86c812f78d7ce13ebf7102 Mon Sep 17 00:00:00 2001 From: Michael Levy Date: Wed, 23 Sep 2020 17:30:41 -0600 Subject: [PATCH 01/14] 5 bash scripts from Gary to create time series Also, a single python script that submits each of the five scripts to the slurm queue on casper. Note that I've modified Gary's original scripts to take case identifier (e.g. 003 or 004) and a single year as command line arguments. The python script sets the default case to 004, but requires users to specify at least a single year. The user can also specify specific scripts to run, and the default is to run all five of them. There is also a "dry-run" option that doesn't actually call sbatch. --- data_reshaping/cice.h1_t13.sh | 106 +++++++++++++++++++++++++++++ data_reshaping/cice.h_t13.sh | 106 +++++++++++++++++++++++++++++ data_reshaping/logs/.gitignore | 2 + data_reshaping/pop.h.nday1_t13.sh | 106 +++++++++++++++++++++++++++++ data_reshaping/pop.h.nyear1_t13.sh | 106 +++++++++++++++++++++++++++++ data_reshaping/pop.h_t13.sh | 106 +++++++++++++++++++++++++++++ data_reshaping/run_all.py | 84 +++++++++++++++++++++++ 7 files changed, 616 insertions(+) create mode 100755 data_reshaping/cice.h1_t13.sh create mode 100755 data_reshaping/cice.h_t13.sh create mode 100644 data_reshaping/logs/.gitignore create mode 100755 data_reshaping/pop.h.nday1_t13.sh create mode 100755 data_reshaping/pop.h.nyear1_t13.sh create mode 100755 data_reshaping/pop.h_t13.sh create mode 100755 data_reshaping/run_all.py diff --git a/data_reshaping/cice.h1_t13.sh b/data_reshaping/cice.h1_t13.sh new file mode 100755 index 0000000..0391557 --- /dev/null +++ b/data_reshaping/cice.h1_t13.sh @@ -0,0 +1,106 @@ +#!/bin/bash -l +# +#SBATCH -n 64 +#SBATCH -N 4 +#SBATCH --ntasks-per-node=16 +#SBATCH -t 24:00:00 +#SBATCH -p dav +#SBATCH -J Cice.h1_t13 +#SBATCH --account=CESM0010 +#SBATCH --mem 100G +#SBATCH -e logs/Cice.h1_t13.err.%J +#SBATCH -o logs/Cice.h1_t13.out.%J +#SBATCH --mail-type=ALL +#SBATCH --mail-user=mlevy@ucar.edu +#SBATCH -m block +# +module purge +conda deactivate || echo "conda not loaded" +# +# PARSE COMMAND LINE ARGUMENTS +CASE=g.e22.G1850ECO_JRA_HR.TL319_t13.${1} ; export CASE +YEAR=${2} +echo "Reshaping year ${YEAR} for ${CASE}..." +# +cd /glade/p/cesm/postprocessing_dav/cesm-env2/bin +. activate +# +module load intel/17.0.1 +module load ncarenv +module load ncarcompilers +module load impi +module load netcdf/4.6.1 +module load nco/4.7.4 +module load ncl/6.4.0 +# +HIST=cice.h1 ; export HIST +# +PATH=/glade/p/cesm/postprocessing_dav/cesm-env2/bin:/usr/local/bin:${PATH} ; export PATH +# +NCKS=`which ncks` ; export NCKS +PROCHOST=`hostname`;export PROCHOST +# +BASEDIR=/glade/u/home/strandwg/CCP_Processing_Suite +LOCALDSK=/glade/scratch/mlevy/archive/${CASE} ; export LOCALDSK +PROCBASE=/glade/scratch/$USER/T13/${CASE} ; export PROCBASE +# +HTYP=`echo $HIST | cut -d'.' -f1` ; export HTYP +case "$HTYP" in + cam2 | cam ) + COMP_NAME=atm ;; + cism ) + COMP_NAME=glc ;; + clm2 ) + COMP_NAME=lnd ;; + pop ) + COMP_NAME=ocn ;; + rtm | mosart ) + COMP_NAME=rof ;; + cice | csim ) + COMP_NAME=ice ;; + * ) + echo "Unable to continue because "$HIST" not known." + exit 1 ;; +esac +# +LOCAL_HIST=${LOCALDSK}/${COMP_NAME}/hist ; export LOCAL_HIST +LOCAL_PROC=${PROCBASE}/${HIST}/proc ; export LOCAL_PROC +CACHEDIR=${LOCAL_PROC}/COMPLETED ; export CACHEDIR +# +VERBOSITY=0 ; export VERBOSITY +PREFIX="${CACHEDIR}/${CASE}.${HIST}." ; export PREFIX +NCFORMAT=netcdf4c ; export NCFORMAT ; export NCFORMAT +# +if [ ! -d $LOCAL_PROC ] ; then + mkdir -p $LOCAL_PROC +fi +if [ ! -d $CACHEDIR ] ; then + mkdir -p $CACHEDIR +fi +# +cd $LOCAL_PROC +ln -s -f $BASEDIR/run_slice2series_dav Transpose_Data +# +rm -f ${CASE}.${HIST}.*nc +if [ ! -f ${LOCAL_PROC}/.DONE.${CASE}.${HIST}.${YEAR} ] ; then + ln -s -f ${LOCAL_HIST}/${CASE}.${HIST}.${YEAR}*nc . + NHISTF=`/bin/ls ${CASE}.${HIST}.${YEAR}*nc | wc -l` + if [ $NHISTF -eq 365 ] ; then + OUTTIME="${YEAR}0101-${YEAR}1231" + SUFFIX=".${OUTTIME}.nc" ; export SUFFIX + echo -n "TS transpose_data start: " ; date + ./Transpose_Data + if [ $? -ne 0 ] ; then + echo "Transpose_Data failed" + exit 1 + fi + echo -n "TS transpose_data end : " ; date + touch ${LOCAL_PROC}/.DONE.${CASE}.${HIST}.${YEAR} + else + echo "File count mismatch on "${CASE}"."${HIST}"."${YEAR}": "${NHISTF}" instead of 365" + fi +fi +# +echo -n "TS COMPLETE: " ; date +# +exit diff --git a/data_reshaping/cice.h_t13.sh b/data_reshaping/cice.h_t13.sh new file mode 100755 index 0000000..89fce89 --- /dev/null +++ b/data_reshaping/cice.h_t13.sh @@ -0,0 +1,106 @@ +#!/bin/bash -l +# +#SBATCH -n 64 +#SBATCH -N 4 +#SBATCH --ntasks-per-node=16 +#SBATCH -t 24:00:00 +#SBATCH -p dav +#SBATCH -J Cice.h_t13 +#SBATCH --account=CESM0010 +#SBATCH --mem 100G +#SBATCH -e logs/Cice.h_t13.err.%J +#SBATCH -o logs/Cice.h_t13.out.%J +#SBATCH --mail-type=ALL +#SBATCH --mail-user=mlevy@ucar.edu +#SBATCH -m block +# +module purge +conda deactivate || echo "conda not loaded" +# +# PARSE COMMAND LINE ARGUMENTS +CASE=g.e22.G1850ECO_JRA_HR.TL319_t13.${1} ; export CASE +YEAR=${2} +echo "Reshaping year ${YEAR} for ${CASE}..." +# +cd /glade/p/cesm/postprocessing_dav/cesm-env2/bin +. activate +# +module load intel/17.0.1 +module load ncarenv +module load ncarcompilers +module load impi +module load netcdf/4.6.1 +module load nco/4.7.4 +module load ncl/6.4.0 +# +HIST=cice.h ; export HIST +# +PATH=/glade/p/cesm/postprocessing_dav/cesm-env2/bin:/usr/local/bin:${PATH} ; export PATH +# +NCKS=`which ncks` ; export NCKS +PROCHOST=`hostname`;export PROCHOST +# +BASEDIR=/glade/u/home/strandwg/CCP_Processing_Suite +LOCALDSK=/glade/scratch/mlevy/archive/${CASE} ; export LOCALDSK +PROCBASE=/glade/scratch/$USER/T13/${CASE} ; export PROCBASE +# +HTYP=`echo $HIST | cut -d'.' -f1` ; export HTYP +case "$HTYP" in + cam2 | cam ) + COMP_NAME=atm ;; + cism ) + COMP_NAME=glc ;; + clm2 ) + COMP_NAME=lnd ;; + pop ) + COMP_NAME=ocn ;; + rtm | mosart ) + COMP_NAME=rof ;; + cice | csim ) + COMP_NAME=ice ;; + * ) + echo "Unable to continue because "$HIST" not known." + exit 1 ;; +esac +# +LOCAL_HIST=${LOCALDSK}/${COMP_NAME}/hist ; export LOCAL_HIST +LOCAL_PROC=${PROCBASE}/${HIST}/proc ; export LOCAL_PROC +CACHEDIR=${LOCAL_PROC}/COMPLETED ; export CACHEDIR +# +VERBOSITY=0 ; export VERBOSITY +PREFIX="${CACHEDIR}/${CASE}.${HIST}." ; export PREFIX +NCFORMAT=netcdf4c ; export NCFORMAT ; export NCFORMAT +# +if [ ! -d $LOCAL_PROC ] ; then + mkdir -p $LOCAL_PROC +fi +if [ ! -d $CACHEDIR ] ; then + mkdir -p $CACHEDIR +fi +# +cd $LOCAL_PROC +ln -s -f $BASEDIR/run_slice2series_dav Transpose_Data +# +rm -f ${CASE}.${HIST}.*nc +if [ ! -f ${LOCAL_PROC}/.DONE.${CASE}.${HIST}.${YEAR} ] ; then + ln -s -f ${LOCAL_HIST}/${CASE}.${HIST}.${YEAR}*nc . + NHISTF=`/bin/ls ${CASE}.${HIST}.${YEAR}*nc | wc -l` + if [ $NHISTF -eq 12 ] ; then + OUTTIME="${YEAR}01-${YEAR}12" + SUFFIX=".${OUTTIME}.nc" ; export SUFFIX + echo -n "TS transpose_data start: " ; date + ./Transpose_Data + if [ $? -ne 0 ] ; then + echo "Transpose_Data failed" + exit 1 + fi + echo -n "TS transpose_data end : " ; date + touch ${LOCAL_PROC}/.DONE.${CASE}.${HIST}.${YEAR} + else + echo "File count mismatch on "${CASE}"."${HIST}"."${YEAR}": "${NHISTF}" instead of 12" + fi +fi +# +echo -n "TS COMPLETE: " ; date +# +exit diff --git a/data_reshaping/logs/.gitignore b/data_reshaping/logs/.gitignore new file mode 100644 index 0000000..c337daf --- /dev/null +++ b/data_reshaping/logs/.gitignore @@ -0,0 +1,2 @@ +Cice* +Pop* diff --git a/data_reshaping/pop.h.nday1_t13.sh b/data_reshaping/pop.h.nday1_t13.sh new file mode 100755 index 0000000..ac80fd9 --- /dev/null +++ b/data_reshaping/pop.h.nday1_t13.sh @@ -0,0 +1,106 @@ +#!/bin/bash -l +# +#SBATCH -n 64 +#SBATCH -N 4 +#SBATCH --ntasks-per-node=16 +#SBATCH -t 24:00:00 +#SBATCH -p dav +#SBATCH -J Pop.h.nday1_t13 +#SBATCH --account=CESM0010 +#SBATCH --mem 100G +#SBATCH -e logs/Pop.h.nday1_t13.err.%J +#SBATCH -o logs/Pop.h.nday1_t13.out.%J +#SBATCH --mail-type=ALL +#SBATCH --mail-user=mlevy@ucar.edu +#SBATCH -m block +# +module purge +conda deactivate || echo "conda not loaded" +# +# PARSE COMMAND LINE ARGUMENTS +CASE=g.e22.G1850ECO_JRA_HR.TL319_t13.${1} ; export CASE +YEAR=${2} +echo "Reshaping year ${YEAR} for ${CASE}..." +# +cd /glade/p/cesm/postprocessing_dav/cesm-env2/bin +. activate +# +module load intel/17.0.1 +module load ncarenv +module load ncarcompilers +module load impi +module load netcdf/4.6.1 +module load nco/4.7.4 +module load ncl/6.4.0 +# +HIST=pop.h.nday1 ; export HIST +# +PATH=/glade/p/cesm/postprocessing_dav/cesm-env2/bin:/usr/local/bin:${PATH} ; export PATH +# +NCKS=`which ncks` ; export NCKS +PROCHOST=`hostname`;export PROCHOST +# +BASEDIR=/glade/u/home/strandwg/CCP_Processing_Suite +LOCALDSK=/glade/scratch/mlevy/archive/${CASE} ; export LOCALDSK +PROCBASE=/glade/scratch/$USER/T13/${CASE} ; export PROCBASE +# +HTYP=`echo $HIST | cut -d'.' -f1` ; export HTYP +case "$HTYP" in + cam2 | cam ) + COMP_NAME=atm ;; + cism ) + COMP_NAME=glc ;; + clm2 ) + COMP_NAME=lnd ;; + pop ) + COMP_NAME=ocn ;; + rtm | mosart ) + COMP_NAME=rof ;; + cice | csim ) + COMP_NAME=ice ;; + * ) + echo "Unable to continue because "$HIST" not known." + exit 1 ;; +esac +# +LOCAL_HIST=${LOCALDSK}/${COMP_NAME}/hist ; export LOCAL_HIST +LOCAL_PROC=${PROCBASE}/${HIST}/proc ; export LOCAL_PROC +CACHEDIR=${LOCAL_PROC}/COMPLETED ; export CACHEDIR +# +VERBOSITY=0 ; export VERBOSITY +PREFIX="${CACHEDIR}/${CASE}.${HIST}." ; export PREFIX +NCFORMAT=netcdf4c ; export NCFORMAT ; export NCFORMAT +# +if [ ! -d $LOCAL_PROC ] ; then + mkdir -p $LOCAL_PROC +fi +if [ ! -d $CACHEDIR ] ; then + mkdir -p $CACHEDIR +fi +# +cd $LOCAL_PROC +ln -s -f $BASEDIR/run_slice2series_dav Transpose_Data +# +rm -f ${CASE}.${HIST}.*nc +if [ ! -f ${LOCAL_PROC}/.DONE.${CASE}.${HIST}.${YEAR} ] ; then + ln -s -f ${LOCAL_HIST}/${CASE}.${HIST}.${YEAR}*nc . + NHISTF=`/bin/ls ${CASE}.${HIST}.${YEAR}*nc | wc -l` + if [ $NHISTF -eq 12 ] ; then + OUTTIME="${YEAR}0101-${YEAR}1231" + SUFFIX=".${OUTTIME}.nc" ; export SUFFIX + echo -n "TS transpose_data start: " ; date + ./Transpose_Data + if [ $? -ne 0 ] ; then + echo "Transpose_Data failed" + exit 1 + fi + echo -n "TS transpose_data end : " ; date + touch ${LOCAL_PROC}/.DONE.${CASE}.${HIST}.${YEAR} + else + echo "File count mismatch on "${CASE}"."${HIST}"."${YEAR}": "${NHISTF}" instead of 12" + fi +fi +# +echo -n "TS COMPLETE: " ; date +# +exit diff --git a/data_reshaping/pop.h.nyear1_t13.sh b/data_reshaping/pop.h.nyear1_t13.sh new file mode 100755 index 0000000..df0f96c --- /dev/null +++ b/data_reshaping/pop.h.nyear1_t13.sh @@ -0,0 +1,106 @@ +#!/bin/bash -l +# +#SBATCH -n 64 +#SBATCH -N 4 +#SBATCH --ntasks-per-node=16 +#SBATCH -t 24:00:00 +#SBATCH -p dav +#SBATCH -J Pop.h.nyear1_t13 +#SBATCH --account=CESM0010 +#SBATCH --mem 100G +#SBATCH -e logs/Pop.h.nyear1_t13.err.%J +#SBATCH -o logs/Pop.h.nyear1_t13.out.%J +#SBATCH --mail-type=ALL +#SBATCH --mail-user=mlevy@ucar.edu +#SBATCH -m block +# +module purge +conda deactivate || echo "conda not loaded" +# +# PARSE COMMAND LINE ARGUMENTS +CASE=g.e22.G1850ECO_JRA_HR.TL319_t13.${1} ; export CASE +YEAR=${2} +echo "Reshaping year ${YEAR} for ${CASE}..." +# +cd /glade/p/cesm/postprocessing_dav/cesm-env2/bin +. activate +# +module load intel/17.0.1 +module load ncarenv +module load ncarcompilers +module load impi +module load netcdf/4.6.1 +module load nco/4.7.4 +module load ncl/6.4.0 +# +HIST=pop.h.nyear1 ; export HIST +# +PATH=/glade/p/cesm/postprocessing_dav/cesm-env2/bin:/usr/local/bin:${PATH} ; export PATH +# +NCKS=`which ncks` ; export NCKS +PROCHOST=`hostname`;export PROCHOST +# +BASEDIR=/glade/u/home/strandwg/CCP_Processing_Suite +LOCALDSK=/glade/scratch/mlevy/archive/${CASE} ; export LOCALDSK +PROCBASE=/glade/scratch/$USER/T13/${CASE} ; export PROCBASE +# +HTYP=`echo $HIST | cut -d'.' -f1` ; export HTYP +case "$HTYP" in + cam2 | cam ) + COMP_NAME=atm ;; + cism ) + COMP_NAME=glc ;; + clm2 ) + COMP_NAME=lnd ;; + pop ) + COMP_NAME=ocn ;; + rtm | mosart ) + COMP_NAME=rof ;; + cice | csim ) + COMP_NAME=ice ;; + * ) + echo "Unable to continue because "$HIST" not known." + exit 1 ;; +esac +# +LOCAL_HIST=${LOCALDSK}/${COMP_NAME}/hist ; export LOCAL_HIST +LOCAL_PROC=${PROCBASE}/${HIST}/proc ; export LOCAL_PROC +CACHEDIR=${LOCAL_PROC}/COMPLETED ; export CACHEDIR +# +VERBOSITY=0 ; export VERBOSITY +PREFIX="${CACHEDIR}/${CASE}.${HIST}." ; export PREFIX +NCFORMAT=netcdf4c ; export NCFORMAT ; export NCFORMAT +# +if [ ! -d $LOCAL_PROC ] ; then + mkdir -p $LOCAL_PROC +fi +if [ ! -d $CACHEDIR ] ; then + mkdir -p $CACHEDIR +fi +# +cd $LOCAL_PROC +ln -s -f $BASEDIR/run_slice2series_dav Transpose_Data +# +rm -f ${CASE}.${HIST}.*nc +if [ ! -f ${LOCAL_PROC}/.DONE.${CASE}.${HIST}.${YEAR} ] ; then + ln -s -f ${LOCAL_HIST}/${CASE}.${HIST}.${YEAR}*nc . + NHISTF=`/bin/ls ${CASE}.${HIST}.${YEAR}*nc | wc -l` + if [ $NHISTF -eq 1 ] ; then + OUTTIME="${YEAR}-${YEAR}" + SUFFIX=".${OUTTIME}.nc" ; export SUFFIX + echo -n "TS transpose_data start: " ; date + ./Transpose_Data + if [ $? -ne 0 ] ; then + echo "Transpose_Data failed" + exit 1 + fi + echo -n "TS transpose_data end : " ; date + touch ${LOCAL_PROC}/.DONE.${CASE}.${HIST}.${YEAR} + else + echo "File count mismatch on "${CASE}"."${HIST}"."${YEAR}": "${NHISTF}" instead of 1" + fi +fi +# +echo -n "TS COMPLETE: " ; date +# +exit diff --git a/data_reshaping/pop.h_t13.sh b/data_reshaping/pop.h_t13.sh new file mode 100755 index 0000000..93c91b9 --- /dev/null +++ b/data_reshaping/pop.h_t13.sh @@ -0,0 +1,106 @@ +#!/bin/bash -l +# +#SBATCH -n 64 +#SBATCH -N 4 +#SBATCH --ntasks-per-node=16 +#SBATCH -t 24:00:00 +#SBATCH -p dav +#SBATCH -J Pop.h_t13 +#SBATCH --account=CESM0010 +#SBATCH --mem 100G +#SBATCH -e logs/Pop.h_t13.err.%J +#SBATCH -o logs/Pop.h_t13.out.%J +#SBATCH --mail-type=ALL +#SBATCH --mail-user=mlevy@ucar.edu +#SBATCH -m block +# +module purge +conda deactivate || echo "conda not loaded" +# +# PARSE COMMAND LINE ARGUMENTS +CASE=g.e22.G1850ECO_JRA_HR.TL319_t13.${1} ; export CASE +YEAR=${2} +echo "Reshaping year ${YEAR} for ${CASE}..." +# +cd /glade/p/cesm/postprocessing_dav/cesm-env2/bin +. activate +# +module load intel/17.0.1 +module load ncarenv +module load ncarcompilers +module load impi +module load netcdf/4.6.1 +module load nco/4.7.4 +module load ncl/6.4.0 +# +HIST=pop.h ; export HIST +# +PATH=/glade/p/cesm/postprocessing_dav/cesm-env2/bin:/usr/local/bin:${PATH} ; export PATH +# +NCKS=`which ncks` ; export NCKS +PROCHOST=`hostname`;export PROCHOST +# +BASEDIR=/glade/u/home/strandwg/CCP_Processing_Suite +LOCALDSK=/glade/scratch/mlevy/archive/${CASE} ; export LOCALDSK +PROCBASE=/glade/scratch/$USER/T13/${CASE} ; export PROCBASE +# +HTYP=`echo $HIST | cut -d'.' -f1` ; export HTYP +case "$HTYP" in + cam2 | cam ) + COMP_NAME=atm ;; + cism ) + COMP_NAME=glc ;; + clm2 ) + COMP_NAME=lnd ;; + pop ) + COMP_NAME=ocn ;; + rtm | mosart ) + COMP_NAME=rof ;; + cice | csim ) + COMP_NAME=ice ;; + * ) + echo "Unable to continue because "$HIST" not known." + exit 1 ;; +esac +# +LOCAL_HIST=${LOCALDSK}/${COMP_NAME}/hist ; export LOCAL_HIST +LOCAL_PROC=${PROCBASE}/${HIST}/proc ; export LOCAL_PROC +CACHEDIR=${LOCAL_PROC}/COMPLETED ; export CACHEDIR +# +VERBOSITY=0 ; export VERBOSITY +PREFIX="${CACHEDIR}/${CASE}.${HIST}." ; export PREFIX +NCFORMAT=netcdf4c ; export NCFORMAT ; export NCFORMAT +# +if [ ! -d $LOCAL_PROC ] ; then + mkdir -p $LOCAL_PROC +fi +if [ ! -d $CACHEDIR ] ; then + mkdir -p $CACHEDIR +fi +# +cd $LOCAL_PROC +ln -s -f $BASEDIR/run_slice2series_dav Transpose_Data +# +rm -f ${CASE}.${HIST}.*nc +if [ ! -f ${LOCAL_PROC}/.DONE.${CASE}.${HIST}.${YEAR} ] ; then + ln -s -f ${LOCAL_HIST}/${CASE}.${HIST}.${YEAR}*nc . + NHISTF=`/bin/ls ${CASE}.${HIST}.${YEAR}*nc | wc -l` + if [ $NHISTF -eq 12 ] ; then + OUTTIME="${YEAR}01-${YEAR}12" + SUFFIX=".${OUTTIME}.nc" ; export SUFFIX + echo -n "TS transpose_data start: " ; date + ./Transpose_Data + if [ $? -ne 0 ] ; then + echo "Transpose_Data failed" + exit 1 + fi + echo -n "TS transpose_data end : " ; date + touch ${LOCAL_PROC}/.DONE.${CASE}.${HIST}.${YEAR} + else + echo "File count mismatch on "${CASE}"."${HIST}"."${YEAR}": "${NHISTF}" instead of 12" + fi +fi +# +echo -n "TS COMPLETE: " ; date +# +exit diff --git a/data_reshaping/run_all.py b/data_reshaping/run_all.py new file mode 100755 index 0000000..d0698ad --- /dev/null +++ b/data_reshaping/run_all.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python +# sbatch pop.h.nday1_t13.sh 004 0001 +import os + + +def _parse_args(): + """ Parse command line arguments """ + + import argparse + + parser = argparse.ArgumentParser( + description="Submit scripts to reshape highres BGC output", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + # Required: specify year + parser.add_argument( + "-y", + "--years", + action="store", + dest="years", + type=int, + required=True, + nargs="+", + help="Year of run to convert to time series", + ) + + # Optional: which case to convert + parser.add_argument( + "-c", + "--case", + action="store", + dest="case", + type=str, + default="004", + help="Suffix of case to convert to time series", + ) + + # Optional: specify which scripts to run + parser.add_argument( + "-s", + "--scripts", + action="store", + dest="scripts", + type=str, + nargs="+", + default=[ + "pop.h_t13.sh", + "pop.h.nday1_t13.sh", + "cice.h_t13.sh", + "pop.h.nyear1_t13.sh", + "cice.h1_t13.sh", + ], + help="Scripts to submit to slurm", + ) + + # Optional: is this a dry-run? If so, don't submit anything + parser.add_argument( + "-d", + "--dry-run", + action="store_true", + dest="dryrun", + help="If true, do not actually submit job", + ) + + return parser.parse_args() + + +################### + +if __name__ == "__main__": + args = _parse_args() + case = args.case + + for yr in args.years: + year = f"{yr:04}" + for script in args.scripts: + print(f"Submitting {script} for year {year} of {case}...") + if not args.dryrun: + # note: the --dependency=singleton option means only one job per job name + # Some jobs had been crashing, and I think it was due to temporary + # files clobbering each other? But only having one pop.h_t13.sh job + # at a time seems to have prevented these issues. + os.system(f"sbatch --dependency=singleton {script} {case} {year}") From 054efff13215e06f321093cc392170319c20fdd3 Mon Sep 17 00:00:00 2001 From: Michael Levy Date: Wed, 23 Sep 2020 17:39:18 -0600 Subject: [PATCH 02/14] shell scripts don't need to be executable Since the bash scripts will be submitted to slurm by the python script, they do not need to be executable. --- data_reshaping/cice.h1_t13.sh | 0 data_reshaping/cice.h_t13.sh | 0 data_reshaping/pop.h.nday1_t13.sh | 0 data_reshaping/pop.h.nyear1_t13.sh | 0 data_reshaping/pop.h_t13.sh | 0 5 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 data_reshaping/cice.h1_t13.sh mode change 100755 => 100644 data_reshaping/cice.h_t13.sh mode change 100755 => 100644 data_reshaping/pop.h.nday1_t13.sh mode change 100755 => 100644 data_reshaping/pop.h.nyear1_t13.sh mode change 100755 => 100644 data_reshaping/pop.h_t13.sh diff --git a/data_reshaping/cice.h1_t13.sh b/data_reshaping/cice.h1_t13.sh old mode 100755 new mode 100644 diff --git a/data_reshaping/cice.h_t13.sh b/data_reshaping/cice.h_t13.sh old mode 100755 new mode 100644 diff --git a/data_reshaping/pop.h.nday1_t13.sh b/data_reshaping/pop.h.nday1_t13.sh old mode 100755 new mode 100644 diff --git a/data_reshaping/pop.h.nyear1_t13.sh b/data_reshaping/pop.h.nyear1_t13.sh old mode 100755 new mode 100644 diff --git a/data_reshaping/pop.h_t13.sh b/data_reshaping/pop.h_t13.sh old mode 100755 new mode 100644 From 9157beb669eb5da5cd2a0d5e36a87730ef1ba959 Mon Sep 17 00:00:00 2001 From: Michael Levy Date: Wed, 23 Sep 2020 17:42:10 -0600 Subject: [PATCH 03/14] Remove un-necessary comment --- data_reshaping/run_all.py | 1 - 1 file changed, 1 deletion(-) diff --git a/data_reshaping/run_all.py b/data_reshaping/run_all.py index d0698ad..5b97698 100755 --- a/data_reshaping/run_all.py +++ b/data_reshaping/run_all.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# sbatch pop.h.nday1_t13.sh 004 0001 import os From f0393f489526cd26ad9b325b8147ebd6c0268d29 Mon Sep 17 00:00:00 2001 From: Michael Levy Date: Thu, 24 Sep 2020 09:57:52 -0600 Subject: [PATCH 04/14] Small refactor to get my email out of scripts Now pass --mail-type and --mail-user through the python script (default sends email, but --no-mail turns off the messages) --- data_reshaping/cice.h1_t13.sh | 2 -- data_reshaping/cice.h_t13.sh | 2 -- data_reshaping/pop.h.nday1_t13.sh | 2 -- data_reshaping/pop.h.nyear1_t13.sh | 2 -- data_reshaping/pop.h_t13.sh | 2 -- data_reshaping/run_all.py | 18 +++++++++++++++++- 6 files changed, 17 insertions(+), 11 deletions(-) diff --git a/data_reshaping/cice.h1_t13.sh b/data_reshaping/cice.h1_t13.sh index 0391557..7c38352 100644 --- a/data_reshaping/cice.h1_t13.sh +++ b/data_reshaping/cice.h1_t13.sh @@ -10,8 +10,6 @@ #SBATCH --mem 100G #SBATCH -e logs/Cice.h1_t13.err.%J #SBATCH -o logs/Cice.h1_t13.out.%J -#SBATCH --mail-type=ALL -#SBATCH --mail-user=mlevy@ucar.edu #SBATCH -m block # module purge diff --git a/data_reshaping/cice.h_t13.sh b/data_reshaping/cice.h_t13.sh index 89fce89..ed2ff26 100644 --- a/data_reshaping/cice.h_t13.sh +++ b/data_reshaping/cice.h_t13.sh @@ -10,8 +10,6 @@ #SBATCH --mem 100G #SBATCH -e logs/Cice.h_t13.err.%J #SBATCH -o logs/Cice.h_t13.out.%J -#SBATCH --mail-type=ALL -#SBATCH --mail-user=mlevy@ucar.edu #SBATCH -m block # module purge diff --git a/data_reshaping/pop.h.nday1_t13.sh b/data_reshaping/pop.h.nday1_t13.sh index ac80fd9..129a23f 100644 --- a/data_reshaping/pop.h.nday1_t13.sh +++ b/data_reshaping/pop.h.nday1_t13.sh @@ -10,8 +10,6 @@ #SBATCH --mem 100G #SBATCH -e logs/Pop.h.nday1_t13.err.%J #SBATCH -o logs/Pop.h.nday1_t13.out.%J -#SBATCH --mail-type=ALL -#SBATCH --mail-user=mlevy@ucar.edu #SBATCH -m block # module purge diff --git a/data_reshaping/pop.h.nyear1_t13.sh b/data_reshaping/pop.h.nyear1_t13.sh index df0f96c..0589519 100644 --- a/data_reshaping/pop.h.nyear1_t13.sh +++ b/data_reshaping/pop.h.nyear1_t13.sh @@ -10,8 +10,6 @@ #SBATCH --mem 100G #SBATCH -e logs/Pop.h.nyear1_t13.err.%J #SBATCH -o logs/Pop.h.nyear1_t13.out.%J -#SBATCH --mail-type=ALL -#SBATCH --mail-user=mlevy@ucar.edu #SBATCH -m block # module purge diff --git a/data_reshaping/pop.h_t13.sh b/data_reshaping/pop.h_t13.sh index 93c91b9..980b0f7 100644 --- a/data_reshaping/pop.h_t13.sh +++ b/data_reshaping/pop.h_t13.sh @@ -10,8 +10,6 @@ #SBATCH --mem 100G #SBATCH -e logs/Pop.h_t13.err.%J #SBATCH -o logs/Pop.h_t13.out.%J -#SBATCH --mail-type=ALL -#SBATCH --mail-user=mlevy@ucar.edu #SBATCH -m block # module purge diff --git a/data_reshaping/run_all.py b/data_reshaping/run_all.py index 5b97698..b3f4098 100755 --- a/data_reshaping/run_all.py +++ b/data_reshaping/run_all.py @@ -62,6 +62,14 @@ def _parse_args(): help="If true, do not actually submit job", ) + # Optional: By default, slurm will email users when jobs start and finish + parser.add_argument( + "--no-mail", + action="store_false", + dest="send_mail", + help="If true, send SLURM emails to {user}@ucar.edu", + ) + return parser.parse_args() @@ -70,14 +78,22 @@ def _parse_args(): if __name__ == "__main__": args = _parse_args() case = args.case + mail_opt = ( + f"--mail-type=ALL --mail-user={os.environ['USER']}@ucar.edu" + if args.send_mail + else "--mail-type=NONE" + ) for yr in args.years: year = f"{yr:04}" for script in args.scripts: print(f"Submitting {script} for year {year} of {case}...") + cmd = f"sbatch {mail_opt} --dependency=singleton {script} {case} {year}" if not args.dryrun: # note: the --dependency=singleton option means only one job per job name # Some jobs had been crashing, and I think it was due to temporary # files clobbering each other? But only having one pop.h_t13.sh job # at a time seems to have prevented these issues. - os.system(f"sbatch --dependency=singleton {script} {case} {year}") + os.system(cmd) + else: + print(f"Command to run: {cmd}") From 170bc3fe7310f46939e158071db17a6a8c4e752d Mon Sep 17 00:00:00 2001 From: Michael Levy Date: Mon, 12 Oct 2020 08:14:28 -0600 Subject: [PATCH 05/14] Update comparison notebooks I've added 0007 and 0008 to glade/campaign, so compare_ts_and_hist_004 checks those years. Also, I cleaned up some of the output (no longer printing start / finish time) --- notebooks/compare_ts_and_hist_003.ipynb | 24 ++++++-------- notebooks/compare_ts_and_hist_004.ipynb | 42 ++++++++++++++----------- 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/notebooks/compare_ts_and_hist_003.ipynb b/notebooks/compare_ts_and_hist_003.ipynb index 1c8d92c..1778bbb 100644 --- a/notebooks/compare_ts_and_hist_003.ipynb +++ b/notebooks/compare_ts_and_hist_003.ipynb @@ -6,8 +6,6 @@ "metadata": {}, "outputs": [], "source": [ - "from datetime import datetime\n", - "\n", "import yaml\n", "\n", "import utils" @@ -48,21 +46,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "Starting year 0001 at 13:07:35 2020-10-09\n", + "Starting year 0001...\n", "No differences found in year 0001\n", - "Finished 1 at 13:12:38 2020-10-09\n", + "Finished 0001\n", "----\n", - "Starting year 0002 at 13:12:38 2020-10-09\n", + "Starting year 0002...\n", "No differences found in year 0002\n", - "Finished 2 at 13:17:28 2020-10-09\n", + "Finished 0002\n", "----\n", - "Starting year 0003 at 13:17:28 2020-10-09\n", + "Starting year 0003...\n", "No differences found in year 0003\n", - "Finished 3 at 13:22:19 2020-10-09\n", + "Finished 0003\n", "----\n", - "Starting year 0004 at 13:22:19 2020-10-09\n", + "Starting year 0004...\n", "No differences found in year 0004\n", - "Finished 4 at 13:27:07 2020-10-09\n", + "Finished 0004\n", "----\n" ] } @@ -75,9 +73,7 @@ "case_hist._timeseries_filenames[stream] = []\n", "\n", "for year in range(start_year, end_year + 1):\n", - " print(\n", - " f\"Starting year {year:04} at {datetime.now().strftime('%H:%M:%S %Y-%m-%d')}\"\n", - " )\n", + " print(f\"Starting year {year:04}...\")\n", " all_same = True\n", " for diag_metadata in diag_metadata_list:\n", " varname = diag_metadata[\"varname\"]\n", @@ -94,7 +90,7 @@ " all_same = False\n", " if all_same:\n", " print(f\"No differences found in year {year:04}\")\n", - " print(f\"Finished {year} at {datetime.now().strftime('%H:%M:%S %Y-%m-%d')}\")\n", + " print(f\"Finished {year:04}\")\n", " print(\"----\")" ] } diff --git a/notebooks/compare_ts_and_hist_004.ipynb b/notebooks/compare_ts_and_hist_004.ipynb index 4173bd3..03d90d0 100644 --- a/notebooks/compare_ts_and_hist_004.ipynb +++ b/notebooks/compare_ts_and_hist_004.ipynb @@ -6,8 +6,6 @@ "metadata": {}, "outputs": [], "source": [ - "from datetime import datetime\n", - "\n", "import yaml\n", "\n", "import utils" @@ -23,7 +21,7 @@ "\n", "# currently have all history files available, but only 6 years of time series\n", "start_year = 1\n", - "end_year = 6" + "end_year = 8" ] }, { @@ -47,29 +45,37 @@ "name": "stdout", "output_type": "stream", "text": [ - "Starting year 0001 at 13:07:35 2020-10-09\n", + "Starting year 0001...\n", "No differences found in year 0001\n", - "Finished 1 at 13:12:45 2020-10-09\n", + "Finished 0001\n", "----\n", - "Starting year 0002 at 13:12:45 2020-10-09\n", + "Starting year 0002...\n", "No differences found in year 0002\n", - "Finished 2 at 13:17:39 2020-10-09\n", + "Finished 0002\n", "----\n", - "Starting year 0003 at 13:17:39 2020-10-09\n", + "Starting year 0003...\n", "No differences found in year 0003\n", - "Finished 3 at 13:22:31 2020-10-09\n", + "Finished 0003\n", "----\n", - "Starting year 0004 at 13:22:31 2020-10-09\n", + "Starting year 0004...\n", "No differences found in year 0004\n", - "Finished 4 at 13:27:28 2020-10-09\n", + "Finished 0004\n", "----\n", - "Starting year 0005 at 13:27:28 2020-10-09\n", + "Starting year 0005...\n", "No differences found in year 0005\n", - "Finished 5 at 13:32:23 2020-10-09\n", + "Finished 0005\n", "----\n", - "Starting year 0006 at 13:32:23 2020-10-09\n", + "Starting year 0006...\n", "No differences found in year 0006\n", - "Finished 6 at 13:37:30 2020-10-09\n", + "Finished 0006\n", + "----\n", + "Starting year 0007...\n", + "No differences found in year 0007\n", + "Finished 0007\n", + "----\n", + "Starting year 0008...\n", + "No differences found in year 0008\n", + "Finished 0008\n", "----\n" ] } @@ -82,9 +88,7 @@ "case_hist._timeseries_filenames[stream] = []\n", "\n", "for year in range(start_year, end_year + 1):\n", - " print(\n", - " f\"Starting year {year:04} at {datetime.now().strftime('%H:%M:%S %Y-%m-%d')}\"\n", - " )\n", + " print(f\"Starting year {year:04}...\")\n", " all_same = True\n", " for diag_metadata in diag_metadata_list:\n", " varname = diag_metadata[\"varname\"]\n", @@ -101,7 +105,7 @@ " all_same = False\n", " if all_same:\n", " print(f\"No differences found in year {year:04}\")\n", - " print(f\"Finished {year} at {datetime.now().strftime('%H:%M:%S %Y-%m-%d')}\")\n", + " print(f\"Finished {year:04}\")\n", " print(\"----\")" ] } From 0738f0a4dddcbd97a22230ccc5efc69b50841013 Mon Sep 17 00:00:00 2001 From: Michael Levy Date: Thu, 15 Oct 2020 14:20:38 -0600 Subject: [PATCH 06/14] CaseClass tracks file type opened by gen_dataset There is now a way to query whether a specific year of a variable from dataset came from time series or history files. This is probably only useful for the compare_ts_and_hist notebooks, which have been re-run. Note that for this commit I re-ran the notebooks on cheyenne, which does not have access to the time series data on campaign -- when casper is back up, I will re-run the notebooks to actually do the comparison. --- notebooks/compare_ts_and_hist_003.ipynb | 94 ++++++++++++------- notebooks/compare_ts_and_hist_004.ipynb | 117 ++++++++++++++++-------- notebooks/utils/CaseClass.py | 66 ++++++++++--- 3 files changed, 194 insertions(+), 83 deletions(-) diff --git a/notebooks/compare_ts_and_hist_003.ipynb b/notebooks/compare_ts_and_hist_003.ipynb index 1778bbb..5dd6a74 100644 --- a/notebooks/compare_ts_and_hist_003.ipynb +++ b/notebooks/compare_ts_and_hist_003.ipynb @@ -16,20 +16,6 @@ "execution_count": 2, "metadata": {}, "outputs": [], - "source": [ - "casename = \"g.e22.G1850ECO_JRA_HR.TL319_t13.003\"\n", - "\n", - "# currently have all history files and time series available\n", - "# 003 only ran for four years\n", - "start_year = 1\n", - "end_year = 4" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], "source": [ "with open(\"diag_metadata.yaml\", mode=\"r\") as fptr:\n", " diag_metadata_list = yaml.safe_load(fptr)\n", @@ -39,7 +25,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -47,47 +33,93 @@ "output_type": "stream", "text": [ "Starting year 0001...\n", - "No differences found in year 0001\n", - "Finished 0001\n", + "case_ts data for POC_FLUX_100m comes from 'hist', not time series files\n", + "Either time series or history files for variables in 0001 are missing, skipping comparison\n", "----\n", "Starting year 0002...\n", - "No differences found in year 0002\n", - "Finished 0002\n", + "case_ts data for POC_FLUX_100m comes from 'hist', not time series files\n", + "Either time series or history files for variables in 0002 are missing, skipping comparison\n", "----\n", "Starting year 0003...\n", - "No differences found in year 0003\n", - "Finished 0003\n", + "case_ts data for POC_FLUX_100m comes from 'hist', not time series files\n", + "Either time series or history files for variables in 0003 are missing, skipping comparison\n", "----\n", "Starting year 0004...\n", - "No differences found in year 0004\n", - "Finished 0004\n", - "----\n" + "case_ts data for POC_FLUX_100m comes from 'hist', not time series files\n", + "Either time series or history files for variables in 0004 are missing, skipping comparison\n", + "----\n", + "Starting year 0005...\n", + "Year 0005 is not available, are you sure it has been run?\n" ] } ], "source": [ + "casename = \"g.e22.G1850ECO_JRA_HR.TL319_t13.003\"\n", + "\n", "case_hist = utils.CaseClass(casename)\n", "case_ts = utils.CaseClass(casename)\n", "\n", "stream = \"pop.h\"\n", "case_hist._timeseries_filenames[stream] = []\n", "\n", - "for year in range(start_year, end_year + 1):\n", + "for year in range(1, 62):\n", " print(f\"Starting year {year:04}...\")\n", " all_same = True\n", + " found_ts_and_hist = True\n", + " year_found = True\n", " for diag_metadata in diag_metadata_list:\n", " varname = diag_metadata[\"varname\"]\n", - " ds_hist = case_hist.gen_dataset(\n", - " varname, stream, start_year=year, end_year=year, quiet=True\n", - " )\n", - " ds_ts = case_ts.gen_dataset(\n", - " varname, stream, start_year=year, end_year=year, quiet=True\n", - " )\n", + "\n", + " # (1) generate datasets\n", + " try:\n", + " ds_hist = case_hist.gen_dataset(\n", + " varname, stream, start_year=year, end_year=year, quiet=True\n", + " )\n", + " ds_ts = case_ts.gen_dataset(\n", + " varname, stream, start_year=year, end_year=year, quiet=True\n", + " )\n", + " except ValueError:\n", + " year_found = False\n", + " break\n", + "\n", + " # (2) Check sources of data being read\n", + " data_source = case_hist.get_dataset_source(stream, year, varname)\n", + " if data_source != \"hist\":\n", + " print(\n", + " f\"case_hist data for {varname} comes from '{data_source}', not history files\"\n", + " )\n", + " found_ts_and_hist = False\n", + " break\n", + " data_source = case_ts.get_dataset_source(stream, year, varname)\n", + " if data_source != \"time series\":\n", + " print(\n", + " f\"case_ts data for {varname} comes from '{data_source}', not time series files\"\n", + " )\n", + " found_ts_and_hist = False\n", + " break\n", + "\n", + " # (3) Compare datasets\n", " da_hist = ds_hist[varname].isel(diag_metadata.get(\"isel_dict\"))\n", " da_ts = ds_ts[varname].isel(diag_metadata.get(\"isel_dict\"))\n", " if not da_hist.identical(da_ts):\n", " print(f\"{varname} is different in year {year:04}\")\n", " all_same = False\n", + "\n", + " # Error checking after running through all variables for a given year\n", + " # (1) was data for the year available?\n", + " if not year_found:\n", + " print(f\"Year {year:04} is not available, are you sure it has been run?\")\n", + " break\n", + "\n", + " # (2) was data for the year available via both time series and history files?\n", + " if not found_ts_and_hist:\n", + " print(\n", + " f\"Either time series or history files for variables in {year:04} are missing, skipping comparison\"\n", + " )\n", + " print(\"----\")\n", + " continue\n", + "\n", + " # (3) was the data in the time series files identical to that in the history files?\n", " if all_same:\n", " print(f\"No differences found in year {year:04}\")\n", " print(f\"Finished {year:04}\")\n", diff --git a/notebooks/compare_ts_and_hist_004.ipynb b/notebooks/compare_ts_and_hist_004.ipynb index 03d90d0..0a92c60 100644 --- a/notebooks/compare_ts_and_hist_004.ipynb +++ b/notebooks/compare_ts_and_hist_004.ipynb @@ -16,19 +16,6 @@ "execution_count": 2, "metadata": {}, "outputs": [], - "source": [ - "casename = \"g.e22.G1850ECO_JRA_HR.TL319_t13.004\"\n", - "\n", - "# currently have all history files available, but only 6 years of time series\n", - "start_year = 1\n", - "end_year = 8" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], "source": [ "with open(\"diag_metadata.yaml\", mode=\"r\") as fptr:\n", " diag_metadata_list = yaml.safe_load(fptr)\n", @@ -38,7 +25,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -46,63 +33,117 @@ "output_type": "stream", "text": [ "Starting year 0001...\n", - "No differences found in year 0001\n", - "Finished 0001\n", + "case_ts data for POC_FLUX_100m comes from 'hist', not time series files\n", + "Either time series or history files for variables in 0001 are missing, skipping comparison\n", "----\n", "Starting year 0002...\n", - "No differences found in year 0002\n", - "Finished 0002\n", + "case_ts data for POC_FLUX_100m comes from 'hist', not time series files\n", + "Either time series or history files for variables in 0002 are missing, skipping comparison\n", "----\n", "Starting year 0003...\n", - "No differences found in year 0003\n", - "Finished 0003\n", + "case_ts data for POC_FLUX_100m comes from 'hist', not time series files\n", + "Either time series or history files for variables in 0003 are missing, skipping comparison\n", "----\n", "Starting year 0004...\n", - "No differences found in year 0004\n", - "Finished 0004\n", + "case_ts data for POC_FLUX_100m comes from 'hist', not time series files\n", + "Either time series or history files for variables in 0004 are missing, skipping comparison\n", "----\n", "Starting year 0005...\n", - "No differences found in year 0005\n", - "Finished 0005\n", + "case_ts data for POC_FLUX_100m comes from 'hist', not time series files\n", + "Either time series or history files for variables in 0005 are missing, skipping comparison\n", "----\n", "Starting year 0006...\n", - "No differences found in year 0006\n", - "Finished 0006\n", + "case_ts data for POC_FLUX_100m comes from 'hist', not time series files\n", + "Either time series or history files for variables in 0006 are missing, skipping comparison\n", "----\n", "Starting year 0007...\n", - "No differences found in year 0007\n", - "Finished 0007\n", + "case_ts data for POC_FLUX_100m comes from 'hist', not time series files\n", + "Either time series or history files for variables in 0007 are missing, skipping comparison\n", "----\n", "Starting year 0008...\n", - "No differences found in year 0008\n", - "Finished 0008\n", - "----\n" + "case_ts data for POC_FLUX_100m comes from 'hist', not time series files\n", + "Either time series or history files for variables in 0008 are missing, skipping comparison\n", + "----\n", + "Starting year 0009...\n", + "case_ts data for POC_FLUX_100m comes from 'hist', not time series files\n", + "Either time series or history files for variables in 0009 are missing, skipping comparison\n", + "----\n", + "Starting year 0010...\n", + "case_ts data for POC_FLUX_100m comes from 'hist', not time series files\n", + "Either time series or history files for variables in 0010 are missing, skipping comparison\n", + "----\n", + "Starting year 0011...\n", + "Year 0011 is not available, are you sure it has been run?\n" ] } ], "source": [ + "casename = \"g.e22.G1850ECO_JRA_HR.TL319_t13.004\"\n", + "\n", "case_hist = utils.CaseClass(casename)\n", "case_ts = utils.CaseClass(casename)\n", "\n", "stream = \"pop.h\"\n", "case_hist._timeseries_filenames[stream] = []\n", "\n", - "for year in range(start_year, end_year + 1):\n", + "for year in range(1, 62):\n", " print(f\"Starting year {year:04}...\")\n", " all_same = True\n", + " found_ts_and_hist = True\n", + " year_found = True\n", " for diag_metadata in diag_metadata_list:\n", " varname = diag_metadata[\"varname\"]\n", - " ds_hist = case_hist.gen_dataset(\n", - " varname, stream, start_year=year, end_year=year, quiet=True\n", - " )\n", - " ds_ts = case_ts.gen_dataset(\n", - " varname, stream, start_year=year, end_year=year, quiet=True\n", - " )\n", + "\n", + " # (1) generate datasets\n", + " try:\n", + " ds_hist = case_hist.gen_dataset(\n", + " varname, stream, start_year=year, end_year=year, quiet=True\n", + " )\n", + " ds_ts = case_ts.gen_dataset(\n", + " varname, stream, start_year=year, end_year=year, quiet=True\n", + " )\n", + " except ValueError:\n", + " year_found = False\n", + " break\n", + "\n", + " # (2) Check sources of data being read\n", + " data_source = case_hist.get_dataset_source(stream, year, varname)\n", + " if data_source != \"hist\":\n", + " print(\n", + " f\"case_hist data for {varname} comes from '{data_source}', not history files\"\n", + " )\n", + " found_ts_and_hist = False\n", + " break\n", + " data_source = case_ts.get_dataset_source(stream, year, varname)\n", + " if data_source != \"time series\":\n", + " print(\n", + " f\"case_ts data for {varname} comes from '{data_source}', not time series files\"\n", + " )\n", + " found_ts_and_hist = False\n", + " break\n", + "\n", + " # (3) Compare datasets\n", " da_hist = ds_hist[varname].isel(diag_metadata.get(\"isel_dict\"))\n", " da_ts = ds_ts[varname].isel(diag_metadata.get(\"isel_dict\"))\n", " if not da_hist.identical(da_ts):\n", " print(f\"{varname} is different in year {year:04}\")\n", " all_same = False\n", + "\n", + " # Error checking after running through all variables for a given year\n", + " # (1) was data for the year available?\n", + " if not year_found:\n", + " print(f\"Year {year:04} is not available, are you sure it has been run?\")\n", + " break\n", + "\n", + " # (2) was data for the year available via both time series and history files?\n", + " if not found_ts_and_hist:\n", + " print(\n", + " f\"Either time series or history files for variables in {year:04} are missing, skipping comparison\"\n", + " )\n", + " print(\"----\")\n", + " continue\n", + "\n", + " # (3) was the data in the time series files identical to that in the history files?\n", " if all_same:\n", " print(f\"No differences found in year {year:04}\")\n", " print(f\"Finished {year:04}\")\n", diff --git a/notebooks/utils/CaseClass.py b/notebooks/utils/CaseClass.py index 6c965d8..881de4c 100644 --- a/notebooks/utils/CaseClass.py +++ b/notebooks/utils/CaseClass.py @@ -35,6 +35,8 @@ def __init__(self, casenames, verbose=False): self._log_filenames = self._find_log_files() self._timeseries_filenames = self._find_timeseries_files() self._history_filenames = self._find_hist_files() + self._dataset_files = dict() + self._dataset_src = dict() self.log_contents = dict() @@ -208,6 +210,32 @@ def get_catalog(self): ############################################################################ + def get_dataset_source(self, stream, year, varname): + data_not_found = "no data" + + # Does _dataset_src[stream] exist? + if stream not in self._dataset_src: + print(f"No datasets have been returned from {stream}") + return data_not_found + + # Does _dataset_src[stream][year] exist? + if year not in self._dataset_src[stream]: + print( + f"No datasets covering year {year:04} have been returned from {stream}" + ) + return data_not_found + + # Does _dataset_src[stream][year][varname] exist? + if varname not in self._dataset_src[stream][year]: + print( + f"No dataset containing {varname} from year {year:04} have been returned from {stream}" + ) + return data_not_found + + return self._dataset_src[stream][year][varname] + + ############################################################################ + def gen_dataset( self, varnames, @@ -230,6 +258,10 @@ def gen_dataset( if type(varnames) != list: raise ValueError(f"{casenames} is not a string or list") + if stream not in self._dataset_files: + self._dataset_files[stream] = dict() + self._dataset_src[stream] = dict() + # Set some defaults to pass to open_mfdataset, then apply kwargs argument open_mfdataset_kwargs = dict() # data_vars="minimal", to avoid introducing time dimension to time-invariant fields @@ -258,13 +290,16 @@ def gen_dataset( for varname in varnames: timeseries_filenames = [] for year in range(start_year, end_year + 1): - timeseries_filenames.extend( - [ - filename - for filename in self._timeseries_filenames[stream] - if f".{varname}." in filename and f".{year:04}" in filename - ] - ) + if year not in self._dataset_files[stream]: + self._dataset_files[stream][year] = dict() + self._dataset_src[stream][year] = dict() + self._dataset_files[stream][year][varname] = [ + filename + for filename in self._timeseries_filenames[stream] + if f".{varname}." in filename and f".{year:04}" in filename + ] + self._dataset_src[stream][year][varname] = "time series" + timeseries_filenames.extend(self._dataset_files[stream][year][varname]) if timeseries_filenames: ds_timeseries_per_var.append( @@ -293,13 +328,16 @@ def gen_dataset( # Pare down history file list history_filenames = [] for year in range(start_year, end_year + 1): - history_filenames.extend( - [ - filename - for filename in self._history_filenames[stream] - if f".{year:04}" in filename - ] - ) + if year not in self._dataset_files[stream]: + self._dataset_files[stream][year] = dict() + self._dataset_src[stream][year] = dict() + self._dataset_files[stream][year][varname] = [ + filename + for filename in self._history_filenames[stream] + if f".{year:04}" in filename + ] + history_filenames.extend(self._dataset_files[stream][year][varname]) + self._dataset_src[stream][year][varname] = "hist" if history_filenames: ds_history = xr.open_mfdataset(history_filenames, **open_mfdataset_kwargs,)[ From cebd8538742978434b23c2e372b33f3dd9648474 Mon Sep 17 00:00:00 2001 From: Michael Levy Date: Fri, 16 Oct 2020 09:04:01 -0600 Subject: [PATCH 07/14] Rerun compare_ts_and_hist notebooks Casper is online so we can compare to time series again --- notebooks/compare_ts_and_hist_003.ipynb | 16 ++++++------- notebooks/compare_ts_and_hist_004.ipynb | 32 ++++++++++++------------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/notebooks/compare_ts_and_hist_003.ipynb b/notebooks/compare_ts_and_hist_003.ipynb index 5dd6a74..d190697 100644 --- a/notebooks/compare_ts_and_hist_003.ipynb +++ b/notebooks/compare_ts_and_hist_003.ipynb @@ -33,20 +33,20 @@ "output_type": "stream", "text": [ "Starting year 0001...\n", - "case_ts data for POC_FLUX_100m comes from 'hist', not time series files\n", - "Either time series or history files for variables in 0001 are missing, skipping comparison\n", + "No differences found in year 0001\n", + "Finished 0001\n", "----\n", "Starting year 0002...\n", - "case_ts data for POC_FLUX_100m comes from 'hist', not time series files\n", - "Either time series or history files for variables in 0002 are missing, skipping comparison\n", + "No differences found in year 0002\n", + "Finished 0002\n", "----\n", "Starting year 0003...\n", - "case_ts data for POC_FLUX_100m comes from 'hist', not time series files\n", - "Either time series or history files for variables in 0003 are missing, skipping comparison\n", + "No differences found in year 0003\n", + "Finished 0003\n", "----\n", "Starting year 0004...\n", - "case_ts data for POC_FLUX_100m comes from 'hist', not time series files\n", - "Either time series or history files for variables in 0004 are missing, skipping comparison\n", + "No differences found in year 0004\n", + "Finished 0004\n", "----\n", "Starting year 0005...\n", "Year 0005 is not available, are you sure it has been run?\n" diff --git a/notebooks/compare_ts_and_hist_004.ipynb b/notebooks/compare_ts_and_hist_004.ipynb index 0a92c60..ba31a5a 100644 --- a/notebooks/compare_ts_and_hist_004.ipynb +++ b/notebooks/compare_ts_and_hist_004.ipynb @@ -33,36 +33,36 @@ "output_type": "stream", "text": [ "Starting year 0001...\n", - "case_ts data for POC_FLUX_100m comes from 'hist', not time series files\n", - "Either time series or history files for variables in 0001 are missing, skipping comparison\n", + "No differences found in year 0001\n", + "Finished 0001\n", "----\n", "Starting year 0002...\n", - "case_ts data for POC_FLUX_100m comes from 'hist', not time series files\n", - "Either time series or history files for variables in 0002 are missing, skipping comparison\n", + "No differences found in year 0002\n", + "Finished 0002\n", "----\n", "Starting year 0003...\n", - "case_ts data for POC_FLUX_100m comes from 'hist', not time series files\n", - "Either time series or history files for variables in 0003 are missing, skipping comparison\n", + "No differences found in year 0003\n", + "Finished 0003\n", "----\n", "Starting year 0004...\n", - "case_ts data for POC_FLUX_100m comes from 'hist', not time series files\n", - "Either time series or history files for variables in 0004 are missing, skipping comparison\n", + "No differences found in year 0004\n", + "Finished 0004\n", "----\n", "Starting year 0005...\n", - "case_ts data for POC_FLUX_100m comes from 'hist', not time series files\n", - "Either time series or history files for variables in 0005 are missing, skipping comparison\n", + "No differences found in year 0005\n", + "Finished 0005\n", "----\n", "Starting year 0006...\n", - "case_ts data for POC_FLUX_100m comes from 'hist', not time series files\n", - "Either time series or history files for variables in 0006 are missing, skipping comparison\n", + "No differences found in year 0006\n", + "Finished 0006\n", "----\n", "Starting year 0007...\n", - "case_ts data for POC_FLUX_100m comes from 'hist', not time series files\n", - "Either time series or history files for variables in 0007 are missing, skipping comparison\n", + "No differences found in year 0007\n", + "Finished 0007\n", "----\n", "Starting year 0008...\n", - "case_ts data for POC_FLUX_100m comes from 'hist', not time series files\n", - "Either time series or history files for variables in 0008 are missing, skipping comparison\n", + "No differences found in year 0008\n", + "Finished 0008\n", "----\n", "Starting year 0009...\n", "case_ts data for POC_FLUX_100m comes from 'hist', not time series files\n", From 21b81df11c8bb7f89f88ab29dd78b473502a275c Mon Sep 17 00:00:00 2001 From: Michael Levy Date: Fri, 16 Oct 2020 22:19:11 -0600 Subject: [PATCH 08/14] Modify scripts per suggestions from code review Pass short term archive root as an argument (default is /glade/scratch/$USER/archive) to shell scripts rather than assuming archive is in my scratch directory and pass full name of case rather than suffix. These two changes combined should make the tool general enough to apply to any CESM case (e.g. Kristen's 1-degree cocco runs). Also cleaned up the way data_reshaping/logs is ignored; may need an additional commit to create the directory from run_all.py as a result. --- .gitignore | 1 + data_reshaping/cice.h1_t13.sh | 7 ++++--- data_reshaping/cice.h_t13.sh | 7 ++++--- data_reshaping/logs/.gitignore | 2 -- data_reshaping/pop.h.nday1_t13.sh | 7 ++++--- data_reshaping/pop.h.nyear1_t13.sh | 7 ++++--- data_reshaping/pop.h_t13.sh | 7 ++++--- data_reshaping/run_all.py | 20 ++++++++++++++++++-- 8 files changed, 39 insertions(+), 19 deletions(-) delete mode 100644 data_reshaping/logs/.gitignore diff --git a/.gitignore b/.gitignore index a056e25..eb4dd2d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ **/.ipynb_checkpoints **/__pycache__ notebooks/logs +data_reshaping/logs diff --git a/data_reshaping/cice.h1_t13.sh b/data_reshaping/cice.h1_t13.sh index 7c38352..fae899c 100644 --- a/data_reshaping/cice.h1_t13.sh +++ b/data_reshaping/cice.h1_t13.sh @@ -16,8 +16,9 @@ module purge conda deactivate || echo "conda not loaded" # # PARSE COMMAND LINE ARGUMENTS -CASE=g.e22.G1850ECO_JRA_HR.TL319_t13.${1} ; export CASE -YEAR=${2} +CASE=${1} ; export CASE +ARCHIVE_ROOT=${2} +YEAR=${3} echo "Reshaping year ${YEAR} for ${CASE}..." # cd /glade/p/cesm/postprocessing_dav/cesm-env2/bin @@ -39,7 +40,7 @@ NCKS=`which ncks` ; export NCKS PROCHOST=`hostname`;export PROCHOST # BASEDIR=/glade/u/home/strandwg/CCP_Processing_Suite -LOCALDSK=/glade/scratch/mlevy/archive/${CASE} ; export LOCALDSK +LOCALDSK=${ARCHIVE_ROOT}/${CASE} ; export LOCALDSK PROCBASE=/glade/scratch/$USER/T13/${CASE} ; export PROCBASE # HTYP=`echo $HIST | cut -d'.' -f1` ; export HTYP diff --git a/data_reshaping/cice.h_t13.sh b/data_reshaping/cice.h_t13.sh index ed2ff26..9cba145 100644 --- a/data_reshaping/cice.h_t13.sh +++ b/data_reshaping/cice.h_t13.sh @@ -16,8 +16,9 @@ module purge conda deactivate || echo "conda not loaded" # # PARSE COMMAND LINE ARGUMENTS -CASE=g.e22.G1850ECO_JRA_HR.TL319_t13.${1} ; export CASE -YEAR=${2} +CASE=${1} ; export CASE +ARCHIVE_ROOT=${2} +YEAR=${3} echo "Reshaping year ${YEAR} for ${CASE}..." # cd /glade/p/cesm/postprocessing_dav/cesm-env2/bin @@ -39,7 +40,7 @@ NCKS=`which ncks` ; export NCKS PROCHOST=`hostname`;export PROCHOST # BASEDIR=/glade/u/home/strandwg/CCP_Processing_Suite -LOCALDSK=/glade/scratch/mlevy/archive/${CASE} ; export LOCALDSK +LOCALDSK=${ARCHIVE_ROOT}/${CASE} ; export LOCALDSK PROCBASE=/glade/scratch/$USER/T13/${CASE} ; export PROCBASE # HTYP=`echo $HIST | cut -d'.' -f1` ; export HTYP diff --git a/data_reshaping/logs/.gitignore b/data_reshaping/logs/.gitignore deleted file mode 100644 index c337daf..0000000 --- a/data_reshaping/logs/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -Cice* -Pop* diff --git a/data_reshaping/pop.h.nday1_t13.sh b/data_reshaping/pop.h.nday1_t13.sh index 129a23f..fe49391 100644 --- a/data_reshaping/pop.h.nday1_t13.sh +++ b/data_reshaping/pop.h.nday1_t13.sh @@ -16,8 +16,9 @@ module purge conda deactivate || echo "conda not loaded" # # PARSE COMMAND LINE ARGUMENTS -CASE=g.e22.G1850ECO_JRA_HR.TL319_t13.${1} ; export CASE -YEAR=${2} +CASE=${1} ; export CASE +ARCHIVE_ROOT=${2} +YEAR=${3} echo "Reshaping year ${YEAR} for ${CASE}..." # cd /glade/p/cesm/postprocessing_dav/cesm-env2/bin @@ -39,7 +40,7 @@ NCKS=`which ncks` ; export NCKS PROCHOST=`hostname`;export PROCHOST # BASEDIR=/glade/u/home/strandwg/CCP_Processing_Suite -LOCALDSK=/glade/scratch/mlevy/archive/${CASE} ; export LOCALDSK +LOCALDSK=${ARCHIVE_ROOT}/${CASE} ; export LOCALDSK PROCBASE=/glade/scratch/$USER/T13/${CASE} ; export PROCBASE # HTYP=`echo $HIST | cut -d'.' -f1` ; export HTYP diff --git a/data_reshaping/pop.h.nyear1_t13.sh b/data_reshaping/pop.h.nyear1_t13.sh index 0589519..0ab38d5 100644 --- a/data_reshaping/pop.h.nyear1_t13.sh +++ b/data_reshaping/pop.h.nyear1_t13.sh @@ -16,8 +16,9 @@ module purge conda deactivate || echo "conda not loaded" # # PARSE COMMAND LINE ARGUMENTS -CASE=g.e22.G1850ECO_JRA_HR.TL319_t13.${1} ; export CASE -YEAR=${2} +CASE=${1} ; export CASE +ARCHIVE_ROOT=${2} +YEAR=${3} echo "Reshaping year ${YEAR} for ${CASE}..." # cd /glade/p/cesm/postprocessing_dav/cesm-env2/bin @@ -39,7 +40,7 @@ NCKS=`which ncks` ; export NCKS PROCHOST=`hostname`;export PROCHOST # BASEDIR=/glade/u/home/strandwg/CCP_Processing_Suite -LOCALDSK=/glade/scratch/mlevy/archive/${CASE} ; export LOCALDSK +LOCALDSK=${ARCHIVE_ROOT}/${CASE} ; export LOCALDSK PROCBASE=/glade/scratch/$USER/T13/${CASE} ; export PROCBASE # HTYP=`echo $HIST | cut -d'.' -f1` ; export HTYP diff --git a/data_reshaping/pop.h_t13.sh b/data_reshaping/pop.h_t13.sh index 980b0f7..4fe290c 100644 --- a/data_reshaping/pop.h_t13.sh +++ b/data_reshaping/pop.h_t13.sh @@ -16,8 +16,9 @@ module purge conda deactivate || echo "conda not loaded" # # PARSE COMMAND LINE ARGUMENTS -CASE=g.e22.G1850ECO_JRA_HR.TL319_t13.${1} ; export CASE -YEAR=${2} +CASE=${1} ; export CASE +ARCHIVE_ROOT=${2} +YEAR=${3} echo "Reshaping year ${YEAR} for ${CASE}..." # cd /glade/p/cesm/postprocessing_dav/cesm-env2/bin @@ -39,7 +40,7 @@ NCKS=`which ncks` ; export NCKS PROCHOST=`hostname`;export PROCHOST # BASEDIR=/glade/u/home/strandwg/CCP_Processing_Suite -LOCALDSK=/glade/scratch/mlevy/archive/${CASE} ; export LOCALDSK +LOCALDSK=${ARCHIVE_ROOT}/${CASE} ; export LOCALDSK PROCBASE=/glade/scratch/$USER/T13/${CASE} ; export PROCBASE # HTYP=`echo $HIST | cut -d'.' -f1` ; export HTYP diff --git a/data_reshaping/run_all.py b/data_reshaping/run_all.py index b3f4098..69fe801 100755 --- a/data_reshaping/run_all.py +++ b/data_reshaping/run_all.py @@ -25,16 +25,31 @@ def _parse_args(): ) # Optional: which case to convert + # if this tool is made public, drop the default and require case as well parser.add_argument( "-c", "--case", action="store", dest="case", type=str, - default="004", + default="g.e22.G1850ECO_JRA_HR.TL319_t13.004", help="Suffix of case to convert to time series", ) + # Optional: location of DOUT_S_ROOT + archive_default = os.path.join( + os.sep, "glade", "scratch", os.environ["USER"], "archive" + ) + parser.add_argument( + "-a", + "--archive-root", + action="store", + dest="archive_root", + type=str, + default=archive_default, + help="base of DOUT_S_ROOT", + ) + # Optional: specify which scripts to run parser.add_argument( "-s", @@ -78,6 +93,7 @@ def _parse_args(): if __name__ == "__main__": args = _parse_args() case = args.case + archive_root = args.archive_root mail_opt = ( f"--mail-type=ALL --mail-user={os.environ['USER']}@ucar.edu" if args.send_mail @@ -88,7 +104,7 @@ def _parse_args(): year = f"{yr:04}" for script in args.scripts: print(f"Submitting {script} for year {year} of {case}...") - cmd = f"sbatch {mail_opt} --dependency=singleton {script} {case} {year}" + cmd = f"sbatch {mail_opt} --dependency=singleton {script} {case} {archive_root} {year}" if not args.dryrun: # note: the --dependency=singleton option means only one job per job name # Some jobs had been crashing, and I think it was due to temporary From 6341c3b3ca45457c1db45443992f33054b810926 Mon Sep 17 00:00:00 2001 From: Michael Levy Date: Tue, 20 Oct 2020 13:44:21 -0600 Subject: [PATCH 09/14] Refactor compare_ts_and_hist notebooks Created utils/compare_ts_and_hist.py which will eventually be a command line tool but also provides compare_ts_and_hist() via import utils. --- notebooks/compare_ts_and_hist_003.ipynb | 70 +++++++++------------- notebooks/compare_ts_and_hist_004.ipynb | 78 +++++++++++-------------- notebooks/utils/CaseClass.py | 7 +-- notebooks/utils/__init__.py | 1 + notebooks/utils/compare_ts_and_hist.py | 67 +++++++++++++++++++++ 5 files changed, 133 insertions(+), 90 deletions(-) create mode 100755 notebooks/utils/compare_ts_and_hist.py diff --git a/notebooks/compare_ts_and_hist_003.ipynb b/notebooks/compare_ts_and_hist_003.ipynb index d190697..7a5001d 100644 --- a/notebooks/compare_ts_and_hist_003.ipynb +++ b/notebooks/compare_ts_and_hist_003.ipynb @@ -49,72 +49,60 @@ "Finished 0004\n", "----\n", "Starting year 0005...\n", - "Year 0005 is not available, are you sure it has been run?\n" + "Year 0005 time series is not available\n" ] } ], "source": [ "casename = \"g.e22.G1850ECO_JRA_HR.TL319_t13.003\"\n", - "\n", - "case_hist = utils.CaseClass(casename)\n", - "case_ts = utils.CaseClass(casename)\n", - "\n", "stream = \"pop.h\"\n", - "case_hist._timeseries_filenames[stream] = []\n", "\n", "for year in range(1, 62):\n", " print(f\"Starting year {year:04}...\")\n", + " ts_found = True\n", + " hist_found = True\n", " all_same = True\n", - " found_ts_and_hist = True\n", - " year_found = True\n", " for diag_metadata in diag_metadata_list:\n", " varname = diag_metadata[\"varname\"]\n", + " isel_kwargs = diag_metadata.get(\"isel_dict\")\n", + " comp_test = utils.compare_ts_and_hist(\n", + " casename, varname, stream, year, isel_kwargs or {}\n", + " )\n", + " # Error checking (TODO: replace string compare)\n", "\n", - " # (1) generate datasets\n", - " try:\n", - " ds_hist = case_hist.gen_dataset(\n", - " varname, stream, start_year=year, end_year=year, quiet=True\n", - " )\n", - " ds_ts = case_ts.gen_dataset(\n", - " varname, stream, start_year=year, end_year=year, quiet=True\n", - " )\n", - " except ValueError:\n", - " year_found = False\n", + " # Missing time series data\n", + " if comp_test == \"Can not generate time series dataset\":\n", + " print(f\"Year {year:04} time series is not available\")\n", + " ts_found = False\n", + " break\n", + " if comp_test == \"case_ts does not provide time series files\":\n", + " print(f\"case_ts data for {varname} is not from time series files\")\n", + " ts_found = False\n", " break\n", "\n", - " # (2) Check sources of data being read\n", - " data_source = case_hist.get_dataset_source(stream, year, varname)\n", - " if data_source != \"hist\":\n", - " print(\n", - " f\"case_hist data for {varname} comes from '{data_source}', not history files\"\n", - " )\n", - " found_ts_and_hist = False\n", + " # Missing history file data\n", + " if comp_test == \"History files unavailable\":\n", + " hist_found = False\n", " break\n", - " data_source = case_ts.get_dataset_source(stream, year, varname)\n", - " if data_source != \"time series\":\n", - " print(\n", - " f\"case_ts data for {varname} comes from '{data_source}', not time series files\"\n", - " )\n", - " found_ts_and_hist = False\n", + " if comp_test == \"case_hist does not provide history files\":\n", + " print(f\"case_hist data for {varname} is not from history files\")\n", + " hist_found = False\n", " break\n", "\n", - " # (3) Compare datasets\n", - " da_hist = ds_hist[varname].isel(diag_metadata.get(\"isel_dict\"))\n", - " da_ts = ds_ts[varname].isel(diag_metadata.get(\"isel_dict\"))\n", - " if not da_hist.identical(da_ts):\n", + " # Datasets differ\n", + " if comp_test == \"datasets differ\":\n", " print(f\"{varname} is different in year {year:04}\")\n", " all_same = False\n", "\n", " # Error checking after running through all variables for a given year\n", - " # (1) was data for the year available?\n", - " if not year_found:\n", - " print(f\"Year {year:04} is not available, are you sure it has been run?\")\n", + " # (1) If time series data is not available, we are done testing\n", + " if not ts_found:\n", " break\n", "\n", - " # (2) was data for the year available via both time series and history files?\n", - " if not found_ts_and_hist:\n", + " # (2) If history files are not available, then we have scrubbed those files\n", + " if not hist_found:\n", " print(\n", - " f\"Either time series or history files for variables in {year:04} are missing, skipping comparison\"\n", + " f\"History files for variables in {year:04} are missing, skipping comparison\"\n", " )\n", " print(\"----\")\n", " continue\n", diff --git a/notebooks/compare_ts_and_hist_004.ipynb b/notebooks/compare_ts_and_hist_004.ipynb index ba31a5a..222d290 100644 --- a/notebooks/compare_ts_and_hist_004.ipynb +++ b/notebooks/compare_ts_and_hist_004.ipynb @@ -65,80 +65,68 @@ "Finished 0008\n", "----\n", "Starting year 0009...\n", - "case_ts data for POC_FLUX_100m comes from 'hist', not time series files\n", - "Either time series or history files for variables in 0009 are missing, skipping comparison\n", + "No differences found in year 0009\n", + "Finished 0009\n", "----\n", "Starting year 0010...\n", - "case_ts data for POC_FLUX_100m comes from 'hist', not time series files\n", - "Either time series or history files for variables in 0010 are missing, skipping comparison\n", + "No differences found in year 0010\n", + "Finished 0010\n", "----\n", "Starting year 0011...\n", - "Year 0011 is not available, are you sure it has been run?\n" + "case_ts data for POC_FLUX_100m is not from time series files\n" ] } ], "source": [ "casename = \"g.e22.G1850ECO_JRA_HR.TL319_t13.004\"\n", - "\n", - "case_hist = utils.CaseClass(casename)\n", - "case_ts = utils.CaseClass(casename)\n", - "\n", "stream = \"pop.h\"\n", - "case_hist._timeseries_filenames[stream] = []\n", "\n", "for year in range(1, 62):\n", " print(f\"Starting year {year:04}...\")\n", + " ts_found = True\n", + " hist_found = True\n", " all_same = True\n", - " found_ts_and_hist = True\n", - " year_found = True\n", " for diag_metadata in diag_metadata_list:\n", " varname = diag_metadata[\"varname\"]\n", + " isel_kwargs = diag_metadata.get(\"isel_dict\")\n", + " comp_test = utils.compare_ts_and_hist(\n", + " casename, varname, stream, year, isel_kwargs or {}\n", + " )\n", + " # Error checking (TODO: replace string compare)\n", "\n", - " # (1) generate datasets\n", - " try:\n", - " ds_hist = case_hist.gen_dataset(\n", - " varname, stream, start_year=year, end_year=year, quiet=True\n", - " )\n", - " ds_ts = case_ts.gen_dataset(\n", - " varname, stream, start_year=year, end_year=year, quiet=True\n", - " )\n", - " except ValueError:\n", - " year_found = False\n", + " # Missing time series data\n", + " if comp_test == \"Can not generate time series dataset\":\n", + " print(f\"Year {year:04} time series is not available\")\n", + " ts_found = False\n", + " break\n", + " if comp_test == \"case_ts does not provide time series files\":\n", + " print(f\"case_ts data for {varname} is not from time series files\")\n", + " ts_found = False\n", " break\n", "\n", - " # (2) Check sources of data being read\n", - " data_source = case_hist.get_dataset_source(stream, year, varname)\n", - " if data_source != \"hist\":\n", - " print(\n", - " f\"case_hist data for {varname} comes from '{data_source}', not history files\"\n", - " )\n", - " found_ts_and_hist = False\n", + " # Missing history file data\n", + " if comp_test == \"History files unavailable\":\n", + " hist_found = False\n", " break\n", - " data_source = case_ts.get_dataset_source(stream, year, varname)\n", - " if data_source != \"time series\":\n", - " print(\n", - " f\"case_ts data for {varname} comes from '{data_source}', not time series files\"\n", - " )\n", - " found_ts_and_hist = False\n", + " if comp_test == \"case_hist does not provide history files\":\n", + " print(f\"case_hist data for {varname} is not from history files\")\n", + " hist_found = False\n", " break\n", "\n", - " # (3) Compare datasets\n", - " da_hist = ds_hist[varname].isel(diag_metadata.get(\"isel_dict\"))\n", - " da_ts = ds_ts[varname].isel(diag_metadata.get(\"isel_dict\"))\n", - " if not da_hist.identical(da_ts):\n", + " # Datasets differ\n", + " if comp_test == \"datasets differ\":\n", " print(f\"{varname} is different in year {year:04}\")\n", " all_same = False\n", "\n", " # Error checking after running through all variables for a given year\n", - " # (1) was data for the year available?\n", - " if not year_found:\n", - " print(f\"Year {year:04} is not available, are you sure it has been run?\")\n", + " # (1) If time series data is not available, we are done testing\n", + " if not ts_found:\n", " break\n", "\n", - " # (2) was data for the year available via both time series and history files?\n", - " if not found_ts_and_hist:\n", + " # (2) If history files are not available, then we have scrubbed those files\n", + " if not hist_found:\n", " print(\n", - " f\"Either time series or history files for variables in {year:04} are missing, skipping comparison\"\n", + " f\"History files for variables in {year:04} are missing, skipping comparison\"\n", " )\n", " print(\"----\")\n", " continue\n", diff --git a/notebooks/utils/CaseClass.py b/notebooks/utils/CaseClass.py index 881de4c..b090c07 100644 --- a/notebooks/utils/CaseClass.py +++ b/notebooks/utils/CaseClass.py @@ -211,26 +211,25 @@ def get_catalog(self): ############################################################################ def get_dataset_source(self, stream, year, varname): - data_not_found = "no data" # Does _dataset_src[stream] exist? if stream not in self._dataset_src: print(f"No datasets have been returned from {stream}") - return data_not_found + return None # Does _dataset_src[stream][year] exist? if year not in self._dataset_src[stream]: print( f"No datasets covering year {year:04} have been returned from {stream}" ) - return data_not_found + return None # Does _dataset_src[stream][year][varname] exist? if varname not in self._dataset_src[stream][year]: print( f"No dataset containing {varname} from year {year:04} have been returned from {stream}" ) - return data_not_found + return None return self._dataset_src[stream][year][varname] diff --git a/notebooks/utils/__init__.py b/notebooks/utils/__init__.py index f568019..c92d39c 100644 --- a/notebooks/utils/__init__.py +++ b/notebooks/utils/__init__.py @@ -8,3 +8,4 @@ from .Plotting import summary_plot_maps from .Plotting import trend_plot from .utils import get_varnames_from_metadata_list +from .compare_ts_and_hist import compare_ts_and_hist diff --git a/notebooks/utils/compare_ts_and_hist.py b/notebooks/utils/compare_ts_and_hist.py new file mode 100755 index 0000000..958b2f8 --- /dev/null +++ b/notebooks/utils/compare_ts_and_hist.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python +""" +A script to verify that converting from history files to time series worked as expected +""" + +from .CaseClass import CaseClass + + +def compare_ts_and_hist(casename, varname, stream, year, isel_kwargs): + """ + Generate two CaseClass objects from the same casename, one + from time series output and the other from history files. + + Compare the datasets generated by these two objects; they + should be identical. Possible error states: + + 1. can not find time series {stream} from {year} for {varname} + 2. can not find history files {stream} from {year} for {varname} + 3. datasets differ + """ + case_ts = CaseClass(casename) + case_hist = CaseClass(casename) + case_hist._timeseries_filenames[stream] = [] + + # (1) generate datasets + # (a) time series + try: + ds_ts = case_ts.gen_dataset( + varname, stream, start_year=year, end_year=year, quiet=True + ) + except ValueError: + return "Can not generate time series dataset" + + # (b) history files + try: + ds_hist = case_hist.gen_dataset( + varname, stream, start_year=year, end_year=year, quiet=True + ) + except ValueError: + return "History files unavailable" + + # (2) Check sources of data being read + # (a) case_ts should return data from time series + data_source = case_ts.get_dataset_source(stream, year, varname) + if data_source != "time series": + return "case_ts does not provide time series files" + + # (b) case_ts should return data from history files + # note: this condition should never be met, as it will be + # caught in (1b) + data_source = case_hist.get_dataset_source(stream, year, varname) + if data_source != "hist": + return "case_hist does not provide history files" + + # (3) Compare datasets + da_hist = ds_hist[varname].isel(**isel_kwargs) + da_ts = ds_ts[varname].isel(**isel_kwargs) + if not da_hist.identical(da_ts): + return "datasets differ" + + return "same" + + +######################## + +if __name__ == "__main__": + print("Feature not implemented yet") From d604c923b73d985ccdfe93350e8b91ce9235999c Mon Sep 17 00:00:00 2001 From: Michael Levy Date: Thu, 22 Oct 2020 12:04:44 -0600 Subject: [PATCH 10/14] Several updates for comparing ts and history 1. CaseClass has two new public methods: get_timeseries_files() and get_history_files(); both return lists of files for a given year and stream. For time series, users can also specify a list of varnames to further pare down the resulting list of files. 2. gen_dataset() now relies on the two functions mentioned in (1) to determine what files to open 3. Massive overhaul to compare_ts_and_hist: * Use open_mfdataset and case.get_history_files() to open ds_hist for a given stream and year; then loop through variables and check that get_timeseries_files() does not return an empty list * No longer run da.identical(); for now, we are only concerned with verifying that all variables from history files made it into time series * This puts "reinstate da.identical()" on a to-do item; even with dask I was running into memory issues comparing monthly 3D fields * Refactored so there is utils/compare_ts_and_hist.py that will eventually be a command-line tool for comparing a given stream and year but is currently imported via utils. Also wrote utils.utils.timeseries_and_history_comparison() which is just a wrapper that accounts for things like missing cice.h1 time series from year 1. I think compare_ts_and_hist.py should live with CaseClass when we refactor this package, while timeseries_and_history_comparison() is specific to the high-res analysis 4. Add ability to get cice.h and cice.h1 streams for both history and time series so (3) compares all five streams rather than just looking at a few specific variables in pop.h --- notebooks/compare_ts_and_hist_003.ipynb | 120 +++++----------- notebooks/compare_ts_and_hist_004.ipynb | 180 ++++++++++-------------- notebooks/utils/CaseClass.py | 105 +++++++++++--- notebooks/utils/__init__.py | 1 + notebooks/utils/compare_ts_and_hist.py | 87 ++++++------ notebooks/utils/config.py | 32 +++++ notebooks/utils/utils.py | 43 ++++++ 7 files changed, 319 insertions(+), 249 deletions(-) diff --git a/notebooks/compare_ts_and_hist_003.ipynb b/notebooks/compare_ts_and_hist_003.ipynb index 7a5001d..28b24bd 100644 --- a/notebooks/compare_ts_and_hist_003.ipynb +++ b/notebooks/compare_ts_and_hist_003.ipynb @@ -6,8 +6,6 @@ "metadata": {}, "outputs": [], "source": [ - "import yaml\n", - "\n", "import utils" ] }, @@ -15,103 +13,55 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [], - "source": [ - "with open(\"diag_metadata.yaml\", mode=\"r\") as fptr:\n", - " diag_metadata_list = yaml.safe_load(fptr)\n", - "\n", - "varnames = utils.get_varnames_from_metadata_list(diag_metadata_list)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Starting year 0001...\n", - "No differences found in year 0001\n", - "Finished 0001\n", + "Checking year 0001...\n", + "... checking stream pop.h.nyear1 ...\n", + "... checking stream pop.h.nday1 ...\n", + "... checking stream pop.h ...\n", + "... checking stream cice.h ...\n", + "All variables available in time series for year 0001\n", "----\n", - "Starting year 0002...\n", - "No differences found in year 0002\n", - "Finished 0002\n", + "Checking year 0002...\n", + "... checking stream pop.h.nyear1 ...\n", + "... checking stream pop.h.nday1 ...\n", + "... checking stream pop.h ...\n", + "... checking stream cice.h1 ...\n", + "... checking stream cice.h ...\n", + "All variables available in time series for year 0002\n", "----\n", - "Starting year 0003...\n", - "No differences found in year 0003\n", - "Finished 0003\n", + "Checking year 0003...\n", + "... checking stream pop.h.nyear1 ...\n", + "... checking stream pop.h.nday1 ...\n", + "... checking stream pop.h ...\n", + "... checking stream cice.h1 ...\n", + "... checking stream cice.h ...\n", + "All variables available in time series for year 0003\n", "----\n", - "Starting year 0004...\n", - "No differences found in year 0004\n", - "Finished 0004\n", + "Checking year 0004...\n", + "... checking stream pop.h.nyear1 ...\n", + "... checking stream pop.h.nday1 ...\n", + "... checking stream pop.h ...\n", + "... checking stream cice.h1 ...\n", + "... checking stream cice.h ...\n", + "All variables available in time series for year 0004\n", "----\n", - "Starting year 0005...\n", - "Year 0005 time series is not available\n" + "Checking year 0005...\n", + "... checking stream pop.h.nyear1 ...\n", + "Could not find time series for year 0005\n", + "CPU times: user 57.1 s, sys: 2.92 s, total: 1min\n", + "Wall time: 2min 5s\n" ] } ], "source": [ - "casename = \"g.e22.G1850ECO_JRA_HR.TL319_t13.003\"\n", - "stream = \"pop.h\"\n", - "\n", - "for year in range(1, 62):\n", - " print(f\"Starting year {year:04}...\")\n", - " ts_found = True\n", - " hist_found = True\n", - " all_same = True\n", - " for diag_metadata in diag_metadata_list:\n", - " varname = diag_metadata[\"varname\"]\n", - " isel_kwargs = diag_metadata.get(\"isel_dict\")\n", - " comp_test = utils.compare_ts_and_hist(\n", - " casename, varname, stream, year, isel_kwargs or {}\n", - " )\n", - " # Error checking (TODO: replace string compare)\n", - "\n", - " # Missing time series data\n", - " if comp_test == \"Can not generate time series dataset\":\n", - " print(f\"Year {year:04} time series is not available\")\n", - " ts_found = False\n", - " break\n", - " if comp_test == \"case_ts does not provide time series files\":\n", - " print(f\"case_ts data for {varname} is not from time series files\")\n", - " ts_found = False\n", - " break\n", - "\n", - " # Missing history file data\n", - " if comp_test == \"History files unavailable\":\n", - " hist_found = False\n", - " break\n", - " if comp_test == \"case_hist does not provide history files\":\n", - " print(f\"case_hist data for {varname} is not from history files\")\n", - " hist_found = False\n", - " break\n", + "%%time\n", "\n", - " # Datasets differ\n", - " if comp_test == \"datasets differ\":\n", - " print(f\"{varname} is different in year {year:04}\")\n", - " all_same = False\n", - "\n", - " # Error checking after running through all variables for a given year\n", - " # (1) If time series data is not available, we are done testing\n", - " if not ts_found:\n", - " break\n", - "\n", - " # (2) If history files are not available, then we have scrubbed those files\n", - " if not hist_found:\n", - " print(\n", - " f\"History files for variables in {year:04} are missing, skipping comparison\"\n", - " )\n", - " print(\"----\")\n", - " continue\n", - "\n", - " # (3) was the data in the time series files identical to that in the history files?\n", - " if all_same:\n", - " print(f\"No differences found in year {year:04}\")\n", - " print(f\"Finished {year:04}\")\n", - " print(\"----\")" + "casename = \"g.e22.G1850ECO_JRA_HR.TL319_t13.003\"\n", + "utils.timeseries_and_history_comparison(casename)" ] } ], diff --git a/notebooks/compare_ts_and_hist_004.ipynb b/notebooks/compare_ts_and_hist_004.ipynb index 222d290..2db2b84 100644 --- a/notebooks/compare_ts_and_hist_004.ipynb +++ b/notebooks/compare_ts_and_hist_004.ipynb @@ -6,8 +6,6 @@ "metadata": {}, "outputs": [], "source": [ - "import yaml\n", - "\n", "import utils" ] }, @@ -15,127 +13,103 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [], - "source": [ - "with open(\"diag_metadata.yaml\", mode=\"r\") as fptr:\n", - " diag_metadata_list = yaml.safe_load(fptr)\n", - "\n", - "varnames = utils.get_varnames_from_metadata_list(diag_metadata_list)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Starting year 0001...\n", - "No differences found in year 0001\n", - "Finished 0001\n", + "Checking year 0001...\n", + "... checking stream pop.h.nyear1 ...\n", + "... checking stream pop.h.nday1 ...\n", + "... checking stream pop.h ...\n", + "... checking stream cice.h ...\n", + "All variables available in time series for year 0001\n", "----\n", - "Starting year 0002...\n", - "No differences found in year 0002\n", - "Finished 0002\n", + "Checking year 0002...\n", + "... checking stream pop.h.nyear1 ...\n", + "... checking stream pop.h.nday1 ...\n", + "... checking stream pop.h ...\n", + "... checking stream cice.h1 ...\n", + "... checking stream cice.h ...\n", + "All variables available in time series for year 0002\n", "----\n", - "Starting year 0003...\n", - "No differences found in year 0003\n", - "Finished 0003\n", + "Checking year 0003...\n", + "... checking stream pop.h.nyear1 ...\n", + "... checking stream pop.h.nday1 ...\n", + "... checking stream pop.h ...\n", + "... checking stream cice.h1 ...\n", + "... checking stream cice.h ...\n", + "All variables available in time series for year 0003\n", "----\n", - "Starting year 0004...\n", - "No differences found in year 0004\n", - "Finished 0004\n", + "Checking year 0004...\n", + "... checking stream pop.h.nyear1 ...\n", + "... checking stream pop.h.nday1 ...\n", + "... checking stream pop.h ...\n", + "... checking stream cice.h1 ...\n", + "... checking stream cice.h ...\n", + "All variables available in time series for year 0004\n", "----\n", - "Starting year 0005...\n", - "No differences found in year 0005\n", - "Finished 0005\n", + "Checking year 0005...\n", + "... checking stream pop.h.nyear1 ...\n", + "... checking stream pop.h.nday1 ...\n", + "... checking stream pop.h ...\n", + "... checking stream cice.h1 ...\n", + "... checking stream cice.h ...\n", + "All variables available in time series for year 0005\n", "----\n", - "Starting year 0006...\n", - "No differences found in year 0006\n", - "Finished 0006\n", + "Checking year 0006...\n", + "... checking stream pop.h.nyear1 ...\n", + "... checking stream pop.h.nday1 ...\n", + "... checking stream pop.h ...\n", + "... checking stream cice.h1 ...\n", + "... checking stream cice.h ...\n", + "All variables available in time series for year 0006\n", "----\n", - "Starting year 0007...\n", - "No differences found in year 0007\n", - "Finished 0007\n", + "Checking year 0007...\n", + "... checking stream pop.h.nyear1 ...\n", + "... checking stream pop.h.nday1 ...\n", + "... checking stream pop.h ...\n", + "... checking stream cice.h1 ...\n", + "... checking stream cice.h ...\n", + "All variables available in time series for year 0007\n", "----\n", - "Starting year 0008...\n", - "No differences found in year 0008\n", - "Finished 0008\n", + "Checking year 0008...\n", + "... checking stream pop.h.nyear1 ...\n", + "... checking stream pop.h.nday1 ...\n", + "... checking stream pop.h ...\n", + "... checking stream cice.h1 ...\n", + "... checking stream cice.h ...\n", + "All variables available in time series for year 0008\n", "----\n", - "Starting year 0009...\n", - "No differences found in year 0009\n", - "Finished 0009\n", + "Checking year 0009...\n", + "... checking stream pop.h.nyear1 ...\n", + "... checking stream pop.h.nday1 ...\n", + "... checking stream pop.h ...\n", + "... checking stream cice.h1 ...\n", + "... checking stream cice.h ...\n", + "All variables available in time series for year 0009\n", "----\n", - "Starting year 0010...\n", - "No differences found in year 0010\n", - "Finished 0010\n", + "Checking year 0010...\n", + "... checking stream pop.h.nyear1 ...\n", + "... checking stream pop.h.nday1 ...\n", + "... checking stream pop.h ...\n", + "... checking stream cice.h1 ...\n", + "... checking stream cice.h ...\n", + "All variables available in time series for year 0010\n", "----\n", - "Starting year 0011...\n", - "case_ts data for POC_FLUX_100m is not from time series files\n" + "Checking year 0011...\n", + "... checking stream pop.h.nyear1 ...\n", + "Could not find time series for year 0011\n", + "CPU times: user 2min 25s, sys: 11.6 s, total: 2min 36s\n", + "Wall time: 6min 6s\n" ] } ], "source": [ - "casename = \"g.e22.G1850ECO_JRA_HR.TL319_t13.004\"\n", - "stream = \"pop.h\"\n", - "\n", - "for year in range(1, 62):\n", - " print(f\"Starting year {year:04}...\")\n", - " ts_found = True\n", - " hist_found = True\n", - " all_same = True\n", - " for diag_metadata in diag_metadata_list:\n", - " varname = diag_metadata[\"varname\"]\n", - " isel_kwargs = diag_metadata.get(\"isel_dict\")\n", - " comp_test = utils.compare_ts_and_hist(\n", - " casename, varname, stream, year, isel_kwargs or {}\n", - " )\n", - " # Error checking (TODO: replace string compare)\n", - "\n", - " # Missing time series data\n", - " if comp_test == \"Can not generate time series dataset\":\n", - " print(f\"Year {year:04} time series is not available\")\n", - " ts_found = False\n", - " break\n", - " if comp_test == \"case_ts does not provide time series files\":\n", - " print(f\"case_ts data for {varname} is not from time series files\")\n", - " ts_found = False\n", - " break\n", - "\n", - " # Missing history file data\n", - " if comp_test == \"History files unavailable\":\n", - " hist_found = False\n", - " break\n", - " if comp_test == \"case_hist does not provide history files\":\n", - " print(f\"case_hist data for {varname} is not from history files\")\n", - " hist_found = False\n", - " break\n", + "%%time\n", "\n", - " # Datasets differ\n", - " if comp_test == \"datasets differ\":\n", - " print(f\"{varname} is different in year {year:04}\")\n", - " all_same = False\n", - "\n", - " # Error checking after running through all variables for a given year\n", - " # (1) If time series data is not available, we are done testing\n", - " if not ts_found:\n", - " break\n", - "\n", - " # (2) If history files are not available, then we have scrubbed those files\n", - " if not hist_found:\n", - " print(\n", - " f\"History files for variables in {year:04} are missing, skipping comparison\"\n", - " )\n", - " print(\"----\")\n", - " continue\n", - "\n", - " # (3) was the data in the time series files identical to that in the history files?\n", - " if all_same:\n", - " print(f\"No differences found in year {year:04}\")\n", - " print(f\"Finished {year:04}\")\n", - " print(\"----\")" + "casename = \"g.e22.G1850ECO_JRA_HR.TL319_t13.004\"\n", + "utils.timeseries_and_history_comparison(casename)" ] } ], diff --git a/notebooks/utils/CaseClass.py b/notebooks/utils/CaseClass.py index b090c07..801b9bb 100644 --- a/notebooks/utils/CaseClass.py +++ b/notebooks/utils/CaseClass.py @@ -14,7 +14,9 @@ add_first_date_and_reformat, get_archive_log_dir, get_campaign_popseries_dir, + get_campaign_ciceseries_dir, get_archive_pophist_dir, + get_archive_cicehist_dir, get_rundir, ) from .utils import time_set_mid @@ -66,6 +68,48 @@ def get_co2calc_warning_cnt(self, max_it=4): ############################################################################ + def _get_single_year_timeseries_files(self, year, stream, varname=None): + var_check = True + timeseries_filenames = [] + for filename in self._timeseries_filenames[stream]: + if varname is not None: + var_check = f".{varname}." in filename + if var_check and f".{year:04}" in filename: + timeseries_filenames.extend([filename]) + return timeseries_filenames + + ############################################################################ + + def get_timeseries_files(self, year, stream, varnames=None): + if type(varnames) == str: + varnames = [varnames] + if not (type(varnames) == list or varnames is None): + raise ValueError( + f"varnames = {varnames} which is not None, a string, or a list" + ) + + timeseries_filenames = [] + if varnames: + for varname in varnames: + timeseries_filenames.extend( + self._get_single_year_timeseries_files(year, stream, varname) + ) + else: + timeseries_filenames = self._get_single_year_timeseries_files(year, stream) + + return timeseries_filenames + + ############################################################################ + + def get_history_files(self, year, stream): + return [ + filename + for filename in self._history_filenames[stream] + if f".{year:04}" in filename + ] + + ############################################################################ + def _find_log_files(self): """ Look in rundir and archive for cesm.log, ocn.log, and cpl.log files @@ -91,6 +135,8 @@ def _find_timeseries_files(self): subdirs["pop.h"] = "month_1" subdirs["pop.h.nday1"] = "day_1" subdirs["pop.h.nyear1"] = "year_1" + subdirs["cice.h"] = "month_1" + subdirs["cice.h1"] = "day_1" for stream in ["pop.h", "pop.h.nday1", "pop.h.nyear1"]: files[stream] = [] for casename in self._casenames: @@ -104,6 +150,19 @@ def _find_timeseries_files(self): ) ) files[stream].sort() + for stream in ["cice.h", "cice.h1"]: + files[stream] = [] + for casename in self._casenames: + files[stream].extend( + glob.glob( + os.path.join( + get_campaign_ciceseries_dir(casename), + subdirs[stream], + f"{casename}.{stream}.*.nc", + ) + ) + ) + files[stream].sort() return files ############################################################################ @@ -113,7 +172,7 @@ def _find_hist_files(self): Look in rundir and archive for pop history files """ files = dict() - for stream in ["pop.h", "pop.h.nday1"]: + for stream in ["pop.h", "pop.h.nday1", "pop.h.nyear1"]: files[stream] = [] for rootdir in [get_archive_pophist_dir, get_rundir]: for casename in self._casenames: @@ -125,6 +184,18 @@ def _find_hist_files(self): ) ) files[stream].sort() + for stream in ["cice.h", "cice.h1"]: + files[stream] = [] + for rootdir in [get_archive_cicehist_dir, get_rundir]: + for casename in self._casenames: + files[stream].extend( + glob.glob( + os.path.join( + rootdir(casename), f"{casename}.{stream}.0*.nc" + ) + ) + ) + files[stream].sort() return files ############################################################################ @@ -255,7 +326,7 @@ def gen_dataset( if type(varnames) == str: varnames = [varnames] if type(varnames) != list: - raise ValueError(f"{casenames} is not a string or list") + raise ValueError(f"{varnames} is not a string or list") if stream not in self._dataset_files: self._dataset_files[stream] = dict() @@ -292,20 +363,21 @@ def gen_dataset( if year not in self._dataset_files[stream]: self._dataset_files[stream][year] = dict() self._dataset_src[stream][year] = dict() - self._dataset_files[stream][year][varname] = [ - filename - for filename in self._timeseries_filenames[stream] - if f".{varname}." in filename and f".{year:04}" in filename - ] - self._dataset_src[stream][year][varname] = "time series" - timeseries_filenames.extend(self._dataset_files[stream][year][varname]) - + self._dataset_files[stream][year][varname] = self.get_timeseries_files( + year, stream, varnames + ) + if self._dataset_files[stream][year][varname]: + self._dataset_src[stream][year][varname] = "time series" + timeseries_filenames.extend( + self._dataset_files[stream][year][varname] + ) if timeseries_filenames: ds_timeseries_per_var.append( xr.open_mfdataset(timeseries_filenames, **open_mfdataset_kwargs,)[ [varname] + _vars_to_keep ] ) + if ds_timeseries_per_var: ds_timeseries = xr.merge(ds_timeseries_per_var) tb_name_ts = ds_timeseries["time"].attrs["bounds"] @@ -330,13 +402,12 @@ def gen_dataset( if year not in self._dataset_files[stream]: self._dataset_files[stream][year] = dict() self._dataset_src[stream][year] = dict() - self._dataset_files[stream][year][varname] = [ - filename - for filename in self._history_filenames[stream] - if f".{year:04}" in filename - ] - history_filenames.extend(self._dataset_files[stream][year][varname]) - self._dataset_src[stream][year][varname] = "hist" + self._dataset_files[stream][year][varname] = self.get_history_files( + year, stream + ) + if self._dataset_files[stream][year][varname]: + self._dataset_src[stream][year][varname] = "hist" + history_filenames.extend(self._dataset_files[stream][year][varname]) if history_filenames: ds_history = xr.open_mfdataset(history_filenames, **open_mfdataset_kwargs,)[ diff --git a/notebooks/utils/__init__.py b/notebooks/utils/__init__.py index c92d39c..1edab76 100644 --- a/notebooks/utils/__init__.py +++ b/notebooks/utils/__init__.py @@ -8,4 +8,5 @@ from .Plotting import summary_plot_maps from .Plotting import trend_plot from .utils import get_varnames_from_metadata_list +from .utils import timeseries_and_history_comparison from .compare_ts_and_hist import compare_ts_and_hist diff --git a/notebooks/utils/compare_ts_and_hist.py b/notebooks/utils/compare_ts_and_hist.py index 958b2f8..60edba6 100755 --- a/notebooks/utils/compare_ts_and_hist.py +++ b/notebooks/utils/compare_ts_and_hist.py @@ -3,61 +3,60 @@ A script to verify that converting from history files to time series worked as expected """ +import xarray as xr + from .CaseClass import CaseClass -def compare_ts_and_hist(casename, varname, stream, year, isel_kwargs): +def compare_ts_and_hist( + casename, stream, year, exclude_vars=["time_bound", "time_bounds"] +): """ - Generate two CaseClass objects from the same casename, one - from time series output and the other from history files. - - Compare the datasets generated by these two objects; they - should be identical. Possible error states: - - 1. can not find time series {stream} from {year} for {varname} - 2. can not find history files {stream} from {year} for {varname} - 3. datasets differ + Generate a CaseClass object from a given casename. For a given stream + and year, open the history files from the case. Then loop through the + variables (excluding time_bound in POP and time_bounds in CICE) and + verify that those fields are available in time series. """ - case_ts = CaseClass(casename) - case_hist = CaseClass(casename) - case_hist._timeseries_filenames[stream] = [] + # Set some defaults to pass to open_mfdataset, then apply kwargs argument + open_mfdataset_kwargs = dict() + # data_vars="minimal", to avoid introducing time dimension to time-invariant fields + open_mfdataset_kwargs["data_vars"] = "minimal" + # compat="override", to skip var consistency checks (for speed) + open_mfdataset_kwargs["compat"] = "override" + # coords="minimal", because coords cannot be default="different" if compat="override" + open_mfdataset_kwargs["coords"] = "minimal" + # parallel=True to open files in parallel + open_mfdataset_kwargs["parallel"] = True - # (1) generate datasets - # (a) time series - try: - ds_ts = case_ts.gen_dataset( - varname, stream, start_year=year, end_year=year, quiet=True - ) - except ValueError: - return "Can not generate time series dataset" + found_all = True - # (b) history files - try: - ds_hist = case_hist.gen_dataset( - varname, stream, start_year=year, end_year=year, quiet=True - ) - except ValueError: - return "History files unavailable" + case = CaseClass(casename) + # Return if no time series is available + if len(case.get_timeseries_files(year, stream)) == 0: + return "no time series" - # (2) Check sources of data being read - # (a) case_ts should return data from time series - data_source = case_ts.get_dataset_source(stream, year, varname) - if data_source != "time series": - return "case_ts does not provide time series files" + # Return if no history files are available + history_filenames = case.get_history_files(year, stream) + if len(history_filenames) == 0: + return "no history" - # (b) case_ts should return data from history files - # note: this condition should never be met, as it will be - # caught in (1b) - data_source = case_hist.get_dataset_source(stream, year, varname) - if data_source != "hist": - return "case_hist does not provide history files" + # Open history files to build dataset + ds_hist = xr.open_mfdataset(history_filenames, **open_mfdataset_kwargs) + vars_to_check = [ + var + for var in ds_hist.data_vars + if "time" in ds_hist[var].coords and not var in exclude_vars + ] - # (3) Compare datasets - da_hist = ds_hist[varname].isel(**isel_kwargs) - da_ts = ds_ts[varname].isel(**isel_kwargs) - if not da_hist.identical(da_ts): - return "datasets differ" + # Look for each variable in time series + for var in vars_to_check: + if len(case.get_timeseries_files(year, stream, var)) == 0: + print(f"No time series files for {var} in year {year:04}") + found_all = False + # Return "same" if all variables were found, otherwise return "datasets differ" + if not found_all: + return "datasets differ" return "same" diff --git a/notebooks/utils/config.py b/notebooks/utils/config.py index 88996d2..d56fbb3 100644 --- a/notebooks/utils/config.py +++ b/notebooks/utils/config.py @@ -37,6 +37,29 @@ def get_campaign_popseries_dir(casename): ################################################################################ +def get_campaign_ciceseries_dir(casename): + freq_name = dict() + return os.path.join( + os.sep, + "glade", + "campaign", + "cesm", + "development", + "bgcwg", + "projects", + "hi-res_JRA", + "cases", + casename, + "output", + "ice", + "proc", + "tseries", + ) + + +################################################################################ + + def get_archive_pophist_dir(casename): return os.path.join( os.sep, "glade", "scratch", user, "archive", casename, "ocn", "hist" @@ -46,6 +69,15 @@ def get_archive_pophist_dir(casename): ################################################################################ +def get_archive_cicehist_dir(casename): + return os.path.join( + os.sep, "glade", "scratch", user, "archive", casename, "ice", "hist" + ) + + +################################################################################ + + def get_archive_log_dir(casename): return os.path.join(os.sep, "glade", "scratch", user, "archive", casename, "logs") diff --git a/notebooks/utils/utils.py b/notebooks/utils/utils.py index dddfbaf..a877436 100644 --- a/notebooks/utils/utils.py +++ b/notebooks/utils/utils.py @@ -6,6 +6,9 @@ import numpy as np import xarray as xr +from . import CaseClass +from .compare_ts_and_hist import compare_ts_and_hist + def repl_coord(coordname, ds1, ds2): """ @@ -100,3 +103,43 @@ def get_varnames_from_metadata_list(diag_metadata_list): if diag_metadata["varname"] not in varnames: varnames.append(diag_metadata["varname"]) return varnames + + +def timeseries_and_history_comparison(casename): + case = CaseClass(casename) + for year in range(1, 62): + has_ts = True + found_all = True + print(f"Checking year {year:04}...") + for stream in ["pop.h.nyear1", "pop.h.nday1", "pop.h", "cice.h1", "cice.h"]: + has_hist = True + # There is no cice.h1 time series for 0001 so skip check + if stream == "cice.h1" and year == 1: + continue + # Run test + print(f"... checking stream {stream} ...") + comp_test = compare_ts_and_hist(casename, stream, year) + # Check ends when there are no history files for comparison + if comp_test == "no time series": + has_ts = False + break + + # Skip years when there are no history files + # (Assume those years were already checked prior to deleting history files) + if comp_test == "no history": + print( + f"Skipping stream {stream} for year {year:04} because there are no history files" + ) + has_hist = False + continue + + found_all = found_all and (comp_test == "same") + + if not has_ts: + print(f"Could not find time series for year {year:04}") + break + if has_hist and found_all: + print(f"All variables available in time series for year {year:04}") + else: + print(f"Could not find time series for all variables in year {year:04}") + print("----") From e7ec903819c1d2db7702b724a5586b126f9d3b71 Mon Sep 17 00:00:00 2001 From: Michael Levy Date: Thu, 22 Oct 2020 12:39:20 -0600 Subject: [PATCH 11/14] Clean up circle dependencies And a few other bad / unnecessary imports --- notebooks/utils/__init__.py | 1 - notebooks/utils/compare_ts_and_hist.py | 5 ++--- notebooks/utils/utils.py | 2 -- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/notebooks/utils/__init__.py b/notebooks/utils/__init__.py index 1edab76..ca47dd5 100644 --- a/notebooks/utils/__init__.py +++ b/notebooks/utils/__init__.py @@ -9,4 +9,3 @@ from .Plotting import trend_plot from .utils import get_varnames_from_metadata_list from .utils import timeseries_and_history_comparison -from .compare_ts_and_hist import compare_ts_and_hist diff --git a/notebooks/utils/compare_ts_and_hist.py b/notebooks/utils/compare_ts_and_hist.py index 60edba6..b376cc6 100755 --- a/notebooks/utils/compare_ts_and_hist.py +++ b/notebooks/utils/compare_ts_and_hist.py @@ -4,8 +4,7 @@ """ import xarray as xr - -from .CaseClass import CaseClass +from . import CaseClass def compare_ts_and_hist( @@ -30,7 +29,7 @@ def compare_ts_and_hist( found_all = True - case = CaseClass(casename) + case = CaseClass.CaseClass(casename) # Return if no time series is available if len(case.get_timeseries_files(year, stream)) == 0: return "no time series" diff --git a/notebooks/utils/utils.py b/notebooks/utils/utils.py index a877436..8e65dfe 100644 --- a/notebooks/utils/utils.py +++ b/notebooks/utils/utils.py @@ -6,7 +6,6 @@ import numpy as np import xarray as xr -from . import CaseClass from .compare_ts_and_hist import compare_ts_and_hist @@ -106,7 +105,6 @@ def get_varnames_from_metadata_list(diag_metadata_list): def timeseries_and_history_comparison(casename): - case = CaseClass(casename) for year in range(1, 62): has_ts = True found_all = True From 12ef2c0144f35438cfc12d4aa780646d2df83c9a Mon Sep 17 00:00:00 2001 From: Michael Levy Date: Thu, 22 Oct 2020 14:13:09 -0600 Subject: [PATCH 12/14] More reorganizing to pass tests Github Actions didn't like the "import utils" call even though it was fine in the notebooks, I think because utils.utils was trying to import compare_ts_and_hist.py; now that import is in the timeseries_and_history_comparison() function and hopefully everything will work again. --- notebooks/utils/compare_ts_and_hist.py | 2 +- notebooks/utils/utils.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/notebooks/utils/compare_ts_and_hist.py b/notebooks/utils/compare_ts_and_hist.py index b376cc6..044b375 100755 --- a/notebooks/utils/compare_ts_and_hist.py +++ b/notebooks/utils/compare_ts_and_hist.py @@ -29,7 +29,7 @@ def compare_ts_and_hist( found_all = True - case = CaseClass.CaseClass(casename) + case = CaseClass(casename) # Return if no time series is available if len(case.get_timeseries_files(year, stream)) == 0: return "no time series" diff --git a/notebooks/utils/utils.py b/notebooks/utils/utils.py index 8e65dfe..ec576d1 100644 --- a/notebooks/utils/utils.py +++ b/notebooks/utils/utils.py @@ -6,8 +6,6 @@ import numpy as np import xarray as xr -from .compare_ts_and_hist import compare_ts_and_hist - def repl_coord(coordname, ds1, ds2): """ @@ -105,6 +103,10 @@ def get_varnames_from_metadata_list(diag_metadata_list): def timeseries_and_history_comparison(casename): + # import here to avoid "ValueError: attempted relative import beyond top-level package" + # when Github Action calls "import utils" (no issue in notebooks) + from .compare_ts_and_hist import compare_ts_and_hist + for year in range(1, 62): has_ts = True found_all = True From 28176f8f2d17ee1a3e25d8db1e9d8dd5073d39c4 Mon Sep 17 00:00:00 2001 From: Michael Levy Date: Fri, 30 Oct 2020 10:02:00 -0600 Subject: [PATCH 13/14] Update tests to handle import better Moved the import statement out of timeseries_and_history_comparison() and fixed sys.path in test_utils.py to ensure the import statement still works. --- notebooks/utils/utils.py | 6 ++---- tests/test_utils.py | 4 ++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/notebooks/utils/utils.py b/notebooks/utils/utils.py index ec576d1..8e65dfe 100644 --- a/notebooks/utils/utils.py +++ b/notebooks/utils/utils.py @@ -6,6 +6,8 @@ import numpy as np import xarray as xr +from .compare_ts_and_hist import compare_ts_and_hist + def repl_coord(coordname, ds1, ds2): """ @@ -103,10 +105,6 @@ def get_varnames_from_metadata_list(diag_metadata_list): def timeseries_and_history_comparison(casename): - # import here to avoid "ValueError: attempted relative import beyond top-level package" - # when Github Action calls "import utils" (no issue in notebooks) - from .compare_ts_and_hist import compare_ts_and_hist - for year in range(1, 62): has_ts = True found_all = True diff --git a/tests/test_utils.py b/tests/test_utils.py index ccaf396..341ae05 100755 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -7,9 +7,9 @@ import numpy as np import xarray as xr -sys.path.append(os.path.abspath(os.path.join("notebooks", "utils"))) +sys.path.append(os.path.abspath(os.path.join("notebooks"))) sys.path.append(os.path.abspath("tests")) -from utils import time_year_plus_frac, time_set_mid, repl_coord, round_sig +from utils.utils import time_year_plus_frac, time_set_mid, repl_coord, round_sig from xr_ds_ex import gen_time_bounds_values, xr_ds_ex nyrs = 300 From b7ec1a07dc66b741913adff5bf9c1cb42c335609 Mon Sep 17 00:00:00 2001 From: Michael Levy Date: Fri, 30 Oct 2020 10:12:42 -0600 Subject: [PATCH 14/14] Bugfix in how CaseClass is used compare_ts_and_hist.py needs CaseClass.CaseClass, not just CaseClass --- notebooks/utils/compare_ts_and_hist.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/utils/compare_ts_and_hist.py b/notebooks/utils/compare_ts_and_hist.py index 044b375..b376cc6 100755 --- a/notebooks/utils/compare_ts_and_hist.py +++ b/notebooks/utils/compare_ts_and_hist.py @@ -29,7 +29,7 @@ def compare_ts_and_hist( found_all = True - case = CaseClass(casename) + case = CaseClass.CaseClass(casename) # Return if no time series is available if len(case.get_timeseries_files(year, stream)) == 0: return "no time series"