Skip to content

Commit

Permalink
Several updates for comparing ts and history
Browse files Browse the repository at this point in the history
1. CaseClass has two new public methods: get_timeseries_files() and
   get_history_files(); both return lists of files for a given year and stream.
   For time series, users can also specify a list of varnames to further pare
   down the resulting list of files.
2. gen_dataset() now relies on the two functions mentioned in (1) to determine
   what files to open
3. Massive overhaul to compare_ts_and_hist:
   * Use open_mfdataset and case.get_history_files() to open ds_hist for a
     given stream and year; then loop through variables and check that
     get_timeseries_files() does not return an empty list
   * No longer run da.identical(); for now, we are only concerned with
     verifying that all variables from history files made it into time series
   * This puts "reinstate da.identical()" on a to-do item; even with dask I was
     running into memory issues comparing monthly 3D fields
   * Refactored so there is utils/compare_ts_and_hist.py that will eventually
     be a command-line tool for comparing a given stream and year but is
     currently imported via utils. Also wrote
     utils.utils.timeseries_and_history_comparison() which is just a wrapper
     that accounts for things like missing cice.h1 time series from year 1. I
     think compare_ts_and_hist.py should live with CaseClass when we refactor
     this package, while timeseries_and_history_comparison() is specific to the
     high-res analysis
4. Add ability to get cice.h and cice.h1 streams for both history and time
   series so (3) compares all five streams rather than just looking at a few
   specific variables in pop.h
  • Loading branch information
mnlevy1981 committed Oct 22, 2020
1 parent 6341c3b commit d604c92
Show file tree
Hide file tree
Showing 7 changed files with 319 additions and 249 deletions.
120 changes: 35 additions & 85 deletions notebooks/compare_ts_and_hist_003.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,112 +6,62 @@
"metadata": {},
"outputs": [],
"source": [
"import yaml\n",
"\n",
"import utils"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"with open(\"diag_metadata.yaml\", mode=\"r\") as fptr:\n",
" diag_metadata_list = yaml.safe_load(fptr)\n",
"\n",
"varnames = utils.get_varnames_from_metadata_list(diag_metadata_list)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Starting year 0001...\n",
"No differences found in year 0001\n",
"Finished 0001\n",
"Checking year 0001...\n",
"... checking stream pop.h.nyear1 ...\n",
"... checking stream pop.h.nday1 ...\n",
"... checking stream pop.h ...\n",
"... checking stream cice.h ...\n",
"All variables available in time series for year 0001\n",
"----\n",
"Starting year 0002...\n",
"No differences found in year 0002\n",
"Finished 0002\n",
"Checking year 0002...\n",
"... checking stream pop.h.nyear1 ...\n",
"... checking stream pop.h.nday1 ...\n",
"... checking stream pop.h ...\n",
"... checking stream cice.h1 ...\n",
"... checking stream cice.h ...\n",
"All variables available in time series for year 0002\n",
"----\n",
"Starting year 0003...\n",
"No differences found in year 0003\n",
"Finished 0003\n",
"Checking year 0003...\n",
"... checking stream pop.h.nyear1 ...\n",
"... checking stream pop.h.nday1 ...\n",
"... checking stream pop.h ...\n",
"... checking stream cice.h1 ...\n",
"... checking stream cice.h ...\n",
"All variables available in time series for year 0003\n",
"----\n",
"Starting year 0004...\n",
"No differences found in year 0004\n",
"Finished 0004\n",
"Checking year 0004...\n",
"... checking stream pop.h.nyear1 ...\n",
"... checking stream pop.h.nday1 ...\n",
"... checking stream pop.h ...\n",
"... checking stream cice.h1 ...\n",
"... checking stream cice.h ...\n",
"All variables available in time series for year 0004\n",
"----\n",
"Starting year 0005...\n",
"Year 0005 time series is not available\n"
"Checking year 0005...\n",
"... checking stream pop.h.nyear1 ...\n",
"Could not find time series for year 0005\n",
"CPU times: user 57.1 s, sys: 2.92 s, total: 1min\n",
"Wall time: 2min 5s\n"
]
}
],
"source": [
"casename = \"g.e22.G1850ECO_JRA_HR.TL319_t13.003\"\n",
"stream = \"pop.h\"\n",
"\n",
"for year in range(1, 62):\n",
" print(f\"Starting year {year:04}...\")\n",
" ts_found = True\n",
" hist_found = True\n",
" all_same = True\n",
" for diag_metadata in diag_metadata_list:\n",
" varname = diag_metadata[\"varname\"]\n",
" isel_kwargs = diag_metadata.get(\"isel_dict\")\n",
" comp_test = utils.compare_ts_and_hist(\n",
" casename, varname, stream, year, isel_kwargs or {}\n",
" )\n",
" # Error checking (TODO: replace string compare)\n",
"\n",
" # Missing time series data\n",
" if comp_test == \"Can not generate time series dataset\":\n",
" print(f\"Year {year:04} time series is not available\")\n",
" ts_found = False\n",
" break\n",
" if comp_test == \"case_ts does not provide time series files\":\n",
" print(f\"case_ts data for {varname} is not from time series files\")\n",
" ts_found = False\n",
" break\n",
"\n",
" # Missing history file data\n",
" if comp_test == \"History files unavailable\":\n",
" hist_found = False\n",
" break\n",
" if comp_test == \"case_hist does not provide history files\":\n",
" print(f\"case_hist data for {varname} is not from history files\")\n",
" hist_found = False\n",
" break\n",
"%%time\n",
"\n",
" # Datasets differ\n",
" if comp_test == \"datasets differ\":\n",
" print(f\"{varname} is different in year {year:04}\")\n",
" all_same = False\n",
"\n",
" # Error checking after running through all variables for a given year\n",
" # (1) If time series data is not available, we are done testing\n",
" if not ts_found:\n",
" break\n",
"\n",
" # (2) If history files are not available, then we have scrubbed those files\n",
" if not hist_found:\n",
" print(\n",
" f\"History files for variables in {year:04} are missing, skipping comparison\"\n",
" )\n",
" print(\"----\")\n",
" continue\n",
"\n",
" # (3) was the data in the time series files identical to that in the history files?\n",
" if all_same:\n",
" print(f\"No differences found in year {year:04}\")\n",
" print(f\"Finished {year:04}\")\n",
" print(\"----\")"
"casename = \"g.e22.G1850ECO_JRA_HR.TL319_t13.003\"\n",
"utils.timeseries_and_history_comparison(casename)"
]
}
],
Expand Down
180 changes: 77 additions & 103 deletions notebooks/compare_ts_and_hist_004.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,136 +6,110 @@
"metadata": {},
"outputs": [],
"source": [
"import yaml\n",
"\n",
"import utils"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"with open(\"diag_metadata.yaml\", mode=\"r\") as fptr:\n",
" diag_metadata_list = yaml.safe_load(fptr)\n",
"\n",
"varnames = utils.get_varnames_from_metadata_list(diag_metadata_list)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Starting year 0001...\n",
"No differences found in year 0001\n",
"Finished 0001\n",
"Checking year 0001...\n",
"... checking stream pop.h.nyear1 ...\n",
"... checking stream pop.h.nday1 ...\n",
"... checking stream pop.h ...\n",
"... checking stream cice.h ...\n",
"All variables available in time series for year 0001\n",
"----\n",
"Starting year 0002...\n",
"No differences found in year 0002\n",
"Finished 0002\n",
"Checking year 0002...\n",
"... checking stream pop.h.nyear1 ...\n",
"... checking stream pop.h.nday1 ...\n",
"... checking stream pop.h ...\n",
"... checking stream cice.h1 ...\n",
"... checking stream cice.h ...\n",
"All variables available in time series for year 0002\n",
"----\n",
"Starting year 0003...\n",
"No differences found in year 0003\n",
"Finished 0003\n",
"Checking year 0003...\n",
"... checking stream pop.h.nyear1 ...\n",
"... checking stream pop.h.nday1 ...\n",
"... checking stream pop.h ...\n",
"... checking stream cice.h1 ...\n",
"... checking stream cice.h ...\n",
"All variables available in time series for year 0003\n",
"----\n",
"Starting year 0004...\n",
"No differences found in year 0004\n",
"Finished 0004\n",
"Checking year 0004...\n",
"... checking stream pop.h.nyear1 ...\n",
"... checking stream pop.h.nday1 ...\n",
"... checking stream pop.h ...\n",
"... checking stream cice.h1 ...\n",
"... checking stream cice.h ...\n",
"All variables available in time series for year 0004\n",
"----\n",
"Starting year 0005...\n",
"No differences found in year 0005\n",
"Finished 0005\n",
"Checking year 0005...\n",
"... checking stream pop.h.nyear1 ...\n",
"... checking stream pop.h.nday1 ...\n",
"... checking stream pop.h ...\n",
"... checking stream cice.h1 ...\n",
"... checking stream cice.h ...\n",
"All variables available in time series for year 0005\n",
"----\n",
"Starting year 0006...\n",
"No differences found in year 0006\n",
"Finished 0006\n",
"Checking year 0006...\n",
"... checking stream pop.h.nyear1 ...\n",
"... checking stream pop.h.nday1 ...\n",
"... checking stream pop.h ...\n",
"... checking stream cice.h1 ...\n",
"... checking stream cice.h ...\n",
"All variables available in time series for year 0006\n",
"----\n",
"Starting year 0007...\n",
"No differences found in year 0007\n",
"Finished 0007\n",
"Checking year 0007...\n",
"... checking stream pop.h.nyear1 ...\n",
"... checking stream pop.h.nday1 ...\n",
"... checking stream pop.h ...\n",
"... checking stream cice.h1 ...\n",
"... checking stream cice.h ...\n",
"All variables available in time series for year 0007\n",
"----\n",
"Starting year 0008...\n",
"No differences found in year 0008\n",
"Finished 0008\n",
"Checking year 0008...\n",
"... checking stream pop.h.nyear1 ...\n",
"... checking stream pop.h.nday1 ...\n",
"... checking stream pop.h ...\n",
"... checking stream cice.h1 ...\n",
"... checking stream cice.h ...\n",
"All variables available in time series for year 0008\n",
"----\n",
"Starting year 0009...\n",
"No differences found in year 0009\n",
"Finished 0009\n",
"Checking year 0009...\n",
"... checking stream pop.h.nyear1 ...\n",
"... checking stream pop.h.nday1 ...\n",
"... checking stream pop.h ...\n",
"... checking stream cice.h1 ...\n",
"... checking stream cice.h ...\n",
"All variables available in time series for year 0009\n",
"----\n",
"Starting year 0010...\n",
"No differences found in year 0010\n",
"Finished 0010\n",
"Checking year 0010...\n",
"... checking stream pop.h.nyear1 ...\n",
"... checking stream pop.h.nday1 ...\n",
"... checking stream pop.h ...\n",
"... checking stream cice.h1 ...\n",
"... checking stream cice.h ...\n",
"All variables available in time series for year 0010\n",
"----\n",
"Starting year 0011...\n",
"case_ts data for POC_FLUX_100m is not from time series files\n"
"Checking year 0011...\n",
"... checking stream pop.h.nyear1 ...\n",
"Could not find time series for year 0011\n",
"CPU times: user 2min 25s, sys: 11.6 s, total: 2min 36s\n",
"Wall time: 6min 6s\n"
]
}
],
"source": [
"casename = \"g.e22.G1850ECO_JRA_HR.TL319_t13.004\"\n",
"stream = \"pop.h\"\n",
"\n",
"for year in range(1, 62):\n",
" print(f\"Starting year {year:04}...\")\n",
" ts_found = True\n",
" hist_found = True\n",
" all_same = True\n",
" for diag_metadata in diag_metadata_list:\n",
" varname = diag_metadata[\"varname\"]\n",
" isel_kwargs = diag_metadata.get(\"isel_dict\")\n",
" comp_test = utils.compare_ts_and_hist(\n",
" casename, varname, stream, year, isel_kwargs or {}\n",
" )\n",
" # Error checking (TODO: replace string compare)\n",
"\n",
" # Missing time series data\n",
" if comp_test == \"Can not generate time series dataset\":\n",
" print(f\"Year {year:04} time series is not available\")\n",
" ts_found = False\n",
" break\n",
" if comp_test == \"case_ts does not provide time series files\":\n",
" print(f\"case_ts data for {varname} is not from time series files\")\n",
" ts_found = False\n",
" break\n",
"\n",
" # Missing history file data\n",
" if comp_test == \"History files unavailable\":\n",
" hist_found = False\n",
" break\n",
" if comp_test == \"case_hist does not provide history files\":\n",
" print(f\"case_hist data for {varname} is not from history files\")\n",
" hist_found = False\n",
" break\n",
"%%time\n",
"\n",
" # Datasets differ\n",
" if comp_test == \"datasets differ\":\n",
" print(f\"{varname} is different in year {year:04}\")\n",
" all_same = False\n",
"\n",
" # Error checking after running through all variables for a given year\n",
" # (1) If time series data is not available, we are done testing\n",
" if not ts_found:\n",
" break\n",
"\n",
" # (2) If history files are not available, then we have scrubbed those files\n",
" if not hist_found:\n",
" print(\n",
" f\"History files for variables in {year:04} are missing, skipping comparison\"\n",
" )\n",
" print(\"----\")\n",
" continue\n",
"\n",
" # (3) was the data in the time series files identical to that in the history files?\n",
" if all_same:\n",
" print(f\"No differences found in year {year:04}\")\n",
" print(f\"Finished {year:04}\")\n",
" print(\"----\")"
"casename = \"g.e22.G1850ECO_JRA_HR.TL319_t13.004\"\n",
"utils.timeseries_and_history_comparison(casename)"
]
}
],
Expand Down
Loading

0 comments on commit d604c92

Please sign in to comment.