Skip to content

Commit

Permalink
Add end-to-end workflow config for fv3atm, fix bugs (#196)
Browse files Browse the repository at this point in the history
* Add workflow yamls for fv3atm end-to-end

* Fix function name in create_training_data pipeline

* Change to partial end-to-end workflow config

* Add end-to-end workflow updates

* Change prognostic run Docker image use v0.3.1 model

* Change Q1/Q2 variable names in prognostic model

* Upload rundirs.yml to remote and open that for combined notebook

* Add fv3atm run to rundirs

* Use new RF training config yaml

* Use v0.3.1 fv3gfs-python for prognostic run

* Change version to 0.1.1, upload rundirs.yml before submit

* Update end-to-end workflow to use new syntax

* Remove dead code

* Remove hard-code of prog run diags location

* Update workflow config

* Use agg backend so windows dont block test script

* Add backoff dependency

* Point fv3config to its latest master commit

* Revert "Point fv3config to its latest master commit"

This reverts commit a0002df.

* Update prognstic run image in workflow config

* Change combined report location to experiments-2020-03

* Add missing -p to argo submit

* Add fv3atm baseline run to rundirs.yml

* Update common report rundirs
  • Loading branch information
Oliver Watt-Meyer authored Mar 25, 2020
1 parent db72139 commit 850c0e0
Show file tree
Hide file tree
Showing 16 changed files with 144 additions and 20 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#################################################################################
# GLOBALS #
#################################################################################
VERSION = v0.1.0
VERSION = v0.1.1
ENVIRONMENT_SCRIPTS = .environment-scripts
PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
BUCKET = [OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')
Expand Down
2 changes: 1 addition & 1 deletion docker/prognostic_run/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM us.gcr.io/vcm-ml/fv3gfs-python:v0.2.1
FROM us.gcr.io/vcm-ml/fv3gfs-python:v0.3.1


COPY docker/prognostic_run/requirements.txt /tmp/requirements.txt
Expand Down
3 changes: 2 additions & 1 deletion docker/prognostic_run/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ scikit-learn==0.22.1
dask
joblib
zarr
scikit-image
scikit-image
backoff
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@
"map_plot_3col": 120,
}

matplotlib.use("Agg")


def make_all_plots(ds_pred, ds_target, ds_hires, grid, output_dir):
""" Makes figures for predictions on test data
Expand Down
2 changes: 1 addition & 1 deletion fv3net/runtime/sklearn_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def update(model, state, dt):
tend = predict(model, state)

updated = state.assign(
sphum=state["sphum"] + tend.Q2 * dt, T=state.T + tend.Q1 * dt
sphum=state["sphum"] + tend.dQ2 * dt, T=state.T + tend.dQ1 * dt
)

return state_io.rename_to_orig(updated), state_io.rename_to_orig(tend)
73 changes: 73 additions & 0 deletions workflows/end_to_end/workflow-config-deep-off-fv3atm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
storage_proto: gs
storage_root: vcm-ml-data/experiments-2020-03/end_to_end
experiment:
name: deep-conv-off-fv3atm-a45d1781
unique_id: False
steps_to_run:
- one_step_run
- create_training_data
- train_sklearn_model
- test_sklearn_model
- prognostic_run
- baseline_run

steps_config:
one_step_run:
command: python workflows/one_step_jobs/orchestrate_submit_jobs.py
args:
restart_data:
location: gs://vcm-ml-data/2020-01-16-X-SHiELD-2019-12-02-pressure-coarsened-rundirs/restarts/C48
experiment_yaml: ./workflows/one_step_jobs/deep-conv-off-fv3atm.yml
docker_image: us.gcr.io/vcm-ml/fv3gfs-python:v0.3.1
--config-version: v0.3

create_training_data:
command: python -m fv3net.pipelines.create_training_data
args:
one_step_data:
from: one_step_run
diagnostics_data:
location: gs://vcm-ml-data/orchestration-testing/shield-coarsened-diags-2019-12-04
--timesteps-per-output-file: 1
--train-fraction: 0.5

train_sklearn_model:
command: python -m fv3net.regression.sklearn.train
args:
training_data:
from: create_training_data
train-config-file: ./workflows/sklearn_regression/maxdepth_13_rf_config.yml

test_sklearn_model:
command: python -m fv3net.diagnostics.sklearn_model_performance
args:
trained_model:
from: train_sklearn_model
testing_data:
from: create_training_data
diagnostics_data:
location: gs://vcm-ml-data/orchestration-testing/shield-coarsened-diags-2019-12-04
--num_test_zarrs: 36
--downsample-time-factor: 40

prognostic_run:
command: python workflows/prognostic_c48_run/orchestrate_submit_job.py
args:
restart_file_dir:
from: one_step_run
ic_timestep: "20160803.061500"
docker_image: us.gcr.io/vcm-ml/prognostic_run:v0.1.1-oliwmtest
--prog_config_yml: workflows/prognostic_c48_run/prognostic_config.yml
--model_url:
from: train_sklearn_model

baseline_run:
command: python workflows/prognostic_c48_run/orchestrate_submit_job.py
args:
restart_file_dir:
from: one_step_run
ic_timestep: "20160803.061500"
docker_image: us.gcr.io/vcm-ml/prognostic_run:v0.1.1-oliwmtest
--prog_config_yml: workflows/prognostic_c48_run/prognostic_config.yml


22 changes: 22 additions & 0 deletions workflows/one_step_jobs/deep-conv-off-fv3atm.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
kubernetes:
docker_image: us.gcr.io/vcm-ml/fv3gfs-python:v0.3.1
fv3config:
diag_table: workflows/one_step_jobs/diag_table_one_step
namelist:
atmos_model_nml:
fhout: 0.01666
coupler_nml:
days: 0
minutes: 15
seconds: 0
dt_atmos: 60 # seconds
dt_ocean: 60 # seconds
restart_secs: 60
fv_core_nml:
external_eta: true
npz: 79
k_split: 1
n_split: 1
gfs_physics_nml:
do_deep: false
fhzero: 0.01666
2 changes: 1 addition & 1 deletion workflows/prognostic_c48_run/prognostic_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ kubernetes:
diag_table: workflows/prognostic_c48_run/diag_table_prognostic
namelist:
coupler_nml:
days: 5 # total length
days: 10 # total length
hours: 0
minutes: 0
seconds: 0
Expand Down
4 changes: 2 additions & 2 deletions workflows/prognostic_c48_run/sklearn_runfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@

def compute_diagnostics(state, diags):
return dict(
net_precip=(diags["Q2"] * state[DELP] / gravity)
net_precip=(diags["dQ2"] * state[DELP] / gravity)
.sum("z")
.assign_attrs(units="kg/m^2/s"),
PW=(state[SPHUM] * state[DELP] / gravity).sum("z").assign_attrs(units="mm"),
net_heating=(diags["Q1"] * state[DELP] / gravity * cp)
net_heating=(diags["dQ1"] * state[DELP] / gravity * cp)
.sum("z")
.assign_attrs(units="W/m^2"),
)
Expand Down
4 changes: 2 additions & 2 deletions workflows/prognostic_run_diags/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ from this directory. This job can be monitored by running
Simply add a new item to rundirs.yml and resubmit the job. All the steps will be
re-run, which is redundant, but the process isn't that slow.

[1]: http://storage.googleapis.com/vcm-ml-public/testing-2020-02/prognostic_run_diags/combined.html
[1]: http://storage.googleapis.com/vcm-ml-public/experiments-2020-03/prognostic_run_diags/combined.html

## Updating the code

After updating the script `fv3net/pipelines/save_prognostic_run_diags.py` you will need to rebuild the docker image, and update the corresponding references in the `argo.yaml`. Specifically, `make push_image` will output a SHA256 digest for the pushed-docker image. Copy-paste this into the "image" fields in `argo.yaml`.
After updating the script `fv3net/pipelines/save_prognostic_run_diags.py` you will need to rebuild the docker image, and update the corresponding references in the `argo.yaml`. Specifically, `make push_image` will output a SHA256 digest for the pushed-docker image. Copy-paste this into the "image" fields in `argo.yaml`.
6 changes: 3 additions & 3 deletions workflows/prognostic_run_diags/argo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ spec:
- name: run
value: "{{item.url}}"
- name: output
value: "gs://vcm-ml-data/testing-2020-02/{{item.name}}.nc"
value: "{{workflow.parameters.output_url}}/{{item.name}}.nc"
- - name: generate-notebook
template: notebook
- name: notebook
Expand All @@ -32,7 +32,7 @@ spec:
value: "climate-sim-pool"
effect: "NoSchedule"
container:
image: us.gcr.io/vcm-ml/fv3net:v0.1.0
image: us.gcr.io/vcm-ml/fv3net:v0.1.1
command: ['bash', 'upload_report.sh']
workingDir: /home/jovyan/fv3net/workflows/prognostic_run_diags
env:
Expand Down Expand Up @@ -62,7 +62,7 @@ spec:
value: "climate-sim-pool"
effect: "NoSchedule"
container:
image: us.gcr.io/vcm-ml/fv3net:v0.1.0
image: us.gcr.io/vcm-ml/fv3net:v0.1.1
command:
- 'python'
- '-m'
Expand Down
5 changes: 3 additions & 2 deletions workflows/prognostic_run_diags/combined.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,11 @@
"metadata": {},
"outputs": [],
"source": [
"with open(\"rundirs.yml\") as f:\n",
"BUCKET = 'gs://vcm-ml-data/experiments-2020-03/prognostic_run_diags/' \n",
"rundirs_yml_url = os.path.join(BUCKET, 'rundirs.yml')\n",
"with fsspec.open(rundirs_yml_url) as f:\n",
" rundirs = yaml.safe_load(f)\n",
"\n",
"BUCKET = 'gs://vcm-ml-data/testing-2020-02/' \n",
"metrics = {}\n",
"\n",
"for rundir in rundirs:\n",
Expand Down
12 changes: 11 additions & 1 deletion workflows/prognostic_run_diags/run_all.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,14 @@
#!/bin/bash

if [ "$#" -lt 1 ]; then
output_url=gs://vcm-ml-data/experiments-2020-03/prognostic_run_diags
echo "WARNING: no output_url specified for prognostic run diags."
echo "Using default output_url $output_url"
else
output_url=$1
echo "Saving prognostic run diagnostics to $output_url"
fi

gsutil cp rundirs.yml $output_url/rundirs.yml
runs=$(yq . rundirs.yml)
argo submit argo.yaml -p runs="$runs"
argo submit argo.yaml -p runs="$runs" -p output_url="$output_url"
8 changes: 4 additions & 4 deletions workflows/prognostic_run_diags/rundirs.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
- url: gs://vcm-ml-data/end-to-end-experiments/2020-02-26-physics-off/annak-prognostic-physics-off-1773255e/prognostic_run_prognostic_yaml_adjust_prognostic_config.yml_ic_timestep_20160801.001500_docker_image_prognostic-run-orchestration
name: physics-off-rf
- url: gs://vcm-ml-data/end-to-end-experiments/deep-conv-off/annak-prognostic-deep-conv-off-ic20160803-199b834a/prognostic_run_prognostic_yaml_adjust_prognostic_config.yml_ic_timestep_20160803.061500_docker_image_prognostic-run-orchestration
name: deep-off-rf
- url: gs://vcm-ml-data/end-to-end-experiments/2020-02-28-deep-and-mp-off/annak-prognostic-deep-and-mp-off/prognostic_run_prognostic_yaml_adjust_prognostic_config.yml_ic_timestep_20160801.001500_docker_image_prognostic-run-orchestration:fv3py_v2.3-mp-off-switch
Expand All @@ -10,5 +8,7 @@
name: deep-off-baseline
- url: gs://vcm-ml-data/experiments-2020-03/all-physics-off-4d4bb0a8/prognostic_run_prognostic_yaml_adjust_prognostic_config.yml_ic_timestep_20160803.061500_docker_image_prognostic-run-orchestration
name: physics-off-1-rf
- name: deep-conv-off-265ebf37
url: gs://vcm-ml-data/experiments-2020-03/deep-conv-off-265ebf37/prognostic_run_prognostic_yaml_adjust_prognostic_config.yml_ic_timestep_20160803.061500_docker_image_prognostic-run-orchestration
- name: deep-off-fv3atm-rf
url: gs://vcm-ml-data/experiments-2020-03/end_to_end/deep-conv-off-fv3atm-a45d1781/prognostic_run_prognostic_yaml_adjust_prognostic_config.yml_ic_timestep_20160803.061500_docker_image_prognostic-run-orchestration:fv3py_v0.3.1
- name: deep-off-fv3atm-baseline
url: gs://vcm-ml-data/experiments-2020-03/end_to_end/deep-conv-off-fv3atm-a45d1781/baseline_run_ic_timestep_20160803.061500_docker_image_prognostic_run:v0.1.1-oliwmtest_prog_config_yml_prognostic_config.yml
2 changes: 1 addition & 1 deletion workflows/prognostic_run_diags/upload_report.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ set -x

gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS
jupyter nbconvert --execute combined.ipynb
gsutil cp combined.html gs://vcm-ml-public/testing-2020-02/prognostic_run_diags/combined.html
gsutil cp combined.html gs://vcm-ml-public/experiments-2020-03/prognostic_run_diags/combined.html
15 changes: 15 additions & 0 deletions workflows/sklearn_regression/maxdepth_13_rf_config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
model_type: sklearn_random_forest
hyperparameters:
max_depth: 13
n_estimators: 1
num_batches: 5
mask_to_surface_type: none
files_per_batch: 13
input_variables:
- T
- sphum
output_variables:
- dQ1
- dQ2
random_seed: 99

0 comments on commit 850c0e0

Please sign in to comment.