From 8ca881977a27bcfed7c0ce9e87d9502ed964dc97 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Tue, 17 Mar 2020 19:18:40 +0000 Subject: [PATCH 01/81] Refactore online_modules to fv3net --- .../runtime}/config.py | 2 +- .../runtime}/sklearn_interface.py | 24 +++++++------------ .../runtime}/state_io.py | 0 .../prognostic_c48_run/sklearn_runfile.py | 15 ++++++------ 4 files changed, 16 insertions(+), 25 deletions(-) rename {workflows/prognostic_c48_run/online_modules => fv3net/runtime}/config.py (93%) rename {workflows/prognostic_c48_run/online_modules => fv3net/runtime}/sklearn_interface.py (70%) rename {workflows/prognostic_c48_run/online_modules => fv3net/runtime}/state_io.py (100%) diff --git a/workflows/prognostic_c48_run/online_modules/config.py b/fv3net/runtime/config.py similarity index 93% rename from workflows/prognostic_c48_run/online_modules/config.py rename to fv3net/runtime/config.py index 13eaed4d45..39fe1794be 100644 --- a/workflows/prognostic_c48_run/online_modules/config.py +++ b/fv3net/runtime/config.py @@ -10,7 +10,7 @@ class dotdict(dict): __delattr__ = dict.__delitem__ -def get_config(): +def get_runfile_config(): with open("fv3config.yml") as f: config = yaml.safe_load(f) return dotdict(config["scikit_learn"]) diff --git a/workflows/prognostic_c48_run/online_modules/sklearn_interface.py b/fv3net/runtime/sklearn_interface.py similarity index 70% rename from workflows/prognostic_c48_run/online_modules/sklearn_interface.py rename to fv3net/runtime/sklearn_interface.py index 5a5027f7b7..e6e730b33e 100644 --- a/workflows/prognostic_c48_run/online_modules/sklearn_interface.py +++ b/fv3net/runtime/sklearn_interface.py @@ -3,10 +3,16 @@ from sklearn.externals import joblib from sklearn.utils import parallel_backend -import state_io +from . import state_io +__all__ = [ + "open_model", + "predict", + "update" +] -def open_sklearn_model(url): + +def open_model(url): # Load the model with fsspec.open(url, "rb") as f: return joblib.load(f) @@ -30,17 +36,3 @@ def update(model, state, dt): ) return state_io.rename_to_orig(updated), state_io.rename_to_orig(tend) - - -if __name__ == "__main__": - import sys - - state_path = sys.argv[1] - model = open_sklearn_model(sys.argv[2]) - - with open(state_path, "rb") as f: - data = state_io.load(f) - - tile = data[0] - preds = update(model, tile, dt=1) - print(preds) diff --git a/workflows/prognostic_c48_run/online_modules/state_io.py b/fv3net/runtime/state_io.py similarity index 100% rename from workflows/prognostic_c48_run/online_modules/state_io.py rename to fv3net/runtime/state_io.py diff --git a/workflows/prognostic_c48_run/sklearn_runfile.py b/workflows/prognostic_c48_run/sklearn_runfile.py index 77a3c183c1..c6c3bfbea9 100644 --- a/workflows/prognostic_c48_run/sklearn_runfile.py +++ b/workflows/prognostic_c48_run/sklearn_runfile.py @@ -3,9 +3,8 @@ import zarr import fv3gfs -import sklearn_interface -import state_io from fv3gfs._wrapper import get_time +from fv3net import runtime from mpi4py import MPI import config @@ -32,8 +31,8 @@ def compute_diagnostics(state, diags): ) -args = config.get_config() -NML = config.get_namelist() +args = runtime.get_runfile_config() +NML = runtime.get_namelist() TIMESTEP = NML["coupler_nml"]["dt_atmos"] times = [] @@ -55,7 +54,7 @@ def compute_diagnostics(state, diags): if rank == 0: logger.info("Downloading Sklearn Model") - MODEL = sklearn_interface.open_sklearn_model(args.model) + MODEL = runtime.sklearn.open_model(args.model) logger.info("Model downloaded") else: MODEL = None @@ -81,7 +80,7 @@ def compute_diagnostics(state, diags): if rank == 0: logger.debug("Computing RF updated variables") - preds, diags = sklearn_interface.update(MODEL, state, dt=TIMESTEP) + preds, diags = runtime.sklearn.update(MODEL, state, dt=TIMESTEP) if rank == 0: logger.debug("Setting Fortran State") fv3gfs.set_state(preds) @@ -91,8 +90,8 @@ def compute_diagnostics(state, diags): diagnostics = compute_diagnostics(state, diags) if i == 0: - writers = state_io.init_writers(GROUP, comm, diagnostics) - state_io.append_to_writers(writers, diagnostics) + writers = runtime.init_writers(GROUP, comm, diagnostics) + runtime.append_to_writers(writers, diagnostics) times.append(get_time()) From 5909c73b3877908e263b2a5083e172cf5fde4c14 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Tue, 17 Mar 2020 21:05:23 +0000 Subject: [PATCH 02/81] black --- fv3net/runtime/sklearn_interface.py | 6 +----- workflows/prognostic_c48_run/Dockerfile | 12 +++++------ workflows/prognostic_c48_run/Makefile | 20 +++++++++---------- workflows/prognostic_c48_run/fv3config.yml | 3 +-- workflows/prognostic_c48_run/requirements.txt | 5 +++++ .../prognostic_c48_run/sklearn_runfile.py | 3 +-- 6 files changed, 22 insertions(+), 27 deletions(-) create mode 100644 workflows/prognostic_c48_run/requirements.txt diff --git a/fv3net/runtime/sklearn_interface.py b/fv3net/runtime/sklearn_interface.py index e6e730b33e..a77106625e 100644 --- a/fv3net/runtime/sklearn_interface.py +++ b/fv3net/runtime/sklearn_interface.py @@ -5,11 +5,7 @@ from . import state_io -__all__ = [ - "open_model", - "predict", - "update" -] +__all__ = ["open_model", "predict", "update"] def open_model(url): diff --git a/workflows/prognostic_c48_run/Dockerfile b/workflows/prognostic_c48_run/Dockerfile index 27d8929fb2..8de1e64a66 100644 --- a/workflows/prognostic_c48_run/Dockerfile +++ b/workflows/prognostic_c48_run/Dockerfile @@ -1,10 +1,8 @@ FROM us.gcr.io/vcm-ml/fv3gfs-python:v0.2.1 -ADD fv3net-0.1.0-py3-none-any.whl /wheels/fv3net-0.1.0-py3-none-any.whl -ADD vcm-0.1.0-py3-none-any.whl /wheels/vcm-0.1.0-py3-none-any.whl -RUN pip3 install --no-deps /wheels/fv3net-0.1.0-py3-none-any.whl && \ - pip3 install /wheels/vcm-0.1.0-py3-none-any.whl && \ - pip3 install scikit-learn==0.22.1 joblib zarr -COPY online_modules/ /online_modules -ENV PYTHONPATH=/online_modules:${PYTHON_PATH} +COPY requirements.txt /tmp/requirements.txt +RUN pip3 install -r /tmp/requirements.txt +COPY fv3net-0.1.0-py3-none-any.whl /wheels/fv3net-0.1.0-py3-none-any.whl +COPY vcm-0.1.0-py3-none-any.whl /wheels/vcm-0.1.0-py3-none-any.whl +RUN pip3 install --no-deps /wheels/fv3net-0.1.0-py3-none-any.whl && pip3 install /wheels/vcm-0.1.0-py3-none-any.whl diff --git a/workflows/prognostic_c48_run/Makefile b/workflows/prognostic_c48_run/Makefile index 82d95a4f7e..30c55c2f0e 100644 --- a/workflows/prognostic_c48_run/Makefile +++ b/workflows/prognostic_c48_run/Makefile @@ -7,24 +7,17 @@ RUN_ARGS = --rm $(KEY_ARGS) $(LOCAL_DIR_ARGS) $(IMAGE) RUN_INTERACTIVE = docker run -ti $(RUN_ARGS) RUN ?= docker run $(RUN_ARGS) SKLEARN_MODEL = gs://vcm-ml-data/test-annak/ml-pipeline-output/2020-01-17_rf_40d_run.pkl -FV3CONFIG = fv3config.yml -FV3NET_VERSION ?=2020-01-23-prognostic-rf +FV3CONFIG = gs://vcm-ml-data/end-to-end-experiments/2020-02-26-physics-off/annak-prognostic-physics-off-1773255e/prognostic_run_prognostic_yaml_adjust_prognostic_config.yml_ic_timestep_20160801.001500_docker_image_prognostic-run-orchestration/job_config/fv3config.yml all: sklearn_run -fv3net-0.1.0-py3-none-any.whl: - pip wheel --no-deps git+ssh://git@github.com/VulcanClimateModeling/fv3net.git@$(FV3NET_VERSION) - -build: fv3net-0.1.0-py3-none-any.whl - docker build . -t $(IMAGE) - -fv3net-local: +fv3net: pip wheel --no-deps ../../. -vcm-local: +vcm: pip wheel --no-deps ../../external/vcm -build_local: fv3net-local vcm-local +build: fv3net vcm docker build . -t $(IMAGE) dev: @@ -36,9 +29,14 @@ test_run_sklearn: state.pkl state.pkl: fv3run --dockerimage test-image --runfile save_state_runfile.py $(FV3CONFIG) save_state/ cp save_state/state.pkl . + +sklearn_run_local: #rundir + fv3run --dockerimage $(IMAGE) --runfile sklearn_runfile.py $(FV3CONFIG) rundir sklearn_run: #rundir fv3run --dockerimage us.gcr.io/vcm-ml/prognostic-run-orchestration --runfile sklearn_runfile.py $(FV3CONFIG) ../../scratch/rundir clean: rm -rf net_precip net_heating/ PW + +.PHONY: fv3net vcm build dev sklearn_run \ No newline at end of file diff --git a/workflows/prognostic_c48_run/fv3config.yml b/workflows/prognostic_c48_run/fv3config.yml index e7bf1b4b5d..673325ca16 100644 --- a/workflows/prognostic_c48_run/fv3config.yml +++ b/workflows/prognostic_c48_run/fv3config.yml @@ -1,5 +1,4 @@ -scikit_learn: - model: gs://vcm-ml-data/test-annak/ml-pipeline-output/2020-01-17_rf_40d_run.pkl +scikit_learn: model:gs://vcm-ml-data/end-to-end-experiments/2020-02-26-physics-off/annak-prognostic-physics-off/train_sklearn_model_train-config-file_example_base_rf_training_config.yml_delete-local-results-after-upload_False/sklearn_model.pkl zarr_output: diags.zarr data_table: default diag_table: gs://vcm-ml-data/2020-01-15-noahb-exploration/2hr_strong_dampingone_step_config/C48/20160805.000000/diag_table diff --git a/workflows/prognostic_c48_run/requirements.txt b/workflows/prognostic_c48_run/requirements.txt new file mode 100644 index 0000000000..9944d45ec4 --- /dev/null +++ b/workflows/prognostic_c48_run/requirements.txt @@ -0,0 +1,5 @@ +scikit-learn==0.22.1 +dask +joblib +zarr +scikit-image \ No newline at end of file diff --git a/workflows/prognostic_c48_run/sklearn_runfile.py b/workflows/prognostic_c48_run/sklearn_runfile.py index c6c3bfbea9..8536ec178d 100644 --- a/workflows/prognostic_c48_run/sklearn_runfile.py +++ b/workflows/prognostic_c48_run/sklearn_runfile.py @@ -6,14 +6,13 @@ from fv3gfs._wrapper import get_time from fv3net import runtime from mpi4py import MPI -import config logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) SPHUM = "specific_humidity" DELP = "pressure_thickness_of_atmospheric_layer" -VARIABLES = list(state_io.CF_TO_RESTART_MAP) + [DELP] +VARIABLES = list(runtime.CF_TO_RESTART_MAP) + [DELP] cp = 1004 gravity = 9.81 From cee28fd12bb9660a7a797393f06d9c12acfd2382 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Tue, 17 Mar 2020 21:21:27 +0000 Subject: [PATCH 03/81] Create us.gcr.io/vcm-ml/prognostic_run:v0.1.0 Builds a docker image with fv3net, vcm, and fv3gfs-python installed. --- .../prognostic_c48_run => docker}/Dockerfile | 0 docker/Dockerfile.kubernetes | 30 ------------------- docker/Makefile | 17 +++++++++++ docker/download_inputdata.sh | 22 -------------- docker/install_gcloud.sh | 18 ----------- .../requirements.txt | 0 fv3net/runtime/__init__.py | 3 ++ .../end_to_end/example-workflow-config.yaml | 2 +- workflows/prognostic_c48_run/Makefile | 13 ++------ 9 files changed, 23 insertions(+), 82 deletions(-) rename {workflows/prognostic_c48_run => docker}/Dockerfile (100%) delete mode 100644 docker/Dockerfile.kubernetes create mode 100644 docker/Makefile delete mode 100755 docker/download_inputdata.sh delete mode 100644 docker/install_gcloud.sh rename {workflows/prognostic_c48_run => docker}/requirements.txt (100%) create mode 100644 fv3net/runtime/__init__.py diff --git a/workflows/prognostic_c48_run/Dockerfile b/docker/Dockerfile similarity index 100% rename from workflows/prognostic_c48_run/Dockerfile rename to docker/Dockerfile diff --git a/docker/Dockerfile.kubernetes b/docker/Dockerfile.kubernetes deleted file mode 100644 index 681af739b2..0000000000 --- a/docker/Dockerfile.kubernetes +++ /dev/null @@ -1,30 +0,0 @@ -FROM google/cloud-sdk:latest - -ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 -ENV PATH /opt/conda/bin:$PATH - -RUN apt-get update --fix-missing && apt-get install -y wget bzip2 ca-certificates \ - libglib2.0-0 libxext6 libsm6 libxrender1 \ - git mercurial subversion - -RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \ - /bin/bash ~/miniconda.sh -b -p /opt/conda && \ - rm ~/miniconda.sh && \ - ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ - echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ - echo "conda activate base" >> ~/.bashrc - -RUN apt-get install -y curl grep sed dpkg && \ - TINI_VERSION=`curl https://github.com/krallin/tini/releases/latest | grep -o "/v.*\"" | sed 's:^..\(.*\).$:\1:'` && \ - curl -L "https://github.com/krallin/tini/releases/download/v${TINI_VERSION}/tini_${TINI_VERSION}.deb" > tini.deb && \ - dpkg -i tini.deb && \ - rm tini.deb && \ - apt-get clean - -COPY . /fv3net - -RUN conda env update --file /fv3net/environment.yml --name base && conda clean -afy -RUN python /fv3net/setup.py develop - -ENTRYPOINT [ "/usr/bin/tini", "--" ] -CMD [ "/bin/bash" ] \ No newline at end of file diff --git a/docker/Makefile b/docker/Makefile new file mode 100644 index 0000000000..3a21535e9c --- /dev/null +++ b/docker/Makefile @@ -0,0 +1,17 @@ +IMAGE = us.gcr.io/vcm-ml/prognostic_run:v0.1.0 + +all: build + +fv3net: + pip wheel --no-deps ../ + +vcm: + pip wheel --no-deps ../external/vcm + +build: fv3net vcm + docker build . -t $(IMAGE) + +push: build + docker push $(IMAGE) + +.PHONY: fv3net vcm build dev sklearn_run diff --git a/docker/download_inputdata.sh b/docker/download_inputdata.sh deleted file mode 100755 index b202a94398..0000000000 --- a/docker/download_inputdata.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash - -set -e -set -x - -dir=2019-09-27-FV3GFS-docker-input-c48-LH-nml -filename=fv3gfs-data-docker_2019-09-27.tar.gz -url=http://storage.googleapis.com/vcm-ml-public/$dir/$filename -datadir_local=inputdata - -mkdir -p $datadir_local - -# download data -[[ -f $filename ]] || wget $url - -# unzip/tar input data -tar xzf $filename -C $datadir_local - -rm $filename -mkdir -p /inputdata/ -mv inputdata/fv3gfs-data-docker/fix.v201702 /inputdata/fix.v201702 - diff --git a/docker/install_gcloud.sh b/docker/install_gcloud.sh deleted file mode 100644 index 36a2644792..0000000000 --- a/docker/install_gcloud.sh +++ /dev/null @@ -1,18 +0,0 @@ -# Add the Cloud SDK distribution URI as a package source -echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list - -# Import the Google Cloud Platform public key -curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - - -# Update the package list and install the Cloud SDK -apt-get update && apt-get install -y python-dev google-cloud-sdk - -# Install fast crc mod for multithreaded downloads -# the snakemake workflow will fail on gsutil downloads without this -curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py -python2.7 get-pip.py -PIP="python2.7 -m pip" -echo "PIP Version" -$PIP -V -$PIP uninstall crcmod -$PIP install --no-cache-dir -U crcmod diff --git a/workflows/prognostic_c48_run/requirements.txt b/docker/requirements.txt similarity index 100% rename from workflows/prognostic_c48_run/requirements.txt rename to docker/requirements.txt diff --git a/fv3net/runtime/__init__.py b/fv3net/runtime/__init__.py new file mode 100644 index 0000000000..80c9e7e8f6 --- /dev/null +++ b/fv3net/runtime/__init__.py @@ -0,0 +1,3 @@ +from . import sklearn_interface as sklearn +from .state_io import init_writers, append_to_writers, CF_TO_RESTART_MAP +from .config import get_runfile_config, get_namelist \ No newline at end of file diff --git a/workflows/end_to_end/example-workflow-config.yaml b/workflows/end_to_end/example-workflow-config.yaml index 9e1355013f..1e899e5f26 100644 --- a/workflows/end_to_end/example-workflow-config.yaml +++ b/workflows/end_to_end/example-workflow-config.yaml @@ -84,4 +84,4 @@ experiment: extra_args: prognostic_yaml_adjust: workflows/prognostic_c48_run/prognostic_config.yml ic_timestep: "20160801.001500" - docker_image: us.gcr.io/vcm-ml/prognostic-run-orchestration + docker_image: us.gcr.io/vcm-ml/prognostic_run:v0.1.0 diff --git a/workflows/prognostic_c48_run/Makefile b/workflows/prognostic_c48_run/Makefile index 30c55c2f0e..3b405c7c0f 100644 --- a/workflows/prognostic_c48_run/Makefile +++ b/workflows/prognostic_c48_run/Makefile @@ -1,4 +1,4 @@ -IMAGE=test-image +IMAGE = us.gcr.io/vcm-ml/prognostic_run:v0.1.0 KEY_ARGS= -v $(GOOGLE_APPLICATION_CREDENTIALS):/key.json \ -e GOOGLE_APPLICATION_CREDENTIALS=/key.json LOCAL_DIR_ARGS = -w /code -v $(shell pwd):/code @@ -11,15 +11,6 @@ FV3CONFIG = gs://vcm-ml-data/end-to-end-experiments/2020-02-26-physics-off/annak all: sklearn_run -fv3net: - pip wheel --no-deps ../../. - -vcm: - pip wheel --no-deps ../../external/vcm - -build: fv3net vcm - docker build . -t $(IMAGE) - dev: $(RUN_INTERACTIVE) bash @@ -39,4 +30,4 @@ sklearn_run: #rundir clean: rm -rf net_precip net_heating/ PW -.PHONY: fv3net vcm build dev sklearn_run \ No newline at end of file +.PHONY: fv3net vcm build dev sklearn_run From 305629fc4b175dcc208f197639e7be1ea3cbdaa0 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Tue, 17 Mar 2020 21:35:51 +0000 Subject: [PATCH 04/81] Refactor us.gcr.io/vcm-ml/fv3net image build code --- Makefile | 9 ++++----- README.md | 6 ++++++ Dockerfile => docker/fv3net/Dockerfile | 0 docker/{ => prognostic_run}/Dockerfile | 0 docker/{ => prognostic_run}/Makefile | 2 +- docker/{ => prognostic_run}/requirements.txt | 0 6 files changed, 11 insertions(+), 6 deletions(-) rename Dockerfile => docker/fv3net/Dockerfile (100%) rename docker/{ => prognostic_run}/Dockerfile (100%) rename docker/{ => prognostic_run}/Makefile (83%) rename docker/{ => prognostic_run}/requirements.txt (100%) diff --git a/Makefile b/Makefile index 756c1cd9a4..26d34f2d23 100644 --- a/Makefile +++ b/Makefile @@ -22,8 +22,10 @@ endif ################################################################################# # COMMANDS # ################################################################################# -build_image: - docker build . -t $(IMAGE) -t $(GCR_IMAGE) + +build_images: + make -C docker/fv3net build + make -C docker/prognostic_run build enter: build_image docker run -it -v $(shell pwd):/code \ @@ -33,9 +35,6 @@ enter: build_image # -e GOOGLE_APPLICATION_CREDENTIALS=/google_creds.json \ # -v $(HOME)/.config/gcloud/application_default_credentials.json:/google_creds.json \ -push_image: build_image - docker push $(GCR_IMAGE) - ## Make Dataset .PHONY: data update_submodules create_environment overwrite_baseline_images diff --git a/README.md b/README.md index 947c393b02..0020b473a2 100644 --- a/README.md +++ b/README.md @@ -117,6 +117,12 @@ If you get an error `Could not create workflow; user does not have write access trying to submit the dataflow job, do `gcloud auth application-default login` first and then retry. +## Building the fv3net docker images + +The pipelines use a pair of common images: +1. us.gcr.io/vcm-ml/prognostic_run:v0.1.0 +1. us.gcr.io/vcm-ml/fv3net + ## Deploying on k8s with fv3net Docker images with the python-wrapped model and fv3run are available from the diff --git a/Dockerfile b/docker/fv3net/Dockerfile similarity index 100% rename from Dockerfile rename to docker/fv3net/Dockerfile diff --git a/docker/Dockerfile b/docker/prognostic_run/Dockerfile similarity index 100% rename from docker/Dockerfile rename to docker/prognostic_run/Dockerfile diff --git a/docker/Makefile b/docker/prognostic_run/Makefile similarity index 83% rename from docker/Makefile rename to docker/prognostic_run/Makefile index 3a21535e9c..bef31690e4 100644 --- a/docker/Makefile +++ b/docker/prognostic_run/Makefile @@ -9,7 +9,7 @@ vcm: pip wheel --no-deps ../external/vcm build: fv3net vcm - docker build . -t $(IMAGE) + docker build -f Dockerfile ../../ -t $(IMAGE) push: build docker push $(IMAGE) diff --git a/docker/requirements.txt b/docker/prognostic_run/requirements.txt similarity index 100% rename from docker/requirements.txt rename to docker/prognostic_run/requirements.txt From ff9f938d0c7618329ec45adb5b00a0b2d75ac4f5 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Tue, 17 Mar 2020 22:13:32 +0000 Subject: [PATCH 05/81] Add build_images makefile target --- Makefile | 19 ++++++++++++++++--- docker/fv3net/Dockerfile | 10 +++------- docker/prognostic_run/Dockerfile | 2 +- docker/prognostic_run/Makefile | 17 ----------------- 4 files changed, 20 insertions(+), 28 deletions(-) delete mode 100644 docker/prognostic_run/Makefile diff --git a/Makefile b/Makefile index 26d34f2d23..716ce26156 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,7 @@ ################################################################################# # GLOBALS # ################################################################################# +VERSION = 0.1.0 ENVIRONMENT_SCRIPTS = .environment-scripts PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) BUCKET = [OPTIONAL] your-bucket-for-syncing-data (do not include 's3://') @@ -22,10 +23,22 @@ endif ################################################################################# # COMMANDS # ################################################################################# +.PHONY: wheels build_images push_image +wheels: + pip wheel --no-deps . + pip wheel --no-deps external/vcm -build_images: - make -C docker/fv3net build - make -C docker/prognostic_run build +# pattern rule for building docker images +build_image_%: + docker build -f docker/$*/Dockerfile . -t us.gcr.io/vcm-ml/$*:$(VERSION) + +build_image_prognostic_run: wheels + +build_images: build_image_fv3net build_image_prognostic_run + +push_image: + docker push us.gcr.io/vcm-ml/fv3net:$(VERSION) + docker push us.gcr.io/vcm-ml/prognostic_run:$(VERSION) enter: build_image docker run -it -v $(shell pwd):/code \ diff --git a/docker/fv3net/Dockerfile b/docker/fv3net/Dockerfile index 8a68ea973d..d12deb541c 100644 --- a/docker/fv3net/Dockerfile +++ b/docker/fv3net/Dockerfile @@ -9,7 +9,6 @@ ENV PROJECT_NAME=fv3net USER root RUN apt-get update && apt-get install -y gfortran ADD environment.yml $FV3NET/ -ADD Makefile $FV3NET/ ADD .environment-scripts $ENVIRONMENT_SCRIPTS RUN fix-permissions $FV3NET WORKDIR $FV3NET @@ -20,21 +19,18 @@ ENV PATH=/opt/conda/envs/fv3net/bin:$PATH RUN bash $ENVIRONMENT_SCRIPTS/build_environment.sh $PROJECT_NAME RUN jupyter labextension install @pyviz/jupyterlab_pyviz -# Add rest of fv3net directory USER root -ADD . $FV3NET # install gcloud sdk RUN cd / && \ curl https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-284.0.0-linux-x86_64.tar.gz |\ tar xz ENV PATH=/google-cloud-sdk/bin:${PATH} -#RUN /google-cloud-sdk/bin/gcloud init + +# Add rest of fv3net directory +ADD . $FV3NET RUN fix-permissions $FV3NET USER $NB_UID -# RUN gcloud init - # setup the local python packages - RUN bash $ENVIRONMENT_SCRIPTS/install_local_packages.sh $PROJECT_NAME diff --git a/docker/prognostic_run/Dockerfile b/docker/prognostic_run/Dockerfile index 8de1e64a66..8fb34cc92c 100644 --- a/docker/prognostic_run/Dockerfile +++ b/docker/prognostic_run/Dockerfile @@ -1,7 +1,7 @@ FROM us.gcr.io/vcm-ml/fv3gfs-python:v0.2.1 -COPY requirements.txt /tmp/requirements.txt +COPY docker/prognostic_run/requirements.txt /tmp/requirements.txt RUN pip3 install -r /tmp/requirements.txt COPY fv3net-0.1.0-py3-none-any.whl /wheels/fv3net-0.1.0-py3-none-any.whl COPY vcm-0.1.0-py3-none-any.whl /wheels/vcm-0.1.0-py3-none-any.whl diff --git a/docker/prognostic_run/Makefile b/docker/prognostic_run/Makefile deleted file mode 100644 index bef31690e4..0000000000 --- a/docker/prognostic_run/Makefile +++ /dev/null @@ -1,17 +0,0 @@ -IMAGE = us.gcr.io/vcm-ml/prognostic_run:v0.1.0 - -all: build - -fv3net: - pip wheel --no-deps ../ - -vcm: - pip wheel --no-deps ../external/vcm - -build: fv3net vcm - docker build -f Dockerfile ../../ -t $(IMAGE) - -push: build - docker push $(IMAGE) - -.PHONY: fv3net vcm build dev sklearn_run From b31a0991c4b9f587a538aa0162375f38591ce015 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Tue, 17 Mar 2020 22:15:15 +0000 Subject: [PATCH 06/81] Add __version__ to fv3net init --- fv3net/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fv3net/__init__.py b/fv3net/__init__.py index 186a2b559f..9e3c8a4193 100644 --- a/fv3net/__init__.py +++ b/fv3net/__init__.py @@ -2,3 +2,5 @@ TOP_LEVEL_DIR = pathlib.Path(__file__).parent.parent.absolute() COARSENED_DIAGS_ZARR_NAME = "gfsphysics_15min_coarse.zarr" + +__version__ = '0.1.0' From 8f9cb3c87ffe598941d8dc82f3b6ebeb4fd7ab4c Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Tue, 17 Mar 2020 22:23:38 +0000 Subject: [PATCH 07/81] update prognostic_run_diags configuration --- workflows/prognostic_run_diags/argo.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/prognostic_run_diags/argo.yaml b/workflows/prognostic_run_diags/argo.yaml index bc7f51501f..a082022471 100644 --- a/workflows/prognostic_run_diags/argo.yaml +++ b/workflows/prognostic_run_diags/argo.yaml @@ -32,7 +32,7 @@ spec: value: "climate-sim-pool" effect: "NoSchedule" container: - image: us.gcr.io/vcm-ml/fv3net@sha256:3b884201e8c61e9db248dec0f863dbd4dba4a9b18bc8a9794823e8ea4b7d2e0b + image: us.gcr.io/vcm-ml/fv3net:0.1.0 command: ['bash', 'upload_report.sh'] workingDir: /home/jovyan/fv3net/workflows/prognostic_run_diags env: @@ -62,7 +62,7 @@ spec: value: "climate-sim-pool" effect: "NoSchedule" container: - image: us.gcr.io/vcm-ml/fv3net@sha256:3b884201e8c61e9db248dec0f863dbd4dba4a9b18bc8a9794823e8ea4b7d2e0b + image: us.gcr.io/vcm-ml/fv3net:0.1.0 command: - 'python' - '-m' From 31930d9484b95e9df4d4b93eee9286f4e8491cbf Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Tue, 17 Mar 2020 22:24:18 +0000 Subject: [PATCH 08/81] black --- fv3net/__init__.py | 2 +- fv3net/runtime/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fv3net/__init__.py b/fv3net/__init__.py index 9e3c8a4193..d2f8e47558 100644 --- a/fv3net/__init__.py +++ b/fv3net/__init__.py @@ -3,4 +3,4 @@ TOP_LEVEL_DIR = pathlib.Path(__file__).parent.parent.absolute() COARSENED_DIAGS_ZARR_NAME = "gfsphysics_15min_coarse.zarr" -__version__ = '0.1.0' +__version__ = "0.1.0" diff --git a/fv3net/runtime/__init__.py b/fv3net/runtime/__init__.py index 80c9e7e8f6..5cb26a73c1 100644 --- a/fv3net/runtime/__init__.py +++ b/fv3net/runtime/__init__.py @@ -1,3 +1,3 @@ from . import sklearn_interface as sklearn from .state_io import init_writers, append_to_writers, CF_TO_RESTART_MAP -from .config import get_runfile_config, get_namelist \ No newline at end of file +from .config import get_runfile_config, get_namelist From f99476680039bd29cd1dd1327fe922572706adc6 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Tue, 17 Mar 2020 22:28:03 +0000 Subject: [PATCH 09/81] update readme --- README.md | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 0020b473a2..8b86d23fc1 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,17 @@ The main data processing pipelines for this project currently utilize Google Clo Dataflow and Kubernetes with Docker images. Run scripts to deploy these workflows along with information can be found under the `workflows` directory. +## Building the fv3net docker images + +The workflows use a pair of common images: + +|Image| Description| +| `us.gcr.io/vcm-ml/prognostic_run` | fv3gfs-python with minimal fv3net and vcm installed | +| `us.gcr.io/vcm-ml/fv3net` | fv3net image with all dependencies including plotting | + +These images can be built and pushed to GCR using `make build_images` and +`make push_images`, respectively. + ## Dataflow Dataflow jobs run in a "serverless" style where data is piped between workers who @@ -117,11 +128,6 @@ If you get an error `Could not create workflow; user does not have write access trying to submit the dataflow job, do `gcloud auth application-default login` first and then retry. -## Building the fv3net docker images - -The pipelines use a pair of common images: -1. us.gcr.io/vcm-ml/prognostic_run:v0.1.0 -1. us.gcr.io/vcm-ml/fv3net ## Deploying on k8s with fv3net From 871be05d813e780f25b2374a6b68af1543786e5f Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Tue, 17 Mar 2020 22:42:01 +0000 Subject: [PATCH 10/81] Fix table in README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 8b86d23fc1..00601d57ae 100644 --- a/README.md +++ b/README.md @@ -77,6 +77,7 @@ along with information can be found under the `workflows` directory. The workflows use a pair of common images: |Image| Description| +|-----|------------| | `us.gcr.io/vcm-ml/prognostic_run` | fv3gfs-python with minimal fv3net and vcm installed | | `us.gcr.io/vcm-ml/fv3net` | fv3net image with all dependencies including plotting | From db16a398ce2bae5353ab451fb8e7406e27269945 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Tue, 17 Mar 2020 22:46:00 +0000 Subject: [PATCH 11/81] fix yaml bug --- workflows/prognostic_run_diags/argo.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/prognostic_run_diags/argo.yaml b/workflows/prognostic_run_diags/argo.yaml index a082022471..58713b68c9 100644 --- a/workflows/prognostic_run_diags/argo.yaml +++ b/workflows/prognostic_run_diags/argo.yaml @@ -32,7 +32,7 @@ spec: value: "climate-sim-pool" effect: "NoSchedule" container: - image: us.gcr.io/vcm-ml/fv3net:0.1.0 + image: us.gcr.io/vcm-ml/fv3net:0.1.0 command: ['bash', 'upload_report.sh'] workingDir: /home/jovyan/fv3net/workflows/prognostic_run_diags env: From 47d69e3d2037494b90815b1506f04db6042f4573 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Tue, 17 Mar 2020 22:58:05 +0000 Subject: [PATCH 12/81] pin pandas version to 1.0.1 --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index 8f2194b35f..593d9ef017 100644 --- a/environment.yml +++ b/environment.yml @@ -18,6 +18,7 @@ dependencies: - h5netcdf - h5py>=2.10 - hypothesis + - pandas=1.0.1 - intake - intake-xarray - metpy From cfe452331253baff3bddb11339e18d244d33c7b6 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Wed, 18 Mar 2020 00:25:10 +0000 Subject: [PATCH 13/81] save and process outputs at different stages --- docker/prognostic_run/Dockerfile | 3 +- fv3net/runtime/__init__.py | 2 +- fv3net/runtime/config.py | 6 +++ workflows/one_step_jobs/_post_process.py | 20 +++++++++ workflows/one_step_jobs/_run.sh | 2 + workflows/one_step_jobs/runfile.py | 53 ++++++++++++++++++++++++ 6 files changed, 84 insertions(+), 2 deletions(-) create mode 100644 workflows/one_step_jobs/_post_process.py create mode 100644 workflows/one_step_jobs/_run.sh create mode 100644 workflows/one_step_jobs/runfile.py diff --git a/docker/prognostic_run/Dockerfile b/docker/prognostic_run/Dockerfile index 8fb34cc92c..296f5a339e 100644 --- a/docker/prognostic_run/Dockerfile +++ b/docker/prognostic_run/Dockerfile @@ -1,8 +1,9 @@ -FROM us.gcr.io/vcm-ml/fv3gfs-python:v0.2.1 +FROM us.gcr.io/vcm-ml/fv3gfs-python:v0.3.1 COPY docker/prognostic_run/requirements.txt /tmp/requirements.txt RUN pip3 install -r /tmp/requirements.txt +RUN pip3 install wheel COPY fv3net-0.1.0-py3-none-any.whl /wheels/fv3net-0.1.0-py3-none-any.whl COPY vcm-0.1.0-py3-none-any.whl /wheels/vcm-0.1.0-py3-none-any.whl RUN pip3 install --no-deps /wheels/fv3net-0.1.0-py3-none-any.whl && pip3 install /wheels/vcm-0.1.0-py3-none-any.whl diff --git a/fv3net/runtime/__init__.py b/fv3net/runtime/__init__.py index 5cb26a73c1..4ce5adce29 100644 --- a/fv3net/runtime/__init__.py +++ b/fv3net/runtime/__init__.py @@ -1,3 +1,3 @@ from . import sklearn_interface as sklearn from .state_io import init_writers, append_to_writers, CF_TO_RESTART_MAP -from .config import get_runfile_config, get_namelist +from .config import get_runfile_config, get_namelist, get_config diff --git a/fv3net/runtime/config.py b/fv3net/runtime/config.py index 39fe1794be..2d0f02a09f 100644 --- a/fv3net/runtime/config.py +++ b/fv3net/runtime/config.py @@ -10,6 +10,12 @@ class dotdict(dict): __delattr__ = dict.__delitem__ +def get_config(): + with open("fv3config.yml") as f: + config = yaml.safe_load(f) + return config + + def get_runfile_config(): with open("fv3config.yml") as f: config = yaml.safe_load(f) diff --git a/workflows/one_step_jobs/_post_process.py b/workflows/one_step_jobs/_post_process.py new file mode 100644 index 0000000000..62edda4fe3 --- /dev/null +++ b/workflows/one_step_jobs/_post_process.py @@ -0,0 +1,20 @@ +import xarray as xr +import numpy as np + +CHUNK = {'time': 1, 'tile': -1} + +begin = xr.open_zarr("output_dir/begin_physics.zarr") +before = xr.open_zarr("output_dir/before_physics.zarr") +after = xr.open_zarr("output_dir/after_physics.zarr") + +# make the time dims consistent +time = begin.time +before = before.drop('time') +after = after.drop('time') +begin = begin.drop('time') + +# concat data +dt = np.timedelta64(15, 'm') +time = np.arange(len(time)) * dt +ds = xr.concat([begin, before, after], dim='step').assign_coords(step=['begin', 'after_dynamics', 'after_physics'], time=time) +ds.chunk(CHUNK).to_zarr("post_processed.zarr", mode='w') diff --git a/workflows/one_step_jobs/_run.sh b/workflows/one_step_jobs/_run.sh new file mode 100644 index 0000000000..a8a7f55ca9 --- /dev/null +++ b/workflows/one_step_jobs/_run.sh @@ -0,0 +1,2 @@ +fv3config=gs://vcm-ml-data/orchestration-testing/test-andrep/one_step_run_experiment_yaml_all-physics-off.yml_docker_image_fv3gfs-python:v0.3.1_config-version_v0.3/one_step_config/20160801.001500/fv3config.yml +fv3run --dockerimage us.gcr.io/vcm-ml/prognostic_run:0.1.0 $fv3config output_dir --runfile runfile.py diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py new file mode 100644 index 0000000000..40c76deeee --- /dev/null +++ b/workflows/one_step_jobs/runfile.py @@ -0,0 +1,53 @@ +import os +from fv3net import runtime +import yaml + +if __name__ == "__main__": + import fv3gfs + from mpi4py import MPI + +RUN_DIR = os.path.dirname(os.path.realpath(__file__)) + +DELP = "pressure_thickness_of_atmospheric_layer" +TIME = 'time' +VARIABLES = list(runtime.CF_TO_RESTART_MAP) + [DELP, TIME] + +rank = MPI.COMM_WORLD.Get_rank() +current_dir = os.getcwd() +config = runtime.get_config() +MPI.COMM_WORLD.barrier() # wait for master rank to write run directory + +partitioner = fv3gfs.CubedSpherePartitioner.from_namelist(config["namelist"]) + +before_monitor = fv3gfs.ZarrMonitor( + os.path.join(RUN_DIR, "before_physics.zarr"), + partitioner, + mode="w", + mpi_comm=MPI.COMM_WORLD, +) + +after_monitor = fv3gfs.ZarrMonitor( + os.path.join(RUN_DIR, "after_physics.zarr"), + partitioner, + mode="w", + mpi_comm=MPI.COMM_WORLD, +) + +begin_monitor = fv3gfs.ZarrMonitor( + os.path.join(RUN_DIR, "begin_physics.zarr"), + partitioner, + mode="w", + mpi_comm=MPI.COMM_WORLD, +) + +fv3gfs.initialize() +state = fv3gfs.get_state(names=VARIABLES) +for i in range(fv3gfs.get_step_count()): + begin_monitor.store(state) + fv3gfs.step_dynamics() + state = fv3gfs.get_state(names=VARIABLES) + before_monitor.store(state) + fv3gfs.step_physics() + state = fv3gfs.get_state(names=VARIABLES) + after_monitor.store(state) +fv3gfs.cleanup() From 1b120ad70dbaa093e8d41d1bc8fbde98f415bb53 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Wed, 18 Mar 2020 20:05:23 +0000 Subject: [PATCH 14/81] post process runfile --- fv3net/pipelines/kube_jobs/one_step.py | 3 +++ workflows/one_step_jobs/runfile.py | 34 ++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/fv3net/pipelines/kube_jobs/one_step.py b/fv3net/pipelines/kube_jobs/one_step.py index 3c6484f8a0..6814a2ddeb 100644 --- a/fv3net/pipelines/kube_jobs/one_step.py +++ b/fv3net/pipelines/kube_jobs/one_step.py @@ -30,6 +30,9 @@ logger = logging.getLogger(__name__) +def create_zarr_store() + + def timesteps_to_process( input_url: str, output_url: str, diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 40c76deeee..c1118b45fd 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -1,6 +1,35 @@ import os from fv3net import runtime import yaml +import xarray as xr +import numpy as np +import logging + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__file__) + + +def post_process(): + logger.info("Post processing model outputs") + CHUNK = {'time': 1, 'tile': -1} + + begin = xr.open_zarr("begin_physics.zarr") + before = xr.open_zarr("before_physics.zarr") + after = xr.open_zarr("after_physics.zarr") + + # make the time dims consistent + time = begin.time + before = before.drop('time') + after = after.drop('time') + begin = begin.drop('time') + + # concat data + dt = np.timedelta64(15, 'm') + time = np.arange(len(time)) * dt + ds = xr.concat([begin, before, after], dim='step').assign_coords(step=['begin', 'after_dynamics', 'after_physics'], time=time) + ds.chunk(CHUNK).to_zarr("post_processed.zarr", mode='w') + if __name__ == "__main__": import fv3gfs @@ -42,7 +71,9 @@ fv3gfs.initialize() state = fv3gfs.get_state(names=VARIABLES) +if rank == 0: logger.info("Beginning steps") for i in range(fv3gfs.get_step_count()): + if rank == 0: logger.info(f"step {i}") begin_monitor.store(state) fv3gfs.step_dynamics() state = fv3gfs.get_state(names=VARIABLES) @@ -50,4 +81,7 @@ fv3gfs.step_physics() state = fv3gfs.get_state(names=VARIABLES) after_monitor.store(state) + +if rank == 0: + post_process() fv3gfs.cleanup() From 61c20366dd534998c4831754b20221505f984471 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Wed, 18 Mar 2020 20:18:56 +0000 Subject: [PATCH 15/81] Initialize the zarr store --- fv3net/pipelines/kube_jobs/one_step.py | 31 +++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/fv3net/pipelines/kube_jobs/one_step.py b/fv3net/pipelines/kube_jobs/one_step.py index 6814a2ddeb..d1a295068a 100644 --- a/fv3net/pipelines/kube_jobs/one_step.py +++ b/fv3net/pipelines/kube_jobs/one_step.py @@ -1,6 +1,8 @@ import logging import os +import zarr import fsspec +import numpy as np import uuid import yaml import re @@ -30,8 +32,35 @@ logger = logging.getLogger(__name__) -def create_zarr_store() +def _compute_chunks(shape, chunks): + return tuple( + size if chunk == -1 else chunk + for size, chunk in zip(shape, chunks) + ) + + +def _get_schema(shape=(3, 1, 6, 79, 48, 48)): + variables = ["air_temperature" , "specific_humidity", "pressure_thickness_of_atmospheric_layer"] + dims_scalar = ['initial_time', 'step', 'lead_time', 'tile', 'z', 'y', 'x'] + chunks_scalar = _compute_chunks(shape, [-1, 1, -1, -1, -1, -1]) + DTYPE = np.float32 + scalar_schema = {"dims": dims_scalar, "chunks": chunks_scalar, "dtype": DTYPE, "shape": shape} + return {key: scalar_schema for key in variables} + + +def _init_group_with_schema(group, schemas, timesteps): + for name, schema in schemas.items(): + shape = (len(timesteps),) + schema['shape'] + chunks = (1,) + schema['chunks'] + group.empty(name, shape=shape, chunks=chunks, dtype=schema['dtype']) + +def create_zarr_store(timesteps, output_url): + schemas = _get_schema() + mapper = fsspec.get_mapper(output_url) + group = zarr.open_group(mapper, mode='w') + _init_group_with_schema(group, schemas, timesteps) + def timesteps_to_process( input_url: str, From 46b4f0f35d12c13e5cfdc26e506751094b7bdd15 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Wed, 18 Mar 2020 20:24:19 +0000 Subject: [PATCH 16/81] write code to insert output in zarr store --- fv3net/pipelines/kube_jobs/one_step.py | 12 ++++++++++-- workflows/one_step_jobs/runfile.py | 14 +++++++++++--- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/fv3net/pipelines/kube_jobs/one_step.py b/fv3net/pipelines/kube_jobs/one_step.py index d1a295068a..2ee616a74c 100644 --- a/fv3net/pipelines/kube_jobs/one_step.py +++ b/fv3net/pipelines/kube_jobs/one_step.py @@ -337,12 +337,20 @@ def submit_jobs( local_vertical_grid_file=None, ) -> None: """Submit one-step job for all timesteps in timestep_list""" - for timestep in timestep_list: + + zarr_url = os.path.join(output_url, "big.zarr") + create_zarr_store(timestep_list, zarr_url) + + for k, timestep in enumerate(timestep_list): curr_input_url = os.path.join(input_url, timestep) - curr_output_url = os.path.join(output_url, timestep) + + # do not upload to rundirectory to cloud + curr_output_url = os.path.join("/tmp", timestep) curr_config_url = os.path.join(config_url, timestep) + one_step_config['fv3config']['one_step'] = {'index': k, 'url': zarr_url} + model_config, kube_config = prepare_and_upload_config( workflow_name, curr_input_url, diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index c1118b45fd..e81d3c9dc3 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -1,5 +1,7 @@ import os from fv3net import runtime +import fsspec +import zarr import yaml import xarray as xr import numpy as np @@ -12,8 +14,6 @@ def post_process(): logger.info("Post processing model outputs") - CHUNK = {'time': 1, 'tile': -1} - begin = xr.open_zarr("begin_physics.zarr") before = xr.open_zarr("before_physics.zarr") after = xr.open_zarr("after_physics.zarr") @@ -28,8 +28,16 @@ def post_process(): dt = np.timedelta64(15, 'm') time = np.arange(len(time)) * dt ds = xr.concat([begin, before, after], dim='step').assign_coords(step=['begin', 'after_dynamics', 'after_physics'], time=time) - ds.chunk(CHUNK).to_zarr("post_processed.zarr", mode='w') + # put in storage + # this object must be initialized + index = config['one_step']['index'] + store_url = config['one_step']['url'] + mapper = fsspec.get_mapper(store_url) + group = zarr.open_group(mapper, mode='a') + for variable in group: + dims = group[variable].attrs['_ARRAY_DIMENSIONS'] + group[variable][index] = np.asarray(ds[variable].transpose(*dims)) if __name__ == "__main__": import fv3gfs From f6827c060076660970abb400e31a60e8854ee424 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Wed, 18 Mar 2020 22:25:19 +0000 Subject: [PATCH 17/81] get data for one time-step saved in the cloud --- Makefile | 1 + docker/prognostic_run/Dockerfile | 3 ++- external/fv3config | 2 +- fv3net/pipelines/kube_jobs/one_step.py | 14 ++++++++------ workflows/one_step_jobs/_run_steps.sh | 14 ++++++++++++++ workflows/one_step_jobs/deep-conv-off.yml | 1 + workflows/one_step_jobs/runfile.py | 4 +++- workflows/one_step_jobs/zarr_stat.py | 9 +++++++++ 8 files changed, 39 insertions(+), 9 deletions(-) create mode 100644 workflows/one_step_jobs/_run_steps.sh create mode 100644 workflows/one_step_jobs/zarr_stat.py diff --git a/Makefile b/Makefile index 716ce26156..27a45ce1fe 100644 --- a/Makefile +++ b/Makefile @@ -27,6 +27,7 @@ endif wheels: pip wheel --no-deps . pip wheel --no-deps external/vcm + pip wheel --no-deps external/fv3config # pattern rule for building docker images build_image_%: diff --git a/docker/prognostic_run/Dockerfile b/docker/prognostic_run/Dockerfile index 296f5a339e..06ba9b87ab 100644 --- a/docker/prognostic_run/Dockerfile +++ b/docker/prognostic_run/Dockerfile @@ -6,4 +6,5 @@ RUN pip3 install -r /tmp/requirements.txt RUN pip3 install wheel COPY fv3net-0.1.0-py3-none-any.whl /wheels/fv3net-0.1.0-py3-none-any.whl COPY vcm-0.1.0-py3-none-any.whl /wheels/vcm-0.1.0-py3-none-any.whl -RUN pip3 install --no-deps /wheels/fv3net-0.1.0-py3-none-any.whl && pip3 install /wheels/vcm-0.1.0-py3-none-any.whl +COPY external/fv3config /opt/fv3config +RUN pip3 install --no-deps /wheels/fv3net-0.1.0-py3-none-any.whl && pip3 install /wheels/vcm-0.1.0-py3-none-any.whl /opt/fv3config diff --git a/external/fv3config b/external/fv3config index e6dc95cef8..fd04e9427e 160000 --- a/external/fv3config +++ b/external/fv3config @@ -1 +1 @@ -Subproject commit e6dc95cef8c755e608dc432c97238907d2cdf558 +Subproject commit fd04e9427efe7ec16aec6bff5e867795c2c3b97b diff --git a/fv3net/pipelines/kube_jobs/one_step.py b/fv3net/pipelines/kube_jobs/one_step.py index 2ee616a74c..e96ccbb240 100644 --- a/fv3net/pipelines/kube_jobs/one_step.py +++ b/fv3net/pipelines/kube_jobs/one_step.py @@ -39,9 +39,9 @@ def _compute_chunks(shape, chunks): ) -def _get_schema(shape=(3, 1, 6, 79, 48, 48)): +def _get_schema(shape=(3, 15, 6, 79, 48, 48)): variables = ["air_temperature" , "specific_humidity", "pressure_thickness_of_atmospheric_layer"] - dims_scalar = ['initial_time', 'step', 'lead_time', 'tile', 'z', 'y', 'x'] + dims_scalar = ['step', 'lead_time', 'tile', 'z', 'y', 'x'] chunks_scalar = _compute_chunks(shape, [-1, 1, -1, -1, -1, -1]) DTYPE = np.float32 scalar_schema = {"dims": dims_scalar, "chunks": chunks_scalar, "dtype": DTYPE, "shape": shape} @@ -52,7 +52,10 @@ def _init_group_with_schema(group, schemas, timesteps): for name, schema in schemas.items(): shape = (len(timesteps),) + schema['shape'] chunks = (1,) + schema['chunks'] - group.empty(name, shape=shape, chunks=chunks, dtype=schema['dtype']) + array = group.empty(name, shape=shape, chunks=chunks, dtype=schema['dtype']) + array.attrs.update({ + '_ARRAY_DIMENSIONS': ['initial_time'] + schema['dims'] + }) def create_zarr_store(timesteps, output_url): @@ -350,6 +353,7 @@ def submit_jobs( curr_config_url = os.path.join(config_url, timestep) one_step_config['fv3config']['one_step'] = {'index': k, 'url': zarr_url} + print(curr_output_url, curr_config_url) model_config, kube_config = prepare_and_upload_config( workflow_name, @@ -361,12 +365,10 @@ def submit_jobs( local_vertical_grid_file=local_vertical_grid_file, ) - jobname = model_config["experiment_name"] - kube_config["jobname"] = jobname fv3config.run_kubernetes( os.path.join(curr_config_url, "fv3config.yml"), curr_output_url, job_labels=job_labels, **kube_config, ) - logger.info(f"Submitted job {jobname}") + logger.info(f"Submitted job for timestep {timestep}") diff --git a/workflows/one_step_jobs/_run_steps.sh b/workflows/one_step_jobs/_run_steps.sh new file mode 100644 index 0000000000..76eee735c8 --- /dev/null +++ b/workflows/one_step_jobs/_run_steps.sh @@ -0,0 +1,14 @@ +workdir=$(pwd) + src=gs://vcm-ml-data/orchestration-testing/test-andrep/coarsen_restarts_source-resolution_384_target-resolution_48/ + output=gs://vcm-ml-data/testing-noah/one-step + image=us.gcr.io/vcm-ml/prognostic_run:0.1.0 + yaml=$PWD/deep-conv-off.yml + + gsutil -m rm -r $output > /dev/null + ( + cd ../../ + python $workdir/orchestrate_submit_jobs.py \ + $src $output $yaml $image -o --n-steps 1 \ + --config-version v0.3 + ) + diff --git a/workflows/one_step_jobs/deep-conv-off.yml b/workflows/one_step_jobs/deep-conv-off.yml index 78e2196f37..0c29d6ade8 100644 --- a/workflows/one_step_jobs/deep-conv-off.yml +++ b/workflows/one_step_jobs/deep-conv-off.yml @@ -1,5 +1,6 @@ kubernetes: docker_image: us.gcr.io/vcm-ml/fv3gfs-python:v0.2.1 + runfile: workflows/one_step_jobs/runfile.py fv3config: diag_table: workflows/one_step_jobs/diag_table_one_step namelist: diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index e81d3c9dc3..8280668ad9 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -28,6 +28,7 @@ def post_process(): dt = np.timedelta64(15, 'm') time = np.arange(len(time)) * dt ds = xr.concat([begin, before, after], dim='step').assign_coords(step=['begin', 'after_dynamics', 'after_physics'], time=time) + ds = ds.rename({'time': 'lead_time'}) # put in storage # this object must be initialized @@ -36,7 +37,8 @@ def post_process(): mapper = fsspec.get_mapper(store_url) group = zarr.open_group(mapper, mode='a') for variable in group: - dims = group[variable].attrs['_ARRAY_DIMENSIONS'] + logger.info(f"Writing {variable} to {group}") + dims = group[variable].attrs['_ARRAY_DIMENSIONS'][1:] group[variable][index] = np.asarray(ds[variable].transpose(*dims)) if __name__ == "__main__": diff --git a/workflows/one_step_jobs/zarr_stat.py b/workflows/one_step_jobs/zarr_stat.py new file mode 100644 index 0000000000..31f7565978 --- /dev/null +++ b/workflows/one_step_jobs/zarr_stat.py @@ -0,0 +1,9 @@ +import fsspec +import zarr + +url = "gs://vcm-ml-data/testing-noah/one-step/big.zarr/" +m = fsspec.get_mapper(url) +g = zarr.open_group(m, mode='r') + +print(g['air_temperature'][:].std()) + From 7eb6314049d69de9352fb53eb9a216e8441b9abf Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Wed, 18 Mar 2020 22:31:27 +0000 Subject: [PATCH 18/81] test for full set of steps --- workflows/one_step_jobs/_run_steps.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/one_step_jobs/_run_steps.sh b/workflows/one_step_jobs/_run_steps.sh index 76eee735c8..49310166b4 100644 --- a/workflows/one_step_jobs/_run_steps.sh +++ b/workflows/one_step_jobs/_run_steps.sh @@ -8,7 +8,7 @@ workdir=$(pwd) ( cd ../../ python $workdir/orchestrate_submit_jobs.py \ - $src $output $yaml $image -o --n-steps 1 \ + $src $output $yaml $image -o \ --config-version v0.3 ) From 298182431e626292727b0471f6d005c521427874 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 19 Mar 2020 18:30:14 +0000 Subject: [PATCH 19/81] update fv3config submodule --- external/fv3config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/fv3config b/external/fv3config index fd04e9427e..f502aa8bab 160000 --- a/external/fv3config +++ b/external/fv3config @@ -1 +1 @@ -Subproject commit fd04e9427efe7ec16aec6bff5e867795c2c3b97b +Subproject commit f502aa8babed969aa6c0e9f9f0a48aaba7a281d1 From 3f6eddaad756283ddc5b5ff34029f47f1cd43115 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 19 Mar 2020 18:34:03 +0000 Subject: [PATCH 20/81] update fv3config to master --- external/fv3config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/fv3config b/external/fv3config index f502aa8bab..b598405973 160000 --- a/external/fv3config +++ b/external/fv3config @@ -1 +1 @@ -Subproject commit f502aa8babed969aa6c0e9f9f0a48aaba7a281d1 +Subproject commit b5984059734d3c1e9d4f60d554fa8a159b7ae85c From a735a2e5934075f55d76ae1d2e264297ab2a6bc6 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 19 Mar 2020 18:46:49 +0000 Subject: [PATCH 21/81] Refactor config generation It looks cleaner to use a closure for this --- fv3net/pipelines/kube_jobs/one_step.py | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/fv3net/pipelines/kube_jobs/one_step.py b/fv3net/pipelines/kube_jobs/one_step.py index e96ccbb240..9e0e4d68d9 100644 --- a/fv3net/pipelines/kube_jobs/one_step.py +++ b/fv3net/pipelines/kube_jobs/one_step.py @@ -269,7 +269,7 @@ def _upload_config_files( config_url: str, local_vertical_grid_file=None, upload_config_filename="fv3config.yml", -) -> Tuple[dict]: +) -> Tuple[str, dict]: """ Upload any files to remote paths necessary for fv3config and the fv3gfs one-step runs. @@ -294,7 +294,7 @@ def _upload_config_files( with fsspec.open(config_path, "w") as config_file: config_file.write(yaml.dump(model_config)) - return model_config, kubernetes_config + return config_path, kubernetes_config def prepare_and_upload_config( @@ -344,18 +344,13 @@ def submit_jobs( zarr_url = os.path.join(output_url, "big.zarr") create_zarr_store(timestep_list, zarr_url) - for k, timestep in enumerate(timestep_list): - + def config_factory(index): + timestep = timestep_list[index] curr_input_url = os.path.join(input_url, timestep) - - # do not upload to rundirectory to cloud - curr_output_url = os.path.join("/tmp", timestep) curr_config_url = os.path.join(config_url, timestep) - one_step_config['fv3config']['one_step'] = {'index': k, 'url': zarr_url} - print(curr_output_url, curr_config_url) - - model_config, kube_config = prepare_and_upload_config( + one_step_config['fv3config']['one_step'] = {'index': index, 'url': zarr_url} + return prepare_and_upload_config( workflow_name, curr_input_url, curr_config_url, @@ -365,10 +360,11 @@ def submit_jobs( local_vertical_grid_file=local_vertical_grid_file, ) + for k, timestep in enumerate(timestep_list): + logger.info(f"Submitting job for timestep {timestep}") + model_config_url, kube_config = config_factory(k) fv3config.run_kubernetes( - os.path.join(curr_config_url, "fv3config.yml"), - curr_output_url, - job_labels=job_labels, + model_config_url, + "/tmp/null", **kube_config, ) - logger.info(f"Submitted job for timestep {timestep}") From 8a7a49cd6cb4ad083b01fe10de8da4e16435e842 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 19 Mar 2020 19:06:07 +0000 Subject: [PATCH 22/81] print more info in zarr_stat --- workflows/one_step_jobs/zarr_stat.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/workflows/one_step_jobs/zarr_stat.py b/workflows/one_step_jobs/zarr_stat.py index 31f7565978..f235d4c92e 100644 --- a/workflows/one_step_jobs/zarr_stat.py +++ b/workflows/one_step_jobs/zarr_stat.py @@ -1,9 +1,24 @@ import fsspec +import xarray as xr import zarr url = "gs://vcm-ml-data/testing-noah/one-step/big.zarr/" m = fsspec.get_mapper(url) -g = zarr.open_group(m, mode='r') +ds = xr.open_zarr(m) + +print("output structure:") +print() +for root, dirname, filename in fsspec.filesystem('gs').walk("gs://vcm-ml-data/testing-noah/one-step"): + if not 'big.zarr' in root: + for name in filename: + print(f"{root}/{name}") + for name in dirname: + print(f"{root}/{name}/") + +print() +print("big.zarr info:") +print() +print(ds.info()) +print(ds.air_temperature.std().compute()) -print(g['air_temperature'][:].std()) From ff62ff3c981ef701017eb40e422bed19bec67b6b Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 19 Mar 2020 19:07:04 +0000 Subject: [PATCH 23/81] remove some files --- workflows/one_step_jobs/_post_process.py | 20 -------------------- workflows/one_step_jobs/_run.sh | 2 -- 2 files changed, 22 deletions(-) delete mode 100644 workflows/one_step_jobs/_post_process.py delete mode 100644 workflows/one_step_jobs/_run.sh diff --git a/workflows/one_step_jobs/_post_process.py b/workflows/one_step_jobs/_post_process.py deleted file mode 100644 index 62edda4fe3..0000000000 --- a/workflows/one_step_jobs/_post_process.py +++ /dev/null @@ -1,20 +0,0 @@ -import xarray as xr -import numpy as np - -CHUNK = {'time': 1, 'tile': -1} - -begin = xr.open_zarr("output_dir/begin_physics.zarr") -before = xr.open_zarr("output_dir/before_physics.zarr") -after = xr.open_zarr("output_dir/after_physics.zarr") - -# make the time dims consistent -time = begin.time -before = before.drop('time') -after = after.drop('time') -begin = begin.drop('time') - -# concat data -dt = np.timedelta64(15, 'm') -time = np.arange(len(time)) * dt -ds = xr.concat([begin, before, after], dim='step').assign_coords(step=['begin', 'after_dynamics', 'after_physics'], time=time) -ds.chunk(CHUNK).to_zarr("post_processed.zarr", mode='w') diff --git a/workflows/one_step_jobs/_run.sh b/workflows/one_step_jobs/_run.sh deleted file mode 100644 index a8a7f55ca9..0000000000 --- a/workflows/one_step_jobs/_run.sh +++ /dev/null @@ -1,2 +0,0 @@ -fv3config=gs://vcm-ml-data/orchestration-testing/test-andrep/one_step_run_experiment_yaml_all-physics-off.yml_docker_image_fv3gfs-python:v0.3.1_config-version_v0.3/one_step_config/20160801.001500/fv3config.yml -fv3run --dockerimage us.gcr.io/vcm-ml/prognostic_run:0.1.0 $fv3config output_dir --runfile runfile.py From a152ed798fa6bc3e30e7571dd7df312b3451f7c1 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 19 Mar 2020 19:26:31 +0000 Subject: [PATCH 24/81] Separate kube_config and fv3config handling The former is constant for all runs, while the latter changes for each time step. It is cleaner to separate them, and remove some of the "and" functions. --- fv3net/pipelines/kube_jobs/one_step.py | 80 ++++++++++---------------- 1 file changed, 29 insertions(+), 51 deletions(-) diff --git a/fv3net/pipelines/kube_jobs/one_step.py b/fv3net/pipelines/kube_jobs/one_step.py index 9e0e4d68d9..fb4ea9040e 100644 --- a/fv3net/pipelines/kube_jobs/one_step.py +++ b/fv3net/pipelines/kube_jobs/one_step.py @@ -8,7 +8,7 @@ import re from copy import deepcopy from multiprocessing import Pool -from typing import List, Tuple +from typing import List, Tuple, Dict import fv3config from . import utils @@ -232,20 +232,16 @@ def _update_config( workflow_name: str, base_config_version: str, user_model_config: dict, - user_kubernetes_config: dict, input_url: str, config_url: str, timestep: str, -) -> Tuple[dict]: +) -> Dict: """ Update kubernetes and fv3 configurations with user inputs to prepare for fv3gfs one-step runs. """ base_model_config = utils.get_base_fv3config(base_config_version) model_config = utils.update_nested_dict(base_model_config, user_model_config) - kubernetes_config = utils.update_nested_dict( - deepcopy(KUBERNETES_CONFIG_DEFAULT), user_kubernetes_config - ) model_config = fv3config.enable_restart(model_config) model_config["experiment_name"] = _get_experiment_name(workflow_name, timestep) @@ -260,27 +256,19 @@ def _update_config( } ) - return model_config, kubernetes_config + return model_config def _upload_config_files( model_config: dict, - kubernetes_config: dict, config_url: str, - local_vertical_grid_file=None, + local_vertical_grid_file, upload_config_filename="fv3config.yml", -) -> Tuple[str, dict]: +) -> str: """ Upload any files to remote paths necessary for fv3config and the fv3gfs one-step runs. """ - - if "runfile" in kubernetes_config: - runfile_path = kubernetes_config["runfile"] - kubernetes_config["runfile"] = utils.transfer_local_to_remote( - runfile_path, config_url - ) - model_config["diag_table"] = utils.transfer_local_to_remote( model_config["diag_table"], config_url ) @@ -294,38 +282,22 @@ def _upload_config_files( with fsspec.open(config_path, "w") as config_file: config_file.write(yaml.dump(model_config)) - return config_path, kubernetes_config + return config_path -def prepare_and_upload_config( - workflow_name: str, - input_url: str, - config_url: str, - timestep: str, - one_step_config: dict, - base_config_version: str, - **kwargs, -) -> Tuple[dict]: - """Update model and kubernetes configurations for this particular - timestep and upload necessary files to GCS""" - - user_model_config = one_step_config["fv3config"] - user_kubernetes_config = one_step_config["kubernetes"] - - model_config, kube_config = _update_config( - workflow_name, - base_config_version, - user_model_config, - user_kubernetes_config, - input_url, - config_url, - timestep, - ) - model_config, kube_config = _upload_config_files( - model_config, kube_config, config_url, **kwargs +def get_run_kubernetes_kwargs(user_kubernetes_config, config_url): + + kubernetes_config = utils.update_nested_dict( + deepcopy(KUBERNETES_CONFIG_DEFAULT), user_kubernetes_config ) - return model_config, kube_config + if "runfile" in kubernetes_config: + runfile_path = kubernetes_config["runfile"] + kubernetes_config["runfile"] = utils.transfer_local_to_remote( + runfile_path, config_url + ) + + return kubernetes_config def submit_jobs( @@ -350,21 +322,27 @@ def config_factory(index): curr_config_url = os.path.join(config_url, timestep) one_step_config['fv3config']['one_step'] = {'index': index, 'url': zarr_url} - return prepare_and_upload_config( + + model_config = _update_config( workflow_name, + base_config_version, + one_step_config['fv3config'], curr_input_url, curr_config_url, timestep, - one_step_config, - base_config_version, - local_vertical_grid_file=local_vertical_grid_file, ) + return _upload_config_files( + model_config, curr_config_url, local_vertical_grid_file) + + # kube kwargs are shared by all jobs + kube_kwargs = get_run_kubernetes_kwargs(one_step_config['kubernetes'], config_url) for k, timestep in enumerate(timestep_list): logger.info(f"Submitting job for timestep {timestep}") - model_config_url, kube_config = config_factory(k) + model_config_url = config_factory(k) fv3config.run_kubernetes( model_config_url, "/tmp/null", - **kube_config, + job_labels=job_labels, + **kube_kwargs ) From e397c13a5c7aeb86a735097967d61c34f9445591 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 19 Mar 2020 19:34:45 +0000 Subject: [PATCH 25/81] black and add workflow to job lables --- fv3net/pipelines/kube_jobs/one_step.py | 48 ++++++++++--------- .../one_step_jobs/orchestrate_submit_jobs.py | 5 +- workflows/one_step_jobs/runfile.py | 31 +++++++----- workflows/one_step_jobs/zarr_stat.py | 8 ++-- 4 files changed, 51 insertions(+), 41 deletions(-) diff --git a/fv3net/pipelines/kube_jobs/one_step.py b/fv3net/pipelines/kube_jobs/one_step.py index fb4ea9040e..7addccdbae 100644 --- a/fv3net/pipelines/kube_jobs/one_step.py +++ b/fv3net/pipelines/kube_jobs/one_step.py @@ -33,37 +33,41 @@ def _compute_chunks(shape, chunks): - return tuple( - size if chunk == -1 else chunk - for size, chunk in zip(shape, chunks) - ) + return tuple(size if chunk == -1 else chunk for size, chunk in zip(shape, chunks)) def _get_schema(shape=(3, 15, 6, 79, 48, 48)): - variables = ["air_temperature" , "specific_humidity", "pressure_thickness_of_atmospheric_layer"] - dims_scalar = ['step', 'lead_time', 'tile', 'z', 'y', 'x'] + variables = [ + "air_temperature", + "specific_humidity", + "pressure_thickness_of_atmospheric_layer", + ] + dims_scalar = ["step", "lead_time", "tile", "z", "y", "x"] chunks_scalar = _compute_chunks(shape, [-1, 1, -1, -1, -1, -1]) DTYPE = np.float32 - scalar_schema = {"dims": dims_scalar, "chunks": chunks_scalar, "dtype": DTYPE, "shape": shape} + scalar_schema = { + "dims": dims_scalar, + "chunks": chunks_scalar, + "dtype": DTYPE, + "shape": shape, + } return {key: scalar_schema for key in variables} def _init_group_with_schema(group, schemas, timesteps): for name, schema in schemas.items(): - shape = (len(timesteps),) + schema['shape'] - chunks = (1,) + schema['chunks'] - array = group.empty(name, shape=shape, chunks=chunks, dtype=schema['dtype']) - array.attrs.update({ - '_ARRAY_DIMENSIONS': ['initial_time'] + schema['dims'] - }) + shape = (len(timesteps),) + schema["shape"] + chunks = (1,) + schema["chunks"] + array = group.empty(name, shape=shape, chunks=chunks, dtype=schema["dtype"]) + array.attrs.update({"_ARRAY_DIMENSIONS": ["initial_time"] + schema["dims"]}) def create_zarr_store(timesteps, output_url): schemas = _get_schema() mapper = fsspec.get_mapper(output_url) - group = zarr.open_group(mapper, mode='w') + group = zarr.open_group(mapper, mode="w") _init_group_with_schema(group, schemas, timesteps) - + def timesteps_to_process( input_url: str, @@ -321,28 +325,26 @@ def config_factory(index): curr_input_url = os.path.join(input_url, timestep) curr_config_url = os.path.join(config_url, timestep) - one_step_config['fv3config']['one_step'] = {'index': index, 'url': zarr_url} + one_step_config["fv3config"]["one_step"] = {"index": index, "url": zarr_url} model_config = _update_config( workflow_name, base_config_version, - one_step_config['fv3config'], + one_step_config["fv3config"], curr_input_url, curr_config_url, timestep, ) return _upload_config_files( - model_config, curr_config_url, local_vertical_grid_file) + model_config, curr_config_url, local_vertical_grid_file + ) # kube kwargs are shared by all jobs - kube_kwargs = get_run_kubernetes_kwargs(one_step_config['kubernetes'], config_url) + kube_kwargs = get_run_kubernetes_kwargs(one_step_config["kubernetes"], config_url) for k, timestep in enumerate(timestep_list): logger.info(f"Submitting job for timestep {timestep}") model_config_url = config_factory(k) fv3config.run_kubernetes( - model_config_url, - "/tmp/null", - job_labels=job_labels, - **kube_kwargs + model_config_url, "/tmp/null", job_labels=job_labels, **kube_kwargs ) diff --git a/workflows/one_step_jobs/orchestrate_submit_jobs.py b/workflows/one_step_jobs/orchestrate_submit_jobs.py index a4fc482b9f..367298755e 100644 --- a/workflows/one_step_jobs/orchestrate_submit_jobs.py +++ b/workflows/one_step_jobs/orchestrate_submit_jobs.py @@ -83,7 +83,10 @@ def _create_arg_parser(): one_step_config = yaml.load(file, Loader=yaml.FullLoader) workflow_name = Path(args.one_step_yaml).with_suffix("").name short_id = get_alphanumeric_unique_tag(8) - job_label = {"orchestrator-jobs": f"{workflow_name}-{short_id}"} + job_label = { + "orchestrator-jobs": f"{workflow_name}-{short_id}", + "workflow": "one_step_jobs", + } if not args.config_url: config_url = os.path.join(args.output_url, CONFIG_DIRECTORY_NAME) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 8280668ad9..8a424aa40a 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -20,27 +20,30 @@ def post_process(): # make the time dims consistent time = begin.time - before = before.drop('time') - after = after.drop('time') - begin = begin.drop('time') + before = before.drop("time") + after = after.drop("time") + begin = begin.drop("time") # concat data - dt = np.timedelta64(15, 'm') + dt = np.timedelta64(15, "m") time = np.arange(len(time)) * dt - ds = xr.concat([begin, before, after], dim='step').assign_coords(step=['begin', 'after_dynamics', 'after_physics'], time=time) - ds = ds.rename({'time': 'lead_time'}) + ds = xr.concat([begin, before, after], dim="step").assign_coords( + step=["begin", "after_dynamics", "after_physics"], time=time + ) + ds = ds.rename({"time": "lead_time"}) # put in storage # this object must be initialized - index = config['one_step']['index'] - store_url = config['one_step']['url'] + index = config["one_step"]["index"] + store_url = config["one_step"]["url"] mapper = fsspec.get_mapper(store_url) - group = zarr.open_group(mapper, mode='a') + group = zarr.open_group(mapper, mode="a") for variable in group: logger.info(f"Writing {variable} to {group}") - dims = group[variable].attrs['_ARRAY_DIMENSIONS'][1:] + dims = group[variable].attrs["_ARRAY_DIMENSIONS"][1:] group[variable][index] = np.asarray(ds[variable].transpose(*dims)) + if __name__ == "__main__": import fv3gfs from mpi4py import MPI @@ -48,7 +51,7 @@ def post_process(): RUN_DIR = os.path.dirname(os.path.realpath(__file__)) DELP = "pressure_thickness_of_atmospheric_layer" -TIME = 'time' +TIME = "time" VARIABLES = list(runtime.CF_TO_RESTART_MAP) + [DELP, TIME] rank = MPI.COMM_WORLD.Get_rank() @@ -81,9 +84,11 @@ def post_process(): fv3gfs.initialize() state = fv3gfs.get_state(names=VARIABLES) -if rank == 0: logger.info("Beginning steps") +if rank == 0: + logger.info("Beginning steps") for i in range(fv3gfs.get_step_count()): - if rank == 0: logger.info(f"step {i}") + if rank == 0: + logger.info(f"step {i}") begin_monitor.store(state) fv3gfs.step_dynamics() state = fv3gfs.get_state(names=VARIABLES) diff --git a/workflows/one_step_jobs/zarr_stat.py b/workflows/one_step_jobs/zarr_stat.py index f235d4c92e..ac315597be 100644 --- a/workflows/one_step_jobs/zarr_stat.py +++ b/workflows/one_step_jobs/zarr_stat.py @@ -8,8 +8,10 @@ print("output structure:") print() -for root, dirname, filename in fsspec.filesystem('gs').walk("gs://vcm-ml-data/testing-noah/one-step"): - if not 'big.zarr' in root: +for root, dirname, filename in fsspec.filesystem("gs").walk( + "gs://vcm-ml-data/testing-noah/one-step" +): + if not "big.zarr" in root: for name in filename: print(f"{root}/{name}") for name in dirname: @@ -20,5 +22,3 @@ print() print(ds.info()) print(ds.air_temperature.std().compute()) - - From 71f73002a273071b53e04451576c673149539391 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 19 Mar 2020 19:40:24 +0000 Subject: [PATCH 26/81] linter --- fv3net/pipelines/kube_jobs/one_step.py | 2 +- workflows/one_step_jobs/runfile.py | 1 - workflows/one_step_jobs/zarr_stat.py | 3 +-- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/fv3net/pipelines/kube_jobs/one_step.py b/fv3net/pipelines/kube_jobs/one_step.py index 7addccdbae..3762503f97 100644 --- a/fv3net/pipelines/kube_jobs/one_step.py +++ b/fv3net/pipelines/kube_jobs/one_step.py @@ -8,7 +8,7 @@ import re from copy import deepcopy from multiprocessing import Pool -from typing import List, Tuple, Dict +from typing import List, Dict import fv3config from . import utils diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 8a424aa40a..04d0d8cc9e 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -2,7 +2,6 @@ from fv3net import runtime import fsspec import zarr -import yaml import xarray as xr import numpy as np import logging diff --git a/workflows/one_step_jobs/zarr_stat.py b/workflows/one_step_jobs/zarr_stat.py index ac315597be..d36b896963 100644 --- a/workflows/one_step_jobs/zarr_stat.py +++ b/workflows/one_step_jobs/zarr_stat.py @@ -1,6 +1,5 @@ import fsspec import xarray as xr -import zarr url = "gs://vcm-ml-data/testing-noah/one-step/big.zarr/" m = fsspec.get_mapper(url) @@ -11,7 +10,7 @@ for root, dirname, filename in fsspec.filesystem("gs").walk( "gs://vcm-ml-data/testing-noah/one-step" ): - if not "big.zarr" in root: + if "big.zarr" not in root: for name in filename: print(f"{root}/{name}") for name in dirname: From 74d5c9b5d45a6be309d9f05dc3d1782af6378720 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 19 Mar 2020 21:01:48 +0000 Subject: [PATCH 27/81] save coordinate information and attrs to the big zarr --- fv3net/pipelines/kube_jobs/one_step.py | 2 +- workflows/one_step_jobs/runfile.py | 125 +++++++++++++------------ workflows/one_step_jobs/zarr_stat.py | 1 + 3 files changed, 69 insertions(+), 59 deletions(-) diff --git a/fv3net/pipelines/kube_jobs/one_step.py b/fv3net/pipelines/kube_jobs/one_step.py index 3762503f97..983f10821f 100644 --- a/fv3net/pipelines/kube_jobs/one_step.py +++ b/fv3net/pipelines/kube_jobs/one_step.py @@ -42,7 +42,7 @@ def _get_schema(shape=(3, 15, 6, 79, 48, 48)): "specific_humidity", "pressure_thickness_of_atmospheric_layer", ] - dims_scalar = ["step", "lead_time", "tile", "z", "y", "x"] + dims_scalar = ["step", "forecast_time", "tile", "z", "y", "x"] chunks_scalar = _compute_chunks(shape, [-1, 1, -1, -1, -1, -1]) DTYPE = np.float32 scalar_schema = { diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 04d0d8cc9e..113ccbd177 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -11,11 +11,11 @@ logger = logging.getLogger(__file__) -def post_process(): +def post_process(out_dir, store_url, index): logger.info("Post processing model outputs") - begin = xr.open_zarr("begin_physics.zarr") - before = xr.open_zarr("before_physics.zarr") - after = xr.open_zarr("after_physics.zarr") + begin = xr.open_zarr(f"{out_dir}/begin_physics.zarr") + before = xr.open_zarr(f"{out_dir}/before_physics.zarr") + after = xr.open_zarr(f"{out_dir}/after_physics.zarr") # make the time dims consistent time = begin.time @@ -29,73 +29,82 @@ def post_process(): ds = xr.concat([begin, before, after], dim="step").assign_coords( step=["begin", "after_dynamics", "after_physics"], time=time ) - ds = ds.rename({"time": "lead_time"}) + ds = ds.rename({"time": "forecast_time"}) # put in storage # this object must be initialized - index = config["one_step"]["index"] - store_url = config["one_step"]["url"] mapper = fsspec.get_mapper(store_url) group = zarr.open_group(mapper, mode="a") - for variable in group: + for variable in ds: logger.info(f"Writing {variable} to {group}") dims = group[variable].attrs["_ARRAY_DIMENSIONS"][1:] + group[variable].attrs.update(ds[variable].attrs) group[variable][index] = np.asarray(ds[variable].transpose(*dims)) + for coord in ds.coords: + if coord not in group: + logger.info(f"writing {coord} to group") + group[coord] = np.asarray(ds[coord]) + group[coord].attrs.update({ + '_ARRAY_DIMENSIONS': ds[coord].dims + }) + group[coord].attrs.update(ds[coord].attrs) + + if __name__ == "__main__": import fv3gfs from mpi4py import MPI -RUN_DIR = os.path.dirname(os.path.realpath(__file__)) - -DELP = "pressure_thickness_of_atmospheric_layer" -TIME = "time" -VARIABLES = list(runtime.CF_TO_RESTART_MAP) + [DELP, TIME] - -rank = MPI.COMM_WORLD.Get_rank() -current_dir = os.getcwd() -config = runtime.get_config() -MPI.COMM_WORLD.barrier() # wait for master rank to write run directory - -partitioner = fv3gfs.CubedSpherePartitioner.from_namelist(config["namelist"]) - -before_monitor = fv3gfs.ZarrMonitor( - os.path.join(RUN_DIR, "before_physics.zarr"), - partitioner, - mode="w", - mpi_comm=MPI.COMM_WORLD, -) - -after_monitor = fv3gfs.ZarrMonitor( - os.path.join(RUN_DIR, "after_physics.zarr"), - partitioner, - mode="w", - mpi_comm=MPI.COMM_WORLD, -) - -begin_monitor = fv3gfs.ZarrMonitor( - os.path.join(RUN_DIR, "begin_physics.zarr"), - partitioner, - mode="w", - mpi_comm=MPI.COMM_WORLD, -) - -fv3gfs.initialize() -state = fv3gfs.get_state(names=VARIABLES) -if rank == 0: - logger.info("Beginning steps") -for i in range(fv3gfs.get_step_count()): - if rank == 0: - logger.info(f"step {i}") - begin_monitor.store(state) - fv3gfs.step_dynamics() - state = fv3gfs.get_state(names=VARIABLES) - before_monitor.store(state) - fv3gfs.step_physics() + RUN_DIR = os.path.dirname(os.path.realpath(__file__)) + + DELP = "pressure_thickness_of_atmospheric_layer" + TIME = "time" + VARIABLES = list(runtime.CF_TO_RESTART_MAP) + [DELP, TIME] + + rank = MPI.COMM_WORLD.Get_rank() + current_dir = os.getcwd() + config = runtime.get_config() + MPI.COMM_WORLD.barrier() # wait for master rank to write run directory + + partitioner = fv3gfs.CubedSpherePartitioner.from_namelist(config["namelist"]) + + before_monitor = fv3gfs.ZarrMonitor( + os.path.join(RUN_DIR, "before_physics.zarr"), + partitioner, + mode="w", + mpi_comm=MPI.COMM_WORLD, + ) + + after_monitor = fv3gfs.ZarrMonitor( + os.path.join(RUN_DIR, "after_physics.zarr"), + partitioner, + mode="w", + mpi_comm=MPI.COMM_WORLD, + ) + + begin_monitor = fv3gfs.ZarrMonitor( + os.path.join(RUN_DIR, "begin_physics.zarr"), + partitioner, + mode="w", + mpi_comm=MPI.COMM_WORLD, + ) + + fv3gfs.initialize() state = fv3gfs.get_state(names=VARIABLES) - after_monitor.store(state) + if rank == 0: + logger.info("Beginning steps") + for i in range(fv3gfs.get_step_count()): + if rank == 0: + logger.info(f"step {i}") + begin_monitor.store(state) + fv3gfs.step_dynamics() + state = fv3gfs.get_state(names=VARIABLES) + before_monitor.store(state) + fv3gfs.step_physics() + state = fv3gfs.get_state(names=VARIABLES) + after_monitor.store(state) -if rank == 0: - post_process() -fv3gfs.cleanup() + if rank == 0: + post_process(RUN_DIR, config['one_step']['url'], config['one_step']['index']) + fv3gfs.cleanup() diff --git a/workflows/one_step_jobs/zarr_stat.py b/workflows/one_step_jobs/zarr_stat.py index d36b896963..214ecdd6e5 100644 --- a/workflows/one_step_jobs/zarr_stat.py +++ b/workflows/one_step_jobs/zarr_stat.py @@ -18,6 +18,7 @@ print() print("big.zarr info:") +print(ds) print() print(ds.info()) print(ds.air_temperature.std().compute()) From 27cb13df4ef15d1c6c7e7f6c38c215350ab503b5 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 19 Mar 2020 21:08:25 +0000 Subject: [PATCH 28/81] add coordinate info in runfile --- workflows/one_step_jobs/runfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 113ccbd177..e5fabab650 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -38,9 +38,9 @@ def post_process(out_dir, store_url, index): for variable in ds: logger.info(f"Writing {variable} to {group}") dims = group[variable].attrs["_ARRAY_DIMENSIONS"][1:] - group[variable].attrs.update(ds[variable].attrs) group[variable][index] = np.asarray(ds[variable].transpose(*dims)) + # TODO maybe move this code to coordinating file for coord in ds.coords: if coord not in group: logger.info(f"writing {coord} to group") From b8a9c0421ffd2c6165ed28df3252c3da8aef31e2 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 19 Mar 2020 22:15:35 +0000 Subject: [PATCH 29/81] debug --- fv3net/pipelines/kube_jobs/one_step.py | 76 +++++++++--------------- workflows/one_step_jobs/runfile.py | 80 ++++++++++++++++++++------ workflows/one_step_jobs/zarr_stat.py | 10 ++-- 3 files changed, 97 insertions(+), 69 deletions(-) diff --git a/fv3net/pipelines/kube_jobs/one_step.py b/fv3net/pipelines/kube_jobs/one_step.py index 983f10821f..1b74d37d70 100644 --- a/fv3net/pipelines/kube_jobs/one_step.py +++ b/fv3net/pipelines/kube_jobs/one_step.py @@ -2,6 +2,7 @@ import os import zarr import fsspec +from toolz import assoc import numpy as np import uuid import yaml @@ -32,43 +33,6 @@ logger = logging.getLogger(__name__) -def _compute_chunks(shape, chunks): - return tuple(size if chunk == -1 else chunk for size, chunk in zip(shape, chunks)) - - -def _get_schema(shape=(3, 15, 6, 79, 48, 48)): - variables = [ - "air_temperature", - "specific_humidity", - "pressure_thickness_of_atmospheric_layer", - ] - dims_scalar = ["step", "forecast_time", "tile", "z", "y", "x"] - chunks_scalar = _compute_chunks(shape, [-1, 1, -1, -1, -1, -1]) - DTYPE = np.float32 - scalar_schema = { - "dims": dims_scalar, - "chunks": chunks_scalar, - "dtype": DTYPE, - "shape": shape, - } - return {key: scalar_schema for key in variables} - - -def _init_group_with_schema(group, schemas, timesteps): - for name, schema in schemas.items(): - shape = (len(timesteps),) + schema["shape"] - chunks = (1,) + schema["chunks"] - array = group.empty(name, shape=shape, chunks=chunks, dtype=schema["dtype"]) - array.attrs.update({"_ARRAY_DIMENSIONS": ["initial_time"] + schema["dims"]}) - - -def create_zarr_store(timesteps, output_url): - schemas = _get_schema() - mapper = fsspec.get_mapper(output_url) - group = zarr.open_group(mapper, mode="w") - _init_group_with_schema(group, schemas, timesteps) - - def timesteps_to_process( input_url: str, output_url: str, @@ -318,19 +282,22 @@ def submit_jobs( """Submit one-step job for all timesteps in timestep_list""" zarr_url = os.path.join(output_url, "big.zarr") - create_zarr_store(timestep_list, zarr_url) + # kube kwargs are shared by all jobs + kube_kwargs = get_run_kubernetes_kwargs(one_step_config["kubernetes"], config_url) - def config_factory(index): - timestep = timestep_list[index] + def config_factory(**kwargs): + timestep = timestep_list[kwargs['index']] curr_input_url = os.path.join(input_url, timestep) curr_config_url = os.path.join(config_url, timestep) - one_step_config["fv3config"]["one_step"] = {"index": index, "url": zarr_url} + config = deepcopy(one_step_config) + kwargs['url'] = zarr_url + config["fv3config"]['one_step'] = kwargs model_config = _update_config( workflow_name, base_config_version, - one_step_config["fv3config"], + config["fv3config"], curr_input_url, curr_config_url, timestep, @@ -339,12 +306,25 @@ def config_factory(index): model_config, curr_config_url, local_vertical_grid_file ) - # kube kwargs are shared by all jobs - kube_kwargs = get_run_kubernetes_kwargs(one_step_config["kubernetes"], config_url) + def run_job(wait=False, **kwargs): + """Run a run_kubernetes job - for k, timestep in enumerate(timestep_list): - logger.info(f"Submitting job for timestep {timestep}") - model_config_url = config_factory(k) + kwargs are passed workflows/one_step_jobs/runfile.py:post_process + + """ + uid = str(uuid.uuid4()) + labels = assoc(job_labels, 'jobid', uid) + model_config_url = config_factory(**kwargs) fv3config.run_kubernetes( - model_config_url, "/tmp/null", job_labels=job_labels, **kube_kwargs + model_config_url, "/tmp/null", job_labels=labels, **kube_kwargs ) + if wait: + utils.wait_for_complete(job_labels, sleep_interval=10) + + for k, timestep in enumerate(timestep_list): + if k == 0: + logger.info("Running the first time step to initialize the zarr store") + run_job(index=k, init=True, wait=True, timesteps=timestep_list) + else: + logger.info(f"Submitting job for timestep {timestep}") + run_job(index=k, init=False) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index e5fabab650..19c57e8b9e 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -11,7 +11,63 @@ logger = logging.getLogger(__file__) -def post_process(out_dir, store_url, index): +def _compute_chunks(shape, chunks): + return tuple(size if chunk == -1 else chunk for size, chunk in zip(shape, chunks)) + + +def _get_schema(shape=(3, 15, 6, 79, 48, 48)): + variables = [ + "air_temperature", + "specific_humidity", + "pressure_thickness_of_atmospheric_layer", + ] + dims_scalar = ["step", "forecast_time", "tile", "z", "y", "x"] + chunks_scalar = _compute_chunks(shape, [-1, 1, -1, -1, -1, -1]) + DTYPE = np.float32 + scalar_schema = { + "dims": dims_scalar, + "chunks": chunks_scalar, + "dtype": DTYPE, + "shape": shape, + } + return {key: scalar_schema for key in variables} + + +def _init_group_with_schema(group, schemas, timesteps): + for name, schema in schemas.items(): + shape = (len(timesteps),) + schema["shape"] + chunks = (1,) + schema["chunks"] + array = group.empty(name, shape=shape, chunks=chunks, dtype=schema["dtype"]) + array.attrs.update({"_ARRAY_DIMENSIONS": ["initial_time"] + schema["dims"]}) + + +def init_data_var(group, array): + shape = (1,) + array.data.shape + chunks = (1,) + tuple(size[0] for size in array.data.chunks) + out_array = group.empty(name=array.name, shape=shape, chunks=chunks, dtype=array.dtype) + out_array.attrs.update(array.attrs) + out_array.attrs['_ARRAY_DIMENSIONS'] = ['initial_time'] + list(array.dims) + + +def init_coord(group, coord): + out_array = group.array(name=coord.name, data=np.asarray(coord)) + out_array.attrs.update(coord.attrs) + out_array.attrs['_ARRAY_DIMENSIONS'] = list(coord.dims) + + +def create_zarr_store(timesteps, group, template): + logger.info("Creating group") + ds = template + for name in ds: + init_data_var(group, ds[name]) + + for name in ds.coords: + init_coord(group, ds[name]) + + +def post_process(out_dir, url, index, init=False, timesteps=()): + + store_url = url logger.info("Post processing model outputs") begin = xr.open_zarr(f"{out_dir}/begin_physics.zarr") before = xr.open_zarr(f"{out_dir}/before_physics.zarr") @@ -29,28 +85,20 @@ def post_process(out_dir, store_url, index): ds = xr.concat([begin, before, after], dim="step").assign_coords( step=["begin", "after_dynamics", "after_physics"], time=time ) - ds = ds.rename({"time": "forecast_time"}) + ds = ds.rename({"time": "forecast_time"}).chunk({'forecast_time': 1, 'tile': 6}) - # put in storage - # this object must be initialized mapper = fsspec.get_mapper(store_url) group = zarr.open_group(mapper, mode="a") + + if init: + group = zarr.open_group(mapper, mode="w") + create_zarr_store(timesteps, group, ds) + for variable in ds: logger.info(f"Writing {variable} to {group}") dims = group[variable].attrs["_ARRAY_DIMENSIONS"][1:] group[variable][index] = np.asarray(ds[variable].transpose(*dims)) - # TODO maybe move this code to coordinating file - for coord in ds.coords: - if coord not in group: - logger.info(f"writing {coord} to group") - group[coord] = np.asarray(ds[coord]) - group[coord].attrs.update({ - '_ARRAY_DIMENSIONS': ds[coord].dims - }) - group[coord].attrs.update(ds[coord].attrs) - - if __name__ == "__main__": import fv3gfs @@ -106,5 +154,5 @@ def post_process(out_dir, store_url, index): after_monitor.store(state) if rank == 0: - post_process(RUN_DIR, config['one_step']['url'], config['one_step']['index']) + post_process(RUN_DIR, **config['one_step']) fv3gfs.cleanup() diff --git a/workflows/one_step_jobs/zarr_stat.py b/workflows/one_step_jobs/zarr_stat.py index 214ecdd6e5..d02b521028 100644 --- a/workflows/one_step_jobs/zarr_stat.py +++ b/workflows/one_step_jobs/zarr_stat.py @@ -1,10 +1,5 @@ import fsspec import xarray as xr - -url = "gs://vcm-ml-data/testing-noah/one-step/big.zarr/" -m = fsspec.get_mapper(url) -ds = xr.open_zarr(m) - print("output structure:") print() for root, dirname, filename in fsspec.filesystem("gs").walk( @@ -16,6 +11,11 @@ for name in dirname: print(f"{root}/{name}/") + +url = "gs://vcm-ml-data/testing-noah/one-step/big.zarr/" +m = fsspec.get_mapper(url) +ds = xr.open_zarr(m) + print() print("big.zarr info:") print(ds) From 0cf8cff89738bce119caa07fcd675f02fb0fbc71 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 19 Mar 2020 22:17:04 +0000 Subject: [PATCH 30/81] move writing into runfile --- workflows/one_step_jobs/runfile.py | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 19c57e8b9e..80ec10ca52 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -58,6 +58,7 @@ def init_coord(group, coord): def create_zarr_store(timesteps, group, template): logger.info("Creating group") ds = template + group.attrs.update(ds.attrs) for name in ds: init_data_var(group, ds[name]) From 8f443f7a36715e956c2d22cb3f8f57d614c79a2e Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 19 Mar 2020 22:21:50 +0000 Subject: [PATCH 31/81] black --- fv3net/pipelines/kube_jobs/one_step.py | 10 ++++------ workflows/one_step_jobs/runfile.py | 12 +++++++----- workflows/one_step_jobs/zarr_stat.py | 1 + 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/fv3net/pipelines/kube_jobs/one_step.py b/fv3net/pipelines/kube_jobs/one_step.py index 1b74d37d70..ab923cd787 100644 --- a/fv3net/pipelines/kube_jobs/one_step.py +++ b/fv3net/pipelines/kube_jobs/one_step.py @@ -1,9 +1,7 @@ import logging import os -import zarr import fsspec from toolz import assoc -import numpy as np import uuid import yaml import re @@ -286,13 +284,13 @@ def submit_jobs( kube_kwargs = get_run_kubernetes_kwargs(one_step_config["kubernetes"], config_url) def config_factory(**kwargs): - timestep = timestep_list[kwargs['index']] + timestep = timestep_list[kwargs["index"]] curr_input_url = os.path.join(input_url, timestep) curr_config_url = os.path.join(config_url, timestep) config = deepcopy(one_step_config) - kwargs['url'] = zarr_url - config["fv3config"]['one_step'] = kwargs + kwargs["url"] = zarr_url + config["fv3config"]["one_step"] = kwargs model_config = _update_config( workflow_name, @@ -313,7 +311,7 @@ def run_job(wait=False, **kwargs): """ uid = str(uuid.uuid4()) - labels = assoc(job_labels, 'jobid', uid) + labels = assoc(job_labels, "jobid", uid) model_config_url = config_factory(**kwargs) fv3config.run_kubernetes( model_config_url, "/tmp/null", job_labels=labels, **kube_kwargs diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 80ec10ca52..796adecea0 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -44,15 +44,17 @@ def _init_group_with_schema(group, schemas, timesteps): def init_data_var(group, array): shape = (1,) + array.data.shape chunks = (1,) + tuple(size[0] for size in array.data.chunks) - out_array = group.empty(name=array.name, shape=shape, chunks=chunks, dtype=array.dtype) + out_array = group.empty( + name=array.name, shape=shape, chunks=chunks, dtype=array.dtype + ) out_array.attrs.update(array.attrs) - out_array.attrs['_ARRAY_DIMENSIONS'] = ['initial_time'] + list(array.dims) + out_array.attrs["_ARRAY_DIMENSIONS"] = ["initial_time"] + list(array.dims) def init_coord(group, coord): out_array = group.array(name=coord.name, data=np.asarray(coord)) out_array.attrs.update(coord.attrs) - out_array.attrs['_ARRAY_DIMENSIONS'] = list(coord.dims) + out_array.attrs["_ARRAY_DIMENSIONS"] = list(coord.dims) def create_zarr_store(timesteps, group, template): @@ -86,7 +88,7 @@ def post_process(out_dir, url, index, init=False, timesteps=()): ds = xr.concat([begin, before, after], dim="step").assign_coords( step=["begin", "after_dynamics", "after_physics"], time=time ) - ds = ds.rename({"time": "forecast_time"}).chunk({'forecast_time': 1, 'tile': 6}) + ds = ds.rename({"time": "forecast_time"}).chunk({"forecast_time": 1, "tile": 6}) mapper = fsspec.get_mapper(store_url) group = zarr.open_group(mapper, mode="a") @@ -155,5 +157,5 @@ def post_process(out_dir, url, index, init=False, timesteps=()): after_monitor.store(state) if rank == 0: - post_process(RUN_DIR, **config['one_step']) + post_process(RUN_DIR, **config["one_step"]) fv3gfs.cleanup() diff --git a/workflows/one_step_jobs/zarr_stat.py b/workflows/one_step_jobs/zarr_stat.py index d02b521028..a9d0ccaf1f 100644 --- a/workflows/one_step_jobs/zarr_stat.py +++ b/workflows/one_step_jobs/zarr_stat.py @@ -1,5 +1,6 @@ import fsspec import xarray as xr + print("output structure:") print() for root, dirname, filename in fsspec.filesystem("gs").walk( From fdac3bc284e13634fe8fd37373eea9cc1c06a8a0 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 19 Mar 2020 22:30:48 +0000 Subject: [PATCH 32/81] log zarr creation stuff --- workflows/one_step_jobs/runfile.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 796adecea0..2fca5e028e 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -42,6 +42,7 @@ def _init_group_with_schema(group, schemas, timesteps): def init_data_var(group, array): + logger.info(f"Initializing coordinate: {array.name}") shape = (1,) + array.data.shape chunks = (1,) + tuple(size[0] for size in array.data.chunks) out_array = group.empty( @@ -52,6 +53,7 @@ def init_data_var(group, array): def init_coord(group, coord): + logger.info(f"Initializing coordinate: {coord.name}") out_array = group.array(name=coord.name, data=np.asarray(coord)) out_array.attrs.update(coord.attrs) out_array.attrs["_ARRAY_DIMENSIONS"] = list(coord.dims) From c6ad40bbf65848f9c42722a173e2cdac4f5dbff0 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 19 Mar 2020 22:53:17 +0000 Subject: [PATCH 33/81] initialize array with timesteps --- workflows/one_step_jobs/runfile.py | 39 +++++------------------------- 1 file changed, 6 insertions(+), 33 deletions(-) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 2fca5e028e..b9b9cce55b 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -11,39 +11,9 @@ logger = logging.getLogger(__file__) -def _compute_chunks(shape, chunks): - return tuple(size if chunk == -1 else chunk for size, chunk in zip(shape, chunks)) - - -def _get_schema(shape=(3, 15, 6, 79, 48, 48)): - variables = [ - "air_temperature", - "specific_humidity", - "pressure_thickness_of_atmospheric_layer", - ] - dims_scalar = ["step", "forecast_time", "tile", "z", "y", "x"] - chunks_scalar = _compute_chunks(shape, [-1, 1, -1, -1, -1, -1]) - DTYPE = np.float32 - scalar_schema = { - "dims": dims_scalar, - "chunks": chunks_scalar, - "dtype": DTYPE, - "shape": shape, - } - return {key: scalar_schema for key in variables} - - -def _init_group_with_schema(group, schemas, timesteps): - for name, schema in schemas.items(): - shape = (len(timesteps),) + schema["shape"] - chunks = (1,) + schema["chunks"] - array = group.empty(name, shape=shape, chunks=chunks, dtype=schema["dtype"]) - array.attrs.update({"_ARRAY_DIMENSIONS": ["initial_time"] + schema["dims"]}) - - -def init_data_var(group, array): +def init_data_var(group, array, nt): logger.info(f"Initializing coordinate: {array.name}") - shape = (1,) + array.data.shape + shape = (nt,) + array.data.shape chunks = (1,) + tuple(size[0] for size in array.data.chunks) out_array = group.empty( name=array.name, shape=shape, chunks=chunks, dtype=array.dtype @@ -63,11 +33,14 @@ def create_zarr_store(timesteps, group, template): logger.info("Creating group") ds = template group.attrs.update(ds.attrs) + nt = len(timesteps) for name in ds: - init_data_var(group, ds[name]) + init_data_var(group, ds[name], nt) for name in ds.coords: init_coord(group, ds[name]) + dim = group.array('initial_time', data=timesteps) + dim.attrs['_ARRAY_DIMENSIONS'] = ['initial_time'] def post_process(out_dir, url, index, init=False, timesteps=()): From b7de07fd2e22d2811a7df87a30317c9b0178e8b4 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 19 Mar 2020 23:20:45 +0000 Subject: [PATCH 34/81] save many more outputs --- workflows/one_step_jobs/runfile.py | 59 ++++++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 3 deletions(-) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index b9b9cce55b..e2f59b7597 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -10,6 +10,62 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) +DELP = "pressure_thickness_of_atmospheric_layer" +TIME = "time" + +VARIABLES = [ + "x_wind", + "y_wind", + "air_temperature", + "pressure_thickness_of_atmospheric_layer", + "vertical_wind", + "vertical_thickness_of_atmospheric_layer", + "surface_geopotential", + "eastward_wind_at_surface", + "mean_cos_zenith_angle", + "sensible_heat_flux", + "latent_heat_flux", + "convective_cloud_fraction", + "convective_cloud_top_pressure", + "convective_cloud_bottom_pressure", + "land_sea_mask", + "surface_temperature", + "water_equivalent_of_accumulated_snow_depth", + "deep_soil_temperature", + "surface_roughness", + "mean_visible_albedo_with_strong_cosz_dependency", + "mean_visible_albedo_with_weak_cosz_dependency", + "mean_near_infrared_albedo_with_strong_cosz_dependency", + "mean_near_infrared_albedo_with_weak_cosz_dependency", + "fractional_coverage_with_strong_cosz_dependency", + "fractional_coverage_with_weak_cosz_dependency", + "vegetation_fraction", + "canopy_water", + "fm_at_10m", + "air_temperature_at_2m", + "specific_humidity_at_2m", + "vegetation_type", + "soil_type", + "friction_velocity", + "fm_parameter", + "fh_parameter", + "sea_ice_thickness", + "ice_fraction_over_open_water", + "surface_temperature_over_ice_fraction", + "total_precipitation", + "snow_rain_flag", + "snow_depth_water_equivalent", + "minimum_fractional_coverage_of_green_vegetation", + "maximum_fractional_coverage_of_green_vegetation", + "surface_slope_type", + "maximum_snow_albedo_in_fraction", + "snow_cover_in_fraction", + "soil_temperature", + "total_soil_moisture", + "liquid_soil_moisture" +] + +VARIABLES = VARIABLES + [TIME] def init_data_var(group, array, nt): logger.info(f"Initializing coordinate: {array.name}") @@ -84,9 +140,6 @@ def post_process(out_dir, url, index, init=False, timesteps=()): RUN_DIR = os.path.dirname(os.path.realpath(__file__)) - DELP = "pressure_thickness_of_atmospheric_layer" - TIME = "time" - VARIABLES = list(runtime.CF_TO_RESTART_MAP) + [DELP, TIME] rank = MPI.COMM_WORLD.Get_rank() current_dir = os.getcwd() From 32ff298ef81d0697b59455ab5d59cdcd18d1cb8c Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 19 Mar 2020 23:29:50 +0000 Subject: [PATCH 35/81] parallelize opening writing across variables --- workflows/one_step_jobs/runfile.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index e2f59b7597..043715a88f 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -8,12 +8,11 @@ logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__file__) DELP = "pressure_thickness_of_atmospheric_layer" TIME = "time" -VARIABLES = [ +VARIABLES = ( "x_wind", "y_wind", "air_temperature", @@ -63,10 +62,11 @@ "soil_temperature", "total_soil_moisture", "liquid_soil_moisture" -] + ) VARIABLES = VARIABLES + [TIME] + def init_data_var(group, array, nt): logger.info(f"Initializing coordinate: {array.name}") shape = (nt,) + array.data.shape @@ -99,10 +99,10 @@ def create_zarr_store(timesteps, group, template): dim.attrs['_ARRAY_DIMENSIONS'] = ['initial_time'] -def post_process(out_dir, url, index, init=False, timesteps=()): - +def post_process(out_dir, url, index, init=False, timesteps=(), variables=VARIABLES): store_url = url logger.info("Post processing model outputs") + logger.info(f"Variables to process: {variables}") begin = xr.open_zarr(f"{out_dir}/begin_physics.zarr") before = xr.open_zarr(f"{out_dir}/before_physics.zarr") after = xr.open_zarr(f"{out_dir}/after_physics.zarr") @@ -119,13 +119,14 @@ def post_process(out_dir, url, index, init=False, timesteps=()): ds = xr.concat([begin, before, after], dim="step").assign_coords( step=["begin", "after_dynamics", "after_physics"], time=time ) + ds = ds[variables] ds = ds.rename({"time": "forecast_time"}).chunk({"forecast_time": 1, "tile": 6}) mapper = fsspec.get_mapper(store_url) group = zarr.open_group(mapper, mode="a") if init: - group = zarr.open_group(mapper, mode="w") + group = zarr.open_group(mapper, mode="a") create_zarr_store(timesteps, group, ds) for variable in ds: @@ -140,8 +141,9 @@ def post_process(out_dir, url, index, init=False, timesteps=()): RUN_DIR = os.path.dirname(os.path.realpath(__file__)) - rank = MPI.COMM_WORLD.Get_rank() + size = MPI.COMM_WORLD.Get_size() + logger = logging.getLogger(__file__ + f"({rank}/{size})") current_dir = os.getcwd() config = runtime.get_config() MPI.COMM_WORLD.barrier() # wait for master rank to write run directory @@ -184,6 +186,7 @@ def post_process(out_dir, url, index, init=False, timesteps=()): state = fv3gfs.get_state(names=VARIABLES) after_monitor.store(state) - if rank == 0: - post_process(RUN_DIR, **config["one_step"]) + MPI.COMM_WORLD.barrier() + # parallelize across variables + post_process(RUN_DIR, variables=VARIABLES[rank::size], **config["one_step"]) fv3gfs.cleanup() From c8bff731ad2657cf9270d36ccd53822065d77190 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Fri, 20 Mar 2020 00:02:44 +0000 Subject: [PATCH 36/81] write data with only the master thread --- workflows/one_step_jobs/_run_steps.sh | 1 + workflows/one_step_jobs/runfile.py | 40 +++++++++++++++++---------- 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/workflows/one_step_jobs/_run_steps.sh b/workflows/one_step_jobs/_run_steps.sh index 49310166b4..e5620b5cde 100644 --- a/workflows/one_step_jobs/_run_steps.sh +++ b/workflows/one_step_jobs/_run_steps.sh @@ -9,6 +9,7 @@ workdir=$(pwd) cd ../../ python $workdir/orchestrate_submit_jobs.py \ $src $output $yaml $image -o \ + --n-steps 1 \ --config-version v0.3 ) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 043715a88f..145321c7a7 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -64,11 +64,10 @@ "liquid_soil_moisture" ) -VARIABLES = VARIABLES + [TIME] def init_data_var(group, array, nt): - logger.info(f"Initializing coordinate: {array.name}") + logger.info(f"Initializing variable: {array.name}") shape = (nt,) + array.data.shape chunks = (1,) + tuple(size[0] for size in array.data.chunks) out_array = group.empty( @@ -99,10 +98,9 @@ def create_zarr_store(timesteps, group, template): dim.attrs['_ARRAY_DIMENSIONS'] = ['initial_time'] -def post_process(out_dir, url, index, init=False, timesteps=(), variables=VARIABLES): +def post_process(out_dir, url, index, init=False, timesteps=(), comm=None): store_url = url logger.info("Post processing model outputs") - logger.info(f"Variables to process: {variables}") begin = xr.open_zarr(f"{out_dir}/begin_physics.zarr") before = xr.open_zarr(f"{out_dir}/before_physics.zarr") after = xr.open_zarr(f"{out_dir}/after_physics.zarr") @@ -119,17 +117,28 @@ def post_process(out_dir, url, index, init=False, timesteps=(), variables=VARIAB ds = xr.concat([begin, before, after], dim="step").assign_coords( step=["begin", "after_dynamics", "after_physics"], time=time ) - ds = ds[variables] ds = ds.rename({"time": "forecast_time"}).chunk({"forecast_time": 1, "tile": 6}) - mapper = fsspec.get_mapper(store_url) - group = zarr.open_group(mapper, mode="a") + if comm is not None: + rank = comm.Get_rank() + else: + rank = 0 - if init: - group = zarr.open_group(mapper, mode="a") + mapper = fsspec.get_mapper(store_url) + if init and rank == 0: + group = zarr.open_group(mapper, mode="w") create_zarr_store(timesteps, group, ds) - - for variable in ds: + + if comm is None: + variables = VARIABLES + else: + comm.barrier() + variables = list(VARIABLES)[comm.rank::comm.size] + + # all processes open group + group = zarr.open_group(mapper, mode="a") + logger.info(f"Variables to process: {variables}") + for variable in ds[list(variables)]: logger.info(f"Writing {variable} to {group}") dims = group[variable].attrs["_ARRAY_DIMENSIONS"][1:] group[variable][index] = np.asarray(ds[variable].transpose(*dims)) @@ -172,7 +181,7 @@ def post_process(out_dir, url, index, init=False, timesteps=(), variables=VARIAB ) fv3gfs.initialize() - state = fv3gfs.get_state(names=VARIABLES) + state = fv3gfs.get_state(names=VARIABLES + (TIME,)) if rank == 0: logger.info("Beginning steps") for i in range(fv3gfs.get_step_count()): @@ -180,13 +189,14 @@ def post_process(out_dir, url, index, init=False, timesteps=(), variables=VARIAB logger.info(f"step {i}") begin_monitor.store(state) fv3gfs.step_dynamics() - state = fv3gfs.get_state(names=VARIABLES) + state = fv3gfs.get_state(names=VARIABLES + (TIME,)) before_monitor.store(state) fv3gfs.step_physics() - state = fv3gfs.get_state(names=VARIABLES) + state = fv3gfs.get_state(names=VARIABLES + (TIME,)) after_monitor.store(state) MPI.COMM_WORLD.barrier() # parallelize across variables - post_process(RUN_DIR, variables=VARIABLES[rank::size], **config["one_step"]) + if rank == 0: + post_process(RUN_DIR, **config["one_step"], comm=None) fv3gfs.cleanup() From cad7e085eae18e6f40f04d8dccfd70b8ff58779c Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Fri, 20 Mar 2020 00:17:20 +0000 Subject: [PATCH 37/81] debug out of memory error --- workflows/one_step_jobs/runfile.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 145321c7a7..6e5b3d2242 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -141,7 +141,8 @@ def post_process(out_dir, url, index, init=False, timesteps=(), comm=None): for variable in ds[list(variables)]: logger.info(f"Writing {variable} to {group}") dims = group[variable].attrs["_ARRAY_DIMENSIONS"][1:] - group[variable][index] = np.asarray(ds[variable].transpose(*dims)) + dask_arr = ds[variable].transpose(*dims).data + dask_arr.store(group[variable][index]) if __name__ == "__main__": From 49334f5252fa2fa0d80dd95e6083789721ee1048 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Fri, 20 Mar 2020 00:42:50 +0000 Subject: [PATCH 38/81] fix store call the operation actually wasn't storing anything! --- workflows/one_step_jobs/_run_steps.sh | 1 - workflows/one_step_jobs/runfile.py | 7 +++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/workflows/one_step_jobs/_run_steps.sh b/workflows/one_step_jobs/_run_steps.sh index e5620b5cde..49310166b4 100644 --- a/workflows/one_step_jobs/_run_steps.sh +++ b/workflows/one_step_jobs/_run_steps.sh @@ -9,7 +9,6 @@ workdir=$(pwd) cd ../../ python $workdir/orchestrate_submit_jobs.py \ $src $output $yaml $image -o \ - --n-steps 1 \ --config-version v0.3 ) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 6e5b3d2242..461bdd7e50 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -126,7 +126,8 @@ def post_process(out_dir, url, index, init=False, timesteps=(), comm=None): mapper = fsspec.get_mapper(store_url) if init and rank == 0: - group = zarr.open_group(mapper, mode="w") + logging.info("initializing zarr store") + group = zarr.open_group(mapper, mode="a") create_zarr_store(timesteps, group, ds) if comm is None: @@ -142,7 +143,7 @@ def post_process(out_dir, url, index, init=False, timesteps=(), comm=None): logger.info(f"Writing {variable} to {group}") dims = group[variable].attrs["_ARRAY_DIMENSIONS"][1:] dask_arr = ds[variable].transpose(*dims).data - dask_arr.store(group[variable][index]) + dask_arr.store(group[variable], regions=(index,)) if __name__ == "__main__": @@ -201,3 +202,5 @@ def post_process(out_dir, url, index, init=False, timesteps=(), comm=None): if rank == 0: post_process(RUN_DIR, **config["one_step"], comm=None) fv3gfs.cleanup() +else: + logger = logging.getLogger(__name__) From d0c77d52048361489164bc4f1538a85b2ce8ea37 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Fri, 20 Mar 2020 22:00:49 +0000 Subject: [PATCH 39/81] change versioning info in makefile --- Makefile | 22 +++++++++++++--------- docker/prognostic_run/Dockerfile | 12 ++++++++---- external/fv3config | 2 +- workflows/one_step_jobs/run_steps.sh | 0 4 files changed, 22 insertions(+), 14 deletions(-) create mode 100644 workflows/one_step_jobs/run_steps.sh diff --git a/Makefile b/Makefile index 27a45ce1fe..d1ad652916 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ ################################################################################# # GLOBALS # ################################################################################# -VERSION = 0.1.0 +VERSION = v0.1.0-a1 ENVIRONMENT_SCRIPTS = .environment-scripts PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) BUCKET = [OPTIONAL] your-bucket-for-syncing-data (do not include 's3://') @@ -24,22 +24,26 @@ endif # COMMANDS # ################################################################################# .PHONY: wheels build_images push_image -wheels: - pip wheel --no-deps . - pip wheel --no-deps external/vcm - pip wheel --no-deps external/fv3config +# wheels: +# pip wheel --no-deps . +# pip wheel --no-deps external/vcm +# pip wheel --no-deps external/fv3config # pattern rule for building docker images build_image_%: docker build -f docker/$*/Dockerfile . -t us.gcr.io/vcm-ml/$*:$(VERSION) -build_image_prognostic_run: wheels +enter_%: + docker run -ti -w /fv3net -v $(shell pwd):/fv3net us.gcr.io/vcm-ml/$*:$(VERSION) bash + +build_image_prognostic_run: build_images: build_image_fv3net build_image_prognostic_run -push_image: - docker push us.gcr.io/vcm-ml/fv3net:$(VERSION) - docker push us.gcr.io/vcm-ml/prognostic_run:$(VERSION) +push_images: push_image_prognostic_run push_image_fv3net + +push_image_%: + docker push us.gcr.io/vcm-ml/$*:$(VERSION) enter: build_image docker run -it -v $(shell pwd):/code \ diff --git a/docker/prognostic_run/Dockerfile b/docker/prognostic_run/Dockerfile index 06ba9b87ab..0746555eb9 100644 --- a/docker/prognostic_run/Dockerfile +++ b/docker/prognostic_run/Dockerfile @@ -4,7 +4,11 @@ FROM us.gcr.io/vcm-ml/fv3gfs-python:v0.3.1 COPY docker/prognostic_run/requirements.txt /tmp/requirements.txt RUN pip3 install -r /tmp/requirements.txt RUN pip3 install wheel -COPY fv3net-0.1.0-py3-none-any.whl /wheels/fv3net-0.1.0-py3-none-any.whl -COPY vcm-0.1.0-py3-none-any.whl /wheels/vcm-0.1.0-py3-none-any.whl -COPY external/fv3config /opt/fv3config -RUN pip3 install --no-deps /wheels/fv3net-0.1.0-py3-none-any.whl && pip3 install /wheels/vcm-0.1.0-py3-none-any.whl /opt/fv3config + +# cache external package installation +COPY external/fv3config /fv3net/external/fv3config +COPY external/vcm /fv3net/external/vcm +RUN pip3 install /fv3net/external/vcm /fv3net/external/fv3config + +COPY . /fv3net +RUN pip3 install --no-deps -e /fv3net diff --git a/external/fv3config b/external/fv3config index b598405973..6bde7b7354 160000 --- a/external/fv3config +++ b/external/fv3config @@ -1 +1 @@ -Subproject commit b5984059734d3c1e9d4f60d554fa8a159b7ae85c +Subproject commit 6bde7b7354fa3c4c512a7178ee9b55b45189d76e diff --git a/workflows/one_step_jobs/run_steps.sh b/workflows/one_step_jobs/run_steps.sh new file mode 100644 index 0000000000..e69de29bb2 From cfed20f3fd5e94df343097dd78ac3b7bffa3dbc4 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Fri, 20 Mar 2020 22:15:58 +0000 Subject: [PATCH 40/81] add tracer variables --- docker/prognostic_run/Dockerfile | 2 +- workflows/one_step_jobs/runfile.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/docker/prognostic_run/Dockerfile b/docker/prognostic_run/Dockerfile index 0746555eb9..362295b121 100644 --- a/docker/prognostic_run/Dockerfile +++ b/docker/prognostic_run/Dockerfile @@ -8,7 +8,7 @@ RUN pip3 install wheel # cache external package installation COPY external/fv3config /fv3net/external/fv3config COPY external/vcm /fv3net/external/vcm -RUN pip3 install /fv3net/external/vcm /fv3net/external/fv3config +RUN pip3 install -e /fv3net/external/vcm -e /fv3net/external/fv3config COPY . /fv3net RUN pip3 install --no-deps -e /fv3net diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 461bdd7e50..9b003c9fe5 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -12,10 +12,13 @@ DELP = "pressure_thickness_of_atmospheric_layer" TIME = "time" +TRACERS = ('specific_humidity', 'cloud_water_mixing_ratio', 'rain_mixing_ratio', 'cloud_ice_mixing_ratio', 'snow_mixing_ratio', 'graupel_mixing_ratio', 'ozone_mixing_ratio', 'cloud_amount') + VARIABLES = ( "x_wind", "y_wind", "air_temperature", + "specific_humidity", "pressure_thickness_of_atmospheric_layer", "vertical_wind", "vertical_thickness_of_atmospheric_layer", @@ -62,7 +65,7 @@ "soil_temperature", "total_soil_moisture", "liquid_soil_moisture" - ) + ) + TRACERS From ae2e38308dda957a378b1196dc448b93701f596d Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Fri, 20 Mar 2020 22:22:23 +0000 Subject: [PATCH 41/81] change image version and chunking --- workflows/one_step_jobs/_run_steps.sh | 2 +- workflows/one_step_jobs/runfile.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/one_step_jobs/_run_steps.sh b/workflows/one_step_jobs/_run_steps.sh index 49310166b4..00a3ef0ab8 100644 --- a/workflows/one_step_jobs/_run_steps.sh +++ b/workflows/one_step_jobs/_run_steps.sh @@ -1,7 +1,7 @@ workdir=$(pwd) src=gs://vcm-ml-data/orchestration-testing/test-andrep/coarsen_restarts_source-resolution_384_target-resolution_48/ output=gs://vcm-ml-data/testing-noah/one-step - image=us.gcr.io/vcm-ml/prognostic_run:0.1.0 + image=us.gcr.io/vcm-ml/prognostic_run:v0.1.0-a1 yaml=$PWD/deep-conv-off.yml gsutil -m rm -r $output > /dev/null diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 9b003c9fe5..495c5972bf 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -120,7 +120,7 @@ def post_process(out_dir, url, index, init=False, timesteps=(), comm=None): ds = xr.concat([begin, before, after], dim="step").assign_coords( step=["begin", "after_dynamics", "after_physics"], time=time ) - ds = ds.rename({"time": "forecast_time"}).chunk({"forecast_time": 1, "tile": 6}) + ds = ds.rename({"time": "forecast_time"}).chunk({"forecast_time": 1, "tile": 6, 'step': 3}) if comm is not None: rank = comm.Get_rank() From 97e46270b592c56934f97ccd6c8f7a435d80d777 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Fri, 20 Mar 2020 22:26:01 +0000 Subject: [PATCH 42/81] change chunks size --- workflows/one_step_jobs/runfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 495c5972bf..9b003c9fe5 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -120,7 +120,7 @@ def post_process(out_dir, url, index, init=False, timesteps=(), comm=None): ds = xr.concat([begin, before, after], dim="step").assign_coords( step=["begin", "after_dynamics", "after_physics"], time=time ) - ds = ds.rename({"time": "forecast_time"}).chunk({"forecast_time": 1, "tile": 6, 'step': 3}) + ds = ds.rename({"time": "forecast_time"}).chunk({"forecast_time": 1, "tile": 6}) if comm is not None: rank = comm.Get_rank() From 34426c00c72afb82b234dc29afb771dac97fb80a Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Fri, 20 Mar 2020 22:46:22 +0000 Subject: [PATCH 43/81] fix out of memory errors --- workflows/one_step_jobs/_run_steps.sh | 1 + workflows/one_step_jobs/runfile.py | 13 +++++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/workflows/one_step_jobs/_run_steps.sh b/workflows/one_step_jobs/_run_steps.sh index 00a3ef0ab8..0fec150fd0 100644 --- a/workflows/one_step_jobs/_run_steps.sh +++ b/workflows/one_step_jobs/_run_steps.sh @@ -9,6 +9,7 @@ workdir=$(pwd) cd ../../ python $workdir/orchestrate_submit_jobs.py \ $src $output $yaml $image -o \ + --n-steps 1 \ --config-version v0.3 ) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 9b003c9fe5..1c35362796 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -5,6 +5,10 @@ import xarray as xr import numpy as np import logging +import dask + +# avoid out of memory errors +dask.config.set(scheduler='single-threaded') logging.basicConfig(level=logging.INFO) @@ -120,7 +124,7 @@ def post_process(out_dir, url, index, init=False, timesteps=(), comm=None): ds = xr.concat([begin, before, after], dim="step").assign_coords( step=["begin", "after_dynamics", "after_physics"], time=time ) - ds = ds.rename({"time": "forecast_time"}).chunk({"forecast_time": 1, "tile": 6}) + ds = ds.rename({"time": "forecast_time"}).chunk({"forecast_time": 1, "tile": 6, "step": 3}) if comm is not None: rank = comm.Get_rank() @@ -130,7 +134,7 @@ def post_process(out_dir, url, index, init=False, timesteps=(), comm=None): mapper = fsspec.get_mapper(store_url) if init and rank == 0: logging.info("initializing zarr store") - group = zarr.open_group(mapper, mode="a") + group = zarr.open_group(mapper, mode="w") create_zarr_store(timesteps, group, ds) if comm is None: @@ -202,8 +206,9 @@ def post_process(out_dir, url, index, init=False, timesteps=(), comm=None): MPI.COMM_WORLD.barrier() # parallelize across variables - if rank == 0: - post_process(RUN_DIR, **config["one_step"], comm=None) + del state, begin_monitor, before_monitor, after_monitor + # if rank == 0: + post_process(RUN_DIR, **config["one_step"], comm=MPI.COMM_WORLD) fv3gfs.cleanup() else: logger = logging.getLogger(__name__) From fba67f5b9dae101f6c47cbf32691697a9fabcd57 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Fri, 20 Mar 2020 23:11:27 +0000 Subject: [PATCH 44/81] show dataset size in summary script --- workflows/one_step_jobs/_run_steps.sh | 1 - workflows/one_step_jobs/runfile.py | 3 ++- workflows/one_step_jobs/zarr_stat.py | 7 ++++++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/workflows/one_step_jobs/_run_steps.sh b/workflows/one_step_jobs/_run_steps.sh index 0fec150fd0..00a3ef0ab8 100644 --- a/workflows/one_step_jobs/_run_steps.sh +++ b/workflows/one_step_jobs/_run_steps.sh @@ -9,7 +9,6 @@ workdir=$(pwd) cd ../../ python $workdir/orchestrate_submit_jobs.py \ $src $output $yaml $image -o \ - --n-steps 1 \ --config-version v0.3 ) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 1c35362796..a56d8bffff 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -161,10 +161,11 @@ def post_process(out_dir, url, index, init=False, timesteps=(), comm=None): rank = MPI.COMM_WORLD.Get_rank() size = MPI.COMM_WORLD.Get_size() - logger = logging.getLogger(__file__ + f"({rank}/{size})") current_dir = os.getcwd() config = runtime.get_config() MPI.COMM_WORLD.barrier() # wait for master rank to write run directory + logger = logging.getLogger( + f"one_step:{rank}/{size}:{config['one_step']['index']}") partitioner = fv3gfs.CubedSpherePartitioner.from_namelist(config["namelist"]) diff --git a/workflows/one_step_jobs/zarr_stat.py b/workflows/one_step_jobs/zarr_stat.py index a9d0ccaf1f..f12f2bae2f 100644 --- a/workflows/one_step_jobs/zarr_stat.py +++ b/workflows/one_step_jobs/zarr_stat.py @@ -22,4 +22,9 @@ print(ds) print() print(ds.info()) -print(ds.air_temperature.std().compute()) + +print() +print("data size:", ds.isel(initial_time=0).nbytes/1e9, "GB/initial time") + + +#print(ds.air_temperature.std().compute()) From 4da489b9ddaeb30aaec92c4ec59c07a26d1901b1 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Sat, 21 Mar 2020 00:01:24 +0000 Subject: [PATCH 45/81] Add surface variables --- docker/prognostic_run/requirements.txt | 3 ++- fv3net/__init__.py | 10 ++++++++++ workflows/one_step_jobs/runfile.py | 20 ++++++++++++++++++-- 3 files changed, 30 insertions(+), 3 deletions(-) diff --git a/docker/prognostic_run/requirements.txt b/docker/prognostic_run/requirements.txt index 9944d45ec4..e27e75c7dd 100644 --- a/docker/prognostic_run/requirements.txt +++ b/docker/prognostic_run/requirements.txt @@ -2,4 +2,5 @@ scikit-learn==0.22.1 dask joblib zarr -scikit-image \ No newline at end of file +scikit-image +google-cloud-logging diff --git a/fv3net/__init__.py b/fv3net/__init__.py index d2f8e47558..0e42f55521 100644 --- a/fv3net/__init__.py +++ b/fv3net/__init__.py @@ -4,3 +4,13 @@ COARSENED_DIAGS_ZARR_NAME = "gfsphysics_15min_coarse.zarr" __version__ = "0.1.0" + +try: + import google.cloud.logging + client = google.cloud.logging.Client() + client.setup_logging() +except ImportError: + pass +else: + print("set up google logging") + diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index a56d8bffff..f27486987e 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -72,6 +72,20 @@ ) + TRACERS +def rename_sfc_dt_atmos(sfc): + SFC_VARIABLES = [ + "DSWRFtoa", + "DSWRFsfc", + "USWRFtoa", + "USWRFsfc", + "DLWRFsfc", + "ULWRFtoa", + "ULWRFsfc", + ] + + DIMS = {"grid_xt": "x", "grid_yt": "y"} + return sfc[DIMS].rename(DIMS).drop(["tile", "x", "y"]) + def init_data_var(group, array, nt): logger.info(f"Initializing variable: {array.name}") @@ -108,6 +122,7 @@ def create_zarr_store(timesteps, group, template): def post_process(out_dir, url, index, init=False, timesteps=(), comm=None): store_url = url logger.info("Post processing model outputs") + sfc = xr.open_mfdataset(f"{out_dir}/sfc_dt_atmos.tile?.nc", concat_dim='tile', combine='nested').pipe(rename_sfc_dt_atmos) begin = xr.open_zarr(f"{out_dir}/begin_physics.zarr") before = xr.open_zarr(f"{out_dir}/before_physics.zarr") after = xr.open_zarr(f"{out_dir}/after_physics.zarr") @@ -124,6 +139,7 @@ def post_process(out_dir, url, index, init=False, timesteps=(), comm=None): ds = xr.concat([begin, before, after], dim="step").assign_coords( step=["begin", "after_dynamics", "after_physics"], time=time ) + ds = ds.merge(sfc) ds = ds.rename({"time": "forecast_time"}).chunk({"forecast_time": 1, "tile": 6, "step": 3}) if comm is not None: @@ -205,11 +221,11 @@ def post_process(out_dir, url, index, init=False, timesteps=(), comm=None): state = fv3gfs.get_state(names=VARIABLES + (TIME,)) after_monitor.store(state) - MPI.COMM_WORLD.barrier() # parallelize across variables + fv3gfs.cleanup() + MPI.COMM_WORLD.barrier() del state, begin_monitor, before_monitor, after_monitor # if rank == 0: post_process(RUN_DIR, **config["one_step"], comm=MPI.COMM_WORLD) - fv3gfs.cleanup() else: logger = logging.getLogger(__name__) From dc1f50240ddf7e8a48a6bad791fc3a8037932579 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Tue, 24 Mar 2020 00:34:13 +0000 Subject: [PATCH 46/81] merge in two d data --- fv3net/__init__.py | 16 +++--- workflows/one_step_jobs/runfile.py | 87 ++++++++++++++++-------------- 2 files changed, 56 insertions(+), 47 deletions(-) diff --git a/fv3net/__init__.py b/fv3net/__init__.py index 0e42f55521..daee341d8d 100644 --- a/fv3net/__init__.py +++ b/fv3net/__init__.py @@ -5,12 +5,12 @@ __version__ = "0.1.0" -try: - import google.cloud.logging - client = google.cloud.logging.Client() - client.setup_logging() -except ImportError: - pass -else: - print("set up google logging") +# try: +# import google.cloud.logging +# client = google.cloud.logging.Client() +# client.setup_logging() +# except ImportError: +# pass +# else: +# print("set up google logging") diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index f27486987e..d22eb0f4c5 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -1,17 +1,21 @@ import os from fv3net import runtime -import fsspec -import zarr -import xarray as xr -import numpy as np import logging import dask +import time +from multiprocessing import Process # avoid out of memory errors -dask.config.set(scheduler='single-threaded') +# dask.config.set(scheduler='single-threaded') + +import fsspec +import zarr +import xarray as xr +import numpy as np logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) DELP = "pressure_thickness_of_atmospheric_layer" TIME = "time" @@ -71,20 +75,19 @@ "liquid_soil_moisture" ) + TRACERS +SFC_VARIABLES = ( + "DSWRFtoa", + "DSWRFsfc", + "USWRFtoa", + "USWRFsfc", + "DLWRFsfc", + "ULWRFtoa", + "ULWRFsfc", +) def rename_sfc_dt_atmos(sfc): - SFC_VARIABLES = [ - "DSWRFtoa", - "DSWRFsfc", - "USWRFtoa", - "USWRFsfc", - "DLWRFsfc", - "ULWRFtoa", - "ULWRFsfc", - ] - - DIMS = {"grid_xt": "x", "grid_yt": "y"} - return sfc[DIMS].rename(DIMS).drop(["tile", "x", "y"]) + DIMS = {"grid_xt": "x", "grid_yt": "y", "time": "forecast_time"} + return sfc[list(SFC_VARIABLES)].rename(DIMS).transpose("forecast_time", "tile", "y", "x").drop(["forecast_time", "y", "x"]) def init_data_var(group, array, nt): @@ -119,10 +122,13 @@ def create_zarr_store(timesteps, group, template): dim.attrs['_ARRAY_DIMENSIONS'] = ['initial_time'] -def post_process(out_dir, url, index, init=False, timesteps=(), comm=None): +def post_process(out_dir, url, index, init=False, timesteps=()): + + if init and len(timesteps) > 0 and index: + raise ValueError(f"To initialize the zarr store, {timesteps} must not be empty.") + store_url = url logger.info("Post processing model outputs") - sfc = xr.open_mfdataset(f"{out_dir}/sfc_dt_atmos.tile?.nc", concat_dim='tile', combine='nested').pipe(rename_sfc_dt_atmos) begin = xr.open_zarr(f"{out_dir}/begin_physics.zarr") before = xr.open_zarr(f"{out_dir}/before_physics.zarr") after = xr.open_zarr(f"{out_dir}/after_physics.zarr") @@ -139,28 +145,17 @@ def post_process(out_dir, url, index, init=False, timesteps=(), comm=None): ds = xr.concat([begin, before, after], dim="step").assign_coords( step=["begin", "after_dynamics", "after_physics"], time=time ) - ds = ds.merge(sfc) ds = ds.rename({"time": "forecast_time"}).chunk({"forecast_time": 1, "tile": 6, "step": 3}) + sfc = xr.open_mfdataset(f"{out_dir}/sfc_dt_atmos.tile?.nc", concat_dim='tile', combine='nested').pipe(rename_sfc_dt_atmos) + ds = ds.merge(sfc) - if comm is not None: - rank = comm.Get_rank() - else: - rank = 0 - - mapper = fsspec.get_mapper(store_url) - if init and rank == 0: + if init: + mapper = fsspec.get_mapper(store_url) logging.info("initializing zarr store") group = zarr.open_group(mapper, mode="w") create_zarr_store(timesteps, group, ds) - - if comm is None: - variables = VARIABLES - else: - comm.barrier() - variables = list(VARIABLES)[comm.rank::comm.size] - - # all processes open group - group = zarr.open_group(mapper, mode="a") + + variables = VARIABLES + SFC_VARIABLES logger.info(f"Variables to process: {variables}") for variable in ds[list(variables)]: logger.info(f"Writing {variable} to {group}") @@ -169,6 +164,16 @@ def post_process(out_dir, url, index, init=False, timesteps=(), comm=None): dask_arr.store(group[variable], regions=(index,)) +def run_post_process_in_new_process(outdir, c): + url = c.pop('url') + index = c.pop('index') + args = (outdir, url, index) + kwargs = c + p = Process(target=post_process, args=args, kwargs=kwargs) + p.start() + p.join() + + if __name__ == "__main__": import fv3gfs from mpi4py import MPI @@ -223,9 +228,13 @@ def post_process(out_dir, url, index, init=False, timesteps=(), comm=None): # parallelize across variables fv3gfs.cleanup() - MPI.COMM_WORLD.barrier() del state, begin_monitor, before_monitor, after_monitor - # if rank == 0: - post_process(RUN_DIR, **config["one_step"], comm=MPI.COMM_WORLD) + + if rank == 0: + # TODO it would be much cleaner to call this is a separate script, but that + # would be incompatible with the run_k8s api + # sleep a little while to allow all process to finish finalizing the netCDFs + time.sleep(2) + run_post_process_in_new_process(RUN_DIR, config['one_step']) else: logger = logging.getLogger(__name__) From 669df7e20e085c535c83a1ff07290b12547f0400 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Tue, 24 Mar 2020 17:36:16 +0000 Subject: [PATCH 47/81] reformat --- external/fv3config | 2 +- fv3net/__init__.py | 1 - workflows/one_step_jobs/runfile.py | 55 +++++++++++++++++++--------- workflows/one_step_jobs/zarr_stat.py | 4 +- 4 files changed, 40 insertions(+), 22 deletions(-) diff --git a/external/fv3config b/external/fv3config index 6bde7b7354..dfc541f155 160000 --- a/external/fv3config +++ b/external/fv3config @@ -1 +1 @@ -Subproject commit 6bde7b7354fa3c4c512a7178ee9b55b45189d76e +Subproject commit dfc541f155d18ad94df930ddf025b21f7df37c18 diff --git a/fv3net/__init__.py b/fv3net/__init__.py index daee341d8d..684dfa0c5c 100644 --- a/fv3net/__init__.py +++ b/fv3net/__init__.py @@ -13,4 +13,3 @@ # pass # else: # print("set up google logging") - diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index d22eb0f4c5..839415e3c4 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -1,7 +1,6 @@ import os from fv3net import runtime import logging -import dask import time from multiprocessing import Process @@ -20,9 +19,18 @@ DELP = "pressure_thickness_of_atmospheric_layer" TIME = "time" -TRACERS = ('specific_humidity', 'cloud_water_mixing_ratio', 'rain_mixing_ratio', 'cloud_ice_mixing_ratio', 'snow_mixing_ratio', 'graupel_mixing_ratio', 'ozone_mixing_ratio', 'cloud_amount') +TRACERS = ( + "specific_humidity", + "cloud_water_mixing_ratio", + "rain_mixing_ratio", + "cloud_ice_mixing_ratio", + "snow_mixing_ratio", + "graupel_mixing_ratio", + "ozone_mixing_ratio", + "cloud_amount", +) -VARIABLES = ( +VARIABLES = ( "x_wind", "y_wind", "air_temperature", @@ -72,10 +80,10 @@ "snow_cover_in_fraction", "soil_temperature", "total_soil_moisture", - "liquid_soil_moisture" - ) + TRACERS + "liquid_soil_moisture", +) + TRACERS -SFC_VARIABLES = ( +SFC_VARIABLES = ( "DSWRFtoa", "DSWRFsfc", "USWRFtoa", @@ -85,9 +93,15 @@ "ULWRFsfc", ) + def rename_sfc_dt_atmos(sfc): DIMS = {"grid_xt": "x", "grid_yt": "y", "time": "forecast_time"} - return sfc[list(SFC_VARIABLES)].rename(DIMS).transpose("forecast_time", "tile", "y", "x").drop(["forecast_time", "y", "x"]) + return ( + sfc[list(SFC_VARIABLES)] + .rename(DIMS) + .transpose("forecast_time", "tile", "y", "x") + .drop(["forecast_time", "y", "x"]) + ) def init_data_var(group, array, nt): @@ -118,14 +132,16 @@ def create_zarr_store(timesteps, group, template): for name in ds.coords: init_coord(group, ds[name]) - dim = group.array('initial_time', data=timesteps) - dim.attrs['_ARRAY_DIMENSIONS'] = ['initial_time'] + dim = group.array("initial_time", data=timesteps) + dim.attrs["_ARRAY_DIMENSIONS"] = ["initial_time"] def post_process(out_dir, url, index, init=False, timesteps=()): if init and len(timesteps) > 0 and index: - raise ValueError(f"To initialize the zarr store, {timesteps} must not be empty.") + raise ValueError( + f"To initialize the zarr store, {timesteps} must not be empty." + ) store_url = url logger.info("Post processing model outputs") @@ -145,8 +161,12 @@ def post_process(out_dir, url, index, init=False, timesteps=()): ds = xr.concat([begin, before, after], dim="step").assign_coords( step=["begin", "after_dynamics", "after_physics"], time=time ) - ds = ds.rename({"time": "forecast_time"}).chunk({"forecast_time": 1, "tile": 6, "step": 3}) - sfc = xr.open_mfdataset(f"{out_dir}/sfc_dt_atmos.tile?.nc", concat_dim='tile', combine='nested').pipe(rename_sfc_dt_atmos) + ds = ds.rename({"time": "forecast_time"}).chunk( + {"forecast_time": 1, "tile": 6, "step": 3} + ) + sfc = xr.open_mfdataset( + f"{out_dir}/sfc_dt_atmos.tile?.nc", concat_dim="tile", combine="nested" + ).pipe(rename_sfc_dt_atmos) ds = ds.merge(sfc) if init: @@ -165,8 +185,8 @@ def post_process(out_dir, url, index, init=False, timesteps=()): def run_post_process_in_new_process(outdir, c): - url = c.pop('url') - index = c.pop('index') + url = c.pop("url") + index = c.pop("index") args = (outdir, url, index) kwargs = c p = Process(target=post_process, args=args, kwargs=kwargs) @@ -185,8 +205,7 @@ def run_post_process_in_new_process(outdir, c): current_dir = os.getcwd() config = runtime.get_config() MPI.COMM_WORLD.barrier() # wait for master rank to write run directory - logger = logging.getLogger( - f"one_step:{rank}/{size}:{config['one_step']['index']}") + logger = logging.getLogger(f"one_step:{rank}/{size}:{config['one_step']['index']}") partitioner = fv3gfs.CubedSpherePartitioner.from_namelist(config["namelist"]) @@ -231,10 +250,10 @@ def run_post_process_in_new_process(outdir, c): del state, begin_monitor, before_monitor, after_monitor if rank == 0: - # TODO it would be much cleaner to call this is a separate script, but that + # TODO it would be much cleaner to call this is a separate script, but that # would be incompatible with the run_k8s api # sleep a little while to allow all process to finish finalizing the netCDFs time.sleep(2) - run_post_process_in_new_process(RUN_DIR, config['one_step']) + run_post_process_in_new_process(RUN_DIR, config["one_step"]) else: logger = logging.getLogger(__name__) diff --git a/workflows/one_step_jobs/zarr_stat.py b/workflows/one_step_jobs/zarr_stat.py index f12f2bae2f..07352298e6 100644 --- a/workflows/one_step_jobs/zarr_stat.py +++ b/workflows/one_step_jobs/zarr_stat.py @@ -24,7 +24,7 @@ print(ds.info()) print() -print("data size:", ds.isel(initial_time=0).nbytes/1e9, "GB/initial time") +print("data size:", ds.isel(initial_time=0).nbytes / 1e9, "GB/initial time") -#print(ds.air_temperature.std().compute()) +# print(ds.air_temperature.std().compute()) From 6f698bc69c263d70283b97ba2e678f286f7cf707 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Tue, 24 Mar 2020 17:42:54 +0000 Subject: [PATCH 48/81] remove get_runfile_config --- fv3net/runtime/__init__.py | 2 +- fv3net/runtime/config.py | 8 +------- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/fv3net/runtime/__init__.py b/fv3net/runtime/__init__.py index 4ce5adce29..d8af81d7a0 100644 --- a/fv3net/runtime/__init__.py +++ b/fv3net/runtime/__init__.py @@ -1,3 +1,3 @@ from . import sklearn_interface as sklearn from .state_io import init_writers, append_to_writers, CF_TO_RESTART_MAP -from .config import get_runfile_config, get_namelist, get_config +from .config import get_namelist, get_config diff --git a/fv3net/runtime/config.py b/fv3net/runtime/config.py index 4c459aa56b..b15524d94c 100644 --- a/fv3net/runtime/config.py +++ b/fv3net/runtime/config.py @@ -10,17 +10,11 @@ class dotdict(dict): __delattr__ = dict.__delitem__ -def get_runfile_config(): +def get_config(): with open("fv3config.yml") as f: config = yaml.safe_load(f) return config -def get_runfile_config(): - with open("fv3config.yml") as f: - config = yaml.safe_load(f) - return dotdict(config["scikit_learn"]) - - def get_namelist(): return f90nml.read("input.nml") From 56fe153691bbc5871c19195a4a8328872a793d93 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Tue, 24 Mar 2020 19:57:52 +0000 Subject: [PATCH 49/81] update fv3config to master --- external/fv3config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/fv3config b/external/fv3config index dfc541f155..bb1c1fa607 160000 --- a/external/fv3config +++ b/external/fv3config @@ -1 +1 @@ -Subproject commit dfc541f155d18ad94df930ddf025b21f7df37c18 +Subproject commit bb1c1fa6079e5bb08708071f7a642a1da614bc34 From e57c543926f518e7beb753008bd5a1bcfc4796bf Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Tue, 24 Mar 2020 21:12:08 +0000 Subject: [PATCH 50/81] fix time dimension --- workflows/one_step_jobs/runfile.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 839415e3c4..b85509524b 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -156,8 +156,7 @@ def post_process(out_dir, url, index, init=False, timesteps=()): begin = begin.drop("time") # concat data - dt = np.timedelta64(15, "m") - time = np.arange(len(time)) * dt + time = time - time[0] ds = xr.concat([begin, before, after], dim="step").assign_coords( step=["begin", "after_dynamics", "after_physics"], time=time ) From b85d0e9bf7c0d27edc70d2b133a6b8bac4ce8602 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Tue, 24 Mar 2020 23:34:10 +0000 Subject: [PATCH 51/81] add test of runfile --- workflows/one_step_jobs/runfile.py | 21 +++++++++++++++++-- workflows/one_step_jobs/test_runfile.py | 28 +++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 2 deletions(-) create mode 100644 workflows/one_step_jobs/test_runfile.py diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index b85509524b..85812e2359 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -117,7 +117,7 @@ def init_data_var(group, array, nt): def init_coord(group, coord): logger.info(f"Initializing coordinate: {coord.name}") - out_array = group.array(name=coord.name, data=np.asarray(coord)) + out_array = group.array(name=coord.name, data=np.asarray(coord), fill_value='NaN') out_array.attrs.update(coord.attrs) out_array.attrs["_ARRAY_DIMENSIONS"] = list(coord.dims) @@ -136,6 +136,23 @@ def create_zarr_store(timesteps, group, template): dim.attrs["_ARRAY_DIMENSIONS"] = ["initial_time"] +def _get_forecast_time(time): + dt = np.asarray(time - time[0]) + return xr.DataArray( + _convert_time_delta_to_float_seconds(dt), + name='time', + dims=['time'], + attrs={ + 'units': 's' + } + ) + + +def _convert_time_delta_to_float_seconds(a): + ns_per_s = 1e9 + return a.astype('timedelta64[ns]').astype(float) / ns_per_s + + def post_process(out_dir, url, index, init=False, timesteps=()): if init and len(timesteps) > 0 and index: @@ -156,7 +173,7 @@ def post_process(out_dir, url, index, init=False, timesteps=()): begin = begin.drop("time") # concat data - time = time - time[0] + time = _get_forecast_time(time) ds = xr.concat([begin, before, after], dim="step").assign_coords( step=["begin", "after_dynamics", "after_physics"], time=time ) diff --git a/workflows/one_step_jobs/test_runfile.py b/workflows/one_step_jobs/test_runfile.py new file mode 100644 index 0000000000..4b9c9dbc4a --- /dev/null +++ b/workflows/one_step_jobs/test_runfile.py @@ -0,0 +1,28 @@ +import runfile +import zarr +import xarray as xr +import numpy as np + +def test_init_coord(): + + time = np.array(['2016-08-01T00:16:00.000000000', '2016-08-01T00:17:00.000000000', + '2016-08-01T00:18:00.000000000', '2016-08-01T00:19:00.000000000', + '2016-08-01T00:20:00.000000000', '2016-08-01T00:21:00.000000000', + '2016-08-01T00:22:00.000000000', '2016-08-01T00:23:00.000000000', + '2016-08-01T00:24:00.000000000', '2016-08-01T00:25:00.000000000', + '2016-08-01T00:26:00.000000000', '2016-08-01T00:27:00.000000000', + '2016-08-01T00:28:00.000000000', '2016-08-01T00:29:00.000000000', + '2016-08-01T00:30:00.000000000'], dtype='datetime64[ns]') + + ds = xr.Dataset(coords={'time': time}) + time = runfile._get_forecast_time(ds.time) + + ds_lead_time = ds.assign(time =time) + + store = {} + + group = zarr.open_group(store, mode='w') + runfile.init_coord(group, ds_lead_time['time']) + + loaded = xr.open_zarr(store) + np.testing.assert_equal(loaded.time.values, ds_lead_time.time.values) From 92f2353fbc5bc634105787429108233c6410cc76 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Tue, 24 Mar 2020 23:36:38 +0000 Subject: [PATCH 52/81] comment on need for fill_value=Nan --- workflows/one_step_jobs/runfile.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 85812e2359..434a51e7c0 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -117,6 +117,9 @@ def init_data_var(group, array, nt): def init_coord(group, coord): logger.info(f"Initializing coordinate: {coord.name}") + # fill_value=NaN is needed below for xr.open_zarr to succesfully load this + # coordinate if decode_cf=True. Otherwise, time=0 gets filled in as nan. very + # confusing... out_array = group.array(name=coord.name, data=np.asarray(coord), fill_value='NaN') out_array.attrs.update(coord.attrs) out_array.attrs["_ARRAY_DIMENSIONS"] = list(coord.dims) From 1bc45b55624c2de007af4c27d9723690027ca3db Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Tue, 24 Mar 2020 23:41:04 +0000 Subject: [PATCH 53/81] black --- workflows/one_step_jobs/runfile.py | 12 ++++---- workflows/one_step_jobs/test_runfile.py | 37 +++++++++++++++++-------- 2 files changed, 30 insertions(+), 19 deletions(-) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 434a51e7c0..8bc07195f4 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -120,7 +120,7 @@ def init_coord(group, coord): # fill_value=NaN is needed below for xr.open_zarr to succesfully load this # coordinate if decode_cf=True. Otherwise, time=0 gets filled in as nan. very # confusing... - out_array = group.array(name=coord.name, data=np.asarray(coord), fill_value='NaN') + out_array = group.array(name=coord.name, data=np.asarray(coord), fill_value="NaN") out_array.attrs.update(coord.attrs) out_array.attrs["_ARRAY_DIMENSIONS"] = list(coord.dims) @@ -143,17 +143,15 @@ def _get_forecast_time(time): dt = np.asarray(time - time[0]) return xr.DataArray( _convert_time_delta_to_float_seconds(dt), - name='time', - dims=['time'], - attrs={ - 'units': 's' - } + name="time", + dims=["time"], + attrs={"units": "s"}, ) def _convert_time_delta_to_float_seconds(a): ns_per_s = 1e9 - return a.astype('timedelta64[ns]').astype(float) / ns_per_s + return a.astype("timedelta64[ns]").astype(float) / ns_per_s def post_process(out_dir, url, index, init=False, timesteps=()): diff --git a/workflows/one_step_jobs/test_runfile.py b/workflows/one_step_jobs/test_runfile.py index 4b9c9dbc4a..8f58533046 100644 --- a/workflows/one_step_jobs/test_runfile.py +++ b/workflows/one_step_jobs/test_runfile.py @@ -3,26 +3,39 @@ import xarray as xr import numpy as np + def test_init_coord(): - time = np.array(['2016-08-01T00:16:00.000000000', '2016-08-01T00:17:00.000000000', - '2016-08-01T00:18:00.000000000', '2016-08-01T00:19:00.000000000', - '2016-08-01T00:20:00.000000000', '2016-08-01T00:21:00.000000000', - '2016-08-01T00:22:00.000000000', '2016-08-01T00:23:00.000000000', - '2016-08-01T00:24:00.000000000', '2016-08-01T00:25:00.000000000', - '2016-08-01T00:26:00.000000000', '2016-08-01T00:27:00.000000000', - '2016-08-01T00:28:00.000000000', '2016-08-01T00:29:00.000000000', - '2016-08-01T00:30:00.000000000'], dtype='datetime64[ns]') + time = np.array( + [ + "2016-08-01T00:16:00.000000000", + "2016-08-01T00:17:00.000000000", + "2016-08-01T00:18:00.000000000", + "2016-08-01T00:19:00.000000000", + "2016-08-01T00:20:00.000000000", + "2016-08-01T00:21:00.000000000", + "2016-08-01T00:22:00.000000000", + "2016-08-01T00:23:00.000000000", + "2016-08-01T00:24:00.000000000", + "2016-08-01T00:25:00.000000000", + "2016-08-01T00:26:00.000000000", + "2016-08-01T00:27:00.000000000", + "2016-08-01T00:28:00.000000000", + "2016-08-01T00:29:00.000000000", + "2016-08-01T00:30:00.000000000", + ], + dtype="datetime64[ns]", + ) - ds = xr.Dataset(coords={'time': time}) + ds = xr.Dataset(coords={"time": time}) time = runfile._get_forecast_time(ds.time) - ds_lead_time = ds.assign(time =time) + ds_lead_time = ds.assign(time=time) store = {} - group = zarr.open_group(store, mode='w') - runfile.init_coord(group, ds_lead_time['time']) + group = zarr.open_group(store, mode="w") + runfile.init_coord(group, ds_lead_time["time"]) loaded = xr.open_zarr(store) np.testing.assert_equal(loaded.time.values, ds_lead_time.time.values) From b72a6708800bd7527665202d8d5349335c2a8e8c Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Tue, 24 Mar 2020 23:57:23 +0000 Subject: [PATCH 54/81] add provenance information to file --- docker/prognostic_run/requirements.txt | 1 + environment.yml | 1 + workflows/one_step_jobs/runfile.py | 15 +++++++++++++++ 3 files changed, 17 insertions(+) diff --git a/docker/prognostic_run/requirements.txt b/docker/prognostic_run/requirements.txt index e27e75c7dd..8071ce3dc1 100644 --- a/docker/prognostic_run/requirements.txt +++ b/docker/prognostic_run/requirements.txt @@ -4,3 +4,4 @@ joblib zarr scikit-image google-cloud-logging +gitpython diff --git a/environment.yml b/environment.yml index ae13d0cef8..db5daf4202 100644 --- a/environment.yml +++ b/environment.yml @@ -48,4 +48,5 @@ dependencies: - pip: - gsutil - nc-time-axis>=1.2.0 + - gitpython - yq diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 8bc07195f4..9e4a526401 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -1,7 +1,9 @@ import os from fv3net import runtime +import fv3net import logging import time +from git import Repo from multiprocessing import Process # avoid out of memory errors @@ -125,10 +127,23 @@ def init_coord(group, coord): out_array.attrs["_ARRAY_DIMENSIONS"] = list(coord.dims) +def get_provenance_info(): + repo = Repo(search_parent_directories=True) + uncommited_changes = len(repo.index.diff(repo.head.commit)) + untracked_files = len(repo.untracked_file) + return { + 'fv3net_version': fv3net.__version__, + 'commit': repo.head.commit.hexsha, + 'index': 'dirty' if uncommited_changes > 0 else 'clean', + 'working-tree': 'dirty' if untracked_files > 0 else 'clean', + } + + def create_zarr_store(timesteps, group, template): logger.info("Creating group") ds = template group.attrs.update(ds.attrs) + group.attrs.update(get_provenance_info()) nt = len(timesteps) for name in ds: init_data_var(group, ds[name], nt) From 08a7dc55e0f81c863295e965e65ad287629f78be Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Wed, 25 Mar 2020 00:08:53 +0000 Subject: [PATCH 55/81] add git provenance info --- workflows/one_step_jobs/runfile.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 9e4a526401..3ee27bb392 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -130,12 +130,14 @@ def init_coord(group, coord): def get_provenance_info(): repo = Repo(search_parent_directories=True) uncommited_changes = len(repo.index.diff(repo.head.commit)) - untracked_files = len(repo.untracked_file) + untracked_files = len(repo.untracked_files) + unstaged_files = len(repo.index.diff(None)) return { 'fv3net_version': fv3net.__version__, 'commit': repo.head.commit.hexsha, 'index': 'dirty' if uncommited_changes > 0 else 'clean', - 'working-tree': 'dirty' if untracked_files > 0 else 'clean', + 'working-tree': 'dirty' if unstaged_files > 0 else 'clean', + 'untracked_files': untracked_files } @@ -286,6 +288,6 @@ def run_post_process_in_new_process(outdir, c): # would be incompatible with the run_k8s api # sleep a little while to allow all process to finish finalizing the netCDFs time.sleep(2) - run_post_process_in_new_process(RUN_DIR, config["one_step"]) + run_post_process_in_new_process(RUN_DIR, config["one_step"], config) else: logger = logging.getLogger(__name__) From f2a4f0a5077aee80eec827d52d7dcbcee59395b7 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Wed, 25 Mar 2020 00:19:14 +0000 Subject: [PATCH 56/81] change order of arguments following upstream changes --- .dockerignore | 3 ++- workflows/one_step_jobs/_run_steps.sh | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.dockerignore b/.dockerignore index dc16f10421..d09c3e155d 100644 --- a/.dockerignore +++ b/.dockerignore @@ -33,4 +33,5 @@ tox.ini vcm-ml-data dataflow/*/env external/vcm/venv -Dockerfile \ No newline at end of file +Dockerfile +outdir/ diff --git a/workflows/one_step_jobs/_run_steps.sh b/workflows/one_step_jobs/_run_steps.sh index 00a3ef0ab8..b688aed29d 100644 --- a/workflows/one_step_jobs/_run_steps.sh +++ b/workflows/one_step_jobs/_run_steps.sh @@ -8,7 +8,7 @@ workdir=$(pwd) ( cd ../../ python $workdir/orchestrate_submit_jobs.py \ - $src $output $yaml $image -o \ + $src $yaml $image $output -o \ --config-version v0.3 ) From 7a0d7b023f5479a44c1b581f41fdab67fa11fbb1 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Wed, 25 Mar 2020 00:25:59 +0000 Subject: [PATCH 57/81] lint --- workflows/one_step_jobs/runfile.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 3ee27bb392..026007aece 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -133,11 +133,11 @@ def get_provenance_info(): untracked_files = len(repo.untracked_files) unstaged_files = len(repo.index.diff(None)) return { - 'fv3net_version': fv3net.__version__, - 'commit': repo.head.commit.hexsha, - 'index': 'dirty' if uncommited_changes > 0 else 'clean', - 'working-tree': 'dirty' if unstaged_files > 0 else 'clean', - 'untracked_files': untracked_files + "fv3net_version": fv3net.__version__, + "commit": repo.head.commit.hexsha, + "index": "dirty" if uncommited_changes > 0 else "clean", + "working-tree": "dirty" if unstaged_files > 0 else "clean", + "untracked_files": untracked_files, } From 09cc50053e2b1140148434b096c44bd76f0763ce Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Wed, 25 Mar 2020 01:09:36 +0000 Subject: [PATCH 58/81] fix runfile --- workflows/one_step_jobs/runfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 026007aece..a8714d495b 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -288,6 +288,6 @@ def run_post_process_in_new_process(outdir, c): # would be incompatible with the run_k8s api # sleep a little while to allow all process to finish finalizing the netCDFs time.sleep(2) - run_post_process_in_new_process(RUN_DIR, config["one_step"], config) + run_post_process_in_new_process(RUN_DIR, config["one_step"]) else: logger = logging.getLogger(__name__) From 669bc685b6f291e97abd6606b408dd635c955c7d Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Wed, 25 Mar 2020 01:13:47 +0000 Subject: [PATCH 59/81] comment out git logging, discovering the reop fails --- workflows/one_step_jobs/runfile.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index a8714d495b..052458c543 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -128,7 +128,6 @@ def init_coord(group, coord): def get_provenance_info(): - repo = Repo(search_parent_directories=True) uncommited_changes = len(repo.index.diff(repo.head.commit)) untracked_files = len(repo.untracked_files) unstaged_files = len(repo.index.diff(None)) @@ -145,7 +144,7 @@ def create_zarr_store(timesteps, group, template): logger.info("Creating group") ds = template group.attrs.update(ds.attrs) - group.attrs.update(get_provenance_info()) + # group.attrs.update(get_provenance_info()) nt = len(timesteps) for name in ds: init_data_var(group, ds[name], nt) From 513562d76243f28bab0091ee317234ce76efd13a Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Wed, 25 Mar 2020 01:22:11 +0000 Subject: [PATCH 60/81] another runfile fix --- workflows/one_step_jobs/runfile.py | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 052458c543..a4c498f52f 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -208,6 +208,7 @@ def post_process(out_dir, url, index, init=False, timesteps=()): group = zarr.open_group(mapper, mode="w") create_zarr_store(timesteps, group, ds) + group = zarr.open_group(mapper, mode="a") variables = VARIABLES + SFC_VARIABLES logger.info(f"Variables to process: {variables}") for variable in ds[list(variables)]: From cf0e5ff8063a3ddefde43fc38e6cc7c2ce5bf41e Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Wed, 25 Mar 2020 01:42:42 +0000 Subject: [PATCH 61/81] another bug 30 mins later...ugh --- workflows/one_step_jobs/runfile.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index a4c498f52f..6d329559d2 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -202,8 +202,9 @@ def post_process(out_dir, url, index, init=False, timesteps=()): ).pipe(rename_sfc_dt_atmos) ds = ds.merge(sfc) + mapper = fsspec.get_mapper(store_url) + if init: - mapper = fsspec.get_mapper(store_url) logging.info("initializing zarr store") group = zarr.open_group(mapper, mode="w") create_zarr_store(timesteps, group, ds) From f685dc9a3632c3958b36aec0c21dd292145cb077 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Wed, 25 Mar 2020 16:37:50 +0000 Subject: [PATCH 62/81] delete submit_jobs.py It seems redundant with orchestrate submit jobs --- fv3net/runtime/config.py | 5 +- workflows/one_step_jobs/runfile.py | 24 ++----- workflows/one_step_jobs/submit_jobs.py | 89 -------------------------- 3 files changed, 7 insertions(+), 111 deletions(-) delete mode 100644 workflows/one_step_jobs/submit_jobs.py diff --git a/fv3net/runtime/config.py b/fv3net/runtime/config.py index b15524d94c..b78f4faf17 100644 --- a/fv3net/runtime/config.py +++ b/fv3net/runtime/config.py @@ -1,3 +1,4 @@ +from typing import Dict import yaml import f90nml @@ -10,11 +11,11 @@ class dotdict(dict): __delattr__ = dict.__delitem__ -def get_config(): +def get_config() -> Dict: with open("fv3config.yml") as f: config = yaml.safe_load(f) return config -def get_namelist(): +def get_namelist() -> f90nml.Namelist: return f90nml.read("input.nml") diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 6d329559d2..52c6cb8594 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -1,9 +1,9 @@ import os +from typing import Dict, Any from fv3net import runtime import fv3net import logging import time -from git import Repo from multiprocessing import Process # avoid out of memory errors @@ -127,24 +127,10 @@ def init_coord(group, coord): out_array.attrs["_ARRAY_DIMENSIONS"] = list(coord.dims) -def get_provenance_info(): - uncommited_changes = len(repo.index.diff(repo.head.commit)) - untracked_files = len(repo.untracked_files) - unstaged_files = len(repo.index.diff(None)) - return { - "fv3net_version": fv3net.__version__, - "commit": repo.head.commit.hexsha, - "index": "dirty" if uncommited_changes > 0 else "clean", - "working-tree": "dirty" if unstaged_files > 0 else "clean", - "untracked_files": untracked_files, - } - - def create_zarr_store(timesteps, group, template): logger.info("Creating group") ds = template group.attrs.update(ds.attrs) - # group.attrs.update(get_provenance_info()) nt = len(timesteps) for name in ds: init_data_var(group, ds[name], nt) @@ -219,14 +205,12 @@ def post_process(out_dir, url, index, init=False, timesteps=()): dask_arr.store(group[variable], regions=(index,)) -def run_post_process_in_new_process(outdir, c): +def run_post_process_from_config(outdir: str, c: Dict[str, Any]): url = c.pop("url") index = c.pop("index") args = (outdir, url, index) kwargs = c - p = Process(target=post_process, args=args, kwargs=kwargs) - p.start() - p.join() + post_process(*args, **kwargs) if __name__ == "__main__": @@ -289,6 +273,6 @@ def run_post_process_in_new_process(outdir, c): # would be incompatible with the run_k8s api # sleep a little while to allow all process to finish finalizing the netCDFs time.sleep(2) - run_post_process_in_new_process(RUN_DIR, config["one_step"]) + run_post_process_from_config(RUN_DIR, config["one_step"]) else: logger = logging.getLogger(__name__) diff --git a/workflows/one_step_jobs/submit_jobs.py b/workflows/one_step_jobs/submit_jobs.py deleted file mode 100644 index 64da5d99c9..0000000000 --- a/workflows/one_step_jobs/submit_jobs.py +++ /dev/null @@ -1,89 +0,0 @@ -import logging -import os -import argparse -import yaml -from pathlib import Path -import fv3net.pipelines.kube_jobs.one_step as one_step - -logger = logging.getLogger("run_jobs") - -RUNDIRS_DIRECTORY_NAME = "one_step_output" -CONFIG_DIRECTORY_NAME = "one_step_config" -PWD = Path(os.path.abspath(__file__)).parent -LOCAL_VGRID_FILE = os.path.join(PWD, one_step.VERTICAL_GRID_FILENAME) - - -def _get_arg_parser(): - parser = argparse.ArgumentParser() - parser.add_argument( - "one-step-yaml", type=str, help="Path to local run configuration yaml.", - ) - parser.add_argument( - "input-url", - type=str, - help="Remote url to initial conditions. Initial conditions are assumed to be " - "stored as INPUT_URL/{timestamp}/{timestamp}.{restart_category}.tile*.nc", - ) - parser.add_argument( - "output-url", - type=str, - help="Remote url where model configuration and output will be saved. " - "Specifically, configuration files will be saved to OUTPUT_URL/" - f"{CONFIG_DIRECTORY_NAME} and model output to OUTPUT_URL/" - f"{RUNDIRS_DIRECTORY_NAME}", - ) - parser.add_argument( - "--n-steps", - type=int, - default=None, - help="Number of timesteps to process. By default all timesteps " - "found in INPUT_URL for which successful runs do not exist in " - "OUTPUT_URL will be processed. Useful for testing.", - ) - parser.add_argument( - "-o", - "--overwrite", - action="store_true", - help="Overwrite successful timesteps in OUTPUT_URL.", - ) - parser.add_argument( - "--init-frequency", - type=int, - required=False, - help="Frequency (in minutes) to initialize one-step jobs starting from" - " the first available timestep.", - ) - - return parser - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - - parser = _get_arg_parser() - args = parser.parse_args() - - with open(args.one_step_yaml) as file: - one_step_config = yaml.load(file, Loader=yaml.FullLoader) - workflow_name = Path(args.one_step_yaml).with_suffix("").name - - output_url = os.path.join(args.output_url, RUNDIRS_DIRECTORY_NAME) - config_url = os.path.join(args.output_url, CONFIG_DIRECTORY_NAME) - - timestep_list = one_step.timesteps_to_process( - args.input_url, - args.output_url, - args.n_steps, - args.overwrite, - subsample_frequency=args.init_frequency, - ) - - one_step.submit_jobs( - timestep_list, - workflow_name, - one_step_config, - args.input_url, - output_url, - config_url, - local_vertical_grid_file=LOCAL_VGRID_FILE, - ) From 597aab253fbbb1bf1c6cd92b9261ac4032b0eac2 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Wed, 25 Mar 2020 16:52:43 +0000 Subject: [PATCH 63/81] add some type hints --- workflows/one_step_jobs/runfile.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 52c6cb8594..70ef9510f6 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -1,5 +1,5 @@ import os -from typing import Dict, Any +from typing import Dict, Any, Sequence from fv3net import runtime import fv3net import logging @@ -96,7 +96,7 @@ ) -def rename_sfc_dt_atmos(sfc): +def rename_sfc_dt_atmos(sfc: xr.Dataset) -> xr.Dataset: DIMS = {"grid_xt": "x", "grid_yt": "y", "time": "forecast_time"} return ( sfc[list(SFC_VARIABLES)] @@ -106,7 +106,7 @@ def rename_sfc_dt_atmos(sfc): ) -def init_data_var(group, array, nt): +def init_data_var(group: zarr.Group, array: xr.DataArray, nt: int): logger.info(f"Initializing variable: {array.name}") shape = (nt,) + array.data.shape chunks = (1,) + tuple(size[0] for size in array.data.chunks) @@ -117,7 +117,7 @@ def init_data_var(group, array, nt): out_array.attrs["_ARRAY_DIMENSIONS"] = ["initial_time"] + list(array.dims) -def init_coord(group, coord): +def init_coord(group: zarr.Group, coord): logger.info(f"Initializing coordinate: {coord.name}") # fill_value=NaN is needed below for xr.open_zarr to succesfully load this # coordinate if decode_cf=True. Otherwise, time=0 gets filled in as nan. very @@ -127,7 +127,7 @@ def init_coord(group, coord): out_array.attrs["_ARRAY_DIMENSIONS"] = list(coord.dims) -def create_zarr_store(timesteps, group, template): +def create_zarr_store(timesteps: Sequence[str], group: zarr.Group, template: xr.Dataset): logger.info("Creating group") ds = template group.attrs.update(ds.attrs) @@ -141,7 +141,7 @@ def create_zarr_store(timesteps, group, template): dim.attrs["_ARRAY_DIMENSIONS"] = ["initial_time"] -def _get_forecast_time(time): +def _get_forecast_time(time) -> xr.DataArray: dt = np.asarray(time - time[0]) return xr.DataArray( _convert_time_delta_to_float_seconds(dt), @@ -156,7 +156,7 @@ def _convert_time_delta_to_float_seconds(a): return a.astype("timedelta64[ns]").astype(float) / ns_per_s -def post_process(out_dir, url, index, init=False, timesteps=()): +def post_process(out_dir: str, url: str, index: int, init: bool=False, timesteps: Sequence=()): if init and len(timesteps) > 0 and index: raise ValueError( From 732128f123fb97487ce1f8132e927f3e37bb0caf Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Wed, 25 Mar 2020 16:54:48 +0000 Subject: [PATCH 64/81] lint --- workflows/one_step_jobs/runfile.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 70ef9510f6..31fc836dcb 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -1,10 +1,8 @@ import os from typing import Dict, Any, Sequence from fv3net import runtime -import fv3net import logging import time -from multiprocessing import Process # avoid out of memory errors # dask.config.set(scheduler='single-threaded') @@ -127,7 +125,9 @@ def init_coord(group: zarr.Group, coord): out_array.attrs["_ARRAY_DIMENSIONS"] = list(coord.dims) -def create_zarr_store(timesteps: Sequence[str], group: zarr.Group, template: xr.Dataset): +def create_zarr_store( + timesteps: Sequence[str], group: zarr.Group, template: xr.Dataset +): logger.info("Creating group") ds = template group.attrs.update(ds.attrs) @@ -156,7 +156,9 @@ def _convert_time_delta_to_float_seconds(a): return a.astype("timedelta64[ns]").astype(float) / ns_per_s -def post_process(out_dir: str, url: str, index: int, init: bool=False, timesteps: Sequence=()): +def post_process( + out_dir: str, url: str, index: int, init: bool = False, timesteps: Sequence = () +): if init and len(timesteps) > 0 and index: raise ValueError( From 558011de35ab074236b6b1ad791ce0cea981f258 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 26 Mar 2020 00:41:05 +0000 Subject: [PATCH 65/81] unify the naming of the monitors and step names --- workflows/one_step_jobs/runfile.py | 114 ++++++++++++++--------------- 1 file changed, 53 insertions(+), 61 deletions(-) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 31fc836dcb..8e2a1d2acd 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -1,7 +1,8 @@ import os -from typing import Dict, Any, Sequence +from typing import Dict, Any, Sequence, Mapping from fv3net import runtime import logging +from collections import defaultdict import time # avoid out of memory errors @@ -156,63 +157,60 @@ def _convert_time_delta_to_float_seconds(a): return a.astype("timedelta64[ns]").astype(float) / ns_per_s +def _merge_monitor_data(paths: Mapping[str, str]) -> xr.Dataset: + datasets = {key: xr.open_zarr(val) for key, val in paths} + time = _get_forecast_time(datasets["begin"]) + datasets_no_time = [val.drop("time") for val in datasets.values()] + steps = list(datasets.keys()) + return xr.concat(datasets_no_time, dim="step").assign_coords(step=steps, time=time) + + +def _write_to_store(group: zarr.ABSStore, ds: xr.Dataset): + for variable in ds: + logger.info(f"Writing {variable} to {group}") + dims = group[variable].attrs["_ARRAY_DIMENSIONS"][1:] + dask_arr = ds[variable].transpose(*dims).data + dask_arr.store(group[variable], regions=(index,)) + + def post_process( - out_dir: str, url: str, index: int, init: bool = False, timesteps: Sequence = () + monitor_paths: Mapping[str, str], + sfc_pattern: str, + store_url: str, + index: int, + init: bool = False, + timesteps: Sequence = (), ): if init and len(timesteps) > 0 and index: raise ValueError( f"To initialize the zarr store, {timesteps} must not be empty." ) - - store_url = url logger.info("Post processing model outputs") - begin = xr.open_zarr(f"{out_dir}/begin_physics.zarr") - before = xr.open_zarr(f"{out_dir}/before_physics.zarr") - after = xr.open_zarr(f"{out_dir}/after_physics.zarr") - - # make the time dims consistent - time = begin.time - before = before.drop("time") - after = after.drop("time") - begin = begin.drop("time") - - # concat data - time = _get_forecast_time(time) - ds = xr.concat([begin, before, after], dim="step").assign_coords( - step=["begin", "after_dynamics", "after_physics"], time=time + + sfc = xr.open_mfdataset(sfc_pattern, concat_dim="tile", combine="nested").pipe( + rename_sfc_dt_atmos ) - ds = ds.rename({"time": "forecast_time"}).chunk( - {"forecast_time": 1, "tile": 6, "step": 3} + + ds = ( + _merge_monitor_data(monitor_paths) + .rename({"time": "forecast_time"}) + .chunk({"forecast_time": 1, "tile": 6, "step": 3}) ) - sfc = xr.open_mfdataset( - f"{out_dir}/sfc_dt_atmos.tile?.nc", concat_dim="tile", combine="nested" - ).pipe(rename_sfc_dt_atmos) - ds = ds.merge(sfc) + merged = xr.merge([sfc, ds]) mapper = fsspec.get_mapper(store_url) if init: logging.info("initializing zarr store") group = zarr.open_group(mapper, mode="w") - create_zarr_store(timesteps, group, ds) + create_zarr_store(timesteps, group, merged) group = zarr.open_group(mapper, mode="a") variables = VARIABLES + SFC_VARIABLES logger.info(f"Variables to process: {variables}") - for variable in ds[list(variables)]: - logger.info(f"Writing {variable} to {group}") - dims = group[variable].attrs["_ARRAY_DIMENSIONS"][1:] - dask_arr = ds[variable].transpose(*dims).data - dask_arr.store(group[variable], regions=(index,)) - - -def run_post_process_from_config(outdir: str, c: Dict[str, Any]): - url = c.pop("url") - index = c.pop("index") - args = (outdir, url, index) - kwargs = c - post_process(*args, **kwargs) + dataset_to_write : xr.Dataset = ds[list(variables)] + _write_to_store(group, dataset_to_write) if __name__ == "__main__": @@ -230,26 +228,17 @@ def run_post_process_from_config(outdir: str, c: Dict[str, Any]): partitioner = fv3gfs.CubedSpherePartitioner.from_namelist(config["namelist"]) - before_monitor = fv3gfs.ZarrMonitor( - os.path.join(RUN_DIR, "before_physics.zarr"), - partitioner, - mode="w", - mpi_comm=MPI.COMM_WORLD, + sfc_pattern = f"{RUN_DIR}/sfc_dt_atmos.tile?.nc" + paths = dict( + begin=os.path.join(RUN_DIR, "before_physics.zarr"), + after_physics=os.path.join(RUN_DIR, "after_physics.zarr"), + after_dynamics=os.path.join(RUN_DIR, "after_dynamics.zarr"), ) - after_monitor = fv3gfs.ZarrMonitor( - os.path.join(RUN_DIR, "after_physics.zarr"), - partitioner, - mode="w", - mpi_comm=MPI.COMM_WORLD, - ) - - begin_monitor = fv3gfs.ZarrMonitor( - os.path.join(RUN_DIR, "begin_physics.zarr"), - partitioner, - mode="w", - mpi_comm=MPI.COMM_WORLD, - ) + monitors = { + key: fv3gfs.ZarrMonitor(path, partitioner, mode="w", mpi_comm=MPI.COMM_WORLD,) + for key, path in paths.items() + } fv3gfs.initialize() state = fv3gfs.get_state(names=VARIABLES + (TIME,)) @@ -258,23 +247,26 @@ def run_post_process_from_config(outdir: str, c: Dict[str, Any]): for i in range(fv3gfs.get_step_count()): if rank == 0: logger.info(f"step {i}") - begin_monitor.store(state) + monitors["before"].store(state) fv3gfs.step_dynamics() state = fv3gfs.get_state(names=VARIABLES + (TIME,)) - before_monitor.store(state) + monitors["after_dynamics"].store(state) fv3gfs.step_physics() state = fv3gfs.get_state(names=VARIABLES + (TIME,)) - after_monitor.store(state) + monitors["after_physics"].store(state) # parallelize across variables fv3gfs.cleanup() - del state, begin_monitor, before_monitor, after_monitor + del monitors if rank == 0: # TODO it would be much cleaner to call this is a separate script, but that # would be incompatible with the run_k8s api # sleep a little while to allow all process to finish finalizing the netCDFs time.sleep(2) - run_post_process_from_config(RUN_DIR, config["one_step"]) + c = config["one_step"] + url = c.pop("url") + index = c.pop("index") + post_process(paths, sfc_pattern, url, index, **c) else: logger = logging.getLogger(__name__) From 9f41b4bf73662a0fe7478eb50dac88ba09a224b8 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 26 Mar 2020 00:47:14 +0000 Subject: [PATCH 66/81] Add out of memory troubleshooting info --- workflows/one_step_jobs/README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/workflows/one_step_jobs/README.md b/workflows/one_step_jobs/README.md index 8cef622d9f..d0f1ff615f 100644 --- a/workflows/one_step_jobs/README.md +++ b/workflows/one_step_jobs/README.md @@ -64,3 +64,13 @@ Use the following command to view your current configuration. It should point to ``` kubectl config view ``` + +### Out of Memory errors + +The one step jobs can be fail with OOMKilled errors if too many dask workers +are used. These errors can typically be avoided by using the single-threaded +dask scheduler. You can enable for this debugging purposes by adding the +following lines to the top of [runfile.py](./runfile.py): + + import dask + dask.config.set(scheduler='single-threaded') From a962ef995c13ebb7c254c677a498efa10934a091 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 26 Mar 2020 00:49:25 +0000 Subject: [PATCH 67/81] Update info about submission --- workflows/one_step_jobs/README.md | 34 +++++-------------------------- 1 file changed, 5 insertions(+), 29 deletions(-) diff --git a/workflows/one_step_jobs/README.md b/workflows/one_step_jobs/README.md index d0f1ff615f..6c534f6b79 100644 --- a/workflows/one_step_jobs/README.md +++ b/workflows/one_step_jobs/README.md @@ -10,35 +10,11 @@ microphysics) Both of these configurations use a one-minute timestep with no dynamics substepping and have a total duration of 15 minutes. -Workflow call signature: -``` -$ python submit_jobs.py -h -usage: submit_jobs.py [-h] INPUT_URL ONE_STEP_YAML OUTPUT_URL [--n-steps N_STEPS] [-o] - - -h, --help show this help message and exit - INPUT_URL Remote url to initial conditions. Initial conditions - are assumed to be stored as INPUT_URL/{timestamp}/{tim - estamp}.{restart_category}.tile*.nc - ONE_STEP_YAML Path to local run configuration yaml. - DOCKER_IMAGE fv3gfs-python model docker image. - OUTPUT_URL Remote url where model configuration and output will - be saved. Specifically, configuration files will be - saved to OUTPUT_URL/one_step_config and model output - to OUTPUT_URL/one_step_output - --n-steps N_STEPS Number of timesteps to process. By default all - timesteps found in INPUT_URL for which successful runs - do not exist in OUTPUT_URL will be processed. Useful - for testing. - -o, --overwrite Overwrite successful timesteps in OUTPUT_URL. - --init-frequency INIT_FREQUENCY - Frequency (in minutes) to initialize one-step jobs - starting from the first available timestep. - --config-version CONFIG_VERSION - Default fv3config.yml version to use as the base - configuration. This should be consistent with the - fv3gfs-python version in the specified docker image. - Defaults to fv3gfs-python v0.2 style configuration. -``` +This workflow can be submitted with the [orchestrate_submit_jobs.py] script. +This script is self-documenting and its help can be seen by running: + + python orchestrate_submit_jobs.py -h + ### Kubernetes VM access troubleshooting From 0e537bbf2555e7038b84fd357bcfcb187f2d898d Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 26 Mar 2020 00:55:15 +0000 Subject: [PATCH 68/81] add comment clarifying the local upload dir --- fv3net/pipelines/kube_jobs/one_step.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fv3net/pipelines/kube_jobs/one_step.py b/fv3net/pipelines/kube_jobs/one_step.py index ab923cd787..f121986db6 100644 --- a/fv3net/pipelines/kube_jobs/one_step.py +++ b/fv3net/pipelines/kube_jobs/one_step.py @@ -313,8 +313,13 @@ def run_job(wait=False, **kwargs): uid = str(uuid.uuid4()) labels = assoc(job_labels, "jobid", uid) model_config_url = config_factory(**kwargs) + + # the one step workflow doesn't need to upload its run directories any longer + # since all the data is in the big zarr. Setting outdir to a pod-local path + # avoids this unecessary upload step. + local_tmp_dir = "/tmp/null" fv3config.run_kubernetes( - model_config_url, "/tmp/null", job_labels=labels, **kube_kwargs + model_config_url, local_tmp_dir, job_labels=labels, **kube_kwargs ) if wait: utils.wait_for_complete(job_labels, sleep_interval=10) From 3805291eeddb1bfb4515eeaa5a8fe21cf9d3a90b Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 26 Mar 2020 00:56:34 +0000 Subject: [PATCH 69/81] lint --- workflows/one_step_jobs/runfile.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 8e2a1d2acd..044f629998 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -1,8 +1,7 @@ import os -from typing import Dict, Any, Sequence, Mapping +from typing import Sequence, Mapping from fv3net import runtime import logging -from collections import defaultdict import time # avoid out of memory errors @@ -209,7 +208,7 @@ def post_process( group = zarr.open_group(mapper, mode="a") variables = VARIABLES + SFC_VARIABLES logger.info(f"Variables to process: {variables}") - dataset_to_write : xr.Dataset = ds[list(variables)] + dataset_to_write: xr.Dataset = ds[list(variables)] _write_to_store(group, dataset_to_write) From 9873e547ab3d7fc15ec313b0868810ce7e0932bc Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 26 Mar 2020 01:27:26 +0000 Subject: [PATCH 70/81] fix typo --- workflows/one_step_jobs/runfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 044f629998..5233b3cc07 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -246,7 +246,7 @@ def post_process( for i in range(fv3gfs.get_step_count()): if rank == 0: logger.info(f"step {i}") - monitors["before"].store(state) + monitors["begin"].store(state) fv3gfs.step_dynamics() state = fv3gfs.get_state(names=VARIABLES + (TIME,)) monitors["after_dynamics"].store(state) From eb11110bdf8777a2131efeea4e126c485aef2819 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 26 Mar 2020 01:41:05 +0000 Subject: [PATCH 71/81] Fix another bug --- workflows/one_step_jobs/runfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 5233b3cc07..6d8f7e2c8a 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -157,7 +157,7 @@ def _convert_time_delta_to_float_seconds(a): def _merge_monitor_data(paths: Mapping[str, str]) -> xr.Dataset: - datasets = {key: xr.open_zarr(val) for key, val in paths} + datasets = {key: xr.open_zarr(val) for key, val in paths.items()} time = _get_forecast_time(datasets["begin"]) datasets_no_time = [val.drop("time") for val in datasets.values()] steps = list(datasets.keys()) From 3dd8cf0bf633e20bbc6d29d395b76b8c9dac89fa Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 26 Mar 2020 01:46:01 +0000 Subject: [PATCH 72/81] another typo --- workflows/one_step_jobs/runfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 6d8f7e2c8a..b6c421a5fc 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -158,7 +158,7 @@ def _convert_time_delta_to_float_seconds(a): def _merge_monitor_data(paths: Mapping[str, str]) -> xr.Dataset: datasets = {key: xr.open_zarr(val) for key, val in paths.items()} - time = _get_forecast_time(datasets["begin"]) + time = _get_forecast_time(datasets["begin"].time) datasets_no_time = [val.drop("time") for val in datasets.values()] steps = list(datasets.keys()) return xr.concat(datasets_no_time, dim="step").assign_coords(step=steps, time=time) From c144a5b788b45183b54290e5e154fae2a903d766 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 26 Mar 2020 02:20:34 +0000 Subject: [PATCH 73/81] fix another bug --- workflows/one_step_jobs/runfile.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index b6c421a5fc..993e765ed3 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -197,7 +197,7 @@ def post_process( .chunk({"forecast_time": 1, "tile": 6, "step": 3}) ) - merged = xr.merge([sfc, ds]) + merged = xr.merge([sfc[SFC_VARIABLES], ds]) mapper = fsspec.get_mapper(store_url) if init: @@ -206,10 +206,7 @@ def post_process( create_zarr_store(timesteps, group, merged) group = zarr.open_group(mapper, mode="a") - variables = VARIABLES + SFC_VARIABLES - logger.info(f"Variables to process: {variables}") - dataset_to_write: xr.Dataset = ds[list(variables)] - _write_to_store(group, dataset_to_write) + _write_to_store(group, merged) if __name__ == "__main__": From 2c5f2fe43e15f2d8816ad2de316fa71749740189 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 26 Mar 2020 02:27:11 +0000 Subject: [PATCH 74/81] fix key --- workflows/one_step_jobs/runfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 993e765ed3..7f699e78f0 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -197,7 +197,7 @@ def post_process( .chunk({"forecast_time": 1, "tile": 6, "step": 3}) ) - merged = xr.merge([sfc[SFC_VARIABLES], ds]) + merged = xr.merge([sfc[list(SFC_VARIABLES)], ds]) mapper = fsspec.get_mapper(store_url) if init: From 643757f22f23b484e79e628ab45479bf9aced527 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 26 Mar 2020 21:06:08 +0000 Subject: [PATCH 75/81] pass index to write_zarr_store --- workflows/one_step_jobs/runfile.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index 7f699e78f0..ba9e348755 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -1,5 +1,5 @@ import os -from typing import Sequence, Mapping +from typing import Sequence, Mapping, cast, Hashable from fv3net import runtime import logging import time @@ -164,7 +164,7 @@ def _merge_monitor_data(paths: Mapping[str, str]) -> xr.Dataset: return xr.concat(datasets_no_time, dim="step").assign_coords(step=steps, time=time) -def _write_to_store(group: zarr.ABSStore, ds: xr.Dataset): +def _write_to_store(group: zarr.ABSStore, index: int, ds: xr.Dataset): for variable in ds: logger.info(f"Writing {variable} to {group}") dims = group[variable].attrs["_ARRAY_DIMENSIONS"][1:] @@ -172,6 +172,20 @@ def _write_to_store(group: zarr.ABSStore, ds: xr.Dataset): dask_arr.store(group[variable], regions=(index,)) +def _safe_get_variables(ds: xr.Dataset, variables: Sequence[Hashable]) -> xr.Dataset: + """ds[...] is very confusing function from a typing perspective and should be + avoided in long-running pipeline codes. This function introduces a type-stable + alternative that works better with mypy. + + In particular, ds[('a' , 'b' ,'c')] looks for a variable named ('a', 'b', 'c') which + usually doesn't exist, so it causes a key error. but ds[['a', 'b', 'c']] makes a + dataset only consisting of the variables 'a', 'b', and 'c'. This causes tons of + hard to find errors. + """ + variables = list(variables) + return cast(xr.Dataset, ds[variables]) + + def post_process( monitor_paths: Mapping[str, str], sfc_pattern: str, @@ -190,6 +204,7 @@ def post_process( sfc = xr.open_mfdataset(sfc_pattern, concat_dim="tile", combine="nested").pipe( rename_sfc_dt_atmos ) + sfc = _safe_get_variables(sfc, SFC_VARIABLES) ds = ( _merge_monitor_data(monitor_paths) @@ -197,7 +212,7 @@ def post_process( .chunk({"forecast_time": 1, "tile": 6, "step": 3}) ) - merged = xr.merge([sfc[list(SFC_VARIABLES)], ds]) + merged = xr.merge([sfc, ds]) mapper = fsspec.get_mapper(store_url) if init: @@ -206,7 +221,7 @@ def post_process( create_zarr_store(timesteps, group, merged) group = zarr.open_group(mapper, mode="a") - _write_to_store(group, merged) + _write_to_store(group, index, merged) if __name__ == "__main__": From d40b044c39e82a02efb14bc3234e644e9de8107d Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 26 Mar 2020 21:12:48 +0000 Subject: [PATCH 76/81] remove prototyping functions --- workflows/one_step_jobs/README.md | 26 ++++++++++++++++++++++++-- workflows/one_step_jobs/_run_steps.sh | 14 -------------- workflows/one_step_jobs/run_steps.sh | 0 3 files changed, 24 insertions(+), 16 deletions(-) delete mode 100644 workflows/one_step_jobs/_run_steps.sh delete mode 100644 workflows/one_step_jobs/run_steps.sh diff --git a/workflows/one_step_jobs/README.md b/workflows/one_step_jobs/README.md index 6c534f6b79..2bcde6ca42 100644 --- a/workflows/one_step_jobs/README.md +++ b/workflows/one_step_jobs/README.md @@ -16,8 +16,30 @@ This script is self-documenting and its help can be seen by running: python orchestrate_submit_jobs.py -h +# Minimal example + +Here is a minimal exmaple for how to run this script on a limited set of sample images. + +```sh +workdir=$(pwd) +src=gs://vcm-ml-data/orchestration-testing/test-andrep/coarsen_restarts_source-resolution_384_target-resolution_48/ +output=gs://vcm-ml-data/testing-noah/one-step +VERSION= +image=us.gcr.io/vcm-ml/prognostic_run:$VERSION +yaml=$PWD/deep-conv-off.yml + +gsutil -m rm -r $output > /dev/null + ( + cd ../../ + python $workdir/orchestrate_submit_jobs.py \ + $src $yaml $image $output -o \ + --config-version v0.3 + ) -### Kubernetes VM access troubleshooting +``` + + +# Kubernetes VM access troubleshooting To process many (> around 40) runs at once, it is recommended to submit this workflow from a VM authorized with a service account. Users have had issues with API request errors @@ -41,7 +63,7 @@ Use the following command to view your current configuration. It should point to kubectl config view ``` -### Out of Memory errors +# Out of Memory errors The one step jobs can be fail with OOMKilled errors if too many dask workers are used. These errors can typically be avoided by using the single-threaded diff --git a/workflows/one_step_jobs/_run_steps.sh b/workflows/one_step_jobs/_run_steps.sh deleted file mode 100644 index b688aed29d..0000000000 --- a/workflows/one_step_jobs/_run_steps.sh +++ /dev/null @@ -1,14 +0,0 @@ -workdir=$(pwd) - src=gs://vcm-ml-data/orchestration-testing/test-andrep/coarsen_restarts_source-resolution_384_target-resolution_48/ - output=gs://vcm-ml-data/testing-noah/one-step - image=us.gcr.io/vcm-ml/prognostic_run:v0.1.0-a1 - yaml=$PWD/deep-conv-off.yml - - gsutil -m rm -r $output > /dev/null - ( - cd ../../ - python $workdir/orchestrate_submit_jobs.py \ - $src $yaml $image $output -o \ - --config-version v0.3 - ) - diff --git a/workflows/one_step_jobs/run_steps.sh b/workflows/one_step_jobs/run_steps.sh deleted file mode 100644 index e69de29bb2..0000000000 From 4a953dae1770fbbb161511670473addd618eede3 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 26 Mar 2020 21:13:33 +0000 Subject: [PATCH 77/81] lint --- workflows/one_step_jobs/runfile.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/workflows/one_step_jobs/runfile.py b/workflows/one_step_jobs/runfile.py index ba9e348755..4770e52185 100644 --- a/workflows/one_step_jobs/runfile.py +++ b/workflows/one_step_jobs/runfile.py @@ -173,13 +173,13 @@ def _write_to_store(group: zarr.ABSStore, index: int, ds: xr.Dataset): def _safe_get_variables(ds: xr.Dataset, variables: Sequence[Hashable]) -> xr.Dataset: - """ds[...] is very confusing function from a typing perspective and should be - avoided in long-running pipeline codes. This function introduces a type-stable + """ds[...] is very confusing function from a typing perspective and should be + avoided in long-running pipeline codes. This function introduces a type-stable alternative that works better with mypy. - In particular, ds[('a' , 'b' ,'c')] looks for a variable named ('a', 'b', 'c') which - usually doesn't exist, so it causes a key error. but ds[['a', 'b', 'c']] makes a - dataset only consisting of the variables 'a', 'b', and 'c'. This causes tons of + In particular, ds[('a' , 'b' ,'c')] looks for a variable named ('a', 'b', 'c') which + usually doesn't exist, so it causes a key error. but ds[['a', 'b', 'c']] makes a + dataset only consisting of the variables 'a', 'b', and 'c'. This causes tons of hard to find errors. """ variables = list(variables) From 92df97397150144671e167edc3eeddf47c26c9b3 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 26 Mar 2020 22:29:44 +0000 Subject: [PATCH 78/81] bake the runfile into the submission script --- workflows/one_step_jobs/deep-conv-off.yml | 1 - workflows/one_step_jobs/orchestrate_submit_jobs.py | 3 +++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/workflows/one_step_jobs/deep-conv-off.yml b/workflows/one_step_jobs/deep-conv-off.yml index 0c29d6ade8..78e2196f37 100644 --- a/workflows/one_step_jobs/deep-conv-off.yml +++ b/workflows/one_step_jobs/deep-conv-off.yml @@ -1,6 +1,5 @@ kubernetes: docker_image: us.gcr.io/vcm-ml/fv3gfs-python:v0.2.1 - runfile: workflows/one_step_jobs/runfile.py fv3config: diag_table: workflows/one_step_jobs/diag_table_one_step namelist: diff --git a/workflows/one_step_jobs/orchestrate_submit_jobs.py b/workflows/one_step_jobs/orchestrate_submit_jobs.py index 305747e2f5..5e87a76875 100644 --- a/workflows/one_step_jobs/orchestrate_submit_jobs.py +++ b/workflows/one_step_jobs/orchestrate_submit_jobs.py @@ -11,6 +11,8 @@ PWD = os.path.dirname(os.path.abspath(__file__)) CONFIG_DIRECTORY_NAME = "one_step_config" +RUNFILE = os.path.join(PWD, 'runfile.py') + def _create_arg_parser(): parser = argparse.ArgumentParser() @@ -101,6 +103,7 @@ def _create_arg_parser(): subsample_frequency=args.init_frequency, ) + one_step_config["kubernetes"]["runfile"] = RUNFILE one_step_config["kubernetes"]["docker_image"] = args.docker_image local_vgrid_file = os.path.join(PWD, one_step.VERTICAL_GRID_FILENAME) From d4fa24f2d3533aab9bb1bdb02b116d7ee9810362 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 26 Mar 2020 22:52:05 +0000 Subject: [PATCH 79/81] print logging information --- fv3net/pipelines/kube_jobs/one_step.py | 4 ++++ workflows/end_to_end/full-workflow-config.yaml | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/fv3net/pipelines/kube_jobs/one_step.py b/fv3net/pipelines/kube_jobs/one_step.py index f121986db6..b1a65f24dd 100644 --- a/fv3net/pipelines/kube_jobs/one_step.py +++ b/fv3net/pipelines/kube_jobs/one_step.py @@ -1,6 +1,7 @@ import logging import os import fsspec +import pprint from toolz import assoc import uuid import yaml @@ -280,6 +281,9 @@ def submit_jobs( """Submit one-step job for all timesteps in timestep_list""" zarr_url = os.path.join(output_url, "big.zarr") + + logger.info("Working on one-step jobs with arguments:") + logger.info(pprint.pformat(locals())) # kube kwargs are shared by all jobs kube_kwargs = get_run_kubernetes_kwargs(one_step_config["kubernetes"], config_url) diff --git a/workflows/end_to_end/full-workflow-config.yaml b/workflows/end_to_end/full-workflow-config.yaml index b0ed9717b8..f9aef398c1 100644 --- a/workflows/end_to_end/full-workflow-config.yaml +++ b/workflows/end_to_end/full-workflow-config.yaml @@ -38,7 +38,7 @@ experiment: restart_data: from: coarsen_restarts experiment_yaml: ./workflows/one_step_jobs/all-physics-off.yml - docker_image: us.gcr.io/vcm-ml/prognostic-run-orchestration + docker_image: us.gcr.io/vcm-ml/prognostic-run-orchestration:v0.1.0-a1 create_training_data: command: python -m fv3net.pipelines.create_training_data From 71cc1ecd41e5d04ecad6f3cf5b8b66984f166e72 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 26 Mar 2020 22:53:42 +0000 Subject: [PATCH 80/81] lint --- workflows/one_step_jobs/orchestrate_submit_jobs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/one_step_jobs/orchestrate_submit_jobs.py b/workflows/one_step_jobs/orchestrate_submit_jobs.py index 5e87a76875..9b623252d5 100644 --- a/workflows/one_step_jobs/orchestrate_submit_jobs.py +++ b/workflows/one_step_jobs/orchestrate_submit_jobs.py @@ -11,7 +11,7 @@ PWD = os.path.dirname(os.path.abspath(__file__)) CONFIG_DIRECTORY_NAME = "one_step_config" -RUNFILE = os.path.join(PWD, 'runfile.py') +RUNFILE = os.path.join(PWD, "runfile.py") def _create_arg_parser(): From 5a6fa366e625ebe98803980928bac7fb79f758a0 Mon Sep 17 00:00:00 2001 From: "Noah D. Brenowitz" Date: Thu, 26 Mar 2020 22:56:01 +0000 Subject: [PATCH 81/81] update yaml with brian's code --- workflows/end_to_end/full-workflow-config.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/workflows/end_to_end/full-workflow-config.yaml b/workflows/end_to_end/full-workflow-config.yaml index f9aef398c1..9959eb7de0 100644 --- a/workflows/end_to_end/full-workflow-config.yaml +++ b/workflows/end_to_end/full-workflow-config.yaml @@ -38,8 +38,9 @@ experiment: restart_data: from: coarsen_restarts experiment_yaml: ./workflows/one_step_jobs/all-physics-off.yml - docker_image: us.gcr.io/vcm-ml/prognostic-run-orchestration:v0.1.0-a1 - + docker_image: us.gcr.io/vcm-ml/prognostic_run:v0.1.0-a1 + --config-version: v0.3 + create_training_data: command: python -m fv3net.pipelines.create_training_data args: