From 59005be8c94e28cabaea2a91a6eb737498f737cb Mon Sep 17 00:00:00 2001 From: Max Grover Date: Wed, 15 Sep 2021 10:51:46 -0600 Subject: [PATCH 1/2] add initial catalog gen notebook --- environment/environment.yml | 1 + notebooks/catalog-generation.ipynb | 195 +++++++++++++++++++++++++++++ 2 files changed, 196 insertions(+) create mode 100644 notebooks/catalog-generation.ipynb diff --git a/environment/environment.yml b/environment/environment.yml index 4ff13fc..05371ad 100644 --- a/environment/environment.yml +++ b/environment/environment.yml @@ -54,4 +54,5 @@ dependencies: - pip: - git+https://github.com/NCAR/esmlab.git - git+https://github.com/NCAR/esmlab-regrid.git + - ecgtools \ No newline at end of file diff --git a/notebooks/catalog-generation.ipynb b/notebooks/catalog-generation.ipynb new file mode 100644 index 0000000..3bdc848 --- /dev/null +++ b/notebooks/catalog-generation.ipynb @@ -0,0 +1,195 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "fc8d1154-c8a2-40f0-b218-035912eef869", + "metadata": {}, + "source": [ + "# Generate an [Intake-ESM](https://intake-esm.readthedocs.io/en/latest/) Catalog Using [ECGTools](https://ecgtools.readthedocs.io/en/latest/)\n", + "In this notebook, we use the data directory specified in `_config_calc.yml` to build a data calog to be used throughout the analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "6809d32c-440c-4e19-9250-eea90f0e7e4e", + "metadata": {}, + "outputs": [], + "source": [ + "import yaml\n", + "\n", + "from ecgtools import Builder\n", + "from ecgtools.parsers.cesm import parse_cesm_timeseries" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "afd8b5bc-2307-423c-87ec-8a0bee002ea9", + "metadata": {}, + "outputs": [], + "source": [ + "with open('_config_calc.yml') as fid:\n", + " config_dict = yaml.load(fid, Loader=yaml.Loader)" + ] + }, + { + "cell_type": "markdown", + "id": "b4e00d81-a5ea-4b44-967e-18e3a8ffe4ce", + "metadata": {}, + "source": [ + "## Setup the Builder\n", + "We set up the builder object here - specifying the data directory within the `_config_calc.yml` file" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "bd7a2533-2fa5-4d7e-8fca-4fdf981421c7", + "metadata": {}, + "outputs": [], + "source": [ + "b = Builder(config_dict['esm_data_dir'])" + ] + }, + { + "cell_type": "markdown", + "id": "3302e629-5a20-4b90-b805-ecbb57300086", + "metadata": {}, + "source": [ + "## Build the Catalog\n", + "When we build the catalog, we specify to use the `parse_cesm_timeseries` parser" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "52a229db-3b87-447d-a914-4aa41d69e385", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.\n", + "[Parallel(n_jobs=-1)]: Done 1 out of 1 | elapsed: 0.0s finished\n", + "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.\n", + "[Parallel(n_jobs=-1)]: Done 24 tasks | elapsed: 1.3s\n", + "[Parallel(n_jobs=-1)]: Done 43 out of 43 | elapsed: 1.7s finished\n" + ] + }, + { + "data": { + "text/plain": [ + "Builder(root_path=PosixPath('/glade/scratch/mclong/cesm2-marbl-data_nc'), extension='.nc', depth=0, exclude_patterns=None, njobs=-1)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "b.build(parse_cesm_timeseries)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "95de1cf4-573f-4144-b3a2-a76d5b4d91cc", + "metadata": {}, + "outputs": [], + "source": [ + "def add_experiment_to_dataframe(df):\n", + " case_split = df.case.str.split('.', expand=True)\n", + " experiment = case_split.iloc[:, 1] + '.' + case_split.iloc[:, 2]\n", + " df['experiment'] = experiment.fillna('historical')\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "fe73027a-4c34-4b37-89c1-13e29908e4db", + "metadata": {}, + "outputs": [], + "source": [ + "b.df = add_experiment_to_dataframe(b.df)" + ] + }, + { + "cell_type": "markdown", + "id": "c183e135-7809-48bb-9d04-67bf83f17fb6", + "metadata": {}, + "source": [ + "## Save the Catalog\n", + "Now that we have built the catalog, let's save it to disk, using the file name specified in `_config_calc.yml`" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "5f02662c-f43f-42c4-8a13-f867a63cc3b7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saved catalog location: data/cesm2-cmip6-timeseries.json and data/cesm2-cmip6-timeseries.json\n" + ] + } + ], + "source": [ + "b.save(\n", + " config_dict['esm_collection'],\n", + " # Column name including filepath\n", + " path_column_name='path',\n", + " # Column name including variables\n", + " variable_column_name='variable',\n", + " # Data file format - could be netcdf or zarr (in this case, netcdf)\n", + " data_format=\"netcdf\",\n", + " # Which attributes to groupby when reading in variables using intake-esm\n", + " groupby_attrs=[\"component\", \"experiment\", \"stream\"],\n", + " # Aggregations which are fed into xarray when reading in data using intake\n", + " aggregations=[\n", + " {\n", + " \"type\": \"join_existing\",\n", + " \"attribute_name\": \"time_range\",\n", + " \"options\": {\"dim\": \"time\", \"coords\": \"minimal\", \"compat\": \"override\"},\n", + " }\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fe95b3f-70c5-4c66-9731-3f5233b118fe", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:miniconda3-cesm2-marbl]", + "language": "python", + "name": "conda-env-miniconda3-cesm2-marbl-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 5341c73c83f12cf7ac917dd7d1957c0a92535759 Mon Sep 17 00:00:00 2001 From: Max Grover Date: Wed, 15 Sep 2021 10:53:54 -0600 Subject: [PATCH 2/2] update config calc --- notebooks/_config_calc.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/notebooks/_config_calc.yml b/notebooks/_config_calc.yml index 0517af7..3878ef1 100644 --- a/notebooks/_config_calc.yml +++ b/notebooks/_config_calc.yml @@ -1,6 +1,7 @@ project_kernel: cesm2-marbl notebooks: pre_notebooks: + - generate-catalog - _data-Ncycle - _data-nutrient-plots - _data-mld-obs @@ -23,7 +24,8 @@ notebooks: - transient-fgco2 - transient-biological-pump -esm_collection: data/campaign-cesm2-cmip6-timeseries.json +esm_data_dir: /glade/scratch/mclong/cesm2-marbl-data_nc +esm_collection: data/cesm2-cmip6-timeseries.json cache_dir: /glade/p/cgd/oce/projects/cesm2-marbl/funnel-cache data_collections: epoch_mean: