diff --git a/data/.gitkeep b/data/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/parquet/example_remote_dl_parquet.ipynb b/parquet/example_remote_dl_parquet.ipynb new file mode 100644 index 0000000..d6b8227 --- /dev/null +++ b/parquet/example_remote_dl_parquet.ipynb @@ -0,0 +1,328 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from pyarrow.parquet import ParquetFile\n", + "import dask.dataframe as dd\n", + "import os\n", + "import xarray as xr\n", + "import ujson\n", + "import pprint\n", + "\n", + "#%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# These fs options don't work for http... beware!\n", + "so = dict(mode=\"rb\", anon=True, default_fill_cache=False, default_cache_type=\"first\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def gen_json(u, fs, outf=None):\n", + " with fs.open(u, **so) as infile:\n", + " h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300)\n", + " p = u.split(\"/\")\n", + " date = p[3]\n", + " fname = p[5]\n", + " if outf:\n", + " # outf = f'{json_dir}{date}.{fname}.json'\n", + " with open(outf, \"wb\") as f:\n", + " f.write(ujson.dumps(h5chunks.translate()).encode())\n", + " else:\n", + " return h5chunks.translate()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# dir_files = [os.path.join(\"../short_range_18files\", files) for files in os.listdir(\"../short_range_18files\")]\n", + "# dir_files = [os.path.join(\"short_range_2files\", files) for files in os.listdir(\"short_range_2files\")]\n", + "# print(dir_files)\n", + "dir_files = [\n", + " \"nwm.t00z.short_range.channel_rt.f001.conus.nc\",\n", + " \"nwm.t00z.short_range.channel_rt.f002.conus.nc\",\n", + " \"nwm.t00z.short_range.channel_rt.f003.conus.nc\",\n", + " \"nwm.t00z.short_range.channel_rt.f004.conus.nc\",\n", + " \"nwm.t00z.short_range.channel_rt.f005.conus.nc\",\n", + " \"nwm.t00z.short_range.channel_rt.f006.conus.nc\",\n", + " \"nwm.t00z.short_range.channel_rt.f007.conus.nc\",\n", + " \"nwm.t00z.short_range.channel_rt.f008.conus.nc\",\n", + " \"nwm.t00z.short_range.channel_rt.f009.conus.nc\",\n", + " \"nwm.t00z.short_range.channel_rt.f010.conus.nc\",\n", + " \"nwm.t00z.short_range.channel_rt.f011.conus.nc\",\n", + " \"nwm.t00z.short_range.channel_rt.f012.conus.nc\",\n", + " \"nwm.t00z.short_range.channel_rt.f013.conus.nc\",\n", + " \"nwm.t00z.short_range.channel_rt.f014.conus.nc\",\n", + " \"nwm.t00z.short_range.channel_rt.f015.conus.nc\",\n", + " \"nwm.t00z.short_range.channel_rt.f016.conus.nc\",\n", + " \"nwm.t00z.short_range.channel_rt.f017.conus.nc\",\n", + " \"nwm.t00z.short_range.channel_rt.f018.conus.nc\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import fsspec\n", + "import xarray as xr\n", + "from kerchunk.hdf import SingleHdf5ToZarr\n", + "\n", + "fs = fsspec.filesystem(\"gcs\", anon=True)\n", + "\n", + "# https://storage.googleapis.com/national-water-model/nwm.20220911/short_range/nwm.t00z.short_range.channel_rt.f001.conus.nc\n", + "# gcs_url = \"gcs://national-water-model/nwm.20220911/short_range/nwm.t00z.short_range.channel_rt.f001.conus.nc\"\n", + "gcs_url = \"gcs://national-water-model/nwm.20220911/short_range/\"\n", + "\n", + "sr_h5 = []\n", + "for f in dir_files:\n", + " print(f)\n", + " sr_h5.append(gen_json(gcs_url + f, fs))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "%%time\n", + "fds = []\n", + "for xj in sr_h5:\n", + " backend_args = {\n", + " \"consolidated\": False,\n", + " \"storage_options\": {\n", + " \"fo\": xj,\n", + " # Adding these options returns a properly dimensioned but otherwise null dataframe\n", + " # \"remote_protocol\": \"https\",\n", + " # \"remote_options\": {'anon':True}\n", + " },\n", + " }\n", + " fds.append(\n", + " xr.open_dataset(\n", + " \"reference://\",\n", + " engine=\"zarr\",\n", + " mask_and_scale=False,\n", + " backend_kwargs=backend_args,\n", + " )\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "%%time\n", + "ds = xr.concat(fds, dim=\"time\")\n", + "ds" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "df = ds[\"streamflow\"].to_dataframe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# df = df.streamflow\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "%%time\n", + "# df = pd.Series.to_frame(df)\n", + "\n", + "df.to_parquet(\n", + " \"../data/parquet_all_feature_ids.gzip\", engine=\"pyarrow\", compression=\"gzip\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "ParquetFile(\"../data/parquet_all_feature_ids.gzip\").metadata # num_columns: 3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "%%time\n", + "data = dd.read_parquet(\n", + " \"../data/parquet_all_feature_ids.gzip\", storage_options={\"anon\": True}\n", + ")\n", + "data\n", + "result = data.compute()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# result = result.loc[:, [101]]\n", + "result = result.loc[:, 100:1032]\n", + "# result= result.loc[:, :, 1000:11000]\n", + "# result= result.loc[:, :, 10000:110000]\n", + "result\n", + "r_xa = result.to_xarray()\n", + "r_xa\n", + "r_xa.plot.scatter(\"time\", \"streamflow\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/parquet/parquet-demo-short-range-18files.ipynb b/parquet/parquet-demo-short-range-18files.ipynb new file mode 100644 index 0000000..1ee1e8c --- /dev/null +++ b/parquet/parquet-demo-short-range-18files.ipynb @@ -0,0 +1,894 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import pyarrow.parquet as pq\n", + "import pandas as pd\n", + "import numpy as np\n", + "import pyarrow as pa\n", + "import dask\n", + "import fsspec\n", + "from datetime import datetime, timedelta\n", + "\n", + "import dask.dataframe as dd\n", + "from dask.distributed import Client\n", + "import os\n", + "import xarray as xr\n", + "import glob\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['short_range_parquet_2files/nwm.20220911_short_range_nwm.t00z.short_range.channel_rt.f001.conus.nc', 'short_range_parquet_2files/nwm.20220911_short_range_nwm.t00z.short_range.channel_rt.f002.conus.nc']\n" + ] + } + ], + "source": [ + "dir_files = [os.path.join(\"short_range_parquet_2files\", files) for files in os.listdir(\"short_range_parquet_2files\")]\n", + "print(dir_files)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
<xarray.Dataset>\n", + "Dimensions: (time: 2, reference_time: 1, feature_id: 2776738)\n", + "Coordinates:\n", + " * time (time) datetime64[ns] 2022-09-11T01:00:00 2022-09-11T02:0...\n", + " * reference_time (reference_time) datetime64[ns] 2022-09-11\n", + " * feature_id (feature_id) int32 101 179 181 ... 1180001803 1180001804\n", + "Data variables:\n", + " crs (time) |S1 b'' b''\n", + " streamflow (time, feature_id) float64 0.18 0.01 0.01 ... 0.0 0.0 0.0\n", + " nudge (time, feature_id) float64 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0\n", + " velocity (time, feature_id) float64 0.08 0.1 0.1 ... 0.02 0.0 0.02\n", + " qSfcLatRunoff (time, feature_id) float64 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0\n", + " qBucket (time, feature_id) float64 0.00341 0.00858 ... 0.00016\n", + " qBtmVertRunoff (time, feature_id) float64 12.18 26.58 9.311 ... 0.531 0.758\n", + "Attributes: (12/19)\n", + " TITLE: OUTPUT FROM NWM v2.2\n", + " featureType: timeSeries\n", + " proj4: +proj=lcc +units=m +a=6370000.0 +b=6370000.0 ...\n", + " model_initialization_time: 2022-09-11_00:00:00\n", + " station_dimension: feature_id\n", + " model_output_valid_time: 2022-09-11_01:00:00\n", + " ... ...\n", + " model_configuration: short_range\n", + " dev_OVRTSWCRT: 1\n", + " dev_NOAH_TIMESTEP: 3600\n", + " dev_channel_only: 0\n", + " dev_channelBucket_only: 0\n", + " dev: dev_ prefix indicates development/internal me...
\n", + " | \n", + " | \n", + " | streamflow | \n", + "
---|---|---|---|
time | \n", + "reference_time | \n", + "feature_id | \n", + "\n", + " |
2022-09-11 01:00:00 | \n", + "2022-09-11 | \n", + "101 | \n", + "0.18 | \n", + "
179 | \n", + "0.01 | \n", + "||
181 | \n", + "0.01 | \n", + "||
183 | \n", + "0.01 | \n", + "||
185 | \n", + "0.01 | \n", + "||
... | \n", + "... | \n", + "... | \n", + "... | \n", + "
2022-09-11 02:00:00 | \n", + "2022-09-11 | \n", + "1180001800 | \n", + "0.00 | \n", + "
1180001801 | \n", + "0.00 | \n", + "||
1180001802 | \n", + "0.00 | \n", + "||
1180001803 | \n", + "0.00 | \n", + "||
1180001804 | \n", + "0.00 | \n", + "
5553476 rows × 1 columns
\n", + "\n", + " | \n", + " | \n", + " | streamflow | \n", + "
---|---|---|---|
time | \n", + "reference_time | \n", + "feature_id | \n", + "\n", + " |
2022-09-11 02:00:00 | \n", + "2022-09-11 | \n", + "101 | \n", + "0.18 | \n", + "
179 | \n", + "0.01 | \n", + "||
181 | \n", + "0.01 | \n", + "||
183 | \n", + "0.01 | \n", + "||
185 | \n", + "0.01 | \n", + "||
... | \n", + "... | \n", + "||
1180001800 | \n", + "0.00 | \n", + "||
1180001801 | \n", + "0.00 | \n", + "||
1180001802 | \n", + "0.00 | \n", + "||
1180001803 | \n", + "0.00 | \n", + "||
1180001804 | \n", + "0.00 | \n", + "
2776738 rows × 1 columns
\n", + "\n | \n | \n | streamflow | \n
---|---|---|---|
time | \nreference_time | \nfeature_id | \n\n |
2022-09-11 01:00:00 | \n2022-09-11 | \n101 | \n0.18 | \n
179 | \n0.01 | \n||
181 | \n0.01 | \n||
183 | \n0.01 | \n||
185 | \n0.01 | \n||
... | \n... | \n... | \n... | \n
2022-09-11 18:00:00 | \n2022-09-11 | \n1180001800 | \n0.00 | \n
1180001801 | \n0.00 | \n||
1180001802 | \n0.00 | \n||
1180001803 | \n0.00 | \n||
1180001804 | \n0.00 | \n
49981284 rows × 1 columns
\n