diff --git a/envs/icechunk_env.yaml b/envs/icechunk_env.yaml new file mode 100644 index 0000000..3f3ad48 --- /dev/null +++ b/envs/icechunk_env.yaml @@ -0,0 +1,11 @@ +name: icechunk +channels: + - conda-forge + - nodefaults +dependencies: + - pip + - fastparquet + - s3fs + - pip: + - icechunk + - git+https://github.com/zarr-developers/VirtualiZarr diff --git a/envs/reader_env.yaml b/envs/reader_env.yaml new file mode 100644 index 0000000..e77e26a --- /dev/null +++ b/envs/reader_env.yaml @@ -0,0 +1,11 @@ +name: reader +channels: + - conda-forge + - nodefaults +dependencies: + - h5py + - xarray + - virtualizarr + - kerchunk + - fastparquet + - s3fs \ No newline at end of file diff --git a/reference_generation/netcdf4_s3_icechunk.py b/reference_generation/netcdf4_s3_icechunk.py new file mode 100644 index 0000000..2307c49 --- /dev/null +++ b/reference_generation/netcdf4_s3_icechunk.py @@ -0,0 +1,87 @@ + + +# dask distributed icechunk s3fs git+https://github.com/mpiannucci/kerchunk@v3 https://github.com/zarr-developers/numcodecs@zarr3-codecs +# TMP BRANCH https://github.com/zarr-developers/VirtualiZarr/pull/278 + + +# To do: We can generalize all these funcs and just have input filelists OR subsets of intake catalogs +# https://pangeo-data.github.io/pangeo-cmip6-cloud/accessing_data.html#manually-searching-the-catalog + +# To Do: +# For now, we are just doing a time concat, but we can merge across vars later + +import xarray as xr +from virtualizarr import open_virtual_dataset +import dask +from dask.distributed import Client +import pandas as pd + + +client = Client(n_workers=16) +client + +urls = [ + 's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_185001-186012.nc', + 's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_187101-188012.nc', + 's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_188101-189012.nc', + 's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_186101-187012.nc', + 's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_189101-190012.nc', + 's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_190101-191012.nc', + 's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_191101-192012.nc', + 's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_192101-193012.nc', + 's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_193101-194012.nc', + 's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_194101-195012.nc', + 's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_195101-196012.nc', + 's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_196101-197012.nc', + 's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_197101-198012.nc', + 's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_198101-199012.nc', + 's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_199101-200012.nc', + 's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_200101-201012.nc', + 's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_201101-201412.nc' +] + + +vds = open_virtual_dataset(urls[0], indexes={}) + +def process(filename): + vds = open_virtual_dataset(filename, indexes={}) + return vds + + +delayed_results = [dask.delayed(process)(filename) for filename in urls] + +# compute delayed obs +results = dask.compute(*delayed_results) + +# concat virtual datasets +combined_vds = xr.concat(list(results), dim="time", coords="minimal", compat="override") + +# once icechunk PR is in, we can write to Zarr v3 +# for now, we write to parquet, then will RT to icechunk with 'icechunk' env +ref_parquet_path = "../refs/netcdf4_s3_icechunk.parquet" +combined_vds.virtualize.to_kerchunk( + ref_parquet_path, format="parquet" +) + + + +###### WRITE ICECHUNK - THIS REQUIRES envs/icechunk_env.yaml ##### + +# 1. read existing ref + +vds = open_virtual_dataset('netcdf4_s3_icechunk.parquet',filetype = 'kerchunk', indexes={}) + +# create an icechunk store +from icechunk import IcechunkStore, StorageConfig, StoreConfig, VirtualRefConfig +import icechunk + +storage_config = icechunk.StorageConfig.filesystem("./netcdf4_s3_icechunk") +store = icechunk.IcechunkStore.create(storage_config) + +# write to icechunk store +vds.virtualize.to_icechunk(store) +store.commit("init") + + + + diff --git a/refs/netcdf4_s3_icechunk.parquet/.zmetadata b/refs/netcdf4_s3_icechunk.parquet/.zmetadata new file mode 100644 index 0000000..c640e1f --- /dev/null +++ b/refs/netcdf4_s3_icechunk.parquet/.zmetadata @@ -0,0 +1 @@ +{"metadata":{".zattrs":{"CCCma_model_hash":"55f484f90aff0e32c5a8e92a42c6b9ae7ffe6224","CCCma_parent_runid":"rc3.1-pictrl","CCCma_pycmor_hash":"33c30511acc319a98240633965a04ca99c26427e","CCCma_runid":"rc3.1-his10","Conventions":"CF-1.7 CMIP-6.2","YMDH_branch_time_in_child":"1850:01:01:00","YMDH_branch_time_in_parent":"5651:01:01:00","activity_id":"CMIP","branch_method":"Spin-up documentation","branch_time_in_child":0.0,"branch_time_in_parent":1387365.0,"cmor_version":"3.4.0","contact":"ec.cccma.info-info.ccmac.ec@canada.ca","creation_date":"2019-05-02T01:31:29Z","data_specs_version":"01.00.29","experiment":"all-forcing simulation of the recent past","experiment_id":"historical","forcing_index":1,"frequency":"mon","further_info_url":"https:\/\/furtherinfo.es-doc.org\/CMIP6.CCCma.CanESM5.historical.none.r10i1p1f1","grid":"ORCA1 tripolar grid, 1 deg with refinement to 1\/3 deg within 20 degrees of the equator; 361 x 290 longitude\/latitude; 45 vertical levels; top grid cell 0-6.19 m","grid_label":"gn","history":"2019-05-02T01:31:29Z ;rewrote data to be consistent with CMIP for variable uo found in table Omon.;\nOutput from $runid","initialization_index":1,"institution":"Canadian Centre for Climate Modelling and Analysis, Environment and Climate Change Canada, Victoria, BC V8P 5C2, Canada","institution_id":"CCCma","license":"CMIP6 model data produced by The Government of Canada (Canadian Centre for Climate Modelling and Analysis, Environment and Climate Change Canada) is licensed under a Creative Commons Attribution ShareAlike 4.0 International License (https:\/\/creativecommons.org\/licenses). Consult https:\/\/pcmdi.llnl.gov\/CMIP6\/TermsOfUse for terms of use governing CMIP6 output, including citation requirements and proper acknowledgment. Further information about this data, including some limitations, can be found via the further_info_url (recorded as a global attribute in this file) and at https:\/\/\/pcmdi.llnl.gov\/. The data producers and data providers make no warranty, either express or implied, including, but not limited to, warranties of merchantability and fitness for a particular purpose. All liabilities arising from the supply of the information (including any liability arising in negligence) are excluded to the fullest extent permitted by law.","mip_era":"CMIP6","nominal_resolution":"100 km","parent_activity_id":"CMIP","parent_experiment_id":"piControl","parent_mip_era":"CMIP6","parent_source_id":"CanESM5","parent_time_units":"days since 1850-01-01 0:0:0.0","parent_variant_label":"r1i1p1f1","physics_index":1,"product":"model-output","realization_index":10,"realm":"ocean","references":"Geophysical Model Development Special issue on CanESM5 (https:\/\/www.geosci-model-dev.net\/special_issues.html)","source":"CanESM5 (2019): \naerosol: interactive\natmos: CanAM5 (T63L49 native atmosphere, T63 Linear Gaussian Grid; 128 x 64 longitude\/latitude; 49 levels; top level 1 hPa)\natmosChem: specified oxidants for aerosols\nland: CLASS3.6\/CTEM1.2\nlandIce: specified ice sheets\nocean: NEMO3.4.1 (ORCA1 tripolar grid, 1 deg with refinement to 1\/3 deg within 20 degrees of the equator; 361 x 290 longitude\/latitude; 45 vertical levels; top grid cell 0-6.19 m)\nocnBgchem: Canadian Model of Ocean Carbon (CMOC); NPZD ecosystem with OMIP prescribed carbonate chemistry\nseaIce: LIM2","source_id":"CanESM5","source_type":"AOGCM","sub_experiment":"none","sub_experiment_id":"none","table_id":"Omon","table_info":"Creation Date:(20 February 2019) MD5:374fbe5a2bcca535c40f7f23da271e49","title":"CanESM5 output prepared for CMIP6","tracking_id":"hdl:21.14100\/f7acdf15-c432-4233-8b75-fd667c5ed52f","variable_id":"uo","variant_label":"r10i1p1f1","version":"v20190429","coordinates":"time j lev i"},".zgroup":{"zarr_format":2},"i\/.zarray":{"shape":[360],"chunks":[360],"dtype":"