Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Netcdf4-icechunk-example #13

Merged
merged 2 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions envs/icechunk_env.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
name: icechunk
channels:
- conda-forge
- nodefaults
dependencies:
- pip
- fastparquet
- s3fs
- pip:
- icechunk
- git+https://github.com/zarr-developers/VirtualiZarr
11 changes: 11 additions & 0 deletions envs/reader_env.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
name: reader
channels:
- conda-forge
- nodefaults
dependencies:
- h5py
- xarray
- virtualizarr
- kerchunk
- fastparquet
- s3fs
87 changes: 87 additions & 0 deletions reference_generation/netcdf4_s3_icechunk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@


# dask distributed icechunk s3fs git+https://github.com/mpiannucci/kerchunk@v3 https://github.com/zarr-developers/numcodecs@zarr3-codecs
# TMP BRANCH https://github.com/zarr-developers/VirtualiZarr/pull/278


# To do: We can generalize all these funcs and just have input filelists OR subsets of intake catalogs
# https://pangeo-data.github.io/pangeo-cmip6-cloud/accessing_data.html#manually-searching-the-catalog

# To Do:
# For now, we are just doing a time concat, but we can merge across vars later

import xarray as xr
from virtualizarr import open_virtual_dataset
import dask
from dask.distributed import Client
import pandas as pd


client = Client(n_workers=16)
client

urls = [
's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_185001-186012.nc',
's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_187101-188012.nc',
's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_188101-189012.nc',
's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_186101-187012.nc',
's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_189101-190012.nc',
's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_190101-191012.nc',
's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_191101-192012.nc',
's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_192101-193012.nc',
's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_193101-194012.nc',
's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_194101-195012.nc',
's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_195101-196012.nc',
's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_196101-197012.nc',
's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_197101-198012.nc',
's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_198101-199012.nc',
's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_199101-200012.nc',
's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_200101-201012.nc',
's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_201101-201412.nc'
]


vds = open_virtual_dataset(urls[0], indexes={})

def process(filename):
vds = open_virtual_dataset(filename, indexes={})
return vds


delayed_results = [dask.delayed(process)(filename) for filename in urls]

# compute delayed obs
results = dask.compute(*delayed_results)

# concat virtual datasets
combined_vds = xr.concat(list(results), dim="time", coords="minimal", compat="override")

# once icechunk PR is in, we can write to Zarr v3
# for now, we write to parquet, then will RT to icechunk with 'icechunk' env
ref_parquet_path = "../refs/netcdf4_s3_icechunk.parquet"
combined_vds.virtualize.to_kerchunk(
ref_parquet_path, format="parquet"
)



###### WRITE ICECHUNK - THIS REQUIRES envs/icechunk_env.yaml #####

# 1. read existing ref

vds = open_virtual_dataset('netcdf4_s3_icechunk.parquet',filetype = 'kerchunk', indexes={})

# create an icechunk store
from icechunk import IcechunkStore, StorageConfig, StoreConfig, VirtualRefConfig
import icechunk

storage_config = icechunk.StorageConfig.filesystem("./netcdf4_s3_icechunk")
store = icechunk.IcechunkStore.create(storage_config)

# write to icechunk store
vds.virtualize.to_icechunk(store)
store.commit("init")




1 change: 1 addition & 0 deletions refs/netcdf4_s3_icechunk.parquet/.zmetadata
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"metadata":{".zattrs":{"CCCma_model_hash":"55f484f90aff0e32c5a8e92a42c6b9ae7ffe6224","CCCma_parent_runid":"rc3.1-pictrl","CCCma_pycmor_hash":"33c30511acc319a98240633965a04ca99c26427e","CCCma_runid":"rc3.1-his10","Conventions":"CF-1.7 CMIP-6.2","YMDH_branch_time_in_child":"1850:01:01:00","YMDH_branch_time_in_parent":"5651:01:01:00","activity_id":"CMIP","branch_method":"Spin-up documentation","branch_time_in_child":0.0,"branch_time_in_parent":1387365.0,"cmor_version":"3.4.0","contact":"[email protected]","creation_date":"2019-05-02T01:31:29Z","data_specs_version":"01.00.29","experiment":"all-forcing simulation of the recent past","experiment_id":"historical","forcing_index":1,"frequency":"mon","further_info_url":"https:\/\/furtherinfo.es-doc.org\/CMIP6.CCCma.CanESM5.historical.none.r10i1p1f1","grid":"ORCA1 tripolar grid, 1 deg with refinement to 1\/3 deg within 20 degrees of the equator; 361 x 290 longitude\/latitude; 45 vertical levels; top grid cell 0-6.19 m","grid_label":"gn","history":"2019-05-02T01:31:29Z ;rewrote data to be consistent with CMIP for variable uo found in table Omon.;\nOutput from $runid","initialization_index":1,"institution":"Canadian Centre for Climate Modelling and Analysis, Environment and Climate Change Canada, Victoria, BC V8P 5C2, Canada","institution_id":"CCCma","license":"CMIP6 model data produced by The Government of Canada (Canadian Centre for Climate Modelling and Analysis, Environment and Climate Change Canada) is licensed under a Creative Commons Attribution ShareAlike 4.0 International License (https:\/\/creativecommons.org\/licenses). Consult https:\/\/pcmdi.llnl.gov\/CMIP6\/TermsOfUse for terms of use governing CMIP6 output, including citation requirements and proper acknowledgment. Further information about this data, including some limitations, can be found via the further_info_url (recorded as a global attribute in this file) and at https:\/\/\/pcmdi.llnl.gov\/. The data producers and data providers make no warranty, either express or implied, including, but not limited to, warranties of merchantability and fitness for a particular purpose. All liabilities arising from the supply of the information (including any liability arising in negligence) are excluded to the fullest extent permitted by law.","mip_era":"CMIP6","nominal_resolution":"100 km","parent_activity_id":"CMIP","parent_experiment_id":"piControl","parent_mip_era":"CMIP6","parent_source_id":"CanESM5","parent_time_units":"days since 1850-01-01 0:0:0.0","parent_variant_label":"r1i1p1f1","physics_index":1,"product":"model-output","realization_index":10,"realm":"ocean","references":"Geophysical Model Development Special issue on CanESM5 (https:\/\/www.geosci-model-dev.net\/special_issues.html)","source":"CanESM5 (2019): \naerosol: interactive\natmos: CanAM5 (T63L49 native atmosphere, T63 Linear Gaussian Grid; 128 x 64 longitude\/latitude; 49 levels; top level 1 hPa)\natmosChem: specified oxidants for aerosols\nland: CLASS3.6\/CTEM1.2\nlandIce: specified ice sheets\nocean: NEMO3.4.1 (ORCA1 tripolar grid, 1 deg with refinement to 1\/3 deg within 20 degrees of the equator; 361 x 290 longitude\/latitude; 45 vertical levels; top grid cell 0-6.19 m)\nocnBgchem: Canadian Model of Ocean Carbon (CMOC); NPZD ecosystem with OMIP prescribed carbonate chemistry\nseaIce: LIM2","source_id":"CanESM5","source_type":"AOGCM","sub_experiment":"none","sub_experiment_id":"none","table_id":"Omon","table_info":"Creation Date:(20 February 2019) MD5:374fbe5a2bcca535c40f7f23da271e49","title":"CanESM5 output prepared for CMIP6","tracking_id":"hdl:21.14100\/f7acdf15-c432-4233-8b75-fd667c5ed52f","variable_id":"uo","variant_label":"r10i1p1f1","version":"v20190429","coordinates":"time j lev i"},".zgroup":{"zarr_format":2},"i\/.zarray":{"shape":[360],"chunks":[360],"dtype":"<i4","fill_value":-2147483647,"order":"C","compressor":null,"filters":null,"zarr_format":2},"i\/.zattrs":{"long_name":"cell index along first dimension","units":"1","_ARRAY_DIMENSIONS":["i"]},"j\/.zarray":{"shape":[291],"chunks":[291],"dtype":"<i4","fill_value":-2147483647,"order":"C","compressor":null,"filters":null,"zarr_format":2},"j\/.zattrs":{"long_name":"cell index along second dimension","units":"1","_ARRAY_DIMENSIONS":["j"]},"latitude\/.zarray":{"shape":[1980,291,360],"chunks":[1,291,360],"dtype":"<f8","fill_value":1e+20,"order":"C","compressor":null,"filters":[{"id":"zlib","level":1}],"zarr_format":2},"latitude\/.zattrs":{"bounds":"vertices_latitude","long_name":"latitude","missing_value":1e+20,"standard_name":"latitude","units":"degrees_north","_ARRAY_DIMENSIONS":["time","j","i"]},"lev\/.zarray":{"shape":[45],"chunks":[45],"dtype":"<f8","fill_value":9.969209968386869e+36,"order":"C","compressor":null,"filters":null,"zarr_format":2},"lev\/.zattrs":{"axis":"Z","bounds":"lev_bnds","long_name":"ocean depth coordinate","positive":"down","standard_name":"depth","units":"m","_ARRAY_DIMENSIONS":["lev"]},"lev_bnds\/.zarray":{"shape":[1980,45,2],"chunks":[1,45,2],"dtype":"<f8","fill_value":9.969209968386869e+36,"order":"C","compressor":null,"filters":[{"id":"zlib","level":1}],"zarr_format":2},"lev_bnds\/.zattrs":{"_ARRAY_DIMENSIONS":["time","lev","bnds"]},"longitude\/.zarray":{"shape":[1980,291,360],"chunks":[1,291,360],"dtype":"<f8","fill_value":1e+20,"order":"C","compressor":null,"filters":[{"id":"zlib","level":1}],"zarr_format":2},"longitude\/.zattrs":{"bounds":"vertices_longitude","long_name":"longitude","missing_value":1e+20,"standard_name":"longitude","units":"degrees_east","_ARRAY_DIMENSIONS":["time","j","i"]},"time\/.zarray":{"shape":[1980],"chunks":[1],"dtype":"<f8","fill_value":NaN,"order":"C","compressor":null,"filters":null,"zarr_format":2},"time\/.zattrs":{"axis":"T","bounds":"time_bnds","calendar":"365_day","long_name":"time","standard_name":"time","units":"days since 1850-01-01 0:0:0.0","_ARRAY_DIMENSIONS":["time"]},"time_bnds\/.zarray":{"shape":[1980,2],"chunks":[1,2],"dtype":"<f8","fill_value":9.969209968386869e+36,"order":"C","compressor":null,"filters":[{"id":"zlib","level":1}],"zarr_format":2},"time_bnds\/.zattrs":{"_ARRAY_DIMENSIONS":["time","bnds"]},"uo\/.zarray":{"shape":[1980,45,291,360],"chunks":[1,21,145,109],"dtype":"<f4","fill_value":1.0000000200408773e+20,"order":"C","compressor":null,"filters":[{"id":"zlib","level":1}],"zarr_format":2},"uo\/.zattrs":{"cell_methods":"time: mean","comment":"Prognostic x-ward velocity component resolved by the model.","coordinates":"latitude longitude","long_name":"Sea Water X Velocity","missing_value":1.0000000200408773e+20,"original_name":"vozocrtx","standard_name":"sea_water_x_velocity","units":"m s-1","_ARRAY_DIMENSIONS":["time","lev","j","i"]},"vertices_latitude\/.zarray":{"shape":[1980,291,360,4],"chunks":[1,291,360,2],"dtype":"<f8","fill_value":1e+20,"order":"C","compressor":null,"filters":[{"id":"zlib","level":1}],"zarr_format":2},"vertices_latitude\/.zattrs":{"missing_value":1e+20,"units":"degrees_north","_ARRAY_DIMENSIONS":["time","j","i","vertices"]},"vertices_longitude\/.zarray":{"shape":[1980,291,360,4],"chunks":[1,291,360,2],"dtype":"<f8","fill_value":1e+20,"order":"C","compressor":null,"filters":[{"id":"zlib","level":1}],"zarr_format":2},"vertices_longitude\/.zattrs":{"missing_value":1e+20,"units":"degrees_east","_ARRAY_DIMENSIONS":["time","j","i","vertices"]}},"record_size":100000}
Binary file added refs/netcdf4_s3_icechunk.parquet/i/refs.0.parq
Binary file not shown.
Binary file added refs/netcdf4_s3_icechunk.parquet/j/refs.0.parq
Binary file not shown.
Binary file not shown.
Binary file added refs/netcdf4_s3_icechunk.parquet/lev/refs.0.parq
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added refs/netcdf4_s3_icechunk.parquet/time/refs.0.parq
Binary file not shown.
Binary file not shown.
Binary file added refs/netcdf4_s3_icechunk.parquet/uo/refs.0.parq
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
1 change: 1 addition & 0 deletions refs/netcdf4_s3_icechunk/refs/branch.main/ZZZZZZZY.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"snapshot":"M8XKZ85VEXS4XXBBR6N0"}
1 change: 1 addition & 0 deletions refs/netcdf4_s3_icechunk/refs/branch.main/ZZZZZZZZ.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"snapshot":"FPE0KG1ZFVQBWJNANW0G"}
Binary file not shown.
Binary file not shown.