Skip to content

Commit

Permalink
Merge pull request #923 from ayushnag/dmrpp_bugs_and_tests
Browse files Browse the repository at this point in the history
Fix small open_virtual_dataset bugs
  • Loading branch information
betolink authored Jan 17, 2025
2 parents 91866ac + f39b151 commit 4d0721d
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 51 deletions.
11 changes: 6 additions & 5 deletions earthaccess/dmrpp_zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ def open_virtual_mfdataset(
import xarray as xr

if access == "direct":
fs = earthaccess.get_s3_filesystem(results=granules[0])
fs.storage_options["anon"] = False # type: ignore
fs = earthaccess.get_s3_filesystem(results=granules) # type: ignore
fs.storage_options["anon"] = False
else:
fs = earthaccess.get_fsspec_https_session()
if parallel:
Expand All @@ -114,7 +114,7 @@ def open_virtual_mfdataset(
filetype="dmrpp", # type: ignore
group=group,
indexes={},
reader_options={"storage_options": fs.storage_options}, # type: ignore
reader_options={"storage_options": fs.storage_options},
)
)
if preprocess is not None:
Expand All @@ -127,6 +127,7 @@ def open_virtual_mfdataset(
vds = xr.combine_nested(vdatasets, **xr_combine_nested_kwargs)
if load:
refs = vds.virtualize.to_kerchunk(filepath=None, format="dict")
protocol = "s3" if "s3" in fs.protocol else fs.protocol
return xr.open_dataset(
"reference://",
engine="zarr",
Expand All @@ -135,8 +136,8 @@ def open_virtual_mfdataset(
"consolidated": False,
"storage_options": {
"fo": refs, # codespell:ignore
"remote_protocol": fs.protocol,
"remote_options": fs.storage_options, # type: ignore
"remote_protocol": protocol,
"remote_options": fs.storage_options,
},
},
)
Expand Down
12 changes: 11 additions & 1 deletion earthaccess/kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import fsspec.utils
import s3fs

# import ipdb
import earthaccess


Expand All @@ -15,12 +16,19 @@ def _get_chunk_metadata(
) -> list[dict]:
from kerchunk.hdf import SingleHdf5ToZarr

if not isinstance(granule, earthaccess.DataGranule) and isinstance(granule, dict):
# WHY: dask serialization is doing something weird, it serializes the granule as a simple dict
# we need to add cast it back to a datagranule to get the nice methods for parsing the data links
# TODO: ask James what is going on
granule = earthaccess.DataGranule(granule)

metadata = []
access = "direct" if isinstance(fs, s3fs.S3FileSystem) else "indirect"
# ipdb.set_trace()

for url in granule.data_links(access=access):
with fs.open(url) as inf:
h5chunks = SingleHdf5ToZarr(inf, url)
h5chunks = SingleHdf5ToZarr(inf, url) # type: ignore
m = h5chunks.translate()
metadata.append(m)

Expand Down Expand Up @@ -50,6 +58,8 @@ def consolidate_metadata(

# Get metadata for each granule
get_chunk_metadata = dask.delayed(_get_chunk_metadata) # type: ignore

# ipdb.set_trace()
chunks = dask.compute(*[get_chunk_metadata(g, fs) for g in granules]) # type: ignore
chunks = sum(chunks, start=[])

Expand Down
10 changes: 8 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,15 +57,19 @@ Changelog = "https://github.com/nsidc/earthaccess/blob/main/CHANGELOG.md"

[project.optional-dependencies]
kerchunk = [
"numpy >=1.26.4",
"kerchunk",
"dask",
"h5py >=3.6.0",
"h5netcdf",
"xarray",
"zarr >=2.12.0, <3.0.0a",
]
virtualizarr = [
"virtualizarr >=1.2.0"
"numpy >=1.26.4",
"zarr >=2.12.0, <3.0.0a",
"virtualizarr >=1.2.0",
"dask",
"h5py >=3.6.0",
]
dev = [
"bump-my-version >=0.10.0",
Expand All @@ -75,6 +79,8 @@ dev = [
"uv >=0.4.7",
]
test = [
"zarr >=2.12.0, <3.0.0a",
"numpy >=1.26.4",
"mypy >=1.11.2",
"pytest >=8.3",
"pytest-cov >=5.0",
Expand Down
45 changes: 14 additions & 31 deletions tests/integration/test_virtualizarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,41 +15,24 @@
logger.info(f"earthaccess version: {earthaccess.__version__}")


@pytest.fixture(scope="module", params=["MUR25-JPL-L4-GLOB-v04.2"])
@pytest.fixture(
scope="module",
params=[
"MUR25-JPL-L4-GLOB-v04.2",
"AVHRR_OI-NCEI-L4-GLOB-v2.1",
"M2T1NXSLV",
],
)
def granule(request):
granules = earthaccess.search_data(
count=1, temporal=("2024"), short_name=request.param
)
return granules[0]


def test_dmrpp(granule):
from virtualizarr import open_virtual_dataset # type: ignore

fs = earthaccess.get_fsspec_https_session()
data_path = granule.data_links(access="indirect")[0]
dmrpp_path = data_path + ".dmrpp"

result = open_virtual_dataset(
dmrpp_path,
filetype="dmrpp", # type: ignore
indexes={},
reader_options={"storage_options": fs.storage_options}, # type: ignore
)

expected = open_virtual_dataset(
data_path,
indexes={},
reader_options={"storage_options": fs.storage_options}, # type: ignore
)

# TODO: replace with xr.testing when virtualizarr fill_val is fixed (https://github.com/zarr-developers/VirtualiZarr/issues/287)
# and dmrpp deflateLevel (zlib compression level) is always present (https://github.com/OPENDAP/bes/issues/954)
for var in result.variables:
assert var in expected.variables
assert result[var].dims == expected[var].dims
assert result[var].shape == expected[var].shape
assert result[var].dtype == expected[var].dtype
assert result[var].data.manifest == expected[var].data.manifest
assert set(result.coords) == set(expected.coords)
assert result.attrs == expected.attrs
def test_open_virtual_dataset(granule):
# Simply check that the dmrpp can be found, parsed, and loaded. Actual parser result is checked in virtualizarr
vds = earthaccess.open_virtual_dataset(granule, load=False)
assert vds is not None
vds_load = earthaccess.open_virtual_dataset(granule, load=True)
assert vds_load is not None
35 changes: 23 additions & 12 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 4d0721d

Please sign in to comment.