Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds open_datatree and load_datatree to the tutorial module #10082

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1590,6 +1590,8 @@ Tutorial

tutorial.open_dataset
tutorial.load_dataset
tutorial.open_datatree
tutorial.load_datatree

Testing
=======
Expand Down
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ v2025.02.0 (unreleased)

New Features
~~~~~~~~~~~~
- Added :py:meth:`tutorial.open_datatree` and :py:meth:`tutorial.load_datatree`
By `Eni Awowale <https://github.com/eni-awowale>`_.
- Added :py:meth:`Coordinates.from_xindex` as convenience for creating a new :py:class:`Coordinates` object
directly from an existing Xarray index object if the latter supports it (:pull:`10000`)
By `Benoit Bovy <https://github.com/benbovy>`_.
Expand Down
37 changes: 26 additions & 11 deletions xarray/tests/test_tutorial.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,15 @@
from __future__ import annotations

import pytest

from xarray import DataArray, tutorial
from xarray.tests import assert_identical, network
from xarray import DataArray, DataTree, tutorial
from xarray.testing import assert_identical
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated this to use the xarray.testing module's assert_identical because xarray.tests didn't support DataTree objects.

from xarray.tests import network


@network
class TestLoadDataset:
@pytest.fixture(autouse=True)
def setUp(self):
self.testfile = "tiny"

def test_download_from_github(self, tmp_path) -> None:
cache_dir = tmp_path / tutorial._default_cache_dir_name
ds = tutorial.open_dataset(self.testfile, cache_dir=cache_dir).load()
ds = tutorial.open_dataset("tiny", cache_dir=cache_dir).load()
tiny = DataArray(range(5), name="tiny").to_dataset()
assert_identical(ds, tiny)

Expand All @@ -24,7 +19,27 @@ def test_download_from_github_load_without_cache(
cache_dir = tmp_path / tutorial._default_cache_dir_name

ds_nocache = tutorial.open_dataset(
self.testfile, cache=False, cache_dir=cache_dir
"tiny", cache=False, cache_dir=cache_dir
).load()
ds_cache = tutorial.open_dataset("tiny", cache_dir=cache_dir).load()
assert_identical(ds_cache, ds_nocache)


@network
class TestLoadDataTree:
def test_download_from_github(self, tmp_path) -> None:
cache_dir = tmp_path / tutorial._default_cache_dir_name
ds = tutorial.open_datatree("tiny", cache_dir=cache_dir).load()
tiny = DataTree.from_dict({"/": DataArray(range(5), name="tiny").to_dataset()})
assert_identical(ds, tiny)

def test_download_from_github_load_without_cache(
self, tmp_path, monkeypatch
) -> None:
cache_dir = tmp_path / tutorial._default_cache_dir_name

ds_nocache = tutorial.open_datatree(
"tiny", cache=False, cache_dir=cache_dir
).load()
ds_cache = tutorial.open_dataset(self.testfile, cache_dir=cache_dir).load()
ds_cache = tutorial.open_datatree("tiny", cache_dir=cache_dir).load()
assert_identical(ds_cache, ds_nocache)
139 changes: 139 additions & 0 deletions xarray/tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@
import numpy as np

from xarray.backends.api import open_dataset as _open_dataset
from xarray.backends.api import open_datatree as _open_datatree
from xarray.core.dataarray import DataArray
from xarray.core.dataset import Dataset
from xarray.core.datatree import DataTree

if TYPE_CHECKING:
from xarray.backends.api import T_Engine
Expand Down Expand Up @@ -248,3 +250,140 @@ def scatter_example_dataset(*, seed: None | int = None) -> Dataset:
ds.B.attrs["units"] = "Bunits"

return ds


def open_datatree(
name: str,
cache: bool = True,
cache_dir: None | str | os.PathLike = None,
*,
engine: T_Engine = None,
**kws,
) -> DataTree:
"""
Open a dataset as a `DataTree` from the online repository (requires internet).

If a local copy is found then always use that to avoid network traffic.

Available datasets:

* ``"imerghh_730"``: GPM IMERG Final Precipitation L3 Half Hourly 0.1 degree x 0.1 degree V07 from 2021-08-29T07:30:00.000Z
* ``"imerghh_830"``: GPM IMERG Final Precipitation L3 Half Hourly 0.1 degree x 0.1 degree V07 from 2021-08-29T08:30:00.000Z
* ``"air_temperature"``: NCEP reanalysis subset
* ``"air_temperature_gradient"``: NCEP reanalysis subset with approximate x,y gradients
* ``"basin_mask"``: Dataset with ocean basins marked using integers
* ``"ASE_ice_velocity"``: MEaSUREs InSAR-Based Ice Velocity of the Amundsen Sea Embayment, Antarctica, Version 1
* ``"rasm"``: Output of the Regional Arctic System Model (RASM)
* ``"ROMS_example"``: Regional Ocean Model System (ROMS) output
* ``"tiny"``: small synthetic dataset with a 1D data variable
* ``"era5-2mt-2019-03-uk.grib"``: ERA5 temperature data over the UK
* ``"eraint_uvz"``: data from ERA-Interim reanalysis, monthly averages of upper level data
* ``"ersstv5"``: NOAA's Extended Reconstructed Sea Surface Temperature monthly averages

Parameters
----------
name : str
Name of the file containing the dataset.
e.g. 'air_temperature'
cache_dir : path-like, optional
The directory in which to search for and write cached data.
cache : bool, optional
If True, then cache data locally for use on subsequent calls
**kws : dict, optional
Passed to xarray.open_dataset

See Also
--------
tutorial.load_datatree
open_datatree
"""
try:
import pooch
except ImportError as e:
raise ImportError(
"tutorial.open_dataset depends on pooch to download and manage datasets."
" To proceed please install pooch."
) from e

logger = pooch.get_logger()
logger.setLevel("WARNING")

cache_dir = _construct_cache_dir(cache_dir)
if name in external_urls:
url = external_urls[name]
else:
path = pathlib.Path(name)
if not path.suffix:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do the hdf5 file work with both netcdf4 and h5netcdf? Otherwise we might need to specialize, like we do with grib

Copy link
Collaborator Author

@eni-awowale eni-awowale Feb 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, imerghh_730.HDF5 and imerghh_830.HDF5, works for both engines. I think if we wanted to add hdf5 files without named dimensions we would want to specify the engine as h5netcdf.

EDIT:
Since pydata/xarray-data#32 was merged, we do have to explicitly add the extension, e.g. xr.tutorial.open_datatree('imerghh_830.hdf5'), otherwise it defaults to .nc

# process the name
default_extension = ".nc"
if engine is None:
_check_netcdf_engine_installed(name)
path = path.with_suffix(default_extension)
elif path.suffix == ".grib":
if engine is None:
engine = "cfgrib"
try:
import cfgrib # noqa: F401
except ImportError as e:
raise ImportError(
"Reading this tutorial dataset requires the cfgrib package."
) from e

url = f"{base_url}/raw/{version}/{path.name}"

headers = {"User-Agent": f"xarray {sys.modules['xarray'].__version__}"}
downloader = pooch.HTTPDownloader(headers=headers)

# retrieve the file
filepath = pooch.retrieve(
url=url, known_hash=None, path=cache_dir, downloader=downloader
)
ds = _open_datatree(filepath, engine=engine, **kws)
if not cache:
ds = ds.load()
pathlib.Path(filepath).unlink()

return ds


def load_datatree(*args, **kwargs) -> DataTree:
"""
Open, load into memory (as a `DataTree`), and close a dataset from the online repository
(requires internet).

If a local copy is found then always use that to avoid network traffic.

Available datasets:

* ``"imerghh_730"``: GPM IMERG Final Precipitation L3 Half Hourly 0.1 degree x 0.1 degree V07 from 2021-08-29T07:30:00.000Z
* ``"imerghh_830"``: GPM IMERG Final Precipitation L3 Half Hourly 0.1 degree x 0.1 degree V07 from 2021-08-29T08:30:00.000Z
* ``"air_temperature"``: NCEP reanalysis subset
* ``"air_temperature_gradient"``: NCEP reanalysis subset with approximate x,y gradients
* ``"basin_mask"``: Dataset with ocean basins marked using integers
* ``"ASE_ice_velocity"``: MEaSUREs InSAR-Based Ice Velocity of the Amundsen Sea Embayment, Antarctica, Version 1
* ``"rasm"``: Output of the Regional Arctic System Model (RASM)
* ``"ROMS_example"``: Regional Ocean Model System (ROMS) output
* ``"tiny"``: small synthetic dataset with a 1D data variable
* ``"era5-2mt-2019-03-uk.grib"``: ERA5 temperature data over the UK
* ``"eraint_uvz"``: data from ERA-Interim reanalysis, monthly averages of upper level data
* ``"ersstv5"``: NOAA's Extended Reconstructed Sea Surface Temperature monthly averages

Parameters
----------
name : str
Name of the file containing the dataset.
e.g. 'air_temperature'
cache_dir : path-like, optional
The directory in which to search for and write cached data.
cache : bool, optional
If True, then cache data locally for use on subsequent calls
**kws : dict, optional
Passed to xarray.open_datatree

See Also
--------
tutorial.open_datatree
open_datatree
"""
with open_datatree(*args, **kwargs) as ds:
return ds.load()
Loading