Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Skip dirs in parse_directory #478

Merged
merged 1 commit into from
Oct 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@ Changelog

v0.11.0 (unreleased)
--------------------
Contributors to this version: Gabriel Rondeau-Genesse (:user:`RondeauG`).
Contributors to this version: Gabriel Rondeau-Genesse (:user:`RondeauG`), Pascal Bourgault (:user:`aulemahal`).

New features and enhancements
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
* ``xs.io.make_toc`` now includes the global attributes of the dataset after the information about the variables. (:pull:`473`).
* New function ``xs.get_warming_level_from_period`` to get the warming level associated with a given time horizon. (:pull:`474`).
* Added ability to skip whole folders to ``xs.parse_directory`` with argument ``skip_dirs``.

Breaking changes
^^^^^^^^^^^^^^^^
Expand Down
15 changes: 14 additions & 1 deletion src/xscen/catutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ def _find_assets(
exts: set[str],
lengths: set[int],
dirglob: str | None = None,
skip_dirs: list[os.PathLike] | None = None,
):
"""Walk recursively over files in a directory, filtering according to a glob pattern, path depth and extensions.

Expand All @@ -138,7 +139,10 @@ def _find_assets(
dirglob : str, optional
A glob pattern. If given, only parent folders matching this pattern are walked through.
This pattern can not include the asset's basename.
skip_dirs : list of Paths, optional
A list of directories to skip on the walk.
"""
skip_dirs = skip_dirs or []
root = str(Path(root)) # to be sure
for top, alldirs, files in os.walk(root):
# Split zarr subdirectories from next iteration
Expand All @@ -147,6 +151,8 @@ def _find_assets(
if dr.endswith(".zarr"):
zarrs.append(dr)
alldirs.remove(dr)
if Path(top).joinpath(dr) in skip_dirs:
alldirs.remove(dr)

if (
top != root
Expand Down Expand Up @@ -270,6 +276,7 @@ def _parse_dir( # noqa: C901
root: os.PathLike | str,
patterns: list[str],
dirglob: str | None = None,
skip_dirs: list[os.PathLike] | None = None,
checks: list[str] | None = None,
read_from_file: list[str] | dict | None = None,
attrs_map: dict | None = None,
Expand All @@ -289,6 +296,8 @@ def _parse_dir( # noqa: C901
dirglob : str
A glob pattern. If given, only parent folders matching this pattern are walked through.
This pattern can not include the asset's basename.
skip_dirs : list of strings or Paths, optional
A list of directories to skip in the walk.
checks: list of strings, optional
A list of checks to perform, available values are:
- "readable" : Check that the file is readable by the current user.
Expand Down Expand Up @@ -389,7 +398,7 @@ def parse_worker():

# Skip the checks if none are requested (save some overhead)
q = q_found if checks else q_checked
for path in _find_assets(Path(root), exts, lengths, dirglob):
for path in _find_assets(Path(root), exts, lengths, dirglob, skip_dirs):
q.put(path)

q_found.join()
Expand Down Expand Up @@ -465,6 +474,7 @@ def parse_directory( # noqa: C901
homogenous_info: dict | None = None,
cvs: str | os.PathLike | dict | None = None,
dirglob: str | None = None,
skip_dirs: list[str | os.PathLike] | None = None,
xr_open_kwargs: Mapping[str, Any] | None = None,
only_official_columns: bool = True,
progress: bool = False,
Expand Down Expand Up @@ -506,6 +516,8 @@ def parse_directory( # noqa: C901
dirglob : str, optional
A glob pattern for path matching to accelerate the parsing of a directory tree if only a subtree is needed.
Only folders matching the pattern are parsed to find datasets.
skip_dirs : list of str or Paths, optional
A list of folders that will be removed from the search.
xr_open_kwargs: dict
If needed, arguments to send xr.open_dataset() when opening the file to read the attributes.
only_official_columns: bool
Expand Down Expand Up @@ -597,6 +609,7 @@ def parse_directory( # noqa: C901
parse_kwargs = dict(
patterns=patterns,
dirglob=dirglob,
skip_dirs=[Path(d) for d in (skip_dirs or [])],
read_from_file=read_from_file if not read_file_groups else None,
attrs_map=attrs_map,
xr_open_kwargs=xr_open_kwargs,
Expand Down
37 changes: 37 additions & 0 deletions tests/test_catutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,43 @@ def test_parse_directory_idcols():
assert (df["id"] == "example-region_NCC").all()


@pytest.mark.requires_netcdf
def test_parse_directory_skipdirs():
df = cu.parse_directory(
directories=[str(SAMPLES_DIR)],
skip_dirs=[
str(SAMPLES_DIR) + "/ScenarioMIP/example-region/NCC/NorESM2-MM/ssp126",
str(SAMPLES_DIR) + "/ScenarioMIP/example-region/NCC/NorESM2-MM/ssp245/",
],
patterns=[
"{activity}/{domain}/{institution}/{source}/{experiment}/{member:rev}/{frequency}/{?:_}.nc"
],
homogenous_info={
"mip_era": "CMIP6",
"type": "simulation",
"processing_level": "raw",
},
read_from_file=["variable", "date_start", "date_end", "version"],
xr_open_kwargs={"engine": "h5netcdf"},
cvs={
"domain": {"example-region": "exreg"},
"attributes": {"version_id": "version"},
},
file_checks=["readable", "ncvalid"],
)

assert len(df) == 4
assert (df["activity"] == "ScenarioMIP").all()
assert (df["mip_era"] == "CMIP6").all()
assert (df["domain"] == "exreg").all() # CVS simple
assert (
df[df["frequency"] == "fx"]["variable"] == ("sftlf",)
).all() # Read from file
assert df.date_start.dtype == "<M8[ms]"
assert df.date_end.dtype == "<M8[ms]"
assert set(df.experiment.unique()) == {"ssp370", "ssp585"}


def test_parse_from_ds():
# Real ds
ds = xr.tutorial.open_dataset("air_temperature")
Expand Down
Loading