diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 6d8b04c1..3523581c 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,12 +4,13 @@ Changelog v0.11.0 (unreleased) -------------------- -Contributors to this version: Gabriel Rondeau-Genesse (:user:`RondeauG`). +Contributors to this version: Gabriel Rondeau-Genesse (:user:`RondeauG`), Pascal Bourgault (:user:`aulemahal`). New features and enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ * ``xs.io.make_toc`` now includes the global attributes of the dataset after the information about the variables. (:pull:`473`). * New function ``xs.get_warming_level_from_period`` to get the warming level associated with a given time horizon. (:pull:`474`). +* Added ability to skip whole folders to ``xs.parse_directory`` with argument ``skip_dirs``. Breaking changes ^^^^^^^^^^^^^^^^ diff --git a/src/xscen/catutils.py b/src/xscen/catutils.py index 44c8f2b3..5ed56357 100644 --- a/src/xscen/catutils.py +++ b/src/xscen/catutils.py @@ -124,6 +124,7 @@ def _find_assets( exts: set[str], lengths: set[int], dirglob: str | None = None, + skip_dirs: list[os.PathLike] | None = None, ): """Walk recursively over files in a directory, filtering according to a glob pattern, path depth and extensions. @@ -138,7 +139,10 @@ def _find_assets( dirglob : str, optional A glob pattern. If given, only parent folders matching this pattern are walked through. This pattern can not include the asset's basename. + skip_dirs : list of Paths, optional + A list of directories to skip on the walk. """ + skip_dirs = skip_dirs or [] root = str(Path(root)) # to be sure for top, alldirs, files in os.walk(root): # Split zarr subdirectories from next iteration @@ -147,6 +151,8 @@ def _find_assets( if dr.endswith(".zarr"): zarrs.append(dr) alldirs.remove(dr) + if Path(top).joinpath(dr) in skip_dirs: + alldirs.remove(dr) if ( top != root @@ -270,6 +276,7 @@ def _parse_dir( # noqa: C901 root: os.PathLike | str, patterns: list[str], dirglob: str | None = None, + skip_dirs: list[os.PathLike] | None = None, checks: list[str] | None = None, read_from_file: list[str] | dict | None = None, attrs_map: dict | None = None, @@ -289,6 +296,8 @@ def _parse_dir( # noqa: C901 dirglob : str A glob pattern. If given, only parent folders matching this pattern are walked through. This pattern can not include the asset's basename. + skip_dirs : list of strings or Paths, optional + A list of directories to skip in the walk. checks: list of strings, optional A list of checks to perform, available values are: - "readable" : Check that the file is readable by the current user. @@ -389,7 +398,7 @@ def parse_worker(): # Skip the checks if none are requested (save some overhead) q = q_found if checks else q_checked - for path in _find_assets(Path(root), exts, lengths, dirglob): + for path in _find_assets(Path(root), exts, lengths, dirglob, skip_dirs): q.put(path) q_found.join() @@ -465,6 +474,7 @@ def parse_directory( # noqa: C901 homogenous_info: dict | None = None, cvs: str | os.PathLike | dict | None = None, dirglob: str | None = None, + skip_dirs: list[str | os.PathLike] | None = None, xr_open_kwargs: Mapping[str, Any] | None = None, only_official_columns: bool = True, progress: bool = False, @@ -506,6 +516,8 @@ def parse_directory( # noqa: C901 dirglob : str, optional A glob pattern for path matching to accelerate the parsing of a directory tree if only a subtree is needed. Only folders matching the pattern are parsed to find datasets. + skip_dirs : list of str or Paths, optional + A list of folders that will be removed from the search. xr_open_kwargs: dict If needed, arguments to send xr.open_dataset() when opening the file to read the attributes. only_official_columns: bool @@ -597,6 +609,7 @@ def parse_directory( # noqa: C901 parse_kwargs = dict( patterns=patterns, dirglob=dirglob, + skip_dirs=[Path(d) for d in (skip_dirs or [])], read_from_file=read_from_file if not read_file_groups else None, attrs_map=attrs_map, xr_open_kwargs=xr_open_kwargs, diff --git a/tests/test_catutils.py b/tests/test_catutils.py index 8d5d792b..40c7c857 100644 --- a/tests/test_catutils.py +++ b/tests/test_catutils.py @@ -187,6 +187,43 @@ def test_parse_directory_idcols(): assert (df["id"] == "example-region_NCC").all() +@pytest.mark.requires_netcdf +def test_parse_directory_skipdirs(): + df = cu.parse_directory( + directories=[str(SAMPLES_DIR)], + skip_dirs=[ + str(SAMPLES_DIR) + "/ScenarioMIP/example-region/NCC/NorESM2-MM/ssp126", + str(SAMPLES_DIR) + "/ScenarioMIP/example-region/NCC/NorESM2-MM/ssp245/", + ], + patterns=[ + "{activity}/{domain}/{institution}/{source}/{experiment}/{member:rev}/{frequency}/{?:_}.nc" + ], + homogenous_info={ + "mip_era": "CMIP6", + "type": "simulation", + "processing_level": "raw", + }, + read_from_file=["variable", "date_start", "date_end", "version"], + xr_open_kwargs={"engine": "h5netcdf"}, + cvs={ + "domain": {"example-region": "exreg"}, + "attributes": {"version_id": "version"}, + }, + file_checks=["readable", "ncvalid"], + ) + + assert len(df) == 4 + assert (df["activity"] == "ScenarioMIP").all() + assert (df["mip_era"] == "CMIP6").all() + assert (df["domain"] == "exreg").all() # CVS simple + assert ( + df[df["frequency"] == "fx"]["variable"] == ("sftlf",) + ).all() # Read from file + assert df.date_start.dtype == "