Ouranosinc · aulemahal · Oct 18, 2024 · Oct 18, 2024
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -4,12 +4,13 @@ Changelog
 
 v0.11.0 (unreleased)
 --------------------
-Contributors to this version: Gabriel Rondeau-Genesse (:user:`RondeauG`).
+Contributors to this version: Gabriel Rondeau-Genesse (:user:`RondeauG`), Pascal Bourgault (:user:`aulemahal`).
 
 New features and enhancements
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 * ``xs.io.make_toc`` now includes the global attributes of the dataset after the information about the variables. (:pull:`473`).
 * New function ``xs.get_warming_level_from_period`` to get the warming level associated with a given time horizon. (:pull:`474`).
+* Added ability to skip whole folders to ``xs.parse_directory`` with argument ``skip_dirs``.
 
 Breaking changes
 ^^^^^^^^^^^^^^^^

diff --git a/src/xscen/catutils.py b/src/xscen/catutils.py
@@ -124,6 +124,7 @@ def _find_assets(
     exts: set[str],
     lengths: set[int],
     dirglob: str | None = None,
+    skip_dirs: list[os.PathLike] | None = None,
 ):
     """Walk recursively over files in a directory, filtering according to a glob pattern, path depth and extensions.
 
@@ -138,7 +139,10 @@ def _find_assets(
     dirglob : str, optional
         A glob pattern. If given, only parent folders matching this pattern are walked through.
         This pattern can not include the asset's basename.
+    skip_dirs : list of Paths, optional
+        A list of directories to skip on the walk.
     """
+    skip_dirs = skip_dirs or []
     root = str(Path(root))  # to be sure
     for top, alldirs, files in os.walk(root):
         # Split zarr subdirectories from next iteration
@@ -147,6 +151,8 @@ def _find_assets(
             if dr.endswith(".zarr"):
                 zarrs.append(dr)
                 alldirs.remove(dr)
+            if Path(top).joinpath(dr) in skip_dirs:
+                alldirs.remove(dr)
 
         if (
             top != root
@@ -270,6 +276,7 @@ def _parse_dir(  # noqa: C901
     root: os.PathLike | str,
     patterns: list[str],
     dirglob: str | None = None,
+    skip_dirs: list[os.PathLike] | None = None,
     checks: list[str] | None = None,
     read_from_file: list[str] | dict | None = None,
     attrs_map: dict | None = None,
@@ -289,6 +296,8 @@ def _parse_dir(  # noqa: C901
     dirglob : str
         A glob pattern. If given, only parent folders matching this pattern are walked through.
         This pattern can not include the asset's basename.
+    skip_dirs : list of strings or Paths, optional
+        A list of directories to skip in the walk.
     checks: list of strings, optional
         A list of checks to perform, available values are:
         - "readable" : Check that the file is readable by the current user.
@@ -389,7 +398,7 @@ def parse_worker():
 
     # Skip the checks if none are requested (save some overhead)
     q = q_found if checks else q_checked
-    for path in _find_assets(Path(root), exts, lengths, dirglob):
+    for path in _find_assets(Path(root), exts, lengths, dirglob, skip_dirs):
         q.put(path)
 
     q_found.join()
@@ -465,6 +474,7 @@ def parse_directory(  # noqa: C901
     homogenous_info: dict | None = None,
     cvs: str | os.PathLike | dict | None = None,
     dirglob: str | None = None,
+    skip_dirs: list[str | os.PathLike] | None = None,
     xr_open_kwargs: Mapping[str, Any] | None = None,
     only_official_columns: bool = True,
     progress: bool = False,
@@ -506,6 +516,8 @@ def parse_directory(  # noqa: C901
     dirglob : str, optional
         A glob pattern for path matching to accelerate the parsing of a directory tree if only a subtree is needed.
         Only folders matching the pattern are parsed to find datasets.
+    skip_dirs : list of str or Paths, optional
+        A list of folders that will be removed from the search.
     xr_open_kwargs: dict
         If needed, arguments to send xr.open_dataset() when opening the file to read the attributes.
     only_official_columns: bool
@@ -597,6 +609,7 @@ def parse_directory(  # noqa: C901
     parse_kwargs = dict(
         patterns=patterns,
         dirglob=dirglob,
+        skip_dirs=[Path(d) for d in (skip_dirs or [])],
         read_from_file=read_from_file if not read_file_groups else None,
         attrs_map=attrs_map,
         xr_open_kwargs=xr_open_kwargs,

diff --git a/tests/test_catutils.py b/tests/test_catutils.py
@@ -187,6 +187,43 @@ def test_parse_directory_idcols():
     assert (df["id"] == "example-region_NCC").all()
 
 
+@pytest.mark.requires_netcdf
+def test_parse_directory_skipdirs():
+    df = cu.parse_directory(
+        directories=[str(SAMPLES_DIR)],
+        skip_dirs=[
+            str(SAMPLES_DIR) + "/ScenarioMIP/example-region/NCC/NorESM2-MM/ssp126",
+            str(SAMPLES_DIR) + "/ScenarioMIP/example-region/NCC/NorESM2-MM/ssp245/",
+        ],
+        patterns=[
+            "{activity}/{domain}/{institution}/{source}/{experiment}/{member:rev}/{frequency}/{?:_}.nc"
+        ],
+        homogenous_info={
+            "mip_era": "CMIP6",
+            "type": "simulation",
+            "processing_level": "raw",
+        },
+        read_from_file=["variable", "date_start", "date_end", "version"],
+        xr_open_kwargs={"engine": "h5netcdf"},
+        cvs={
+            "domain": {"example-region": "exreg"},
+            "attributes": {"version_id": "version"},
+        },
+        file_checks=["readable", "ncvalid"],
+    )
+
+    assert len(df) == 4
+    assert (df["activity"] == "ScenarioMIP").all()
+    assert (df["mip_era"] == "CMIP6").all()
+    assert (df["domain"] == "exreg").all()  # CVS simple
+    assert (
+        df[df["frequency"] == "fx"]["variable"] == ("sftlf",)
+    ).all()  # Read from file
+    assert df.date_start.dtype == "<M8[ms]"
+    assert df.date_end.dtype == "<M8[ms]"
+    assert set(df.experiment.unique()) == {"ssp370", "ssp585"}
+
+
 def test_parse_from_ds():
     # Real ds
     ds = xr.tutorial.open_dataset("air_temperature")