Implement catalog filter for KedroDataCatalog (#4449)

* Fixed catalog list for KedroDataCatalog Signed-off-by: Elena Khaustova <[email protected]> * Replaced solution Signed-off-by: Elena Khaustova <[email protected]> * Updated solution and made it on the catalog side Signed-off-by: Elena Khaustova <[email protected]> * Updated internal datasets access for KedroDataCatalog Signed-off-by: Elena Khaustova <[email protected]> * Fixed __getattribute__ Signed-off-by: Elena Khaustova <[email protected]> * Added test template Signed-off-by: Elena Khaustova <[email protected]> * Updated solution and test Signed-off-by: Elena Khaustova <[email protected]> * Fixed linter Signed-off-by: Elena Khaustova <[email protected]> * Updated release notes Signed-off-by: Elena Khaustova <[email protected]> * Implemented a draft of filtering method Signed-off-by: Elena Khaustova <[email protected]> * Updated filter Signed-off-by: Elena Khaustova <[email protected]> * Fixed lint Signed-off-by: Elena Khaustova <[email protected]> * Updated old list method Signed-off-by: Elena Khaustova <[email protected]> * Implemented tests for new filter Signed-off-by: Elena Khaustova <[email protected]> * Added tests for lazy datasets Signed-off-by: Elena Khaustova <[email protected]> * Added docstrings and usage examples Signed-off-by: Elena Khaustova <[email protected]> * Updated examples in the docstrings Signed-off-by: Elena Khaustova <[email protected]> * Updated lazy dataset representation Signed-off-by: Elena Khaustova <[email protected]> * Updated unit tests Signed-off-by: Elena Khaustova <[email protected]> * Updated tests to reach coverage Signed-off-by: Elena Khaustova <[email protected]> * Updated release notes Signed-off-by: Elena Khaustova <[email protected]> * Updated _LazyDataset representation Signed-off-by: Elena Khaustova <[email protected]> * Updated release notes Signed-off-by: Elena Khaustova <[email protected]> * Added default value to the docstrings Signed-off-by: Elena Khaustova <[email protected]> * Renamed _compile_pattern to _compile_regex_pattern Signed-off-by: Elena Khaustova <[email protected]> * Updated release notes Signed-off-by: Elena Khaustova <[email protected]> * Updated release notes Signed-off-by: Elena Khaustova <[email protected]> * Updated secrets baseline Signed-off-by: Elena Khaustova <[email protected]> * Added by_type filter Signed-off-by: Elena Khaustova <[email protected]> * Fixed bugs found when testing Signed-off-by: Elena Khaustova <[email protected]> * Updated tests Signed-off-by: Elena Khaustova <[email protected]> * Fixed linter Signed-off-by: Elena Khaustova <[email protected]> * Updated docstring Signed-off-by: Elena Khaustova <[email protected]> * Updated release notes Signed-off-by: Elena Khaustova <[email protected]> * Updated function to accept compiled patterns Signed-off-by: Elena Khaustova <[email protected]> * Updated unit tests Signed-off-by: Elena Khaustova <[email protected]> * Removed bad regex test Signed-off-by: Elena Khaustova <[email protected]> * Fixed linter Signed-off-by: Elena Khaustova <[email protected]> --------- Signed-off-by: Elena Khaustova <[email protected]>
kedro-org · Feb 7, 2025 · 06d5a69 · 06d5a69
1 parent a47677d
commit 06d5a69
Show file tree

Hide file tree

Showing 4 changed files with 147 additions and 8 deletions.
diff --git a/.secrets.baseline b/.secrets.baseline
@@ -215,9 +215,9 @@
         "filename": "tests/io/test_kedro_data_catalog.py",
         "hashed_secret": "15dd2c9ccec914f1470b4dccb45789844e49cf70",
         "is_verified": false,
-        "line_number": 501
+        "line_number": 560
       }
     ]
   },
-  "generated_at": "2025-01-28T14:51:20Z"
+  "generated_at": "2025-02-06T19:46:16Z"
 }
diff --git a/RELEASE.md b/RELEASE.md
@@ -1,6 +1,8 @@
 # Upcoming Release
 
 ## Major features and improvements
+* Added `KedroDataCatalog.filter()` to filter datasets by name and type.
+
 ## Bug fixes and other changes
 * Updated `_LazyDataset` representation when printing `KedroDataCatalog`.
 

diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py
@@ -551,11 +551,88 @@ def add(
             )
         self.__setitem__(ds_name, dataset)
 
+    def filter(
+        self,
+        name_regex: re.Pattern[str] | str | None = None,
+        type_regex: re.Pattern[str] | str | None = None,
+        by_type: type | list[type] | None = None,
+    ) -> List[str]:  # noqa: UP006
+        """Filter dataset names registered in the catalog based on name and/or type.
+
+        This method allows filtering datasets by their names and/or types. Regular expressions
+        should be precompiled before passing them to `name_regex` or `type_regex`, but plain
+        strings are also supported.
+
+        Args:
+            name_regex: Optional compiled regex pattern or string to filter dataset names.
+            type_regex: Optional compiled regex pattern or string to filter dataset types.
+                The provided regex is matched against the full dataset type path, for example:
+                `kedro_datasets.pandas.parquet_dataset.ParquetDataset`.
+            by_type: Optional dataset type(s) to filter by. This performs an instance type check
+                rather than a regex match. It can be a single dataset type or a list of types.
+
+        Returns:
+            A list of dataset names that match the filtering criteria.
+
+        Example:
+        ::
+
+            >>> import re
+            >>> catalog = KedroDataCatalog()
+            >>> # get datasets where the substring 'raw' is present
+            >>> raw_data = catalog.filter(name_regex='raw')
+            >>> # get datasets where names start with 'model_' (precompiled regex)
+            >>> model_datasets = catalog.filter(name_regex=re.compile('^model_'))
+            >>> # get datasets of a specific type using type_regex
+            >>> csv_datasets = catalog.filter(type_regex='pandas.excel_dataset.ExcelDataset')
+            >>> # get datasets where names contain 'train' and type matches 'CSV' in the path
+            >>> catalog.filter(name_regex="train", type_regex="CSV")
+            >>> # get datasets where names include 'data' and are of a specific type
+            >>> from kedro_datasets.pandas import SQLQueryDataset
+            >>> catalog.filter(name_regex="data", by_type=SQLQueryDataset)
+            >>> # get datasets where names include 'data' and are of multiple specific types
+            >>> from kedro.io import MemoryDataset
+            >>> catalog.filter(name_regex="data", by_type=[MemoryDataset, SQLQueryDataset])
+        """
+        filtered = self.keys()
+
+        # Apply name filter if specified
+        if name_regex:
+            filtered = [
+                ds_name for ds_name in filtered if re.search(name_regex, ds_name)
+            ]
+
+        # Apply type filters if specified
+        by_type_set = set()
+        if by_type:
+            if not isinstance(by_type, list):
+                by_type = [by_type]
+            for _type in by_type:
+                by_type_set.add(f"{_type.__module__}.{_type.__qualname__}")
+
+        if by_type_set or type_regex:
+            filtered_types = []
+            for ds_name in filtered:
+                # Retrieve the dataset type
+                if ds_name in self._lazy_datasets:
+                    str_type = str(self._lazy_datasets[ds_name])
+                else:
+                    class_type = type(self.__datasets[ds_name])
+                    str_type = f"{class_type.__module__}.{class_type.__qualname__}"
+                # Match against type_regex and apply by_type filtering
+                if (not type_regex or re.search(type_regex, str_type)) and (
+                    not by_type_set or str_type in by_type_set
+                ):
+                    filtered_types.append(ds_name)
+
+            return filtered_types
+
+        return filtered
+
     def list(
         self, regex_search: str | None = None, regex_flags: int | re.RegexFlag = 0
     ) -> List[str]:  # noqa: UP006
-        # TODO: rename depending on the solution for https://github.com/kedro-org/kedro/issues/3917
-        # TODO: make regex_search mandatory argument as we have catalog.keys() for listing all the datasets.
+        # TODO: remove when removing old catalog
         """List all dataset names registered in the catalog, optionally filtered by a regex pattern.
 
         If a regex pattern is provided, only dataset names matching the pattern will be returned.
@@ -598,6 +675,7 @@ def list(
             raise SyntaxError(
                 f"Invalid regular expression provided: '{regex_search}'"
             ) from exc
+
         return [ds_name for ds_name in self.__iter__() if pattern.search(ds_name)]
 
     def save(self, name: str, data: Any) -> None:

diff --git a/tests/io/test_kedro_data_catalog.py b/tests/io/test_kedro_data_catalog.py
@@ -47,9 +47,10 @@ def conflicting_feed_dict():
 
 @pytest.fixture
 def multi_catalog():
-    csv = CSVDataset(filepath="abc.csv")
+    csv_1 = CSVDataset(filepath="abc.csv")
+    csv_2 = CSVDataset(filepath="def.csv")
     parq = ParquetDataset(filepath="xyz.parq")
-    return KedroDataCatalog({"abc": csv, "xyz": parq})
+    return KedroDataCatalog({"abc": csv_1, "def": csv_2, "xyz": parq})
 
 
 @pytest.fixture
@@ -159,8 +160,9 @@ def test_multi_catalog_list(self, multi_catalog):
         [
             ("^a", ["abc"]),
             ("a|x", ["abc", "xyz"]),
-            ("^(?!(a|x))", []),
-            ("def", []),
+            ("^(?!(a|d|x))", []),
+            ("def", ["def"]),
+            ("ghi", []),
             ("", []),
         ],
     )
@@ -175,6 +177,63 @@ def test_multi_catalog_list_bad_regex(self, multi_catalog):
         with pytest.raises(SyntaxError, match=pattern):
             multi_catalog.list("((")
 
+    @pytest.mark.parametrize(
+        "name_regex,type_regex,expected",
+        [
+            (re.compile("^a"), None, ["abc"]),
+            (re.compile("^A"), None, []),
+            (re.compile("^A", flags=re.IGNORECASE), None, ["abc"]),
+            ("a|x", None, ["abc", "xyz"]),
+            ("a|d|x", None, ["abc", "def", "xyz"]),
+            ("a|d|x", "CSVDataset", ["abc", "def"]),
+            ("a|d|x", "kedro_datasets", ["abc", "def", "xyz"]),
+            (None, "ParquetDataset", ["xyz"]),
+            ("^(?!(a|d|x))", None, []),
+            ("def", None, ["def"]),
+            (None, None, ["abc", "def", "xyz"]),
+            ("a|d|x", "no_such_dataset", []),
+        ],
+    )
+    def test_catalog_filter_regex(
+        self, multi_catalog, name_regex, type_regex, expected
+    ):
+        """Test that regex patterns filter materialized datasets accordingly"""
+        assert (
+            multi_catalog.filter(name_regex=name_regex, type_regex=type_regex)
+            == expected
+        )
+
+    @pytest.mark.parametrize(
+        "name_regex,type_regex,by_type,expected",
+        [
+            ("b|m", None, None, ["boats", "materialized"]),
+            (None, None, None, ["boats", "cars", "materialized"]),
+            (None, "CSVDataset", None, ["boats", "cars"]),
+            (None, "ParquetDataset", None, ["materialized"]),
+            ("b|c", "ParquetDataset", None, []),
+            (None, None, ParquetDataset, ["materialized"]),
+            (
+                None,
+                None,
+                [CSVDataset, ParquetDataset],
+                ["boats", "cars", "materialized"],
+            ),
+            (None, "ParquetDataset", [CSVDataset, ParquetDataset], ["materialized"]),
+            ("b|m", None, [CSVDataset, ParquetDataset], ["boats", "materialized"]),
+        ],
+    )
+    def test_from_config_catalog_filter_regex(
+        self, data_catalog_from_config, name_regex, type_regex, by_type, expected
+    ):
+        """Test that regex patterns filter lazy and materialized datasets accordingly"""
+        data_catalog_from_config["materialized"] = ParquetDataset(filepath="xyz.parq")
+        assert (
+            data_catalog_from_config.filter(
+                name_regex=name_regex, type_regex=type_regex, by_type=by_type
+            )
+            == expected
+        )
+
     def test_eq(self, multi_catalog, data_catalog):
         assert multi_catalog == multi_catalog.shallow_copy()
         assert multi_catalog != data_catalog