Skip to content

Commit

Permalink
Implement catalog filter for KedroDataCatalog (#4449)
Browse files Browse the repository at this point in the history
* Fixed catalog list for KedroDataCatalog

Signed-off-by: Elena Khaustova <[email protected]>

* Replaced solution

Signed-off-by: Elena Khaustova <[email protected]>

* Updated solution and made it on the catalog side

Signed-off-by: Elena Khaustova <[email protected]>

* Updated internal datasets access for KedroDataCatalog

Signed-off-by: Elena Khaustova <[email protected]>

* Fixed __getattribute__

Signed-off-by: Elena Khaustova <[email protected]>

* Added test template

Signed-off-by: Elena Khaustova <[email protected]>

* Updated solution and test

Signed-off-by: Elena Khaustova <[email protected]>

* Fixed linter

Signed-off-by: Elena Khaustova <[email protected]>

* Updated release notes

Signed-off-by: Elena Khaustova <[email protected]>

* Implemented a draft of filtering method

Signed-off-by: Elena Khaustova <[email protected]>

* Updated filter

Signed-off-by: Elena Khaustova <[email protected]>

* Fixed lint

Signed-off-by: Elena Khaustova <[email protected]>

* Updated old list method

Signed-off-by: Elena Khaustova <[email protected]>

* Implemented tests for new filter

Signed-off-by: Elena Khaustova <[email protected]>

* Added tests for lazy datasets

Signed-off-by: Elena Khaustova <[email protected]>

* Added docstrings and usage examples

Signed-off-by: Elena Khaustova <[email protected]>

* Updated examples in the docstrings

Signed-off-by: Elena Khaustova <[email protected]>

* Updated lazy dataset representation

Signed-off-by: Elena Khaustova <[email protected]>

* Updated unit tests

Signed-off-by: Elena Khaustova <[email protected]>

* Updated tests to reach coverage

Signed-off-by: Elena Khaustova <[email protected]>

* Updated release notes

Signed-off-by: Elena Khaustova <[email protected]>

* Updated _LazyDataset representation

Signed-off-by: Elena Khaustova <[email protected]>

* Updated release notes

Signed-off-by: Elena Khaustova <[email protected]>

* Added default value to the docstrings

Signed-off-by: Elena Khaustova <[email protected]>

* Renamed _compile_pattern to _compile_regex_pattern

Signed-off-by: Elena Khaustova <[email protected]>

* Updated release notes

Signed-off-by: Elena Khaustova <[email protected]>

* Updated release notes

Signed-off-by: Elena Khaustova <[email protected]>

* Updated secrets baseline

Signed-off-by: Elena Khaustova <[email protected]>

* Added by_type filter

Signed-off-by: Elena Khaustova <[email protected]>

* Fixed bugs found when testing

Signed-off-by: Elena Khaustova <[email protected]>

* Updated tests

Signed-off-by: Elena Khaustova <[email protected]>

* Fixed linter

Signed-off-by: Elena Khaustova <[email protected]>

* Updated docstring

Signed-off-by: Elena Khaustova <[email protected]>

* Updated release notes

Signed-off-by: Elena Khaustova <[email protected]>

* Updated function to accept compiled patterns

Signed-off-by: Elena Khaustova <[email protected]>

* Updated unit tests

Signed-off-by: Elena Khaustova <[email protected]>

* Removed bad regex test

Signed-off-by: Elena Khaustova <[email protected]>

* Fixed linter

Signed-off-by: Elena Khaustova <[email protected]>

---------

Signed-off-by: Elena Khaustova <[email protected]>
  • Loading branch information
ElenaKhaustova authored Feb 7, 2025
1 parent a47677d commit 06d5a69
Show file tree
Hide file tree
Showing 4 changed files with 147 additions and 8 deletions.
4 changes: 2 additions & 2 deletions .secrets.baseline
Original file line number Diff line number Diff line change
Expand Up @@ -215,9 +215,9 @@
"filename": "tests/io/test_kedro_data_catalog.py",
"hashed_secret": "15dd2c9ccec914f1470b4dccb45789844e49cf70",
"is_verified": false,
"line_number": 501
"line_number": 560
}
]
},
"generated_at": "2025-01-28T14:51:20Z"
"generated_at": "2025-02-06T19:46:16Z"
}
2 changes: 2 additions & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Upcoming Release

## Major features and improvements
* Added `KedroDataCatalog.filter()` to filter datasets by name and type.

## Bug fixes and other changes
* Updated `_LazyDataset` representation when printing `KedroDataCatalog`.

Expand Down
82 changes: 80 additions & 2 deletions kedro/io/kedro_data_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,11 +551,88 @@ def add(
)
self.__setitem__(ds_name, dataset)

def filter(
self,
name_regex: re.Pattern[str] | str | None = None,
type_regex: re.Pattern[str] | str | None = None,
by_type: type | list[type] | None = None,
) -> List[str]: # noqa: UP006
"""Filter dataset names registered in the catalog based on name and/or type.
This method allows filtering datasets by their names and/or types. Regular expressions
should be precompiled before passing them to `name_regex` or `type_regex`, but plain
strings are also supported.
Args:
name_regex: Optional compiled regex pattern or string to filter dataset names.
type_regex: Optional compiled regex pattern or string to filter dataset types.
The provided regex is matched against the full dataset type path, for example:
`kedro_datasets.pandas.parquet_dataset.ParquetDataset`.
by_type: Optional dataset type(s) to filter by. This performs an instance type check
rather than a regex match. It can be a single dataset type or a list of types.
Returns:
A list of dataset names that match the filtering criteria.
Example:
::
>>> import re
>>> catalog = KedroDataCatalog()
>>> # get datasets where the substring 'raw' is present
>>> raw_data = catalog.filter(name_regex='raw')
>>> # get datasets where names start with 'model_' (precompiled regex)
>>> model_datasets = catalog.filter(name_regex=re.compile('^model_'))
>>> # get datasets of a specific type using type_regex
>>> csv_datasets = catalog.filter(type_regex='pandas.excel_dataset.ExcelDataset')
>>> # get datasets where names contain 'train' and type matches 'CSV' in the path
>>> catalog.filter(name_regex="train", type_regex="CSV")
>>> # get datasets where names include 'data' and are of a specific type
>>> from kedro_datasets.pandas import SQLQueryDataset
>>> catalog.filter(name_regex="data", by_type=SQLQueryDataset)
>>> # get datasets where names include 'data' and are of multiple specific types
>>> from kedro.io import MemoryDataset
>>> catalog.filter(name_regex="data", by_type=[MemoryDataset, SQLQueryDataset])
"""
filtered = self.keys()

# Apply name filter if specified
if name_regex:
filtered = [
ds_name for ds_name in filtered if re.search(name_regex, ds_name)
]

# Apply type filters if specified
by_type_set = set()
if by_type:
if not isinstance(by_type, list):
by_type = [by_type]
for _type in by_type:
by_type_set.add(f"{_type.__module__}.{_type.__qualname__}")

if by_type_set or type_regex:
filtered_types = []
for ds_name in filtered:
# Retrieve the dataset type
if ds_name in self._lazy_datasets:
str_type = str(self._lazy_datasets[ds_name])
else:
class_type = type(self.__datasets[ds_name])
str_type = f"{class_type.__module__}.{class_type.__qualname__}"
# Match against type_regex and apply by_type filtering
if (not type_regex or re.search(type_regex, str_type)) and (
not by_type_set or str_type in by_type_set
):
filtered_types.append(ds_name)

return filtered_types

return filtered

def list(
self, regex_search: str | None = None, regex_flags: int | re.RegexFlag = 0
) -> List[str]: # noqa: UP006
# TODO: rename depending on the solution for https://github.com/kedro-org/kedro/issues/3917
# TODO: make regex_search mandatory argument as we have catalog.keys() for listing all the datasets.
# TODO: remove when removing old catalog
"""List all dataset names registered in the catalog, optionally filtered by a regex pattern.
If a regex pattern is provided, only dataset names matching the pattern will be returned.
Expand Down Expand Up @@ -598,6 +675,7 @@ def list(
raise SyntaxError(
f"Invalid regular expression provided: '{regex_search}'"
) from exc

return [ds_name for ds_name in self.__iter__() if pattern.search(ds_name)]

def save(self, name: str, data: Any) -> None:
Expand Down
67 changes: 63 additions & 4 deletions tests/io/test_kedro_data_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,10 @@ def conflicting_feed_dict():

@pytest.fixture
def multi_catalog():
csv = CSVDataset(filepath="abc.csv")
csv_1 = CSVDataset(filepath="abc.csv")
csv_2 = CSVDataset(filepath="def.csv")
parq = ParquetDataset(filepath="xyz.parq")
return KedroDataCatalog({"abc": csv, "xyz": parq})
return KedroDataCatalog({"abc": csv_1, "def": csv_2, "xyz": parq})


@pytest.fixture
Expand Down Expand Up @@ -159,8 +160,9 @@ def test_multi_catalog_list(self, multi_catalog):
[
("^a", ["abc"]),
("a|x", ["abc", "xyz"]),
("^(?!(a|x))", []),
("def", []),
("^(?!(a|d|x))", []),
("def", ["def"]),
("ghi", []),
("", []),
],
)
Expand All @@ -175,6 +177,63 @@ def test_multi_catalog_list_bad_regex(self, multi_catalog):
with pytest.raises(SyntaxError, match=pattern):
multi_catalog.list("((")

@pytest.mark.parametrize(
"name_regex,type_regex,expected",
[
(re.compile("^a"), None, ["abc"]),
(re.compile("^A"), None, []),
(re.compile("^A", flags=re.IGNORECASE), None, ["abc"]),
("a|x", None, ["abc", "xyz"]),
("a|d|x", None, ["abc", "def", "xyz"]),
("a|d|x", "CSVDataset", ["abc", "def"]),
("a|d|x", "kedro_datasets", ["abc", "def", "xyz"]),
(None, "ParquetDataset", ["xyz"]),
("^(?!(a|d|x))", None, []),
("def", None, ["def"]),
(None, None, ["abc", "def", "xyz"]),
("a|d|x", "no_such_dataset", []),
],
)
def test_catalog_filter_regex(
self, multi_catalog, name_regex, type_regex, expected
):
"""Test that regex patterns filter materialized datasets accordingly"""
assert (
multi_catalog.filter(name_regex=name_regex, type_regex=type_regex)
== expected
)

@pytest.mark.parametrize(
"name_regex,type_regex,by_type,expected",
[
("b|m", None, None, ["boats", "materialized"]),
(None, None, None, ["boats", "cars", "materialized"]),
(None, "CSVDataset", None, ["boats", "cars"]),
(None, "ParquetDataset", None, ["materialized"]),
("b|c", "ParquetDataset", None, []),
(None, None, ParquetDataset, ["materialized"]),
(
None,
None,
[CSVDataset, ParquetDataset],
["boats", "cars", "materialized"],
),
(None, "ParquetDataset", [CSVDataset, ParquetDataset], ["materialized"]),
("b|m", None, [CSVDataset, ParquetDataset], ["boats", "materialized"]),
],
)
def test_from_config_catalog_filter_regex(
self, data_catalog_from_config, name_regex, type_regex, by_type, expected
):
"""Test that regex patterns filter lazy and materialized datasets accordingly"""
data_catalog_from_config["materialized"] = ParquetDataset(filepath="xyz.parq")
assert (
data_catalog_from_config.filter(
name_regex=name_regex, type_regex=type_regex, by_type=by_type
)
== expected
)

def test_eq(self, multi_catalog, data_catalog):
assert multi_catalog == multi_catalog.shallow_copy()
assert multi_catalog != data_catalog
Expand Down

0 comments on commit 06d5a69

Please sign in to comment.