From 1f3289486e266a200094c4064456217fc8827e83 Mon Sep 17 00:00:00 2001 From: Christoph Boeddeker Date: Tue, 3 Dec 2024 11:25:56 +0100 Subject: [PATCH 1/6] update __init__.py --- lazy_dataset/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lazy_dataset/__init__.py b/lazy_dataset/__init__.py index d7fecab..a0ca21b 100644 --- a/lazy_dataset/__init__.py +++ b/lazy_dataset/__init__.py @@ -1,3 +1,4 @@ +from . import core from .core import ( new, concatenate, @@ -7,6 +8,7 @@ from_dict, from_list, from_dataset, + from_file, FilterException, ) -from.core import _zip as zip +from .core import _zip as zip From d7fe68ba24156d5a57ba1e342d08df46e6aaaf99 Mon Sep 17 00:00:00 2001 From: Christoph Boeddeker Date: Tue, 3 Dec 2024 11:27:25 +0100 Subject: [PATCH 2/6] fix hiding of bug in from_dataset --- lazy_dataset/core.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/lazy_dataset/core.py b/lazy_dataset/core.py index b9ab2d2..4bfd3db 100644 --- a/lazy_dataset/core.py +++ b/lazy_dataset/core.py @@ -201,6 +201,21 @@ def from_dataset( >>> ds = from_dataset(new({'a': 1, 'b': 2, 'c': 3, 'd': 4}).filter(lambda x: x%2)) >>> dict(ds) {'a': 1, 'c': 3} + + # Works with concatenated datasets and duplicated keys + >>> ds = new({'a': 1, 'b': 2}) + >>> ds = concatenate(ds, ds) + >>> ds + DictDataset(len=2) + MapDataset(_pickle.loads) + DictDataset(len=2) + MapDataset(_pickle.loads) + ConcatenateDataset() + >>> from_dataset(ds) + ListDataset(len=4) + MapDataset(_pickle.loads) + >>> list(ds.items()) + """ try: items = list(examples.items()) @@ -208,7 +223,9 @@ def from_dataset( return from_list(list(examples), immutable_warranty=immutable_warranty, name=name) else: - return from_dict(dict(items), + new = dict(items) + assert len(new) == len(items), f'{len(new)} != {len(items)}\nYou found a bug!\n{examples!r}' + return from_dict(new, immutable_warranty=immutable_warranty, name=name) From 073f4c1da6325205b5ce2666a266bbbb6d2ee43a Mon Sep 17 00:00:00 2001 From: Christoph Boeddeker Date: Tue, 3 Dec 2024 11:29:16 +0100 Subject: [PATCH 3/6] fix ConcatenateDataset to yield ItemsNotDefined, if some keys are duplicated --- lazy_dataset/core.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lazy_dataset/core.py b/lazy_dataset/core.py index 4bfd3db..4716095 100644 --- a/lazy_dataset/core.py +++ b/lazy_dataset/core.py @@ -2703,6 +2703,12 @@ def ordered(self) -> bool: return all(ds.ordered for ds in self.input_datasets) def __iter__(self, with_key=False): + if with_key: + try: + self.keys() + except AssertionError: + raise _ItemsNotDefined(self.__class__.__name__) from None + for input_dataset in self.input_datasets: if with_key: iterable = input_dataset.__iter__(with_key=True) From 1c99a4b24bd8b9dad177cd8f908e736e5c8780a0 Mon Sep 17 00:00:00 2001 From: Christoph Boeddeker Date: Tue, 3 Dec 2024 12:26:03 +0100 Subject: [PATCH 4/6] improve exceptions and remove wip code to test the excpetion --- lazy_dataset/core.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/lazy_dataset/core.py b/lazy_dataset/core.py index 4716095..c916037 100644 --- a/lazy_dataset/core.py +++ b/lazy_dataset/core.py @@ -214,7 +214,6 @@ def from_dataset( >>> from_dataset(ds) ListDataset(len=4) MapDataset(_pickle.loads) - >>> list(ds.items()) """ try: @@ -434,7 +433,10 @@ def copy(self, freeze: bool = False) -> 'Dataset': Returns: A copy of this dataset """ - raise NotImplementedError + raise NotImplementedError( + f'copy is not implemented for {self.__class__}.\n' + f'self: \n{repr(self)}' + ) def __iter__(self, with_key=False): if with_key: @@ -2706,8 +2708,8 @@ def __iter__(self, with_key=False): if with_key: try: self.keys() - except AssertionError: - raise _ItemsNotDefined(self.__class__.__name__) from None + except AssertionError as e: + raise _ItemsNotDefined(self.__class__.__name__) from e for input_dataset in self.input_datasets: if with_key: @@ -2996,6 +2998,7 @@ def __init__(self, *input_datasets): ] raise AssertionError( f'Expect that all input_datasets have the same keys. ' + f'Missing: {lengths} of {len(keys)}\n' f'Missing keys: ' f'{missing_keys}\n{self.input_datasets}' ) From 1ffb9b47fd8e464a17ae082783178291e66f17bd Mon Sep 17 00:00:00 2001 From: Christoph Boeddeker Date: Tue, 3 Dec 2024 13:13:17 +0100 Subject: [PATCH 5/6] add back support of duplicated keys in ConcatenateDataset and change from_dataset --- lazy_dataset/core.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/lazy_dataset/core.py b/lazy_dataset/core.py index c916037..549159a 100644 --- a/lazy_dataset/core.py +++ b/lazy_dataset/core.py @@ -223,9 +223,13 @@ def from_dataset( immutable_warranty=immutable_warranty, name=name) else: new = dict(items) - assert len(new) == len(items), f'{len(new)} != {len(items)}\nYou found a bug!\n{examples!r}' - return from_dict(new, - immutable_warranty=immutable_warranty, name=name) + if len(new) == len(items): + return from_dict(new, + immutable_warranty=immutable_warranty, name=name) + else: + # Duplicates in keys + return from_list(list(map(operator.itemgetter(1), items)), + immutable_warranty=immutable_warranty, name=name) def concatenate(*datasets): @@ -2705,12 +2709,6 @@ def ordered(self) -> bool: return all(ds.ordered for ds in self.input_datasets) def __iter__(self, with_key=False): - if with_key: - try: - self.keys() - except AssertionError as e: - raise _ItemsNotDefined(self.__class__.__name__) from e - for input_dataset in self.input_datasets: if with_key: iterable = input_dataset.__iter__(with_key=True) @@ -3093,8 +3091,8 @@ class ItemsDataset(Dataset): >>> ds_nokeys_rng = ds_plain.shuffle(True, rng=np.random.RandomState(0)) # No keys >>> list(ds_nokeys.map(lambda x: x + 10).items()) [('a', 11), ('b', 12), ('c', 13)] - >>> list(ds_nokeys.concatenate(ds_plain).items()) - [('a', 1), ('b', 2), ('c', 3), ('a', 1), ('b', 2), ('c', 3)] + >>> list(ds_nokeys.map(lambda x: x + 10).concatenate(ds_plain).filter(lambda x: x in [1, 12, 13]).items()) + [('b', 12), ('c', 13), ('a', 1)] >>> list(ds_nokeys_rng.intersperse(ds_nokeys_rng).items()) [('c', 3), ('a', 1), ('c', 3), ('c', 3), ('b', 2), ('b', 2)] >>> list(ds_plain.key_zip(ds_plain).items()) From 7d7e8d8539cd607f88f0900fa3d98ca7b5cf89ea Mon Sep 17 00:00:00 2001 From: Christoph Boeddeker Date: Tue, 3 Dec 2024 13:16:14 +0100 Subject: [PATCH 6/6] add py311 and py312 to tests --- .github/workflows/run_python_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run_python_tests.yml b/.github/workflows/run_python_tests.yml index 5f985ee..e8239cc 100644 --- a/.github/workflows/run_python_tests.yml +++ b/.github/workflows/run_python_tests.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.7", "3.8", "3.9", "3.10"] + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v2