From 9d7a4276fbf392b4c2ca31701cb54f59e6450d5e Mon Sep 17 00:00:00 2001 From: Felipe Hofmann Date: Wed, 27 Jul 2022 15:59:23 -0700 Subject: [PATCH 01/18] Add key validation methods (the methods themselves still gotta be revised) --- sdv/metadata/single_table.py | 103 +++++++++++++++++++++++++++++++---- 1 file changed, 92 insertions(+), 11 deletions(-) diff --git a/sdv/metadata/single_table.py b/sdv/metadata/single_table.py index ca3bec9f6..a7fe4a670 100644 --- a/sdv/metadata/single_table.py +++ b/sdv/metadata/single_table.py @@ -250,6 +250,22 @@ def _validate_datatype(id): """Check whether id is a string or a tuple of strings.""" return isinstance(id, str) or isinstance(id, tuple) and all(isinstance(i, str) for i in id) + def _validate_primary_key(self, id): + if not self._validate_datatype(id): + raise ValueError("'primary_key' must be a string or tuple of strings.") + + invalid_ids = {} + if isinstance(id, str) and id not in self._columns: + invalid_ids = {id} + elif isinstance(id, tuple): + invalid_ids = {invalid_id for invalid_id in id if invalid_id in self._columns} + + if invalid_ids: + raise ValueError( + f'Unknown primary key values {invalid_ids}.' + ' Keys should be columns that exist in the table.' + ) + def set_primary_key(self, id): """Set the metadata primary key. @@ -257,8 +273,7 @@ def set_primary_key(self, id): id (str, tuple): Name (or tuple of names) of the primary key column(s). """ - if not self._validate_datatype(id): - raise ValueError("'primary_key' must be a string or tuple of strings.") + self._validate_primary_key(id) if self._metadata['primary_key'] is not None: warnings.warn( @@ -268,6 +283,25 @@ def set_primary_key(self, id): self._metadata['primary_key'] = id + def _validate_alternate_keys(self, ids): + if not isinstance(ids, list) or not all(self._validate_datatype(id) for id in ids): + raise ValueError( + "'alternate_keys' must be a list of strings or a list of tuples of strings." + ) + + invalid_ids = {} + for id in ids: + if isinstance(id, str) and id not in self._columns: + invalid_ids.add(id) + elif isinstance(id, tuple): + invalid_ids.update({invalid_id for invalid_id in id if invalid_id in self._columns}) + + if invalid_ids: + raise ValueError( + f'Unknown alternate key values {invalid_ids}.' + ' Keys should be columns that exist in the table.' + ) + def set_alternate_keys(self, ids): """Set the metadata alternate keys. @@ -275,13 +309,25 @@ def set_alternate_keys(self, ids): ids (list[str], list[tuple]): List of names (or tuple of names) of the alternate key columns. """ - if not isinstance(ids, list) or not all(self._validate_datatype(id) for id in ids): + self._validate_alternate_keys(ids) + self._metadata['alternate_keys'] = ids + + def _validate_sequence_key(self, id): + if not self._validate_datatype(id): + raise ValueError("'sequence_key' must be a string or tuple of strings.") + + invalid_ids = {} + if isinstance(id, str) and id not in self._columns: + invalid_ids = {id} + elif isinstance(id, tuple): + invalid_ids = {invalid_id for invalid_id in id if invalid_id in self._columns} + + if invalid_ids: raise ValueError( - "'alternate_keys' must be a list of strings or a list of tuples of strings." + f'Unknown sequence key values {invalid_ids}.' + ' Keys should be columns that exist in the table.' ) - self._metadata['alternate_keys'] = ids - def set_sequence_key(self, id): """Set the metadata sequence key. @@ -289,8 +335,7 @@ def set_sequence_key(self, id): id (str, tuple): Name (or tuple of names) of the sequence key column(s). """ - if not self._validate_datatype(id): - raise ValueError("'sequence_key' must be a string or tuple of strings.") + self._validate_sequence_key(id) if self._metadata['sequence_key'] is not None: warnings.warn( @@ -300,6 +345,16 @@ def set_sequence_key(self, id): self._metadata['sequence_key'] = id + def _validate_sequence_index(self, column_name): + if not isinstance(column_name, str): + raise ValueError("'sequence_index' must be a string.") + + if column_name not in self._columns: + raise ValueError( + f'Unknown sequence key value {column_name}.' + ' Keys should be columns that exist in the table.' + ) + def set_sequence_index(self, column_name): """Set the metadata sequence index. @@ -307,11 +362,37 @@ def set_sequence_index(self, column_name): column_name (str): Name of the sequence index column. """ - if not isinstance(column_name, str): - raise ValueError("'sequence_index' must be a string.") - + self._validate_sequence_index(column_name) self._metadata['sequence_index'] = column_name + def _validate_sequence_key_not_sequence_index(self): + """Check that ``_sequence_index`` and ``_sequence_key`` don't overlap.""" + if ( + (isinstance(self._sequence_key, tuple) and self._sequence_index in self._sequence_key) + or self._sequence_index == self._sequence_key + ): + raise ValueError( + f'sequence_index and sequence_key have the same value {self._sequence_index}.' + ' These columns must be different.' + ) + + def validate(self): + """Validate the metadata. + + Raises: + - ``InvalidMetadataError`` if the metadata is invalid. + """ + # Validate keys + self._validate_primary_key(self._primary_key) + self._validate_alternate_keys(self._alternate_keys) + self._validate_sequence_key(self._sequence_key) + self._validate_sequence_index(self._sequence_index) + self._validate_sequence_key_not_sequence_index() + + # Validate constraints + + # Validate columns + def to_dict(self): """Return a python ``dict`` representation of the ``SingleTableMetadata``.""" metadata = {} From b6da51431294714c5c31150b61ee6012c57663c4 Mon Sep 17 00:00:00 2001 From: Felipe Hofmann Date: Thu, 28 Jul 2022 13:29:54 -0700 Subject: [PATCH 02/18] Combined errors implementation --- sdv/metadata/single_table.py | 124 +++++++++++------------ tests/unit/metadata/test_single_table.py | 16 ++- 2 files changed, 75 insertions(+), 65 deletions(-) diff --git a/sdv/metadata/single_table.py b/sdv/metadata/single_table.py index a7fe4a670..80a768c9a 100644 --- a/sdv/metadata/single_table.py +++ b/sdv/metadata/single_table.py @@ -8,6 +8,7 @@ from pathlib import Path import pandas as pd +from sdv.constraints.errors import MultipleConstraintsErrors from sdv.constraints import Constraint from sdv.metadata.errors import MetadataError @@ -107,6 +108,10 @@ def _validate_text(column_name, **kwargs): def __init__(self): self._columns = {} self._constraints = [] + self._primary_key = None + self._alternate_keys = [] + self._sequence_key = None + self._sequence_index = None self._version = self.SCHEMA_VERSION self._metadata = { 'columns': self._columns, @@ -250,22 +255,23 @@ def _validate_datatype(id): """Check whether id is a string or a tuple of strings.""" return isinstance(id, str) or isinstance(id, tuple) and all(isinstance(i, str) for i in id) - def _validate_primary_key(self, id): + def _validate_key(self, id, key_type): + """Validate the primary and sequence keys.""" + errors = [] if not self._validate_datatype(id): - raise ValueError("'primary_key' must be a string or tuple of strings.") - - invalid_ids = {} - if isinstance(id, str) and id not in self._columns: - invalid_ids = {id} - elif isinstance(id, tuple): - invalid_ids = {invalid_id for invalid_id in id if invalid_id in self._columns} + errors.append(f"'{key_type}_key' must be a string or tuple of strings.") + keys = {id} if isinstance(id, str) else set(id) + invalid_ids = keys - set(self._columns) if invalid_ids: - raise ValueError( - f'Unknown primary key values {invalid_ids}.' + errors.append( + f'Unknown {key_type} key values {invalid_ids}.' ' Keys should be columns that exist in the table.' ) + if errors: + raise MultipleConstraintsErrors(errors) + def set_primary_key(self, id): """Set the metadata primary key. @@ -273,7 +279,7 @@ def set_primary_key(self, id): id (str, tuple): Name (or tuple of names) of the primary key column(s). """ - self._validate_primary_key(id) + self._validate_key(id, 'primary') if self._metadata['primary_key'] is not None: warnings.warn( @@ -283,25 +289,44 @@ def set_primary_key(self, id): self._metadata['primary_key'] = id + def set_sequence_key(self, id): + """Set the metadata sequence key. + + Args: + id (str, tuple): + Name (or tuple of names) of the sequence key column(s). + """ + self._validate_key(id, 'sequence') + + if self._metadata['sequence_key'] is not None: + warnings.warn( + f"There is an existing sequence key {self._metadata['sequence_key']}." + ' This key will be removed.' + ) + + self._metadata['sequence_key'] = id + def _validate_alternate_keys(self, ids): + errors = [] if not isinstance(ids, list) or not all(self._validate_datatype(id) for id in ids): - raise ValueError( + errors.append( "'alternate_keys' must be a list of strings or a list of tuples of strings." ) - - invalid_ids = {} + + keys = set() for id in ids: - if isinstance(id, str) and id not in self._columns: - invalid_ids.add(id) - elif isinstance(id, tuple): - invalid_ids.update({invalid_id for invalid_id in id if invalid_id in self._columns}) + keys.update({id} if isinstance(id, str) else set(id)) + invalid_ids = keys - set(self._columns) if invalid_ids: - raise ValueError( + errors.append( f'Unknown alternate key values {invalid_ids}.' ' Keys should be columns that exist in the table.' ) + if errors: + raise MultipleConstraintsErrors(errors) + def set_alternate_keys(self, ids): """Set the metadata alternate keys. @@ -312,49 +337,20 @@ def set_alternate_keys(self, ids): self._validate_alternate_keys(ids) self._metadata['alternate_keys'] = ids - def _validate_sequence_key(self, id): - if not self._validate_datatype(id): - raise ValueError("'sequence_key' must be a string or tuple of strings.") - - invalid_ids = {} - if isinstance(id, str) and id not in self._columns: - invalid_ids = {id} - elif isinstance(id, tuple): - invalid_ids = {invalid_id for invalid_id in id if invalid_id in self._columns} - - if invalid_ids: - raise ValueError( - f'Unknown sequence key values {invalid_ids}.' - ' Keys should be columns that exist in the table.' - ) - - def set_sequence_key(self, id): - """Set the metadata sequence key. - - Args: - id (str, tuple): - Name (or tuple of names) of the sequence key column(s). - """ - self._validate_sequence_key(id) - - if self._metadata['sequence_key'] is not None: - warnings.warn( - f"There is an existing sequence key {self._metadata['sequence_key']}." - ' This key will be removed.' - ) - - self._metadata['sequence_key'] = id - def _validate_sequence_index(self, column_name): + errors = [] if not isinstance(column_name, str): - raise ValueError("'sequence_index' must be a string.") + errors.append("'sequence_index' must be a string.") if column_name not in self._columns: - raise ValueError( + errors.append( f'Unknown sequence key value {column_name}.' ' Keys should be columns that exist in the table.' ) + if errors: + raise MultipleConstraintsErrors(errors) + def set_sequence_index(self, column_name): """Set the metadata sequence index. @@ -365,12 +361,11 @@ def set_sequence_index(self, column_name): self._validate_sequence_index(column_name) self._metadata['sequence_index'] = column_name - def _validate_sequence_key_not_sequence_index(self): + def _validate_sequence_index_not_in_sequence_key(self): """Check that ``_sequence_index`` and ``_sequence_key`` don't overlap.""" - if ( - (isinstance(self._sequence_key, tuple) and self._sequence_index in self._sequence_key) - or self._sequence_index == self._sequence_key - ): + sk = self._sequence_key + sequence_key = {sk} if isinstance(sk, str) else set(sk) + if self._sequence_index in sequence_key: raise ValueError( f'sequence_index and sequence_key have the same value {self._sequence_index}.' ' These columns must be different.' @@ -383,15 +378,20 @@ def validate(self): - ``InvalidMetadataError`` if the metadata is invalid. """ # Validate keys - self._validate_primary_key(self._primary_key) + self._validate_key(self._primary_key, 'primary') self._validate_alternate_keys(self._alternate_keys) - self._validate_sequence_key(self._sequence_key) + self._validate_key(self._sequence_key, 'sequence') self._validate_sequence_index(self._sequence_index) - self._validate_sequence_key_not_sequence_index() + self._validate_sequence_index_not_in_sequence_key() # Validate constraints + for constraint, args in self._constraints.items(): + constraint._validate_metadata(self, **args) # Validate columns + for column, kwargs in self._columns: + sdtype = kwargs.get('sdtype') + self._validate_column(column, sdtype, **kwargs) def to_dict(self): """Return a python ``dict`` representation of the ``SingleTableMetadata``.""" diff --git a/tests/unit/metadata/test_single_table.py b/tests/unit/metadata/test_single_table.py index 221bc4252..40588ddfb 100644 --- a/tests/unit/metadata/test_single_table.py +++ b/tests/unit/metadata/test_single_table.py @@ -12,6 +12,8 @@ from sdv.metadata.errors import MetadataError from sdv.metadata.single_table import SingleTableMetadata +from sdv.constraints.errors import MultipleConstraintsErrors + class TestSingleTableMetadata: @@ -863,6 +865,9 @@ def test__validate_dataype_invalid_tuple(self): def test_set_primary_key_validation(self): """Test that ``set_primary_key`` crashes for invalid arguments. + Setup: + - A ``SingleTableMetadata`` instance with ``_columns`` set. + Input: - A tuple with non-string values. @@ -871,11 +876,16 @@ def test_set_primary_key_validation(self): """ # Setup instance = SingleTableMetadata() + instance._columns = {'a', 'd'} - err_msg = "'primary_key' must be a string or tuple of strings." + err_msg = ( + "'primary_key' must be a string or tuple of strings." + "\n\nUnknown primary key values {'b', 'c'}." + ' Keys should be columns that exist in the table.' + ) # Run / Assert - with pytest.raises(ValueError, match=err_msg): - instance.set_primary_key(('1', 2, '3')) + with pytest.raises(MultipleConstraintsErrors, match=err_msg): + instance.set_primary_key(('a', 'b', 'd', 'c')) def test_set_primary_key(self): """Test that ``set_primary_key`` sets the ``_metadata['primary_key']`` value.""" From d0db252270249cc961322e69e71949643e7c5a61 Mon Sep 17 00:00:00 2001 From: Felipe Hofmann Date: Fri, 29 Jul 2022 09:18:06 -0700 Subject: [PATCH 03/18] Finish constraints --- sdv/metadata/single_table.py | 28 ++-- tests/unit/metadata/test_single_table.py | 165 +++++++++++++++++++---- 2 files changed, 147 insertions(+), 46 deletions(-) diff --git a/sdv/metadata/single_table.py b/sdv/metadata/single_table.py index 80a768c9a..4496ff91f 100644 --- a/sdv/metadata/single_table.py +++ b/sdv/metadata/single_table.py @@ -257,21 +257,17 @@ def _validate_datatype(id): def _validate_key(self, id, key_type): """Validate the primary and sequence keys.""" - errors = [] if not self._validate_datatype(id): - errors.append(f"'{key_type}_key' must be a string or tuple of strings.") + raise ValueError(f"'{key_type}_key' must be a string or tuple of strings.") keys = {id} if isinstance(id, str) else set(id) invalid_ids = keys - set(self._columns) if invalid_ids: - errors.append( + raise ValueError( f'Unknown {key_type} key values {invalid_ids}.' ' Keys should be columns that exist in the table.' ) - if errors: - raise MultipleConstraintsErrors(errors) - def set_primary_key(self, id): """Set the metadata primary key. @@ -307,9 +303,8 @@ def set_sequence_key(self, id): self._metadata['sequence_key'] = id def _validate_alternate_keys(self, ids): - errors = [] if not isinstance(ids, list) or not all(self._validate_datatype(id) for id in ids): - errors.append( + raise ValueError( "'alternate_keys' must be a list of strings or a list of tuples of strings." ) @@ -319,14 +314,11 @@ def _validate_alternate_keys(self, ids): invalid_ids = keys - set(self._columns) if invalid_ids: - errors.append( + raise ValueError( f'Unknown alternate key values {invalid_ids}.' ' Keys should be columns that exist in the table.' ) - if errors: - raise MultipleConstraintsErrors(errors) - def set_alternate_keys(self, ids): """Set the metadata alternate keys. @@ -338,19 +330,16 @@ def set_alternate_keys(self, ids): self._metadata['alternate_keys'] = ids def _validate_sequence_index(self, column_name): - errors = [] if not isinstance(column_name, str): - errors.append("'sequence_index' must be a string.") + raise ValueError("'sequence_index' must be a string.") if column_name not in self._columns: - errors.append( + column_name = {column_name} + raise ValueError( f'Unknown sequence key value {column_name}.' ' Keys should be columns that exist in the table.' ) - if errors: - raise MultipleConstraintsErrors(errors) - def set_sequence_index(self, column_name): """Set the metadata sequence index. @@ -366,8 +355,9 @@ def _validate_sequence_index_not_in_sequence_key(self): sk = self._sequence_key sequence_key = {sk} if isinstance(sk, str) else set(sk) if self._sequence_index in sequence_key: + index = {self._sequence_index} raise ValueError( - f'sequence_index and sequence_key have the same value {self._sequence_index}.' + f"'sequence_index' and 'sequence_key' have the same value {index}." ' These columns must be different.' ) diff --git a/tests/unit/metadata/test_single_table.py b/tests/unit/metadata/test_single_table.py index 40588ddfb..4d86e0db0 100644 --- a/tests/unit/metadata/test_single_table.py +++ b/tests/unit/metadata/test_single_table.py @@ -12,8 +12,6 @@ from sdv.metadata.errors import MetadataError from sdv.metadata.single_table import SingleTableMetadata -from sdv.constraints.errors import MultipleConstraintsErrors - class TestSingleTableMetadata: @@ -862,14 +860,33 @@ def test__validate_dataype_invalid_tuple(self): # Assert assert out is False - def test_set_primary_key_validation(self): + def test_set_primary_key_validation_dtype(self): + """Test that ``set_primary_key`` crashes for invalid arguments. + + Input: + - A tuple with non-string values. + + Side Effect: + - A ``ValueError`` should be raised. + """ + # Setup + instance = SingleTableMetadata() + + err_msg = ( + "'primary_key' must be a string or tuple of strings." + ) + # Run / Assert + with pytest.raises(ValueError, match=err_msg): + instance.set_primary_key(('1', 2, '3')) + + def test_set_primary_key_validation_columns(self): """Test that ``set_primary_key`` crashes for invalid arguments. Setup: - A ``SingleTableMetadata`` instance with ``_columns`` set. Input: - - A tuple with non-string values. + - A tuple with columns not present in ``_columns``. Side Effect: - A ``ValueError`` should be raised. @@ -879,18 +896,18 @@ def test_set_primary_key_validation(self): instance._columns = {'a', 'd'} err_msg = ( - "'primary_key' must be a string or tuple of strings." - "\n\nUnknown primary key values {'b', 'c'}." + "Unknown primary key values {'b', 'c'}." ' Keys should be columns that exist in the table.' ) # Run / Assert - with pytest.raises(MultipleConstraintsErrors, match=err_msg): + with pytest.raises(ValueError, match=err_msg): instance.set_primary_key(('a', 'b', 'd', 'c')) def test_set_primary_key(self): """Test that ``set_primary_key`` sets the ``_metadata['primary_key']`` value.""" # Setup instance = SingleTableMetadata() + instance._columns = {'column'} # Run instance.set_primary_key('column') @@ -902,6 +919,7 @@ def test_set_primary_key_tuple(self): """Test that ``set_primary_key`` sets the ``_metadata['primary_key']`` value for tuples.""" # Setup instance = SingleTableMetadata() + instance._columns = {'column1', 'column2'} # Run instance.set_primary_key(('column1', 'column2')) @@ -924,6 +942,7 @@ def test_set_primary_key_warning(self, warning_mock): """ # Setup instance = SingleTableMetadata() + instance._columns = {'column1'} instance._metadata['primary_key'] = 'column0' # Run @@ -934,11 +953,11 @@ def test_set_primary_key_warning(self, warning_mock): assert warning_mock.warn.called_once_with(warning_msg) assert instance._metadata['primary_key'] == 'column1' - def test_set_alternate_keys_validation(self): - """Test that ``set_alternate_keys`` crashes for invalid arguments. + def test_set_sequence_key_validation_dtype(self): + """Test that ``set_sequence_key`` crashes for invalid arguments. Input: - - A list with tuples with non-string values. + - A tuple with non-string values. Side Effect: - A ``ValueError`` should be raised. @@ -946,43 +965,40 @@ def test_set_alternate_keys_validation(self): # Setup instance = SingleTableMetadata() - err_msg = "'alternate_keys' must be a list of strings or a list of tuples of strings." + err_msg = "'sequence_key' must be a string or tuple of strings." # Run / Assert with pytest.raises(ValueError, match=err_msg): - instance.set_alternate_keys(['col1', ('1', 2, '3'), 'col3']) - - def test_set_alternate_keys(self): - """Test that ``set_alternate_keys`` sets the ``_metadata['alternate_keys']`` value.""" - # Setup - instance = SingleTableMetadata() - - # Run - instance.set_alternate_keys(['column1', ('column2', 'column3')]) - - # Assert - assert instance._metadata['alternate_keys'] == ['column1', ('column2', 'column3')] + instance.set_sequence_key(('1', 2, '3')) - def test_set_sequence_key_validation(self): + def test_set_sequence_key_validation_columns(self): """Test that ``set_sequence_key`` crashes for invalid arguments. + Setup: + - A ``SingleTableMetadata`` instance with ``_columns`` set. + Input: - - A tuple with non-string values. + - A tuple with columns not present in ``_columns``. Side Effect: - A ``ValueError`` should be raised. """ # Setup instance = SingleTableMetadata() + instance._columns = {'a', 'd'} - err_msg = "'sequence_key' must be a string or tuple of strings." + err_msg = ( + "Unknown sequence key values {'b', 'c'}." + ' Keys should be columns that exist in the table.' + ) # Run / Assert with pytest.raises(ValueError, match=err_msg): - instance.set_sequence_key(('1', 2, '3')) + instance.set_sequence_key(('a', 'b', 'd', 'c')) def test_set_sequence_key(self): """Test that ``set_sequence_key`` sets the ``_metadata['sequence_key']`` value.""" # Setup instance = SingleTableMetadata() + instance._columns = {'column'} # Run instance.set_sequence_key('column') @@ -994,6 +1010,7 @@ def test_set_sequence_key_tuple(self): """Test that ``set_sequence_key`` sets ``_metadata['sequence_key']`` for tuples.""" # Setup instance = SingleTableMetadata() + instance._columns = {'column1', 'column2'} # Run instance.set_sequence_key(('column1', 'column2')) @@ -1016,6 +1033,7 @@ def test_set_sequence_key_warning(self, warning_mock): """ # Setup instance = SingleTableMetadata() + instance._columns = {'column1'} instance._metadata['sequence_key'] = 'column0' # Run @@ -1026,6 +1044,59 @@ def test_set_sequence_key_warning(self, warning_mock): assert warning_mock.warn.called_once_with(warning_msg) assert instance._metadata['sequence_key'] == 'column1' + def test_set_alternate_keys_validation_dtype(self): + """Test that ``set_alternate_keys`` crashes for invalid arguments. + + Input: + - A list with tuples with non-string values. + + Side Effect: + - A ``ValueError`` should be raised. + """ + # Setup + instance = SingleTableMetadata() + + err_msg = "'alternate_keys' must be a list of strings or a list of tuples of strings." + # Run / Assert + with pytest.raises(ValueError, match=err_msg): + instance.set_alternate_keys(['col1', ('1', 2, '3'), 'col3']) + + def test_set_alternate_keys_validation_columns(self): + """Test that ``set_alternate_keys`` crashes for invalid arguments. + + Setup: + - A ``SingleTableMetadata`` instance with ``_columns`` set. + + Input: + - A tuple with non-string values. + + Side Effect: + - A ``ValueError`` should be raised. + """ + # Setup + instance = SingleTableMetadata() + instance._columns = {'abc', '213', '312'} + + err_msg = ( + "Unknown alternate key values {'bca', '123'}." + ' Keys should be columns that exist in the table.' + ) + # Run / Assert + with pytest.raises(ValueError, match=err_msg): + instance.set_alternate_keys(['abc', ('123', '213', '312'), 'bca']) + + def test_set_alternate_keys(self): + """Test that ``set_alternate_keys`` sets the ``_metadata['alternate_keys']`` value.""" + # Setup + instance = SingleTableMetadata() + instance._columns = {'column1', 'column2', 'column3'} + + # Run + instance.set_alternate_keys(['column1', ('column2', 'column3')]) + + # Assert + assert instance._metadata['alternate_keys'] == ['column1', ('column2', 'column3')] + def test_set_sequence_index_validation(self): """Test that ``set_sequence_index`` crashes for invalid arguments. @@ -1043,16 +1114,56 @@ def test_set_sequence_index_validation(self): with pytest.raises(ValueError, match=err_msg): instance.set_sequence_index(('column1', 'column2')) + def test_set_sequence_index_validation_columns(self): + """Test that ``set_sequence_index`` crashes for invalid arguments. + + Setup: + - A ``SingleTableMetadata`` instance with ``_columns`` set. + + Input: + - A string not present in ``_columns``. + + Side Effect: + - A ``ValueError`` should be raised. + """ + # Setup + instance = SingleTableMetadata() + instance._columns = {'a', 'd'} + + err_msg = ( + "Unknown sequence key value {'column'}." + ' Keys should be columns that exist in the table.' + ) + # Run / Assert + with pytest.raises(ValueError, match=err_msg): + instance.set_sequence_index('column') + def test_set_sequence_index(self): """Test that ``set_sequence_index`` sets the ``_metadata['sequence_index']`` value.""" # Setup instance = SingleTableMetadata() + instance._columns = {'column'} # Run instance.set_sequence_index('column') # Assert assert instance._metadata['sequence_index'] == 'column' + + def test_validate_sequence_index_not_in_sequence_key(self): + """Test the ``_validate_sequence_index_not_in_sequence_key`` method.""" + # Setup + instance = SingleTableMetadata() + instance._sequence_key = ('abc', 'def') + instance._sequence_index = 'abc' + + err_msg = ( + "'sequence_index' and 'sequence_key' have the same value {'abc'}." + ' These columns must be different.' + ) + # Run / Assert + with pytest.raises(ValueError, match=err_msg): + instance._validate_sequence_index_not_in_sequence_key() def test_to_dict(self): """Test the ``to_dict`` method from ``SingleTableMetadata``. From 3d9a5f7c576f8ee50573f3eda9c1df3b928cd93d Mon Sep 17 00:00:00 2001 From: Felipe Hofmann Date: Mon, 1 Aug 2022 19:05:04 -0700 Subject: [PATCH 04/18] Add silly constraint solution --- sdv/metadata/single_table.py | 16 +++++++++++----- tests/unit/metadata/test_single_table.py | 21 +++++++++++++++++++++ 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/sdv/metadata/single_table.py b/sdv/metadata/single_table.py index 4496ff91f..81211abc3 100644 --- a/sdv/metadata/single_table.py +++ b/sdv/metadata/single_table.py @@ -8,11 +8,12 @@ from pathlib import Path import pandas as pd -from sdv.constraints.errors import MultipleConstraintsErrors from sdv.constraints import Constraint from sdv.metadata.errors import MetadataError +import inspect + class SingleTableMetadata: """Single Table Metadata class.""" @@ -354,7 +355,7 @@ def _validate_sequence_index_not_in_sequence_key(self): """Check that ``_sequence_index`` and ``_sequence_key`` don't overlap.""" sk = self._sequence_key sequence_key = {sk} if isinstance(sk, str) else set(sk) - if self._sequence_index in sequence_key: + if self._sequence_index in sequence_key or sk is None: index = {self._sequence_index} raise ValueError( f"'sequence_index' and 'sequence_key' have the same value {index}." @@ -363,7 +364,7 @@ def _validate_sequence_index_not_in_sequence_key(self): def validate(self): """Validate the metadata. - + Raises: - ``InvalidMetadataError`` if the metadata is invalid. """ @@ -375,8 +376,13 @@ def validate(self): self._validate_sequence_index_not_in_sequence_key() # Validate constraints - for constraint, args in self._constraints.items(): - constraint._validate_metadata(self, **args) + for constraint in self._constraints: + params = inspect.signature(constraint.__init__).parameters.keys() + params = ['_' + arg for arg in params] + kwargs = {arg[1:]: constraint.arg for arg in params} + constraint._validate_metadata(self, **kwargs) + + assert 1 == 2 # Validate columns for column, kwargs in self._columns: diff --git a/tests/unit/metadata/test_single_table.py b/tests/unit/metadata/test_single_table.py index 4d86e0db0..63dac1e82 100644 --- a/tests/unit/metadata/test_single_table.py +++ b/tests/unit/metadata/test_single_table.py @@ -1164,6 +1164,27 @@ def test_validate_sequence_index_not_in_sequence_key(self): # Run / Assert with pytest.raises(ValueError, match=err_msg): instance._validate_sequence_index_not_in_sequence_key() + + def test_validate(self): + """Test ``validate``.""" + # Setup + instance = SingleTableMetadata() + instance._columns = {'col1': {'sdtype': 'numerical'}, 'col2': {'sdtype': 'numerical'}} + instance._constraints = [] + instance._primary_key = 'col1' + instance._alternate_keys = ['col2'] + instance._sequence_key = 'col1' + instance._sequence_index = 'col2' + + instance.add_constraint( + constraint_name='Inequality', + low_column_name='col1', + high_column_name='col2' + ) + + # Run + instance.validate() + def test_to_dict(self): """Test the ``to_dict`` method from ``SingleTableMetadata``. From 46117007c31a11cd0bb8da88f0d2481e957f7e03 Mon Sep 17 00:00:00 2001 From: Felipe Hofmann Date: Tue, 2 Aug 2022 10:50:08 -0700 Subject: [PATCH 05/18] Add try/catches for validation + add unit and integration tests --- sdv/metadata/errors.py | 4 ++ sdv/metadata/single_table.py | 70 +++++++++++------- .../integration/metadata/test_single_table.py | 72 +++++++++++++++++++ tests/unit/metadata/test_single_table.py | 51 ++++++++----- 4 files changed, 154 insertions(+), 43 deletions(-) diff --git a/sdv/metadata/errors.py b/sdv/metadata/errors.py index 1ba8f3bd6..ceb7e1aa8 100644 --- a/sdv/metadata/errors.py +++ b/sdv/metadata/errors.py @@ -5,5 +5,9 @@ class MetadataError(Exception): """Error to raise when Metadata is not valid.""" +class InvalidMetadataError(Exception): + """Error to raise when Metadata is not valid.""" + + class MetadataNotFittedError(MetadataError): """Error to raise when Metadata is used before fitting.""" diff --git a/sdv/metadata/single_table.py b/sdv/metadata/single_table.py index 81211abc3..325d20af3 100644 --- a/sdv/metadata/single_table.py +++ b/sdv/metadata/single_table.py @@ -10,9 +10,7 @@ import pandas as pd from sdv.constraints import Constraint -from sdv.metadata.errors import MetadataError - -import inspect +from sdv.metadata.errors import InvalidMetadataError, MetadataError class SingleTableMetadata: @@ -54,13 +52,16 @@ def _validate_numerical(self, column_name, **kwargs): representation = kwargs.get('representation') if representation and representation not in self._NUMERICAL_REPRESENTATIONS: raise ValueError( - f"Invalid value for 'representation' {representation} for column '{column_name}'.") + f"Invalid value for 'representation' '{representation}'" + f" for column '{column_name}'." + ) @staticmethod def _validate_datetime(column_name, **kwargs): datetime_format = kwargs.get('datetime_format') - if datetime_format: + if datetime_format is not None: try: + # NOTE: I don't know if this ever crashes, it just returns the string as is formated_date = datetime.now().strftime(datetime_format) except Exception as exception: raise ValueError( @@ -79,12 +80,12 @@ def _validate_datetime(column_name, **kwargs): def _validate_categorical(column_name, **kwargs): order = kwargs.get('order') order_by = kwargs.get('order_by') - if order and order_by: + if order is not None and order_by is not None: raise ValueError( f"Categorical column '{column_name}' has both an 'order' and 'order_by' " 'attribute. Only 1 is allowed.' ) - if order_by and order_by not in ('numerical_value', 'alphabetical'): + if order_by is not None and order_by not in ('numerical_value', 'alphabetical'): raise ValueError( f"Unknown ordering method '{order_by}' provided for categorical column " f"'{column_name}'. Ordering method must be 'numerical_value' or 'alphabetical'." @@ -284,7 +285,7 @@ def set_primary_key(self, id): ' This key will be removed.' ) - self._metadata['primary_key'] = id + self._primary_key = id def set_sequence_key(self, id): """Set the metadata sequence key. @@ -301,7 +302,7 @@ def set_sequence_key(self, id): ' This key will be removed.' ) - self._metadata['sequence_key'] = id + self._sequence_key = id def _validate_alternate_keys(self, ids): if not isinstance(ids, list) or not all(self._validate_datatype(id) for id in ids): @@ -349,7 +350,7 @@ def set_sequence_index(self, column_name): Name of the sequence index column. """ self._validate_sequence_index(column_name) - self._metadata['sequence_index'] = column_name + self._sequence_index = column_name def _validate_sequence_index_not_in_sequence_key(self): """Check that ``_sequence_index`` and ``_sequence_key`` don't overlap.""" @@ -369,25 +370,44 @@ def validate(self): - ``InvalidMetadataError`` if the metadata is invalid. """ # Validate keys - self._validate_key(self._primary_key, 'primary') - self._validate_alternate_keys(self._alternate_keys) - self._validate_key(self._sequence_key, 'sequence') - self._validate_sequence_index(self._sequence_index) - self._validate_sequence_index_not_in_sequence_key() + errors = [] + try: + self._validate_key(self._primary_key, 'primary') + except ValueError as e: + errors.append(e) - # Validate constraints - for constraint in self._constraints: - params = inspect.signature(constraint.__init__).parameters.keys() - params = ['_' + arg for arg in params] - kwargs = {arg[1:]: constraint.arg for arg in params} - constraint._validate_metadata(self, **kwargs) + try: + self._validate_key(self._sequence_key, 'sequence') + except ValueError as e: + errors.append(e) - assert 1 == 2 + try: + self._validate_alternate_keys(self._alternate_keys) + except ValueError as e: + errors.append(e) + + try: + self._validate_sequence_index(self._sequence_index) + except ValueError as e: + errors.append(e) + + try: + self._validate_sequence_index_not_in_sequence_key() + except ValueError as e: + errors.append(e) # Validate columns - for column, kwargs in self._columns: - sdtype = kwargs.get('sdtype') - self._validate_column(column, sdtype, **kwargs) + for column, kwargs in self._columns.items(): + try: + self._validate_column(column, **kwargs) + except ValueError as e: + errors.append(e) + + if errors: + raise InvalidMetadataError( + 'The following errors were found in the metadata:\n\n' + + '\n'.join([str(e) for e in errors]) + ) def to_dict(self): """Return a python ``dict`` representation of the ``SingleTableMetadata``.""" diff --git a/tests/integration/metadata/test_single_table.py b/tests/integration/metadata/test_single_table.py index af015b2a1..1be7b6bd4 100644 --- a/tests/integration/metadata/test_single_table.py +++ b/tests/integration/metadata/test_single_table.py @@ -1,6 +1,11 @@ """Integration tests for Single Table Metadata.""" +import re + +import pytest + from sdv.metadata import SingleTableMetadata +from sdv.metadata.errors import InvalidMetadataError def test_single_table_metadata(): @@ -28,3 +33,70 @@ def test_single_table_metadata(): 'constraints': [], 'SCHEMA_VERSION': 'SINGLE_TABLE_V1' } + + +def test_validate(): + """Test ``SingleTableMetadata.validate``. + + Ensure the method doesn't crash for a valid metadata. + """ + # Setup + instance = SingleTableMetadata() + instance.add_column('col1', sdtype='numerical') + instance.add_column('col2', sdtype='numerical') + instance.add_constraint( + constraint_name='Inequality', + low_column_name='col1', + high_column_name='col2' + ) + instance.set_primary_key('col1') + instance.set_alternate_keys([('col1', 'col2')]) + instance.set_sequence_index('col1') + instance.set_sequence_key('col2') + + # Run + instance.validate() + + +def test_validate_errors(): + """Test Test ``SingleTableMetadata.validate`` raises the correct errors.""" + # Setup + instance = SingleTableMetadata() + instance._columns = { + 'col1': {'sdtype': 'numerical'}, + 'col2': {'sdtype': 'numerical'}, + 'col4': {'sdtype': 'categorical', 'invalid1': 'value'}, + 'col5': {'sdtype': 'categorical', 'order': ''}, + 'col6': {'sdtype': 'categorical', 'order_by': ''}, + 'col7': {'sdtype': 'categorical', 'order': '', 'order_by': ''}, + 'col8': {'sdtype': 'numerical', 'representation': 'value'}, + 'col9': {'sdtype': 'datetime', 'datetime_format': '%1-%Y-%m-%d-%'}, + 'col10': {'sdtype': 'text', 'regex_format': '[A-{6}'}, + } + instance._primary_key = 10 + instance._alternate_keys = 'col1' + instance._sequence_key = ('col3', 'col1') + instance._sequence_index = 'col3' + + err_msg = re.escape( + 'The following errors were found in the metadata:' + "\n\n'primary_key' must be a string or tuple of strings." + "\nUnknown sequence key values {'col3'}. Keys should be columns that exist in the table." + "\n'alternate_keys' must be a list of strings or a list of tuples of strings." + "\nUnknown sequence key value {'col3'}. Keys should be columns that exist in the table." + "\n'sequence_index' and 'sequence_key' have the same value {'col3'}." + ' These columns must be different.' + "\nInvalid values '(invalid1)' for categorical column 'col4'." + "\nInvalid order value provided for categorical column 'col5'." + " The 'order' must be a list with 1 or more elements." + "\nUnknown ordering method '' provided for categorical column 'col6'." + " Ordering method must be 'numerical_value' or 'alphabetical'." + "\nCategorical column 'col7' has both an 'order' and 'order_by' attribute." + ' Only 1 is allowed.' + "\nInvalid value for 'representation' 'value' for column 'col8'." + "\nInvalid datetime format string '%1-%Y-%m-%d-%' for datetime column 'col9'." + "\nInvalid regex format string '[A-{6}' for text column 'col10'." + ) + # Run / Assert + with pytest.raises(InvalidMetadataError, match=err_msg): + instance.validate() diff --git a/tests/unit/metadata/test_single_table.py b/tests/unit/metadata/test_single_table.py index 63dac1e82..2d9e66c0d 100644 --- a/tests/unit/metadata/test_single_table.py +++ b/tests/unit/metadata/test_single_table.py @@ -89,7 +89,7 @@ def test__validate_numerical_default_and_invalid(self): # Run / Assert instance._validate_numerical('age') - error_msg = re.escape("Invalid value for 'representation' 36 for column 'age'.") + error_msg = re.escape("Invalid value for 'representation' '36' for column 'age'.") with pytest.raises(ValueError, match=error_msg): instance._validate_numerical('age', representation=36) @@ -913,7 +913,7 @@ def test_set_primary_key(self): instance.set_primary_key('column') # Assert - assert instance._metadata['primary_key'] == 'column' + assert instance._primary_key == 'column' def test_set_primary_key_tuple(self): """Test that ``set_primary_key`` sets the ``_metadata['primary_key']`` value for tuples.""" @@ -925,7 +925,7 @@ def test_set_primary_key_tuple(self): instance.set_primary_key(('column1', 'column2')) # Assert - assert instance._metadata['primary_key'] == ('column1', 'column2') + assert instance._primary_key == ('column1', 'column2') @patch('sdv.tabular.utils.warnings') def test_set_primary_key_warning(self, warning_mock): @@ -943,7 +943,7 @@ def test_set_primary_key_warning(self, warning_mock): # Setup instance = SingleTableMetadata() instance._columns = {'column1'} - instance._metadata['primary_key'] = 'column0' + instance._primary_key = 'column0' # Run instance.set_primary_key('column1') @@ -951,7 +951,7 @@ def test_set_primary_key_warning(self, warning_mock): # Assert warning_msg = "There is an existing primary key 'column0'. This key will be removed." assert warning_mock.warn.called_once_with(warning_msg) - assert instance._metadata['primary_key'] == 'column1' + assert instance._primary_key == 'column1' def test_set_sequence_key_validation_dtype(self): """Test that ``set_sequence_key`` crashes for invalid arguments. @@ -1004,7 +1004,7 @@ def test_set_sequence_key(self): instance.set_sequence_key('column') # Assert - assert instance._metadata['sequence_key'] == 'column' + assert instance._sequence_key == 'column' def test_set_sequence_key_tuple(self): """Test that ``set_sequence_key`` sets ``_metadata['sequence_key']`` for tuples.""" @@ -1016,7 +1016,7 @@ def test_set_sequence_key_tuple(self): instance.set_sequence_key(('column1', 'column2')) # Assert - assert instance._metadata['sequence_key'] == ('column1', 'column2') + assert instance._sequence_key == ('column1', 'column2') @patch('sdv.tabular.utils.warnings') def test_set_sequence_key_warning(self, warning_mock): @@ -1042,7 +1042,7 @@ def test_set_sequence_key_warning(self, warning_mock): # Assert warning_msg = "There is an existing sequence key 'column0'. This key will be removed." assert warning_mock.warn.called_once_with(warning_msg) - assert instance._metadata['sequence_key'] == 'column1' + assert instance._sequence_key == 'column1' def test_set_alternate_keys_validation_dtype(self): """Test that ``set_alternate_keys`` crashes for invalid arguments. @@ -1148,8 +1148,8 @@ def test_set_sequence_index(self): instance.set_sequence_index('column') # Assert - assert instance._metadata['sequence_index'] == 'column' - + assert instance._sequence_index == 'column' + def test_validate_sequence_index_not_in_sequence_key(self): """Test the ``_validate_sequence_index_not_in_sequence_key`` method.""" # Setup @@ -1164,9 +1164,19 @@ def test_validate_sequence_index_not_in_sequence_key(self): # Run / Assert with pytest.raises(ValueError, match=err_msg): instance._validate_sequence_index_not_in_sequence_key() - + def test_validate(self): - """Test ``validate``.""" + """Test the ``validate`` method. + + Ensure the method calls the correct methods with the correct parameters. + + Setup: + - A ``SingleTableMetadata`` instance with: + - ``_columns``, ``_constraints``, ``_primary_key``, ``_alternate_keys``, + ``_sequence_key`` and ``_sequence_index`` defined. + - ``_validate_key``, ``_validate_alternate_keys``, ``_validate_sequence_index`` + and ``_validate_sequence_index_not_in_sequence_key`` mocked. + """ # Setup instance = SingleTableMetadata() instance._columns = {'col1': {'sdtype': 'numerical'}, 'col2': {'sdtype': 'numerical'}} @@ -1175,16 +1185,21 @@ def test_validate(self): instance._alternate_keys = ['col2'] instance._sequence_key = 'col1' instance._sequence_index = 'col2' - - instance.add_constraint( - constraint_name='Inequality', - low_column_name='col1', - high_column_name='col2' - ) + instance._validate_key = Mock() + instance._validate_alternate_keys = Mock() + instance._validate_sequence_index = Mock() + instance._validate_sequence_index_not_in_sequence_key = Mock() # Run instance.validate() + # Assert + instance._validate_key.assert_has_calls( + [call(instance._primary_key, 'primary'), call(instance._sequence_key, 'sequence')] + ) + instance._validate_alternate_keys.assert_called_once_with(instance._alternate_keys) + instance._validate_sequence_index.assert_called_once_with(instance._sequence_index) + instance._validate_sequence_index_not_in_sequence_key.assert_called_once() def test_to_dict(self): """Test the ``to_dict`` method from ``SingleTableMetadata``. From 96a1c0edb8acf21ba91e44afd8f3964ac8fc82f8 Mon Sep 17 00:00:00 2001 From: Felipe Hofmann Date: Tue, 2 Aug 2022 12:42:48 -0700 Subject: [PATCH 06/18] Change MetadataError to InvalidMetadataError --- sdv/metadata/__init__.py | 4 ++-- sdv/metadata/dataset.py | 10 +++++----- sdv/metadata/errors.py | 6 +----- sdv/metadata/single_table.py | 4 ++-- sdv/metadata/table.py | 4 ++-- tests/unit/metadata/test_single_table.py | 6 +++--- 6 files changed, 15 insertions(+), 19 deletions(-) diff --git a/sdv/metadata/__init__.py b/sdv/metadata/__init__.py index f6f0e34df..27a6ff0f2 100644 --- a/sdv/metadata/__init__.py +++ b/sdv/metadata/__init__.py @@ -2,14 +2,14 @@ from sdv.metadata import visualization from sdv.metadata.dataset import Metadata -from sdv.metadata.errors import MetadataError, MetadataNotFittedError +from sdv.metadata.errors import InvalidMetadataError, MetadataNotFittedError from sdv.metadata.multi_table import MultiTableMetadata from sdv.metadata.single_table import SingleTableMetadata from sdv.metadata.table import Table __all__ = ( 'Metadata', - 'MetadataError', + 'InvalidMetadataError', 'MetadataNotFittedError', 'MultiTableMetadata', 'SingleTableMetadata', diff --git a/sdv/metadata/dataset.py b/sdv/metadata/dataset.py index bd77b95a5..3e6bb4dbb 100644 --- a/sdv/metadata/dataset.py +++ b/sdv/metadata/dataset.py @@ -12,7 +12,7 @@ from sdv.constraints import Constraint from sdv.metadata import visualization -from sdv.metadata.errors import MetadataError +from sdv.metadata.errors import InvalidMetadataError LOGGER = logging.getLogger(__name__) @@ -551,7 +551,7 @@ def _validate_table(self, table_name, table_meta, table_data=None, errors=None): on the metadata. Raises: - MetadataError: + InvalidMetadataError: If there is any error in the metadata or the data does not match the metadata description. """ @@ -635,7 +635,7 @@ def validate(self, tables=None): """ tables_meta = self._metadata.get('tables') if not tables_meta: - raise MetadataError('"tables" entry not found in Metadata.') + raise InvalidMetadataError('"tables" entry not found in Metadata.') if tables and not isinstance(tables, dict): tables = self.load_tables() @@ -654,7 +654,7 @@ def validate(self, tables=None): self._validate_circular_relationships(table_name, errors=errors) if errors: - raise MetadataError('Invalid Metadata specification:\n - ' + '\n - '.join(errors)) + raise InvalidMetadataError('Invalid Metadata specification:\n - ' + '\n - '.join(errors)) def _check_field(self, table, field, exists=False): """Validate the existance of the table and existance (or not) of field.""" @@ -832,7 +832,7 @@ def add_relationship(self, parent, child, foreign_key=None, validate=True): if validate: try: self.validate() - except MetadataError: + except InvalidMetadataError: self._metadata = metadata_backup raise diff --git a/sdv/metadata/errors.py b/sdv/metadata/errors.py index ceb7e1aa8..08f62ca8a 100644 --- a/sdv/metadata/errors.py +++ b/sdv/metadata/errors.py @@ -1,13 +1,9 @@ """Metadata Exceptions.""" -class MetadataError(Exception): - """Error to raise when Metadata is not valid.""" - - class InvalidMetadataError(Exception): """Error to raise when Metadata is not valid.""" -class MetadataNotFittedError(MetadataError): +class MetadataNotFittedError(InvalidMetadataError): """Error to raise when Metadata is used before fitting.""" diff --git a/sdv/metadata/single_table.py b/sdv/metadata/single_table.py index 325d20af3..f1d7cd963 100644 --- a/sdv/metadata/single_table.py +++ b/sdv/metadata/single_table.py @@ -10,7 +10,7 @@ import pandas as pd from sdv.constraints import Constraint -from sdv.metadata.errors import InvalidMetadataError, MetadataError +from sdv.metadata.errors import InvalidMetadataError class SingleTableMetadata: @@ -457,7 +457,7 @@ def add_constraint(self, constraint_name, **kwargs): try: constraint_class = Constraint._get_class_from_dict(constraint_name) except KeyError: - raise MetadataError(f"Invalid constraint ('{constraint_name}').") + raise InvalidMetadataError(f"Invalid constraint ('{constraint_name}').") constraint_class._validate_metadata(self, **kwargs) constraint = constraint_class(**kwargs) diff --git a/sdv/metadata/table.py b/sdv/metadata/table.py index 6ef4751d2..aa67d1156 100644 --- a/sdv/metadata/table.py +++ b/sdv/metadata/table.py @@ -13,7 +13,7 @@ from sdv.constraints import Constraint from sdv.constraints.errors import ( FunctionError, MissingConstraintColumnError, MultipleConstraintsErrors) -from sdv.metadata.errors import MetadataError, MetadataNotFittedError +from sdv.metadata.errors import InvalidMetadataError, MetadataNotFittedError from sdv.metadata.utils import strings_from_regex LOGGER = logging.getLogger(__name__) @@ -306,7 +306,7 @@ def _get_field_dtype(self, field_name, field_metadata): field_subtype = field_metadata.get('subtype') dtype = self._TYPES_TO_DTYPES.get((field_type, field_subtype)) if not dtype: - raise MetadataError( + raise InvalidMetadataError( 'Invalid type and subtype combination for field {}: ({}, {})'.format( field_name, field_type, field_subtype) ) diff --git a/tests/unit/metadata/test_single_table.py b/tests/unit/metadata/test_single_table.py index 2d9e66c0d..59621e4b3 100644 --- a/tests/unit/metadata/test_single_table.py +++ b/tests/unit/metadata/test_single_table.py @@ -10,7 +10,7 @@ import pandas as pd import pytest -from sdv.metadata.errors import MetadataError +from sdv.metadata.errors import InvalidMetadataError from sdv.metadata.single_table import SingleTableMetadata @@ -1549,12 +1549,12 @@ def test_add_constraint_bad_constraint(self): - Fakse constraint name Side effect: - - MetadataError should be raised + - InvalidMetadataError should be raised """ # Setup metadata = SingleTableMetadata() # Run error_message = re.escape("Invalid constraint ('fake_constraint').") - with pytest.raises(MetadataError, match=error_message): + with pytest.raises(InvalidMetadataError, match=error_message): metadata.add_constraint(constraint_name='fake_constraint') From a54f3570b199c822d86eddd27425fe6b5dc72e40 Mon Sep 17 00:00:00 2001 From: Felipe Hofmann Date: Tue, 2 Aug 2022 12:52:24 -0700 Subject: [PATCH 07/18] Simplify tests to one element + fix lint --- sdv/metadata/dataset.py | 3 ++- sdv/metadata/single_table.py | 1 - tests/unit/metadata/test_single_table.py | 15 +++++++++------ 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/sdv/metadata/dataset.py b/sdv/metadata/dataset.py index 3e6bb4dbb..b9cacc14a 100644 --- a/sdv/metadata/dataset.py +++ b/sdv/metadata/dataset.py @@ -654,7 +654,8 @@ def validate(self, tables=None): self._validate_circular_relationships(table_name, errors=errors) if errors: - raise InvalidMetadataError('Invalid Metadata specification:\n - ' + '\n - '.join(errors)) + raise InvalidMetadataError( + 'Invalid Metadata specification:\n - ' + '\n - '.join(errors)) def _check_field(self, table, field, exists=False): """Validate the existance of the table and existance (or not) of field.""" diff --git a/sdv/metadata/single_table.py b/sdv/metadata/single_table.py index f1d7cd963..f5bf4ea14 100644 --- a/sdv/metadata/single_table.py +++ b/sdv/metadata/single_table.py @@ -61,7 +61,6 @@ def _validate_datetime(column_name, **kwargs): datetime_format = kwargs.get('datetime_format') if datetime_format is not None: try: - # NOTE: I don't know if this ever crashes, it just returns the string as is formated_date = datetime.now().strftime(datetime_format) except Exception as exception: raise ValueError( diff --git a/tests/unit/metadata/test_single_table.py b/tests/unit/metadata/test_single_table.py index 59621e4b3..800e0df2b 100644 --- a/tests/unit/metadata/test_single_table.py +++ b/tests/unit/metadata/test_single_table.py @@ -896,12 +896,13 @@ def test_set_primary_key_validation_columns(self): instance._columns = {'a', 'd'} err_msg = ( - "Unknown primary key values {'b', 'c'}." + "Unknown primary key values {'b'}." ' Keys should be columns that exist in the table.' ) # Run / Assert with pytest.raises(ValueError, match=err_msg): - instance.set_primary_key(('a', 'b', 'd', 'c')) + instance.set_primary_key(('a', 'b', 'd')) + # NOTE: used to be ('a', 'b', 'd', 'c') def test_set_primary_key(self): """Test that ``set_primary_key`` sets the ``_metadata['primary_key']`` value.""" @@ -987,12 +988,13 @@ def test_set_sequence_key_validation_columns(self): instance._columns = {'a', 'd'} err_msg = ( - "Unknown sequence key values {'b', 'c'}." + "Unknown sequence key values {'b'}." ' Keys should be columns that exist in the table.' ) # Run / Assert with pytest.raises(ValueError, match=err_msg): - instance.set_sequence_key(('a', 'b', 'd', 'c')) + instance.set_sequence_key(('a', 'b', 'd')) + # NOTE: used to be ('a', 'b', 'd', 'c') def test_set_sequence_key(self): """Test that ``set_sequence_key`` sets the ``_metadata['sequence_key']`` value.""" @@ -1078,12 +1080,13 @@ def test_set_alternate_keys_validation_columns(self): instance._columns = {'abc', '213', '312'} err_msg = ( - "Unknown alternate key values {'bca', '123'}." + "Unknown alternate key values {'123'}." ' Keys should be columns that exist in the table.' ) # Run / Assert with pytest.raises(ValueError, match=err_msg): - instance.set_alternate_keys(['abc', ('123', '213', '312'), 'bca']) + instance.set_alternate_keys(['abc', ('123', '213', '312')]) + # NOTE: used to be ['abc', ('123', '213', '312'), 'bca'] def test_set_alternate_keys(self): """Test that ``set_alternate_keys`` sets the ``_metadata['alternate_keys']`` value.""" From 9097c5e7814970ff6e9cd13e12619fa63f2b9ea2 Mon Sep 17 00:00:00 2001 From: Felipe Hofmann Date: Tue, 2 Aug 2022 13:47:40 -0700 Subject: [PATCH 08/18] Remove _metadata --- sdv/metadata/single_table.py | 39 +++++----- .../integration/metadata/test_single_table.py | 13 +--- tests/unit/metadata/test_single_table.py | 76 ++++++++----------- 3 files changed, 57 insertions(+), 71 deletions(-) diff --git a/sdv/metadata/single_table.py b/sdv/metadata/single_table.py index f5bf4ea14..c8cce7c7e 100644 --- a/sdv/metadata/single_table.py +++ b/sdv/metadata/single_table.py @@ -114,15 +114,6 @@ def __init__(self): self._sequence_key = None self._sequence_index = None self._version = self.SCHEMA_VERSION - self._metadata = { - 'columns': self._columns, - 'primary_key': None, - 'alternate_keys': [], - 'sequence_key': None, - 'sequence_index': None, - 'constraints': self._constraints, - 'SCHEMA_VERSION': self.SCHEMA_VERSION - } def _validate_unexpected_kwargs(self, column_name, sdtype, **kwargs): expected_kwargs = self._EXPECTED_KWARGS.get(sdtype, ['pii']) @@ -278,9 +269,9 @@ def set_primary_key(self, id): """ self._validate_key(id, 'primary') - if self._metadata['primary_key'] is not None: + if self._primary_key is not None: warnings.warn( - f"There is an existing primary key {self._metadata['primary_key']}." + f"There is an existing primary key {self._primary_key}." ' This key will be removed.' ) @@ -295,9 +286,9 @@ def set_sequence_key(self, id): """ self._validate_key(id, 'sequence') - if self._metadata['sequence_key'] is not None: + if self._sequence_key is not None: warnings.warn( - f"There is an existing sequence key {self._metadata['sequence_key']}." + f"There is an existing sequence key {self._sequence_key}." ' This key will be removed.' ) @@ -328,7 +319,7 @@ def set_alternate_keys(self, ids): List of names (or tuple of names) of the alternate key columns. """ self._validate_alternate_keys(ids) - self._metadata['alternate_keys'] = ids + self._alternate_keys = ids def _validate_sequence_index(self, column_name): if not isinstance(column_name, str): @@ -408,10 +399,21 @@ def validate(self): + '\n'.join([str(e) for e in errors]) ) + def _metadata_dict(self): + return { + 'columns': self._columns, + 'constraints': self._constraints, + 'primary_key': self._primary_key, + 'alternate_keys': self._alternate_keys, + 'sequence_key': self._sequence_key, + 'sequence_index': self._sequence_index, + 'SCHEMA_VERSION': self._version + } + def to_dict(self): """Return a python ``dict`` representation of the ``SingleTableMetadata``.""" metadata = {} - for key, value in self._metadata.items(): + for key, value in self._metadata_dict().items(): if key == 'constraints' and value: constraints = [] for constraint in value: @@ -427,8 +429,8 @@ def to_dict(self): return deepcopy(metadata) - def _set_metadata_dict(self, metadata): - """Set the ``metadata`` dictionary to the current instance. + def _set_metadata_attributes(self, metadata): + """Set the metadata attributes to the current instance. Args: metadata (dict): @@ -440,7 +442,6 @@ def _set_metadata_dict(self, metadata): value = [Constraint.from_dict(constraint_dict) for constraint_dict in value] if value: - self._metadata[key] = value setattr(self, f'_{key}', value) def add_constraint(self, constraint_name, **kwargs): @@ -474,7 +475,7 @@ def _load_from_dict(cls, metadata): Instance of ``SingleTableMetadata``. """ instance = cls() - instance._set_metadata_dict(metadata) + instance._set_metadata_attributes(metadata) return instance @classmethod diff --git a/tests/integration/metadata/test_single_table.py b/tests/integration/metadata/test_single_table.py index 1be7b6bd4..291cee176 100644 --- a/tests/integration/metadata/test_single_table.py +++ b/tests/integration/metadata/test_single_table.py @@ -24,15 +24,10 @@ def test_single_table_metadata(): assert instance._columns == {} assert instance._constraints == [] assert instance._version == 'SINGLE_TABLE_V1' - assert instance._metadata == { - 'columns': {}, - 'primary_key': None, - 'alternate_keys': [], - 'sequence_key': None, - 'sequence_index': None, - 'constraints': [], - 'SCHEMA_VERSION': 'SINGLE_TABLE_V1' - } + assert instance._primary_key is None + assert instance._sequence_key is None + assert instance._alternate_keys == [] + assert instance._sequence_index is None def test_validate(): diff --git a/tests/unit/metadata/test_single_table.py b/tests/unit/metadata/test_single_table.py index 800e0df2b..2b6df1126 100644 --- a/tests/unit/metadata/test_single_table.py +++ b/tests/unit/metadata/test_single_table.py @@ -55,17 +55,12 @@ def test___init__(self): # Assert assert instance._columns == {} + assert instance._primary_key is None + assert instance._sequence_key is None + assert instance._alternate_keys == [] + assert instance._sequence_index is None assert instance._constraints == [] assert instance._version == 'SINGLE_TABLE_V1' - assert instance._metadata == { - 'columns': {}, - 'primary_key': None, - 'alternate_keys': [], - 'sequence_key': None, - 'sequence_index': None, - 'constraints': [], - 'SCHEMA_VERSION': 'SINGLE_TABLE_V1' - } def test__validate_numerical_default_and_invalid(self): """Test the ``_validate_numerical`` method. @@ -905,7 +900,7 @@ def test_set_primary_key_validation_columns(self): # NOTE: used to be ('a', 'b', 'd', 'c') def test_set_primary_key(self): - """Test that ``set_primary_key`` sets the ``_metadata['primary_key']`` value.""" + """Test that ``set_primary_key`` sets the ``_primary_key`` value.""" # Setup instance = SingleTableMetadata() instance._columns = {'column'} @@ -917,7 +912,7 @@ def test_set_primary_key(self): assert instance._primary_key == 'column' def test_set_primary_key_tuple(self): - """Test that ``set_primary_key`` sets the ``_metadata['primary_key']`` value for tuples.""" + """Test that ``set_primary_key`` sets the ``_primary_key`` value for tuples.""" # Setup instance = SingleTableMetadata() instance._columns = {'column1', 'column2'} @@ -933,7 +928,7 @@ def test_set_primary_key_warning(self, warning_mock): """Test that ``set_primary_key`` raises a warning when a primary key already exists. Setup: - - An instance of ``SingleTableMetadata`` with ``_metadata['primary_key']`` set. + - An instance of ``SingleTableMetadata`` with ``_primary_key`` set. Input: - String. @@ -997,7 +992,7 @@ def test_set_sequence_key_validation_columns(self): # NOTE: used to be ('a', 'b', 'd', 'c') def test_set_sequence_key(self): - """Test that ``set_sequence_key`` sets the ``_metadata['sequence_key']`` value.""" + """Test that ``set_sequence_key`` sets the ``_sequence_key`` value.""" # Setup instance = SingleTableMetadata() instance._columns = {'column'} @@ -1009,7 +1004,7 @@ def test_set_sequence_key(self): assert instance._sequence_key == 'column' def test_set_sequence_key_tuple(self): - """Test that ``set_sequence_key`` sets ``_metadata['sequence_key']`` for tuples.""" + """Test that ``set_sequence_key`` sets ``_sequence_key`` for tuples.""" # Setup instance = SingleTableMetadata() instance._columns = {'column1', 'column2'} @@ -1025,7 +1020,7 @@ def test_set_sequence_key_warning(self, warning_mock): """Test that ``set_sequence_key`` raises a warning when a sequence key already exists. Setup: - - An instance of ``SingleTableMetadata`` with ``_metadata['sequence_key']`` set. + - An instance of ``SingleTableMetadata`` with ``_sequence_key`` set. Input: - String. @@ -1036,7 +1031,7 @@ def test_set_sequence_key_warning(self, warning_mock): # Setup instance = SingleTableMetadata() instance._columns = {'column1'} - instance._metadata['sequence_key'] = 'column0' + instance._sequence_key = 'column0' # Run instance.set_sequence_key('column1') @@ -1089,7 +1084,7 @@ def test_set_alternate_keys_validation_columns(self): # NOTE: used to be ['abc', ('123', '213', '312'), 'bca'] def test_set_alternate_keys(self): - """Test that ``set_alternate_keys`` sets the ``_metadata['alternate_keys']`` value.""" + """Test that ``set_alternate_keys`` sets the ``_alternate_keys`` value.""" # Setup instance = SingleTableMetadata() instance._columns = {'column1', 'column2', 'column3'} @@ -1098,7 +1093,7 @@ def test_set_alternate_keys(self): instance.set_alternate_keys(['column1', ('column2', 'column3')]) # Assert - assert instance._metadata['alternate_keys'] == ['column1', ('column2', 'column3')] + assert instance._alternate_keys == ['column1', ('column2', 'column3')] def test_set_sequence_index_validation(self): """Test that ``set_sequence_index`` crashes for invalid arguments. @@ -1142,7 +1137,7 @@ def test_set_sequence_index_validation_columns(self): instance.set_sequence_index('column') def test_set_sequence_index(self): - """Test that ``set_sequence_index`` sets the ``_metadata['sequence_index']`` value.""" + """Test that ``set_sequence_index`` sets the ``_sequence_index`` value.""" # Setup instance = SingleTableMetadata() instance._columns = {'column'} @@ -1242,8 +1237,8 @@ def test_to_dict(self): result['columns']['my_column'] = 1 assert instance._columns['my_column'] == 'value' - def test__set_metadata_dict(self): - """Test the ``_set_metadata_dict`` to a instance. + def test__set_metadata_attributes(self): + """Test the ``_set_metadata_attributes`` to a instance. Setup: - Instance of ``SingleTableMetadata``. @@ -1260,20 +1255,16 @@ def test__set_metadata_dict(self): } # Run - instance._set_metadata_dict(metadata) + instance._set_metadata_attributes(metadata) # Assert - assert instance._metadata == { - 'columns': {'my_column': 'value'}, - 'primary_key': None, - 'alternate_keys': [], - 'sequence_key': None, - 'sequence_index': None, - 'constraints': [], - 'SCHEMA_VERSION': 'SINGLE_TABLE_V1' - } - assert instance._columns == {'my_column': 'value'} + assert instance._primary_key is None + assert instance._sequence_key is None + assert instance._alternate_keys == [] + assert instance._sequence_index is None + assert instance._constraints == [] + assert instance._version == 'SINGLE_TABLE_V1' def test__load_from_dict(self): """Test that ``_load_from_dict`` returns a instance with the ``dict`` updated objects.""" @@ -1292,9 +1283,13 @@ def test__load_from_dict(self): instance = SingleTableMetadata._load_from_dict(my_metadata) # Assert - assert instance._metadata == my_metadata assert instance._columns == {'my_column': 'value'} + assert instance._primary_key == 'pk' + assert instance._sequence_key is None + assert instance._alternate_keys == [] + assert instance._sequence_index is None assert instance._constraints == [] + assert instance._version == 'SINGLE_TABLE_V1' @patch('sdv.metadata.single_table.Path') def test_load_from_json_path_does_not_exist(self, mock_path): @@ -1408,18 +1403,13 @@ def test_load_from_json(self, mock_json, mock_path, mock_open, mock_constraint): instance = SingleTableMetadata.load_from_json('filepath.json') # Assert - expected_metadata = { - 'columns': {'animals': {'type': 'categorical'}}, - 'primary_key': 'animals', - 'alternate_keys': [], - 'sequence_key': None, - 'sequence_index': None, - 'constraints': [{'my_constraint': 'my_params'}], - 'SCHEMA_VERSION': 'SINGLE_TABLE_V1' - } assert instance._columns == {'animals': {'type': 'categorical'}} + assert instance._primary_key == 'animals' + assert instance._sequence_key is None + assert instance._alternate_keys == [] + assert instance._sequence_index is None assert instance._constraints == [{'my_constraint': 'my_params'}] - assert instance._metadata == expected_metadata + assert instance._version == 'SINGLE_TABLE_V1' mock_constraint.from_dict.assert_called_once() @patch('sdv.metadata.single_table.Path') From 544928f03ad8bbfd800b82814ec170c5d77a73e1 Mon Sep 17 00:00:00 2001 From: Felipe Hofmann Date: Tue, 2 Aug 2022 13:57:49 -0700 Subject: [PATCH 09/18] Change way _metadata implemented in to_dict --- sdv/metadata/single_table.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sdv/metadata/single_table.py b/sdv/metadata/single_table.py index c8cce7c7e..635f5bdd6 100644 --- a/sdv/metadata/single_table.py +++ b/sdv/metadata/single_table.py @@ -413,7 +413,8 @@ def _metadata_dict(self): def to_dict(self): """Return a python ``dict`` representation of the ``SingleTableMetadata``.""" metadata = {} - for key, value in self._metadata_dict().items(): + for key in self._KEYS: + value = getattr(self, f'_{key}') if key != 'SCHEMA_VERSION' else self._version if key == 'constraints' and value: constraints = [] for constraint in value: From 73be37e60aac363450bbf4d676d8abc842066fe9 Mon Sep 17 00:00:00 2001 From: Felipe Hofmann Date: Tue, 2 Aug 2022 15:08:22 -0700 Subject: [PATCH 10/18] Add constraint validation + integration test + change _constraints to be list of tuple --- sdv/constraints/base.py | 3 +- sdv/metadata/single_table.py | 44 ++++++++++--------- .../integration/metadata/test_single_table.py | 6 +++ tests/unit/metadata/test_single_table.py | 16 ++++--- 4 files changed, 41 insertions(+), 28 deletions(-) diff --git a/sdv/constraints/base.py b/sdv/constraints/base.py index f16facc12..459bad4cc 100644 --- a/sdv/constraints/base.py +++ b/sdv/constraints/base.py @@ -126,7 +126,8 @@ def _validate_inputs(cls, **kwargs): f'Invalid values {invalid_vals} are present in {article} {constraint} constraint.' )) - raise MultipleConstraintsErrors(errors) + if errors: + raise MultipleConstraintsErrors(errors) @classmethod def _validate_metadata_columns(cls, metadata, **kwargs): diff --git a/sdv/metadata/single_table.py b/sdv/metadata/single_table.py index 635f5bdd6..2f90dac60 100644 --- a/sdv/metadata/single_table.py +++ b/sdv/metadata/single_table.py @@ -353,14 +353,35 @@ def _validate_sequence_index_not_in_sequence_key(self): ' These columns must be different.' ) + def _validate_constraint(self, constraint_name, **kwargs): + """Validate a constraint against the single table metadata. + + Args: + constraint_name (string): + Name of the constraint class. + **kwargs: + Any other arguments the constraint requires. + """ + try: + constraint_class = Constraint._get_class_from_dict(constraint_name) + except KeyError: + raise InvalidMetadataError(f"Invalid constraint ('{constraint_name}').") + + constraint_class._validate_metadata(self, **kwargs) + def validate(self): """Validate the metadata. Raises: - ``InvalidMetadataError`` if the metadata is invalid. """ - # Validate keys + # Validate constraints errors = [] + for tuple in self._constraints: + constraint_name, kwargs = tuple + self._validate_constraint(constraint_name, **kwargs) + + # Validate keys try: self._validate_key(self._primary_key, 'primary') except ValueError as e: @@ -399,17 +420,6 @@ def validate(self): + '\n'.join([str(e) for e in errors]) ) - def _metadata_dict(self): - return { - 'columns': self._columns, - 'constraints': self._constraints, - 'primary_key': self._primary_key, - 'alternate_keys': self._alternate_keys, - 'sequence_key': self._sequence_key, - 'sequence_index': self._sequence_index, - 'SCHEMA_VERSION': self._version - } - def to_dict(self): """Return a python ``dict`` representation of the ``SingleTableMetadata``.""" metadata = {} @@ -455,14 +465,8 @@ def add_constraint(self, constraint_name, **kwargs): **kwargs: Any other arguments the constraint requires. """ - try: - constraint_class = Constraint._get_class_from_dict(constraint_name) - except KeyError: - raise InvalidMetadataError(f"Invalid constraint ('{constraint_name}').") - - constraint_class._validate_metadata(self, **kwargs) - constraint = constraint_class(**kwargs) - self._constraints.append(constraint) + self._validate_constraint(constraint_name, **kwargs) + self._constraints.append((constraint_name, kwargs)) @classmethod def _load_from_dict(cls, metadata): diff --git a/tests/integration/metadata/test_single_table.py b/tests/integration/metadata/test_single_table.py index 291cee176..a4277b0f8 100644 --- a/tests/integration/metadata/test_single_table.py +++ b/tests/integration/metadata/test_single_table.py @@ -44,6 +44,12 @@ def test_validate(): low_column_name='col1', high_column_name='col2' ) + instance.add_constraint( + constraint_name='ScalarInequality', + column_name='col1', + relation='<', + value=10 + ) instance.set_primary_key('col1') instance.set_alternate_keys([('col1', 'col2')]) instance.set_sequence_index('col1') diff --git a/tests/unit/metadata/test_single_table.py b/tests/unit/metadata/test_single_table.py index 2b6df1126..cb08fbeb1 100644 --- a/tests/unit/metadata/test_single_table.py +++ b/tests/unit/metadata/test_single_table.py @@ -1495,8 +1495,8 @@ def test___repr__(self, mock_json): def test_add_constraint(self, constraint_mock): """Test the ``add_constraint`` method. - The method should create an instance of the specified constraint, run validation on the - constraint against the rest of the metadata and add the instance to the + The method should create an instance of the specified constraint, validate it + against the rest of the metadata and add ``{constraint_name: kwargs}`` to the ``self._constraints`` list. Setup: @@ -1522,17 +1522,19 @@ def test_add_constraint(self, constraint_mock): # Assert constraint_mock._get_class_from_dict.assert_called_once_with('Inequality') - assert metadata._constraints == [dummy_constraint_class.return_value] - dummy_constraint_class.assert_called_once_with( - low_column_name='child_age', - high_column_name='start_date' - ) dummy_constraint_class._validate_metadata.assert_called_once_with( metadata, low_column_name='child_age', high_column_name='start_date' ) + assert metadata._constraints == [( + 'Inequality', { + 'low_column_name': 'child_age', + 'high_column_name': 'start_date' + } + )] + def test_add_constraint_bad_constraint(self): """Test the ``add_constraint`` method with a non-existent constraint. From 82bbff0d6548b4353bb8b8b3ec61c7979fef0e46 Mon Sep 17 00:00:00 2001 From: Felipe Hofmann Date: Tue, 2 Aug 2022 19:28:38 -0700 Subject: [PATCH 11/18] Fix error message for validate + add error integration test --- sdv/metadata/single_table.py | 7 ++++++- tests/integration/metadata/test_single_table.py | 13 ++++++++++++- tests/unit/constraints/test_tabular.py | 2 +- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/sdv/metadata/single_table.py b/sdv/metadata/single_table.py index 2f90dac60..9915f5116 100644 --- a/sdv/metadata/single_table.py +++ b/sdv/metadata/single_table.py @@ -10,6 +10,7 @@ import pandas as pd from sdv.constraints import Constraint +from sdv.constraints.errors import MultipleConstraintsErrors from sdv.metadata.errors import InvalidMetadataError @@ -379,7 +380,11 @@ def validate(self): errors = [] for tuple in self._constraints: constraint_name, kwargs = tuple - self._validate_constraint(constraint_name, **kwargs) + try: + self._validate_constraint(constraint_name, **kwargs) + except MultipleConstraintsErrors as e: + reformated_errors = '\n'.join(map(str, e.errors)) + errors.append(reformated_errors) # Validate keys try: diff --git a/tests/integration/metadata/test_single_table.py b/tests/integration/metadata/test_single_table.py index a4277b0f8..931663d14 100644 --- a/tests/integration/metadata/test_single_table.py +++ b/tests/integration/metadata/test_single_table.py @@ -74,6 +74,10 @@ def test_validate_errors(): 'col9': {'sdtype': 'datetime', 'datetime_format': '%1-%Y-%m-%d-%'}, 'col10': {'sdtype': 'text', 'regex_format': '[A-{6}'}, } + instance._constraints = [ + ('Inequality', {'low_column_name': 'col1', 'wrong_arg': 'col2'}), + ('ScalarInequality', {'column_name': 'col1', 'relation': '<', 'value': 'string'}) + ] instance._primary_key = 10 instance._alternate_keys = 'col1' instance._sequence_key = ('col3', 'col1') @@ -81,7 +85,14 @@ def test_validate_errors(): err_msg = re.escape( 'The following errors were found in the metadata:' - "\n\n'primary_key' must be a string or tuple of strings." + "\n\nMissing required values {'high_column_name'} in an Inequality constraint." + "\nInvalid values {'wrong_arg'} are present in an Inequality constraint." + "\nA Inequality constraint is being applied to invalid column names {None}." + ' The columns must exist in the table.' + "\nAn Inequality constraint is being applied to mismatched sdtypes [None, 'col1']." + ' Both columns must be either numerical or datetime.' + "\n'value' must be an int or float" + "\n'primary_key' must be a string or tuple of strings." "\nUnknown sequence key values {'col3'}. Keys should be columns that exist in the table." "\n'alternate_keys' must be a list of strings or a list of tuples of strings." "\nUnknown sequence key value {'col3'}. Keys should be columns that exist in the table." diff --git a/tests/unit/constraints/test_tabular.py b/tests/unit/constraints/test_tabular.py index a26e5734a..4cff6f574 100644 --- a/tests/unit/constraints/test_tabular.py +++ b/tests/unit/constraints/test_tabular.py @@ -1616,7 +1616,7 @@ def test__validate_metadata_specific_to_constraint_numerical_error(self): metadata._columns = {'a': {'sdtype': 'numerical'}} # Run - error_message = "'value' must be an int or float" + error_message = "'value' must be an int or float." with pytest.raises(ConstraintMetadataError, match=error_message): ScalarInequality._validate_metadata_specific_to_constraint( metadata, From 562ffd31e22dbd80d659a3f2144c3e1970a46b31 Mon Sep 17 00:00:00 2001 From: Felipe Hofmann Date: Tue, 2 Aug 2022 20:37:02 -0700 Subject: [PATCH 12/18] Fix all constraint related error messages --- sdv/constraints/base.py | 10 ++-- sdv/constraints/tabular.py | 46 +++++++++---------- sdv/metadata/single_table.py | 4 +- .../integration/metadata/test_single_table.py | 8 ++-- tests/unit/constraints/test_tabular.py | 10 ++-- tests/unit/metadata/test_single_table.py | 2 +- 6 files changed, 39 insertions(+), 41 deletions(-) diff --git a/sdv/constraints/base.py b/sdv/constraints/base.py index 459bad4cc..602a76f00 100644 --- a/sdv/constraints/base.py +++ b/sdv/constraints/base.py @@ -136,16 +136,16 @@ def _validate_metadata_columns(cls, metadata, **kwargs): else: column_names = kwargs.get('column_names') - missing_columns = set(column_names) - set(metadata._columns) - + missing_columns = set(column_names) - set(metadata._columns) - {None} if missing_columns: + article = 'An' if cls.__name__ == 'Inequality' else 'A' raise ConstraintMetadataError( - f'A {cls.__name__} constraint is being applied to invalid column names ' + f'{article} {cls.__name__} constraint is being applied to invalid column names ' f'{missing_columns}. The columns must exist in the table.' ) - @classmethod - def _validate_metadata_specific_to_constraint(cls, metadata, **kwargs): + @staticmethod + def _validate_metadata_specific_to_constraint(metadata, **kwargs): pass @classmethod diff --git a/sdv/constraints/tabular.py b/sdv/constraints/tabular.py index b34ef3dfd..7379d9426 100644 --- a/sdv/constraints/tabular.py +++ b/sdv/constraints/tabular.py @@ -345,8 +345,8 @@ def _validate_metadata_columns(cls, metadata, **kwargs): kwargs['column_names'] = [kwargs.get('high_column_name'), kwargs.get('low_column_name')] super()._validate_metadata_columns(metadata, **kwargs) - @classmethod - def _validate_metadata_specific_to_constraint(cls, metadata, **kwargs): + @staticmethod + def _validate_metadata_specific_to_constraint(metadata, **kwargs): high = kwargs.get('high_column_name') low = kwargs.get('low_column_name') high_sdtype = metadata._columns.get(high, {}).get('sdtype') @@ -355,8 +355,8 @@ def _validate_metadata_specific_to_constraint(cls, metadata, **kwargs): both_numerical = high_sdtype == low_sdtype == 'numerical' if not (both_datetime or both_numerical): raise ConstraintMetadataError( - f'An {cls.__name__} constraint is being applied to mismatched sdtypes ' - f'{[high, low]}. Both columns must be either numerical or datetime.' + f'An Inequality constraint is being applied to mismatched sdtype columns' + f' {[high, low]}. Both columns must be either numerical or datetime.' ) def __init__(self, low_column_name, high_column_name, strict_boundaries=False): @@ -504,14 +504,14 @@ def _validate_inputs(cls, **kwargs): if errors: raise MultipleConstraintsErrors(errors) - @classmethod - def _validate_metadata_specific_to_constraint(cls, metadata, **kwargs): + @staticmethod + def _validate_metadata_specific_to_constraint(metadata, **kwargs): column_name = kwargs.get('column_name') sdtype = metadata._columns.get(column_name, {}).get('sdtype') val = kwargs.get('value') if sdtype == 'numerical': if not isinstance(val, (int, float)): - raise ConstraintMetadataError("'value' must be an int or float") + raise ConstraintMetadataError("'value' must be an int or float.") elif sdtype == 'datetime': datetime_format = metadata._columns.get(column_name).get('datetime_format') @@ -523,7 +523,7 @@ def _validate_metadata_specific_to_constraint(cls, metadata, **kwargs): else: raise ConstraintMetadataError( - f'A {cls.__name__} constraint is being applied to mismatched sdtypes. ' + f'A ScalarInequality constraint is being applied to mismatched sdtypes. ' 'Numerical columns must be compared to integer or float values. ' 'Datetimes column must be compared to datetime strings.' ) @@ -672,13 +672,13 @@ class Positive(ScalarInequality): zero ``>`` or include it ``>=``. """ - @classmethod - def _validate_metadata_specific_to_constraint(cls, metadata, **kwargs): + @staticmethod + def _validate_metadata_specific_to_constraint(metadata, **kwargs): column_name = kwargs.get('column_name') sdtype = metadata._columns.get(column_name, {}).get('sdtype') if sdtype != 'numerical': raise ConstraintMetadataError( - f'A {cls.__name__} constraint is being applied to an invalid column ' + f'A Positive constraint is being applied to an invalid column ' f"'{column_name}'. This constraint is only defined for numerical columns." ) @@ -700,13 +700,13 @@ class Negative(ScalarInequality): zero ``<`` or include it ``<=``. """ - @classmethod - def _validate_metadata_specific_to_constraint(cls, metadata, **kwargs): + @staticmethod + def _validate_metadata_specific_to_constraint(metadata, **kwargs): column_name = kwargs.get('column_name') sdtype = metadata._columns.get(column_name, {}).get('sdtype') if sdtype != 'numerical': raise ConstraintMetadataError( - f'A {cls.__name__} constraint is being applied to an invalid column ' + f'A Negative constraint is being applied to an invalid column ' f"'{column_name}'. This constraint is only defined for numerical columns." ) @@ -742,8 +742,8 @@ def _validate_metadata_columns(cls, metadata, **kwargs): kwargs['column_names'] = [high, low, middle] super()._validate_metadata_columns(metadata, **kwargs) - @classmethod - def _validate_metadata_specific_to_constraint(cls, metadata, **kwargs): + @staticmethod + def _validate_metadata_specific_to_constraint(metadata, **kwargs): high = kwargs.get('high_column_name') low = kwargs.get('low_column_name') middle = kwargs.get('middle_column_name') @@ -754,7 +754,7 @@ def _validate_metadata_specific_to_constraint(cls, metadata, **kwargs): all_numerical = high_sdtype == low_sdtype == middle_sdtype == 'numerical' if not (all_datetime or all_numerical): raise ConstraintMetadataError( - f'A {cls.__name__} constraint is being applied to mismatched sdtypes ' + f'A Range constraint is being applied to mismatched sdtype columns ' f'{[high, middle, low]}. All columns must be either numerical or datetime.' ) @@ -926,12 +926,12 @@ def _validate_init_inputs(low_value, high_value): 'represents a datetime.' ) - @classmethod - def _validate_metadata_specific_to_constraint(cls, metadata, **kwargs): + @staticmethod + def _validate_metadata_specific_to_constraint(metadata, **kwargs): column_name = kwargs.get('column_name') if column_name not in metadata._columns: raise ConstraintMetadataError( - f'A {cls.__name__} constraint is being applied to invalid column names ' + f'A ScalarRange constraint is being applied to invalid column names ' f'({column_name}). The columns must exist in the table.' ) sdtype = metadata._columns.get(column_name).get('sdtype') @@ -955,7 +955,7 @@ def _validate_metadata_specific_to_constraint(cls, metadata, **kwargs): else: raise ConstraintMetadataError( - f'A {cls.__name__} constraint is being applied to mismatched sdtypes. ' + f'A ScalarRange constraint is being applied to mismatched sdtypes. ' 'Numerical columns must be compared to integer or float values. ' 'Datetimes column must be compared to datetime strings.' ) @@ -1263,8 +1263,8 @@ def __init__(self, column_names): self.column_names = column_names self.constraint_columns = tuple(self.column_names) - @classmethod - def _validate_metadata_specific_to_constraint(cls, metadata, **kwargs): + @staticmethod + def _validate_metadata_specific_to_constraint(metadata, **kwargs): column_names = kwargs.get('column_names') keys = set() if isinstance(metadata._primary_key, tuple): diff --git a/sdv/metadata/single_table.py b/sdv/metadata/single_table.py index 9915f5116..7e43d00c3 100644 --- a/sdv/metadata/single_table.py +++ b/sdv/metadata/single_table.py @@ -10,8 +10,8 @@ import pandas as pd from sdv.constraints import Constraint -from sdv.constraints.errors import MultipleConstraintsErrors from sdv.metadata.errors import InvalidMetadataError +from sdv.constraints.errors import MultipleConstraintsErrors class SingleTableMetadata: @@ -329,7 +329,7 @@ def _validate_sequence_index(self, column_name): if column_name not in self._columns: column_name = {column_name} raise ValueError( - f'Unknown sequence key value {column_name}.' + f'Unknown sequence index value {column_name}.' ' Keys should be columns that exist in the table.' ) diff --git a/tests/integration/metadata/test_single_table.py b/tests/integration/metadata/test_single_table.py index 931663d14..4eeac88bd 100644 --- a/tests/integration/metadata/test_single_table.py +++ b/tests/integration/metadata/test_single_table.py @@ -87,15 +87,13 @@ def test_validate_errors(): 'The following errors were found in the metadata:' "\n\nMissing required values {'high_column_name'} in an Inequality constraint." "\nInvalid values {'wrong_arg'} are present in an Inequality constraint." - "\nA Inequality constraint is being applied to invalid column names {None}." - ' The columns must exist in the table.' - "\nAn Inequality constraint is being applied to mismatched sdtypes [None, 'col1']." + "\nAn Inequality constraint is being applied to mismatched sdtype columns [None, 'col1']." ' Both columns must be either numerical or datetime.' - "\n'value' must be an int or float" + "\n'value' must be an int or float." "\n'primary_key' must be a string or tuple of strings." "\nUnknown sequence key values {'col3'}. Keys should be columns that exist in the table." "\n'alternate_keys' must be a list of strings or a list of tuples of strings." - "\nUnknown sequence key value {'col3'}. Keys should be columns that exist in the table." + "\nUnknown sequence index value {'col3'}. Keys should be columns that exist in the table." "\n'sequence_index' and 'sequence_key' have the same value {'col3'}." ' These columns must be different.' "\nInvalid values '(invalid1)' for categorical column 'col4'." diff --git a/tests/unit/constraints/test_tabular.py b/tests/unit/constraints/test_tabular.py index 4cff6f574..ed00a215f 100644 --- a/tests/unit/constraints/test_tabular.py +++ b/tests/unit/constraints/test_tabular.py @@ -948,7 +948,7 @@ def test__validate_metadata_columns_raises_error(self): # Run error_message = re.escape( - 'A Inequality constraint is being applied to invalid column names ' + 'An Inequality constraint is being applied to invalid column names ' "{'c'}. The columns must exist in the table." ) with pytest.raises(ConstraintMetadataError, match=error_message): @@ -990,7 +990,7 @@ def test__validate_metadata_specific_to_constraint_datetime_error(self): # Run error_message = re.escape( - 'An Inequality constraint is being applied to mismatched sdtypes ' + 'An Inequality constraint is being applied to mismatched sdtype columns ' "['a', 'b']. Both columns must be either numerical or datetime." ) with pytest.raises(ConstraintMetadataError, match=error_message): @@ -1035,7 +1035,7 @@ def test__validate_metadata_specific_to_constraint_numerical_error(self): # Run error_message = re.escape( - 'An Inequality constraint is being applied to mismatched sdtypes ' + 'An Inequality constraint is being applied to mismatched sdtype columns ' "['a', 'b']. Both columns must be either numerical or datetime." ) with pytest.raises(ConstraintMetadataError, match=error_message): @@ -2581,7 +2581,7 @@ def test__validate_metadata_specific_to_constraint_datetime_error(self): # Run error_message = re.escape( - 'A Range constraint is being applied to mismatched sdtypes ' + 'A Range constraint is being applied to mismatched sdtype columns ' "['a', 'c', 'b']. All columns must be either numerical or datetime." ) with pytest.raises(ConstraintMetadataError, match=error_message): @@ -2637,7 +2637,7 @@ def test__validate_metadata_specific_to_constraint_numerical_error(self): # Run error_message = re.escape( - 'A Range constraint is being applied to mismatched sdtypes ' + 'A Range constraint is being applied to mismatched sdtype columns ' "['a', 'c', 'b']. All columns must be either numerical or datetime." ) with pytest.raises(ConstraintMetadataError, match=error_message): diff --git a/tests/unit/metadata/test_single_table.py b/tests/unit/metadata/test_single_table.py index cb08fbeb1..f4adc9d3f 100644 --- a/tests/unit/metadata/test_single_table.py +++ b/tests/unit/metadata/test_single_table.py @@ -1129,7 +1129,7 @@ def test_set_sequence_index_validation_columns(self): instance._columns = {'a', 'd'} err_msg = ( - "Unknown sequence key value {'column'}." + "Unknown sequence index value {'column'}." ' Keys should be columns that exist in the table.' ) # Run / Assert From 4cda5b45d27bfe188946fcaec65a713433fb6c0a Mon Sep 17 00:00:00 2001 From: Felipe Hofmann Date: Wed, 3 Aug 2022 08:49:15 -0700 Subject: [PATCH 13/18] Add helper _append_error to validate + fix constraint related errors --- sdv/constraints/tabular.py | 11 ++--- sdv/metadata/single_table.py | 43 ++++++------------- .../integration/metadata/test_single_table.py | 2 - 3 files changed, 20 insertions(+), 36 deletions(-) diff --git a/sdv/constraints/tabular.py b/sdv/constraints/tabular.py index 7379d9426..b713c49b4 100644 --- a/sdv/constraints/tabular.py +++ b/sdv/constraints/tabular.py @@ -353,7 +353,7 @@ def _validate_metadata_specific_to_constraint(metadata, **kwargs): low_sdtype = metadata._columns.get(low, {}).get('sdtype') both_datetime = high_sdtype == low_sdtype == 'datetime' both_numerical = high_sdtype == low_sdtype == 'numerical' - if not (both_datetime or both_numerical): + if not (both_datetime or both_numerical) and not (high is None or low is None): raise ConstraintMetadataError( f'An Inequality constraint is being applied to mismatched sdtype columns' f' {[high, low]}. Both columns must be either numerical or datetime.' @@ -518,12 +518,12 @@ def _validate_metadata_specific_to_constraint(metadata, **kwargs): matches_format = matches_datetime_format(val, datetime_format) if not matches_format: raise ConstraintMetadataError( - "'value' must be a datetime string of the right format" + "'value' must be a datetime string of the right format." ) else: raise ConstraintMetadataError( - f'A ScalarInequality constraint is being applied to mismatched sdtypes. ' + 'A ScalarInequality constraint is being applied to mismatched sdtypes. ' 'Numerical columns must be compared to integer or float values. ' 'Datetimes column must be compared to datetime strings.' ) @@ -752,7 +752,8 @@ def _validate_metadata_specific_to_constraint(metadata, **kwargs): middle_sdtype = metadata._columns.get(middle, {}).get('sdtype') all_datetime = high_sdtype == low_sdtype == middle_sdtype == 'datetime' all_numerical = high_sdtype == low_sdtype == middle_sdtype == 'numerical' - if not (all_datetime or all_numerical): + if not (all_datetime or all_numerical) and \ + not (high is None or low is None or middle is None): raise ConstraintMetadataError( f'A Range constraint is being applied to mismatched sdtype columns ' f'{[high, middle, low]}. All columns must be either numerical or datetime.' @@ -955,7 +956,7 @@ def _validate_metadata_specific_to_constraint(metadata, **kwargs): else: raise ConstraintMetadataError( - f'A ScalarRange constraint is being applied to mismatched sdtypes. ' + 'A ScalarRange constraint is being applied to mismatched sdtypes. ' 'Numerical columns must be compared to integer or float values. ' 'Datetimes column must be compared to datetime strings.' ) diff --git a/sdv/metadata/single_table.py b/sdv/metadata/single_table.py index 7e43d00c3..5db66cfdf 100644 --- a/sdv/metadata/single_table.py +++ b/sdv/metadata/single_table.py @@ -10,8 +10,8 @@ import pandas as pd from sdv.constraints import Constraint -from sdv.metadata.errors import InvalidMetadataError from sdv.constraints.errors import MultipleConstraintsErrors +from sdv.metadata.errors import InvalidMetadataError class SingleTableMetadata: @@ -370,6 +370,13 @@ def _validate_constraint(self, constraint_name, **kwargs): constraint_class._validate_metadata(self, **kwargs) + def _append_error(self, errors, method, *args, **kwargs): + """Inplace, append the produced error to the passed ``errors`` list.""" + try: + method(*args, **kwargs) + except ValueError as e: + errors.append(e) + def validate(self): """Validate the metadata. @@ -387,37 +394,15 @@ def validate(self): errors.append(reformated_errors) # Validate keys - try: - self._validate_key(self._primary_key, 'primary') - except ValueError as e: - errors.append(e) - - try: - self._validate_key(self._sequence_key, 'sequence') - except ValueError as e: - errors.append(e) - - try: - self._validate_alternate_keys(self._alternate_keys) - except ValueError as e: - errors.append(e) - - try: - self._validate_sequence_index(self._sequence_index) - except ValueError as e: - errors.append(e) - - try: - self._validate_sequence_index_not_in_sequence_key() - except ValueError as e: - errors.append(e) + self._append_error(errors, self._validate_key, self._primary_key, 'primary') + self._append_error(errors, self._validate_key, self._sequence_key, 'sequence') + self._append_error(errors, self._validate_alternate_keys, self._alternate_keys) + self._append_error(errors, self._validate_sequence_index, self._sequence_index) + self._append_error(errors, self._validate_sequence_index_not_in_sequence_key) # Validate columns for column, kwargs in self._columns.items(): - try: - self._validate_column(column, **kwargs) - except ValueError as e: - errors.append(e) + self._append_error(errors, self._validate_column, column, **kwargs) if errors: raise InvalidMetadataError( diff --git a/tests/integration/metadata/test_single_table.py b/tests/integration/metadata/test_single_table.py index 4eeac88bd..8f615865b 100644 --- a/tests/integration/metadata/test_single_table.py +++ b/tests/integration/metadata/test_single_table.py @@ -87,8 +87,6 @@ def test_validate_errors(): 'The following errors were found in the metadata:' "\n\nMissing required values {'high_column_name'} in an Inequality constraint." "\nInvalid values {'wrong_arg'} are present in an Inequality constraint." - "\nAn Inequality constraint is being applied to mismatched sdtype columns [None, 'col1']." - ' Both columns must be either numerical or datetime.' "\n'value' must be an int or float." "\n'primary_key' must be a string or tuple of strings." "\nUnknown sequence key values {'col3'}. Keys should be columns that exist in the table." From 4a58faaf4f29617f7cc7998f6b66676549a369f6 Mon Sep 17 00:00:00 2001 From: Felipe Hofmann Date: Wed, 3 Aug 2022 09:12:30 -0700 Subject: [PATCH 14/18] Add constraint unit test --- tests/unit/metadata/test_single_table.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/unit/metadata/test_single_table.py b/tests/unit/metadata/test_single_table.py index f4adc9d3f..e956eae85 100644 --- a/tests/unit/metadata/test_single_table.py +++ b/tests/unit/metadata/test_single_table.py @@ -1178,11 +1178,15 @@ def test_validate(self): # Setup instance = SingleTableMetadata() instance._columns = {'col1': {'sdtype': 'numerical'}, 'col2': {'sdtype': 'numerical'}} - instance._constraints = [] + instance._constraints = [ + ('Inequality', {'low_column_name': 'col1', 'high_column_name': 'col2'}), + ('ScalarInequality', {'column_name': 'col1', 'relation': '<', 'value': 10}) + ] instance._primary_key = 'col1' instance._alternate_keys = ['col2'] instance._sequence_key = 'col1' instance._sequence_index = 'col2' + instance._validate_constraint = Mock() instance._validate_key = Mock() instance._validate_alternate_keys = Mock() instance._validate_sequence_index = Mock() @@ -1192,6 +1196,10 @@ def test_validate(self): instance.validate() # Assert + instance._validate_constraint.assert_has_calls([ + call('Inequality', low_column_name='col1', high_column_name='col2'), + call('ScalarInequality', column_name='col1', relation='<', value=10) + ]) instance._validate_key.assert_has_calls( [call(instance._primary_key, 'primary'), call(instance._sequence_key, 'sequence')] ) From c56355654048f9d032204518bec206a4eec26073 Mon Sep 17 00:00:00 2001 From: Felipe Hofmann Date: Wed, 3 Aug 2022 09:36:29 -0700 Subject: [PATCH 15/18] Fix lint --- tests/integration/metadata/test_single_table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/metadata/test_single_table.py b/tests/integration/metadata/test_single_table.py index 8f615865b..e456e2886 100644 --- a/tests/integration/metadata/test_single_table.py +++ b/tests/integration/metadata/test_single_table.py @@ -60,7 +60,7 @@ def test_validate(): def test_validate_errors(): - """Test Test ``SingleTableMetadata.validate`` raises the correct errors.""" + """Test ``SingleTableMetadata.validate`` raises the correct errors.""" # Setup instance = SingleTableMetadata() instance._columns = { From adfd39168e1df49ff76e4a982fbef860b7ec50ce Mon Sep 17 00:00:00 2001 From: Felipe Hofmann Date: Wed, 3 Aug 2022 10:14:57 -0700 Subject: [PATCH 16/18] change _metadata[pk] to pk --- sdv/metadata/multi_table.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdv/metadata/multi_table.py b/sdv/metadata/multi_table.py index 5c66b1465..b0cd157a0 100644 --- a/sdv/metadata/multi_table.py +++ b/sdv/metadata/multi_table.py @@ -62,7 +62,7 @@ def visualize(self, show_table_details=True, show_relationship_labels=True, columns = [f"{name} : {meta.get('sdtype')}" for name, meta in column_dict] nodes[table_name] = { 'columns': r'\l'.join(columns), - 'primary_key': f"Primary key: {table_meta._metadata['primary_key']}" + 'primary_key': f'Primary key: {table_meta._primary_key}' } else: @@ -72,7 +72,7 @@ def visualize(self, show_table_details=True, show_relationship_labels=True, parent = relationship.get('parent_table_name') child = relationship.get('child_table_name') foreign_key = relationship.get('child_foreign_key') - primary_key = self._tables.get(parent)._metadata.get('primary_key') + primary_key = self._tables.get(parent)._primary_key edge_label = f' {foreign_key} → {primary_key}' if show_relationship_labels else '' edges.append((parent, child, edge_label)) From df76a4b700feed73373c9f7e91f86474ea28e1b1 Mon Sep 17 00:00:00 2001 From: Felipe Hofmann Date: Wed, 3 Aug 2022 11:50:41 -0700 Subject: [PATCH 17/18] Add tests requested + addresses feedback --- sdv/constraints/tabular.py | 9 +++++---- tests/unit/constraints/test_base.py | 15 +++++++++++++++ tests/unit/constraints/test_tabular.py | 12 ++++++------ tests/unit/metadata/test_single_table.py | 19 ++++++++++++++++--- 4 files changed, 42 insertions(+), 13 deletions(-) diff --git a/sdv/constraints/tabular.py b/sdv/constraints/tabular.py index b713c49b4..51fd04f75 100644 --- a/sdv/constraints/tabular.py +++ b/sdv/constraints/tabular.py @@ -355,7 +355,7 @@ def _validate_metadata_specific_to_constraint(metadata, **kwargs): both_numerical = high_sdtype == low_sdtype == 'numerical' if not (both_datetime or both_numerical) and not (high is None or low is None): raise ConstraintMetadataError( - f'An Inequality constraint is being applied to mismatched sdtype columns' + 'An Inequality constraint is being applied to columns with mismatched sdtypes' f' {[high, low]}. Both columns must be either numerical or datetime.' ) @@ -523,7 +523,8 @@ def _validate_metadata_specific_to_constraint(metadata, **kwargs): else: raise ConstraintMetadataError( - 'A ScalarInequality constraint is being applied to mismatched sdtypes. ' + 'A ScalarInequality constraint is being applied ' + 'to columns with mismatched sdtypes. ' 'Numerical columns must be compared to integer or float values. ' 'Datetimes column must be compared to datetime strings.' ) @@ -755,7 +756,7 @@ def _validate_metadata_specific_to_constraint(metadata, **kwargs): if not (all_datetime or all_numerical) and \ not (high is None or low is None or middle is None): raise ConstraintMetadataError( - f'A Range constraint is being applied to mismatched sdtype columns ' + 'A Range constraint is being applied to columns with mismatched sdtypes ' f'{[high, middle, low]}. All columns must be either numerical or datetime.' ) @@ -956,7 +957,7 @@ def _validate_metadata_specific_to_constraint(metadata, **kwargs): else: raise ConstraintMetadataError( - 'A ScalarRange constraint is being applied to mismatched sdtypes. ' + 'A ScalarRange constraint is being applied to columns with mismatched sdtypes. ' 'Numerical columns must be compared to integer or float values. ' 'Datetimes column must be compared to datetime strings.' ) diff --git a/tests/unit/constraints/test_base.py b/tests/unit/constraints/test_base.py index dbb1849af..5f8718604 100644 --- a/tests/unit/constraints/test_base.py +++ b/tests/unit/constraints/test_base.py @@ -159,6 +159,21 @@ def test_import_object_function(): class TestConstraint(): + def test__validate_inputs(self): + """Test the ``_validate_inputs`` method. + + The method should only raise errors if the input paramaters are invalid. + + Raise: + - ``MultipleConstraintsErrors`` if errors were found. + """ + # Run + Constraint._validate_inputs(args='value', kwargs='value') + + # Run / Assert + with pytest.raises(MultipleConstraintsErrors): + Constraint._validate_inputs(wrong_args='value') + @patch('sdv.constraints.base.Constraint._validate_inputs') @patch('sdv.constraints.base.Constraint._validate_metadata_columns') @patch('sdv.constraints.base.Constraint._validate_metadata_specific_to_constraint') diff --git a/tests/unit/constraints/test_tabular.py b/tests/unit/constraints/test_tabular.py index ed00a215f..91bb57e90 100644 --- a/tests/unit/constraints/test_tabular.py +++ b/tests/unit/constraints/test_tabular.py @@ -990,7 +990,7 @@ def test__validate_metadata_specific_to_constraint_datetime_error(self): # Run error_message = re.escape( - 'An Inequality constraint is being applied to mismatched sdtype columns ' + 'An Inequality constraint is being applied to columns with mismatched sdtypes ' "['a', 'b']. Both columns must be either numerical or datetime." ) with pytest.raises(ConstraintMetadataError, match=error_message): @@ -1035,7 +1035,7 @@ def test__validate_metadata_specific_to_constraint_numerical_error(self): # Run error_message = re.escape( - 'An Inequality constraint is being applied to mismatched sdtype columns ' + 'An Inequality constraint is being applied to columns with mismatched sdtypes ' "['a', 'b']. Both columns must be either numerical or datetime." ) with pytest.raises(ConstraintMetadataError, match=error_message): @@ -1705,7 +1705,7 @@ def test__validate_metadata_specific_to_constraint_bad_type(self): # Run error_message = ( - 'A ScalarInequality constraint is being applied to mismatched sdtypes. ' + 'A ScalarInequality constraint is being applied to columns with mismatched sdtypes. ' 'Numerical columns must be compared to integer or float values. ' 'Datetimes column must be compared to datetime strings.' ) @@ -2581,7 +2581,7 @@ def test__validate_metadata_specific_to_constraint_datetime_error(self): # Run error_message = re.escape( - 'A Range constraint is being applied to mismatched sdtype columns ' + 'A Range constraint is being applied to columns with mismatched sdtypes ' "['a', 'c', 'b']. All columns must be either numerical or datetime." ) with pytest.raises(ConstraintMetadataError, match=error_message): @@ -2637,7 +2637,7 @@ def test__validate_metadata_specific_to_constraint_numerical_error(self): # Run error_message = re.escape( - 'A Range constraint is being applied to mismatched sdtype columns ' + 'A Range constraint is being applied to columns with mismatched sdtypes ' "['a', 'c', 'b']. All columns must be either numerical or datetime." ) with pytest.raises(ConstraintMetadataError, match=error_message): @@ -3326,7 +3326,7 @@ def test__validate_metadata_specific_to_constraint_bad_type(self): # Run error_message = ( - 'A ScalarRange constraint is being applied to mismatched sdtypes. ' + 'A ScalarRange constraint is being applied to columns with mismatched sdtypes. ' 'Numerical columns must be compared to integer or float values. ' 'Datetimes column must be compared to datetime strings.' ) diff --git a/tests/unit/metadata/test_single_table.py b/tests/unit/metadata/test_single_table.py index e956eae85..af691f468 100644 --- a/tests/unit/metadata/test_single_table.py +++ b/tests/unit/metadata/test_single_table.py @@ -10,6 +10,7 @@ import pandas as pd import pytest +from sdv.constraints.errors import MultipleConstraintsErrors from sdv.metadata.errors import InvalidMetadataError from sdv.metadata.single_table import SingleTableMetadata @@ -1186,14 +1187,23 @@ def test_validate(self): instance._alternate_keys = ['col2'] instance._sequence_key = 'col1' instance._sequence_index = 'col2' - instance._validate_constraint = Mock() + instance._validate_constraint = Mock(side_effect=MultipleConstraintsErrors(['cnt_error'])) instance._validate_key = Mock() instance._validate_alternate_keys = Mock() instance._validate_sequence_index = Mock() instance._validate_sequence_index_not_in_sequence_key = Mock() - + instance._validate_column = Mock(side_effect=ValueError('column_error')) + + err_msg = re.escape( + 'The following errors were found in the metadata:' + '\n\ncnt_error' + '\ncnt_error' + '\ncolumn_error' + '\ncolumn_error' + ) # Run - instance.validate() + with pytest.raises(InvalidMetadataError, match=err_msg): + instance.validate() # Assert instance._validate_constraint.assert_has_calls([ @@ -1203,6 +1213,9 @@ def test_validate(self): instance._validate_key.assert_has_calls( [call(instance._primary_key, 'primary'), call(instance._sequence_key, 'sequence')] ) + instance._validate_column.assert_has_calls( + [call('col1', sdtype='numerical'), call('col2', sdtype='numerical')] + ) instance._validate_alternate_keys.assert_called_once_with(instance._alternate_keys) instance._validate_sequence_index.assert_called_once_with(instance._sequence_index) instance._validate_sequence_index_not_in_sequence_key.assert_called_once() From af34779180105400c4984c0690f4f3c83f7dbdb9 Mon Sep 17 00:00:00 2001 From: Felipe Hofmann Date: Wed, 3 Aug 2022 13:29:12 -0700 Subject: [PATCH 18/18] Address minor feedback --- sdv/metadata/single_table.py | 6 +++--- tests/unit/constraints/test_base.py | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/sdv/metadata/single_table.py b/sdv/metadata/single_table.py index 5db66cfdf..b8c8bad38 100644 --- a/sdv/metadata/single_table.py +++ b/sdv/metadata/single_table.py @@ -345,9 +345,9 @@ def set_sequence_index(self, column_name): def _validate_sequence_index_not_in_sequence_key(self): """Check that ``_sequence_index`` and ``_sequence_key`` don't overlap.""" - sk = self._sequence_key - sequence_key = {sk} if isinstance(sk, str) else set(sk) - if self._sequence_index in sequence_key or sk is None: + seq_key = self._sequence_key + sequence_key = {seq_key} if isinstance(seq_key, str) else set(seq_key) + if self._sequence_index in sequence_key or seq_key is None: index = {self._sequence_index} raise ValueError( f"'sequence_index' and 'sequence_key' have the same value {index}." diff --git a/tests/unit/constraints/test_base.py b/tests/unit/constraints/test_base.py index 5f8718604..7fb2dd242 100644 --- a/tests/unit/constraints/test_base.py +++ b/tests/unit/constraints/test_base.py @@ -171,8 +171,9 @@ def test__validate_inputs(self): Constraint._validate_inputs(args='value', kwargs='value') # Run / Assert - with pytest.raises(MultipleConstraintsErrors): - Constraint._validate_inputs(wrong_args='value') + err_msg = "Invalid values {'wrong_args'} are present in a Constraint constraint." + with pytest.raises(MultipleConstraintsErrors, match=err_msg): + Constraint._validate_inputs(args='value', kwargs='value', wrong_args='value') @patch('sdv.constraints.base.Constraint._validate_inputs') @patch('sdv.constraints.base.Constraint._validate_metadata_columns')