Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add metadata validation #527

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion sdmetrics/reports/base_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,24 @@ def _validate_data_format(self, real_data, synthetic_data):
return

error_message = (
f'Single table report {self.__class__.__name__} expects real and synthetic data to be'
f'Single table {self.__class__.__name__} expects real and synthetic data to be'
' pandas.DataFrame. If your real and synthetic data are dictionaries of tables, '
f'please use the multi-table {self.__class__.__name__} instead.'

)
raise ValueError(error_message)

def _validate_metadata_format(self, metadata):
"""Validate the metadata."""
if not isinstance(metadata, dict):
raise TypeError('The provided metadata is not a dictionary.')

if 'columns' not in metadata:
raise ValueError(
'Single table reports expect metadata to contain a "columns" key with a mapping'
' from column names to column informations.'
)

def _validate(self, real_data, synthetic_data, metadata):
"""Validate the inputs.

Expand All @@ -80,6 +91,7 @@ def _validate(self, real_data, synthetic_data, metadata):
The metadata of the table.
"""
self._validate_data_format(real_data, synthetic_data)
self._validate_metadata_format(metadata)
self._validate_metadata_matches_data(real_data, synthetic_data, metadata)

@staticmethod
Expand Down
20 changes: 18 additions & 2 deletions sdmetrics/reports/multi_table/base_multi_table_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,29 @@ def _validate_data_format(self, real_data, synthetic_data):
return

error_message = (
f'Multi table report {self.__class__.__name__} expects real and synthetic data to be'
f'Multi table {self.__class__.__name__} expects real and synthetic data to be'
' dictionaries of pandas.DataFrame. If your real and synthetic data are pd.DataFrame,'
f' please use the single-table {self.__class__.__name__} instead.'
)

raise ValueError(error_message)

def _validate_metadata_format(self, metadata):
"""Validate the metadata."""
if not isinstance(metadata, dict):
raise TypeError('The provided metadata is not a dictionary.')

if 'tables' not in metadata:
raise ValueError(
'Multi table reports expect metadata to contain a "tables" key with a mapping'
' from table names to metadata for each table.'
)
for table_name, table_metadata in metadata['tables'].items():
if 'columns' not in table_metadata:
raise ValueError(
f'The metadata for table "{table_name}" is missing a "columns" key.'
)
Comment on lines +47 to +56
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@npatki should we error in the case where the metadata is malformed? More specifically, f a users gives metadata with an empty table or empty column, should we error?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@amontanez24 yeah that's probably a good idea. Though I'm neutral on whether or not there's an explicit error message for this case vs. the report just naturally crashing because it cannot access those key names.


def _validate_relationships(self, real_data, synthetic_data, metadata):
"""Validate that the relationships are valid."""
for rel in metadata.get('relationships', []):
Expand Down Expand Up @@ -83,7 +99,7 @@ def generate(self, real_data, synthetic_data, metadata, verbose=True):
verbose (bool):
Whether or not to print report summary and progress.
"""
self.table_names = list(metadata['tables'].keys())
self.table_names = list(metadata.get('tables', {}).keys())
return super().generate(real_data, synthetic_data, metadata, verbose)

def _check_table_names(self, table_name):
Expand Down
53 changes: 52 additions & 1 deletion tests/unit/reports/multi_table/test_base_multi_table_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,64 @@ def test__validate_data_format(self):

# Run and Assert
expected_message = (
'Multi table report BaseMultiTableReport expects real and synthetic data to be '
'Multi table BaseMultiTableReport expects real and synthetic data to be '
'dictionaries of pandas.DataFrame. If your real and synthetic data are '
'pd.DataFrame, please use the single-table BaseMultiTableReport instead.'
)
with pytest.raises(ValueError, match=expected_message):
base_report._validate_data_format(real_data, synthetic_data)

def test__validate_metadata_format(self):
"""Test the ``_validate_metadata_format`` method.

This test checks that the method raises an error when the metadata is not a dictionnary.
"""
# Setup
base_report = BaseMultiTableReport()
metadata = []

# Run and Assert
expected_message = 'The provided metadata is not a dictionary.'
with pytest.raises(TypeError, match=expected_message):
base_report._validate_metadata_format(metadata)

def test__validate_metadata_format_with_no_tables(self):
"""Test the ``_validate_metadata_format`` method.

This test checks that the method raises an error when the metadata does not contain a
'tables' key.
"""
# Setup
base_report = BaseMultiTableReport()
metadata = {}

# Run and Assert
expected_message = (
'Multi table reports expect metadata to contain a "tables" key with a mapping from '
'table names to metadata for each table.'
)
with pytest.raises(ValueError, match=expected_message):
base_report._validate_metadata_format(metadata)

def test__validate_metadata_format_with_no_columns(self):
"""Test the ``_validate_metadata_format`` method.

This test checks that the method raises an error when the metadata does not contain a
'columns' key.
"""
# Setup
base_report = BaseMultiTableReport()
metadata = {
'tables': {
'Table_1': {}
}
}

# Run and Assert
expected_message = 'The metadata for table "Table_1" is missing a "columns" key.'
with pytest.raises(ValueError, match=expected_message):
base_report._validate_metadata_format(metadata)

def test__validate_relationships(self):
"""Test the ``_validate_relationships`` method."""
# Setup
Expand Down
36 changes: 35 additions & 1 deletion tests/unit/reports/test_base_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,47 @@ def test__validate_data_format(self):

# Run and Assert
expected_message = (
'Single table report BaseReport expects real and synthetic data to be '
'Single table BaseReport expects real and synthetic data to be '
'pandas.DataFrame. If your real and synthetic data are dictionaries of '
'tables, please use the multi-table BaseReport instead.'
)
with pytest.raises(ValueError, match=expected_message):
base_report._validate_data_format(real_data, synthetic_data)

def test__validate_metadata_format(self):
"""Test the ``_validate_metadata_format`` method.

This test checks that the method raises an error when the metadata is not a dictionary.
"""
# Setup
base_report = BaseReport()
metadata = 'metadata'

# Run and Assert
expected_message = (
'The provided metadata is not a dictionary.'
)
with pytest.raises(TypeError, match=expected_message):
base_report._validate_metadata_format(metadata)

def test__validate_metadata_format_no_columns(self):
"""Test the ``_validate_metadata_format`` method.

This test checks that the method raises an error when the metadata does not contain a
'columns' key.
"""
# Setup
base_report = BaseReport()
metadata = {}

# Run and Assert
expected_message = (
'Single table reports expect metadata to contain a "columns" key with a mapping'
' from column names to column informations.'
)
with pytest.raises(ValueError, match=expected_message):
base_report._validate_metadata_format(metadata)

def test__validate_metadata_matches_data(self):
"""Test the ``_validate_metadata_matches_data`` method.

Expand Down
Loading