From 7c9ee647162278f18074ac94e247ebfd7c40c9bd Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Mon, 18 Mar 2024 11:16:36 +0000 Subject: [PATCH 1/6] def --- sdv/utils/poc.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/sdv/utils/poc.py b/sdv/utils/poc.py index 19a2cdff9..ac83b786a 100644 --- a/sdv/utils/poc.py +++ b/sdv/utils/poc.py @@ -1,10 +1,14 @@ """Utility functions.""" +import sys + +import pandas as pd + from sdv._utils import ( _get_relationship_for_child, _get_rows_to_drop, _validate_foreign_keys_not_null) from sdv.errors import InvalidDataError, SynthesizerInputError -def drop_unknown_references(metadata, data, drop_missing_values=True): +def drop_unknown_references(metadata, data, drop_missing_values=True, verbose=True): """Drop rows with unknown foreign keys. Args: @@ -17,17 +21,25 @@ def drop_unknown_references(metadata, data, drop_missing_values=True): Boolean describing whether or not to also drop foreign keys with missing values If True, drop rows with missing values in the foreign keys. Defaults to True. + verbose (bool): + If True, print information about the rows that are dropped. + Defaults to True. Returns: dict: Dictionary with the dataframes ensuring referential integrity. """ + success_message = 'Success! All foreign keys have referential integrity.' metadata.validate() try: metadata.validate_data(data) if drop_missing_values: _validate_foreign_keys_not_null(metadata, data) + if verbose: + message = [success_message, 'No rows were dropped.'] + sys.stdout.write('\n'.join(message)) + return data except (InvalidDataError, SynthesizerInputError): result = data.copy() @@ -47,4 +59,18 @@ def drop_unknown_references(metadata, data, drop_missing_values=True): 'Try providing different data for this table.' ]) + if verbose: + table_names = sorted(metadata.tables) + summary_table = pd.DataFrame({ + 'Table Name': table_names, + '# Rows (Original)': [len(data[table]) for table in table_names], + '# Invalid Rows': [ + len(data[table]) - len(result[table]) for table in table_names + ], + '# Rows (New)': [len(result[table]) for table in table_names] + }) + message = [success_message, 'Summary of the number of rows dropped:'] + message.append(summary_table.to_string(index=False)) + sys.stdout.write('\n'.join(message)) + return result From 39b3613f359848b6d3bf140dc4d33756d8fe11ca Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Mon, 18 Mar 2024 11:16:51 +0000 Subject: [PATCH 2/6] unit tests --- tests/unit/utils/test_poc.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/tests/unit/utils/test_poc.py b/tests/unit/utils/test_poc.py index 6f158e99f..37429ab71 100644 --- a/tests/unit/utils/test_poc.py +++ b/tests/unit/utils/test_poc.py @@ -10,8 +10,9 @@ from sdv.utils.poc import drop_unknown_references +@patch('sys.stdout.write') @patch('sdv.utils.poc._get_rows_to_drop') -def test_drop_unknown_references(mock_get_rows_to_drop): +def test_drop_unknown_references(mock_get_rows_to_drop, mock_stdout_write): """Test ``drop_unknown_references``.""" # Setup relationships = [ @@ -65,6 +66,14 @@ def test_drop_unknown_references(mock_get_rows_to_drop): result = drop_unknown_references(metadata, data) # Assert + mock_stdout_write.assert_called_once_with( + 'Success! All foreign keys have referential integrity.\n' + 'Summary of the number of rows dropped:\n' + 'Table Name # Rows (Original) # Invalid Rows # Rows (New)\n' + ' child 5 1 4\n' + 'grandchild 5 3 2\n' + ' parent 5 0 5' + ) metadata.validate.assert_called_once() metadata.validate_data.assert_called_once_with(data) mock_get_rows_to_drop.assert_called_once() @@ -88,7 +97,8 @@ def test_drop_unknown_references(mock_get_rows_to_drop): pd.testing.assert_frame_equal(table, expected_result[table_name]) -def test_drop_unknown_references_valid_data_mock(): +@patch('sys.stdout.write') +def test_drop_unknown_references_valid_data_mock(mock_stdout_write): """Test ``drop_unknown_references`` when data has referential integrity.""" # Setup metadata = Mock() @@ -116,6 +126,9 @@ def test_drop_unknown_references_valid_data_mock(): result = drop_unknown_references(metadata, data) # Assert + mock_stdout_write.assert_called_once_with( + 'Success! All foreign keys have referential integrity.\nNo rows were dropped.' + ) metadata.validate.assert_called_once() metadata.validate_data.assert_called_once_with(data) for table_name, table in result.items(): @@ -175,7 +188,7 @@ def test_drop_unknown_references_with_nan(mock_validate_foreign_keys, mock_get_r }) # Run - result = drop_unknown_references(metadata, data) + result = drop_unknown_references(metadata, data, verbose=False) # Assert metadata.validate.assert_called_once() @@ -255,7 +268,7 @@ def test_drop_unknown_references_drop_missing_values_false(mock_get_rows_to_drop }) # Run - result = drop_unknown_references(metadata, data, drop_missing_values=False) + result = drop_unknown_references(metadata, data, drop_missing_values=False, verbose=False) # Assert mock_get_rows_to_drop.assert_called_once() From 1bf697c1541791c2897ac739495f1982ae0e4fc6 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Mon, 18 Mar 2024 11:17:13 +0000 Subject: [PATCH 3/6] integration tests --- tests/integration/utils/test_poc.py | 34 +++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/tests/integration/utils/test_poc.py b/tests/integration/utils/test_poc.py index 664710e5d..bffa207c2 100644 --- a/tests/integration/utils/test_poc.py +++ b/tests/integration/utils/test_poc.py @@ -61,7 +61,7 @@ def data(): } -def test_drop_unknown_references(metadata, data): +def test_drop_unknown_references(metadata, data, capsys): """Test ``drop_unknown_references`` end to end.""" # Run expected_message = re.escape( @@ -75,14 +75,23 @@ def test_drop_unknown_references(metadata, data): cleaned_data = drop_unknown_references(metadata, data) metadata.validate_data(cleaned_data) + captured = capsys.readouterr() # Assert pd.testing.assert_frame_equal(cleaned_data['parent'], data['parent']) pd.testing.assert_frame_equal(cleaned_data['child'], data['child'].iloc[:4]) assert len(cleaned_data['child']) == 4 + expected_output = ( + 'Success! All foreign keys have referential integrity.\n' + 'Summary of the number of rows dropped:\n' + 'Table Name # Rows (Original) # Invalid Rows # Rows (New)\n' + ' child 5 1 4\n' + ' parent 5 0 5' + ) + assert captured.out.strip() == expected_output -def test_drop_unknown_references_valid_data(metadata, data): +def test_drop_unknown_references_valid_data(metadata, data, capsys): """Test ``drop_unknown_references`` when data has referential integrity.""" # Setup data = deepcopy(data) @@ -90,13 +99,19 @@ def test_drop_unknown_references_valid_data(metadata, data): # Run result = drop_unknown_references(metadata, data) + captured = capsys.readouterr() # Assert pd.testing.assert_frame_equal(result['parent'], data['parent']) pd.testing.assert_frame_equal(result['child'], data['child']) + expected_message = ( + 'Success! All foreign keys have referential integrity.\n' + 'No rows were dropped.' + ) + assert captured.out.strip() == expected_message -def test_drop_unknown_references_drop_missing_values(metadata, data): +def test_drop_unknown_references_drop_missing_values(metadata, data, capsys): """Test ``drop_unknown_references`` when there is missing values in the foreign keys.""" # Setup data = deepcopy(data) @@ -105,11 +120,20 @@ def test_drop_unknown_references_drop_missing_values(metadata, data): # Run cleaned_data = drop_unknown_references(metadata, data) metadata.validate_data(cleaned_data) + captured = capsys.readouterr() # Assert pd.testing.assert_frame_equal(cleaned_data['parent'], data['parent']) pd.testing.assert_frame_equal(cleaned_data['child'], data['child'].iloc[:4]) assert len(cleaned_data['child']) == 4 + expected_output = ( + 'Success! All foreign keys have referential integrity.\n' + 'Summary of the number of rows dropped:\n' + 'Table Name # Rows (Original) # Invalid Rows # Rows (New)\n' + ' child 5 1 4\n' + ' parent 5 0 5' + ) + assert captured.out.strip() == expected_output def test_drop_unknown_references_not_drop_missing_values(metadata, data): @@ -118,7 +142,9 @@ def test_drop_unknown_references_not_drop_missing_values(metadata, data): data['child'].loc[3, 'parent_id'] = np.nan # Run - cleaned_data = drop_unknown_references(metadata, data, drop_missing_values=False) + cleaned_data = drop_unknown_references( + metadata, data, drop_missing_values=False, verbose=False + ) # Assert pd.testing.assert_frame_equal(cleaned_data['parent'], data['parent']) From 8def6dfb905de8ab1cd8659d6beae13275354a94 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Mon, 18 Mar 2024 12:33:56 +0000 Subject: [PATCH 4/6] fix minimum version --- tests/unit/utils/test_poc.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/unit/utils/test_poc.py b/tests/unit/utils/test_poc.py index 37429ab71..e06d075c1 100644 --- a/tests/unit/utils/test_poc.py +++ b/tests/unit/utils/test_poc.py @@ -66,14 +66,16 @@ def test_drop_unknown_references(mock_get_rows_to_drop, mock_stdout_write): result = drop_unknown_references(metadata, data) # Assert - mock_stdout_write.assert_called_once_with( - 'Success! All foreign keys have referential integrity.\n' - 'Summary of the number of rows dropped:\n' - 'Table Name # Rows (Original) # Invalid Rows # Rows (New)\n' - ' child 5 1 4\n' - 'grandchild 5 3 2\n' - ' parent 5 0 5' + expected_pattern = re.compile( + r'Success! All foreign keys have referential integrity\.\s*' + r'Summary of the number of rows dropped:\s*' + r'Table Name\s*#\s*Rows \(Original\)\s*#\s*Invalid Rows\s*#\s*Rows \(New\)\s*' + r'child\s*5\s*1\s*4\s*' + r'grandchild\s*5\s*3\s*2\s*' + r'parent\s*5\s*0\s*5' ) + output = mock_stdout_write.call_args[0][0] + assert expected_pattern.match(output) metadata.validate.assert_called_once() metadata.validate_data.assert_called_once_with(data) mock_get_rows_to_drop.assert_called_once() From 7a80b0ae1b37e389a3a66fc914b7e124061e1331 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Tue, 19 Mar 2024 08:43:22 +0000 Subject: [PATCH 5/6] change message --- sdv/utils/poc.py | 29 ++++++++++++++--------------- tests/integration/utils/test_poc.py | 6 +++--- tests/unit/utils/test_poc.py | 12 +++++++++--- 3 files changed, 26 insertions(+), 21 deletions(-) diff --git a/sdv/utils/poc.py b/sdv/utils/poc.py index ac83b786a..45b6a5b98 100644 --- a/sdv/utils/poc.py +++ b/sdv/utils/poc.py @@ -30,6 +30,13 @@ def drop_unknown_references(metadata, data, drop_missing_values=True, verbose=Tr Dictionary with the dataframes ensuring referential integrity. """ success_message = 'Success! All foreign keys have referential integrity.' + table_names = sorted(metadata.tables) + summary_table = pd.DataFrame({ + 'Table Name': table_names, + '# Rows (Original)': [len(data[table]) for table in table_names], + '# Invalid Rows': [0] * len(table_names), + '# Rows (New)': [len(data[table]) for table in table_names] + }) metadata.validate() try: metadata.validate_data(data) @@ -37,14 +44,13 @@ def drop_unknown_references(metadata, data, drop_missing_values=True, verbose=Tr _validate_foreign_keys_not_null(metadata, data) if verbose: - message = [success_message, 'No rows were dropped.'] - sys.stdout.write('\n'.join(message)) + sys.stdout.write('\n'.join([success_message, summary_table.to_string(index=False)])) return data except (InvalidDataError, SynthesizerInputError): result = data.copy() table_to_idx_to_drop = _get_rows_to_drop(metadata, result) - for table in metadata.tables: + for table in table_names: idx_to_drop = table_to_idx_to_drop[table] result[table] = result[table].drop(idx_to_drop) if drop_missing_values: @@ -60,17 +66,10 @@ def drop_unknown_references(metadata, data, drop_missing_values=True, verbose=Tr ]) if verbose: - table_names = sorted(metadata.tables) - summary_table = pd.DataFrame({ - 'Table Name': table_names, - '# Rows (Original)': [len(data[table]) for table in table_names], - '# Invalid Rows': [ - len(data[table]) - len(result[table]) for table in table_names - ], - '# Rows (New)': [len(result[table]) for table in table_names] - }) - message = [success_message, 'Summary of the number of rows dropped:'] - message.append(summary_table.to_string(index=False)) - sys.stdout.write('\n'.join(message)) + summary_table['# Invalid Rows'] = [ + len(data[table]) - len(result[table]) for table in table_names + ] + summary_table['# Rows (New)'] = [len(result[table]) for table in table_names] + sys.stdout.write('\n'.join([success_message, summary_table.to_string(index=False)])) return result diff --git a/tests/integration/utils/test_poc.py b/tests/integration/utils/test_poc.py index bffa207c2..c69c11975 100644 --- a/tests/integration/utils/test_poc.py +++ b/tests/integration/utils/test_poc.py @@ -83,7 +83,6 @@ def test_drop_unknown_references(metadata, data, capsys): assert len(cleaned_data['child']) == 4 expected_output = ( 'Success! All foreign keys have referential integrity.\n' - 'Summary of the number of rows dropped:\n' 'Table Name # Rows (Original) # Invalid Rows # Rows (New)\n' ' child 5 1 4\n' ' parent 5 0 5' @@ -106,7 +105,9 @@ def test_drop_unknown_references_valid_data(metadata, data, capsys): pd.testing.assert_frame_equal(result['child'], data['child']) expected_message = ( 'Success! All foreign keys have referential integrity.\n' - 'No rows were dropped.' + 'Table Name # Rows (Original) # Invalid Rows # Rows (New)\n' + ' child 5 0 5\n' + ' parent 5 0 5' ) assert captured.out.strip() == expected_message @@ -128,7 +129,6 @@ def test_drop_unknown_references_drop_missing_values(metadata, data, capsys): assert len(cleaned_data['child']) == 4 expected_output = ( 'Success! All foreign keys have referential integrity.\n' - 'Summary of the number of rows dropped:\n' 'Table Name # Rows (Original) # Invalid Rows # Rows (New)\n' ' child 5 1 4\n' ' parent 5 0 5' diff --git a/tests/unit/utils/test_poc.py b/tests/unit/utils/test_poc.py index e06d075c1..e1b29b974 100644 --- a/tests/unit/utils/test_poc.py +++ b/tests/unit/utils/test_poc.py @@ -68,7 +68,6 @@ def test_drop_unknown_references(mock_get_rows_to_drop, mock_stdout_write): # Assert expected_pattern = re.compile( r'Success! All foreign keys have referential integrity\.\s*' - r'Summary of the number of rows dropped:\s*' r'Table Name\s*#\s*Rows \(Original\)\s*#\s*Invalid Rows\s*#\s*Rows \(New\)\s*' r'child\s*5\s*1\s*4\s*' r'grandchild\s*5\s*3\s*2\s*' @@ -107,6 +106,7 @@ def test_drop_unknown_references_valid_data_mock(mock_stdout_write): metadata._get_all_foreign_keys.side_effect = [ [], ['parent_foreign_key'], ['child_foreign_key', 'parent_foreign_key'] ] + metadata.tables = {'parent', 'child', 'grandchild'} data = { 'parent': pd.DataFrame({ 'id_parent': [0, 1, 2, 3, 4], @@ -128,9 +128,15 @@ def test_drop_unknown_references_valid_data_mock(mock_stdout_write): result = drop_unknown_references(metadata, data) # Assert - mock_stdout_write.assert_called_once_with( - 'Success! All foreign keys have referential integrity.\nNo rows were dropped.' + expected_pattern = re.compile( + r'Success! All foreign keys have referential integrity\.\s*' + r'Table Name\s*#\s*Rows \(Original\)\s*#\s*Invalid Rows\s*#\s*Rows \(New\)\s*' + r'child\s*5\s*0\s*5\s*' + r'grandchild\s*5\s*0\s*5\s*' + r'parent\s*5\s*0\s*5' ) + output = mock_stdout_write.call_args[0][0] + assert expected_pattern.match(output) metadata.validate.assert_called_once() metadata.validate_data.assert_called_once_with(data) for table_name, table in result.items(): From 6067e02072619dafdc0575c9380174654d94d196 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Wed, 20 Mar 2024 12:43:32 +0000 Subject: [PATCH 6/6] add blank line --- sdv/utils/poc.py | 8 ++++++-- tests/integration/utils/test_poc.py | 6 +++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/sdv/utils/poc.py b/sdv/utils/poc.py index 45b6a5b98..a7a9f780c 100644 --- a/sdv/utils/poc.py +++ b/sdv/utils/poc.py @@ -44,7 +44,9 @@ def drop_unknown_references(metadata, data, drop_missing_values=True, verbose=Tr _validate_foreign_keys_not_null(metadata, data) if verbose: - sys.stdout.write('\n'.join([success_message, summary_table.to_string(index=False)])) + sys.stdout.write( + '\n'.join([success_message, '', summary_table.to_string(index=False)]) + ) return data except (InvalidDataError, SynthesizerInputError): @@ -70,6 +72,8 @@ def drop_unknown_references(metadata, data, drop_missing_values=True, verbose=Tr len(data[table]) - len(result[table]) for table in table_names ] summary_table['# Rows (New)'] = [len(result[table]) for table in table_names] - sys.stdout.write('\n'.join([success_message, summary_table.to_string(index=False)])) + sys.stdout.write('\n'.join([ + success_message, '', summary_table.to_string(index=False) + ])) return result diff --git a/tests/integration/utils/test_poc.py b/tests/integration/utils/test_poc.py index c69c11975..0ab6ac072 100644 --- a/tests/integration/utils/test_poc.py +++ b/tests/integration/utils/test_poc.py @@ -82,7 +82,7 @@ def test_drop_unknown_references(metadata, data, capsys): pd.testing.assert_frame_equal(cleaned_data['child'], data['child'].iloc[:4]) assert len(cleaned_data['child']) == 4 expected_output = ( - 'Success! All foreign keys have referential integrity.\n' + 'Success! All foreign keys have referential integrity.\n\n' 'Table Name # Rows (Original) # Invalid Rows # Rows (New)\n' ' child 5 1 4\n' ' parent 5 0 5' @@ -104,7 +104,7 @@ def test_drop_unknown_references_valid_data(metadata, data, capsys): pd.testing.assert_frame_equal(result['parent'], data['parent']) pd.testing.assert_frame_equal(result['child'], data['child']) expected_message = ( - 'Success! All foreign keys have referential integrity.\n' + 'Success! All foreign keys have referential integrity.\n\n' 'Table Name # Rows (Original) # Invalid Rows # Rows (New)\n' ' child 5 0 5\n' ' parent 5 0 5' @@ -128,7 +128,7 @@ def test_drop_unknown_references_drop_missing_values(metadata, data, capsys): pd.testing.assert_frame_equal(cleaned_data['child'], data['child'].iloc[:4]) assert len(cleaned_data['child']) == 4 expected_output = ( - 'Success! All foreign keys have referential integrity.\n' + 'Success! All foreign keys have referential integrity.\n\n' 'Table Name # Rows (Original) # Invalid Rows # Rows (New)\n' ' child 5 1 4\n' ' parent 5 0 5'