From 891e4044d2d2a50cf5c1917e28e13f3f21ee0e21 Mon Sep 17 00:00:00 2001 From: Andrew Montanez Date: Mon, 12 Apr 2021 15:48:35 -0500 Subject: [PATCH 1/2] sdv-issue-331: Fixing Duplicate IDs when using reject-sampling --- sdv/metadata/table.py | 17 +++++++++++++++++ sdv/tabular/base.py | 1 + tests/unit/metadata/test_table.py | 20 ++++++++++++++++++++ 3 files changed, 38 insertions(+) diff --git a/sdv/metadata/table.py b/sdv/metadata/table.py index 1fe97024a..a616cbb10 100644 --- a/sdv/metadata/table.py +++ b/sdv/metadata/table.py @@ -584,6 +584,23 @@ def filter_valid(self, data): return data + def make_ids_unique(self, data): + """Repopulate any id fields in provided data to guarantee uniqueness. + + Args: + data (pandas.DataFrame): + Table data. + + Returns: + pandas.DataFrame: + Table where all id fields are unique. + """ + for name, field_metadata in self._fields_metadata.items(): + if field_metadata['type'] == 'id': + data[name] = self._make_ids(field_metadata, len(data)) + + return data + # ###################### # # Metadata Serialization # # ###################### # diff --git a/sdv/tabular/base.py b/sdv/tabular/base.py index 94d0f0f7f..9852d09a1 100644 --- a/sdv/tabular/base.py +++ b/sdv/tabular/base.py @@ -296,6 +296,7 @@ def _sample_batch(self, num_rows=None, max_retries=100, max_rows_multiplier=10, counter += 1 + sampled = self._metadata.make_ids_unique(sampled) return sampled.head(min(len(sampled), num_rows)) def _make_conditions_df(self, conditions, num_rows): diff --git a/tests/unit/metadata/test_table.py b/tests/unit/metadata/test_table.py index a667c0157..60cdefdd8 100644 --- a/tests/unit/metadata/test_table.py +++ b/tests/unit/metadata/test_table.py @@ -17,3 +17,23 @@ def test__make_ids_fail(self): metadata = {'subtype': 'string', 'regex': '[a-d]'} with pytest.raises(ValueError): Table._make_ids(metadata, 20) + + def test_make_ids_unique(self): + """Test that id columns contain all unique values""" + metadata_dict = { + 'fields': { + 'item 0': {'type': 'id', 'subtype': 'integer'}, + 'item 1': {'type': 'boolean'} + }, + 'primary_key': 'item 0' + } + metadata = Table.from_dict(metadata_dict) + data = pd.DataFrame({ + 'item 0': [0, 1, 1, 2, 3, 5, 5, 6], + 'item 1': [True, True, False, False, True, False, False, True] + }) + + new_data = metadata.make_ids_unique(data) + + assert new_data['item 1'].equals(data['item 1']) + assert new_data['item 0'].is_unique From 6aa42effd363f467256a0f231d7f6c72e400fe10 Mon Sep 17 00:00:00 2001 From: Andrew Montanez Date: Mon, 12 Apr 2021 17:00:23 -0500 Subject: [PATCH 2/2] fixing test --- sdv/metadata/table.py | 2 +- tests/unit/metadata/test_table.py | 24 ++++++++++++++++++++++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/sdv/metadata/table.py b/sdv/metadata/table.py index a616cbb10..39c89dadf 100644 --- a/sdv/metadata/table.py +++ b/sdv/metadata/table.py @@ -596,7 +596,7 @@ def make_ids_unique(self, data): Table where all id fields are unique. """ for name, field_metadata in self._fields_metadata.items(): - if field_metadata['type'] == 'id': + if field_metadata['type'] == 'id' and not data[name].is_unique: data[name] = self._make_ids(field_metadata, len(data)) return data diff --git a/tests/unit/metadata/test_table.py b/tests/unit/metadata/test_table.py index 60cdefdd8..c34b4a0e5 100644 --- a/tests/unit/metadata/test_table.py +++ b/tests/unit/metadata/test_table.py @@ -18,8 +18,8 @@ def test__make_ids_fail(self): with pytest.raises(ValueError): Table._make_ids(metadata, 20) - def test_make_ids_unique(self): - """Test that id columns contain all unique values""" + def test_make_ids_unique_field_not_unique(self): + """Test that id column is replaced with all unique values if not already unique.""" metadata_dict = { 'fields': { 'item 0': {'type': 'id', 'subtype': 'integer'}, @@ -37,3 +37,23 @@ def test_make_ids_unique(self): assert new_data['item 1'].equals(data['item 1']) assert new_data['item 0'].is_unique + + def test_make_ids_unique_field_already_unique(self): + """Test that id column is kept if already unique.""" + metadata_dict = { + 'fields': { + 'item 0': {'type': 'id', 'subtype': 'integer'}, + 'item 1': {'type': 'boolean'} + }, + 'primary_key': 'item 0' + } + metadata = Table.from_dict(metadata_dict) + data = pd.DataFrame({ + 'item 0': [9, 1, 8, 2, 3, 7, 5, 6], + 'item 1': [True, True, False, False, True, False, False, True] + }) + + new_data = metadata.make_ids_unique(data) + + assert new_data['item 1'].equals(data['item 1']) + assert new_data['item 0'].equals(data['item 0'])