Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix HMASynthesizer not able to fit when only primary keys and foreign keys are within a table. #1266

4 changes: 2 additions & 2 deletions sdv/data_processing/data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -739,14 +739,14 @@ def reverse_transform(self, data, reset_keys=False):
for column in self.metadata.columns.keys() - set(sampled_columns + self._keys)
if self._hyper_transformer.field_transformers.get(column)
]
if missing_columns:
if missing_columns and num_rows:
anonymized_data = self._hyper_transformer.create_anonymized_columns(
num_rows=num_rows,
column_names=missing_columns
)
sampled_columns.extend(missing_columns)

if self._keys:
if self._keys and num_rows:
generated_keys = self.generate_keys(num_rows, reset_keys)
sampled_columns.extend(self._keys)

Expand Down
3 changes: 2 additions & 1 deletion sdv/multi_table/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,10 @@ def _set_temp_numpy_seed(self):
initial_state = np.random.get_state()
if isinstance(self._numpy_seed, int):
np.random.seed(self._numpy_seed)
np.random.default_rng(self._numpy_seed)
else:
np.random.set_state(self._numpy_seed)

np.random.default_rng(self._numpy_seed[1])
try:
yield
finally:
Expand Down
71 changes: 45 additions & 26 deletions sdv/multi_table/hma.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@ class HMASynthesizer(BaseMultiTableSynthesizer):
"""Hierarchical Modeling Algorithm One.

Args:
metadata (dict, str or Metadata):
Metadata dict, path to the metadata JSON file or Metadata instance itself.
metadata (sdv.metadata.multi_table.MultiTableMetadata):
Multi table metadata representing the data tables that this synthesizer will be used
for.
"""

DEFAULT_SYNTHESIZER_KWARGS = {
'default_distribution': 'beta',
'default_distribution': 'beta'
}

def __init__(self, metadata, synthesizer_kwargs=None):
Expand Down Expand Up @@ -54,29 +55,36 @@ def _get_extension(self, child_name, child_table, foreign_key):
table_meta = self._table_synthesizers[child_name].get_metadata()

extension_rows = []
foreign_key_columns = self._get_all_foreign_keys(child_name)
foreign_key_values = child_table[foreign_key].unique()
child_table = child_table.set_index(foreign_key)

index = []
scale_columns = None
for foreign_key_value in foreign_key_values:
child_rows = child_table.loc[[foreign_key_value]]
child_rows = child_rows[child_rows.columns.difference(foreign_key_columns)]

try:
synthesizer = self._synthesizer(table_meta, **self._synthesizer_kwargs)
synthesizer.fit_processed_data(child_rows.reset_index(drop=True))
row = synthesizer._get_parameters()
row = pd.Series(row)
row.index = f'__{child_name}__{foreign_key}__' + row.index

if scale_columns is None:
scale_columns = [
column
for column in row.index
if column.endswith('scale')
]

if len(child_rows) == 1:
row.loc[scale_columns] = None
if child_rows.empty:
row = pd.Series({'num_rows': len(child_rows)})
row.index = f'__{child_name}__{foreign_key}__' + row.index
else:
synthesizer = self._synthesizer(table_meta, **self._synthesizer_kwargs)
synthesizer.fit_processed_data(child_rows.reset_index(drop=True))
row = synthesizer._get_parameters()
row = pd.Series(row)
row.index = f'__{child_name}__{foreign_key}__' + row.index

if scale_columns is None:
scale_columns = [
column
for column in row.index
if column.endswith('scale')
]

if len(child_rows) == 1:
row.loc[scale_columns] = None

extension_rows.append(row)
index.append(foreign_key_value)
Expand Down Expand Up @@ -140,7 +148,7 @@ def _pop_foreign_keys(self, table_data, table_name):
The name representing the table.

Returns:
keyes (dict):
pvk-developer marked this conversation as resolved.
Show resolved Hide resolved
keys (dict):
A dictionary mapping with the foreign key and it's values within the table.
"""
foreign_keys = self._get_all_foreign_keys(table_name)
Expand Down Expand Up @@ -185,7 +193,8 @@ def _model_table(self, table_name, tables):
LOGGER.info('Fitting %s for table %s; shape: %s', self._synthesizer.__name__,
table_name, table.shape)

self._table_synthesizers[table_name].fit_processed_data(table)
if not table.empty:
self._table_synthesizers[table_name].fit_processed_data(table)

for name, values in keys.items():
table[name] = values
Expand All @@ -196,7 +205,7 @@ def _model_table(self, table_name, tables):
return table

def _fit(self, processed_data):
"""Fit this HMA1 instance to the dataset data.
"""Fit this ``HMASynthesizer`` instance to the dataset data.

Args:
processed_data (dict):
Expand Down Expand Up @@ -239,7 +248,7 @@ def _finalize(self, sampled_data):
foreign_key,
sampled_data
)
table_rows[foreign_key] = parent_ids
table_rows[foreign_key] = parent_ids.to_numpy()

synthesizer = self._table_synthesizers.get(table_name)
dtypes = synthesizer._data_processor._dtypes
Expand Down Expand Up @@ -309,7 +318,11 @@ def _sample_rows(self, synthesizer, table_name, num_rows=None):
Sampled rows, shape (, num_rows)
"""
num_rows = num_rows or synthesizer._num_rows
sampled_rows = synthesizer._sample(num_rows)
if synthesizer._model:
sampled_rows = synthesizer._sample(num_rows)
else:
sampled_rows = pd.DataFrame(index=range(num_rows))

return self._process_samples(table_name, sampled_rows)

def _get_child_synthesizer(self, parent_row, table_name, foreign_key):
Expand Down Expand Up @@ -411,7 +424,7 @@ def _find_parent_id(likelihoods, num_rows):
else:
weights = likelihoods.to_numpy() / total

return np.random.choice(likelihoods.index, p=weights)
return np.random.choice(likelihoods.index.to_list(), p=weights)

def _get_likelihoods(self, table_rows, parent_rows, table_name, foreign_key):
"""Calculate the likelihood of each parent id value appearing in the data.
Expand All @@ -431,13 +444,19 @@ def _get_likelihoods(self, table_rows, parent_rows, table_name, foreign_key):
A DataFrame of the likelihood of each parent id.
"""
likelihoods = {}

data_processor = self._table_synthesizers[table_name]._data_processor
table_rows = data_processor.transform(table_rows)

for parent_id, row in parent_rows.iterrows():
parameters = self._extract_parameters(row, table_name, foreign_key)
table_meta = self._table_synthesizers[table_name].get_metadata()
synthesizer = self._synthesizer(table_meta, **self._synthesizer_kwargs)
synthesizer.set_parameters(parameters)
synthesizer._set_parameters(parameters)
try:
likelihoods[parent_id] = synthesizer.get_likelihood(table_rows)
with np.random.default_rng(np.random.get_state()[1]):
likelihoods[parent_id] = synthesizer._get_likelihood(table_rows)

except (AttributeError, np.linalg.LinAlgError):
likelihoods[parent_id] = None

Expand Down
10 changes: 7 additions & 3 deletions sdv/single_table/copulas.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,9 @@ def _rebuild_gaussian_copula(self, model_parameters):

return model_parameters

def _get_likelihood(self, table_rows):
return self._model.probability_density(table_rows)

def _set_parameters(self, parameters):
"""Set copula model parameters.

Expand All @@ -366,7 +369,8 @@ def _set_parameters(self, parameters):
parameters = unflatten_dict(parameters)
if 'num_rows' in parameters:
num_rows = parameters.pop('num_rows')
self._num_rows = 0 if pd.isna(num_rows) else max(0, int(round(num_rows)))

parameters = self._rebuild_gaussian_copula(parameters)
self._model = multivariate.GaussianMultivariate.from_dict(parameters)
self._num_rows = 0 if pd.isna(num_rows) else max(0, int(round(num_rows)))
if parameters:
parameters = self._rebuild_gaussian_copula(parameters)
self._model = multivariate.GaussianMultivariate.from_dict(parameters)
49 changes: 49 additions & 0 deletions tests/integration/multi_table/test_hma.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,3 +334,52 @@ def test_save_and_load():
assert isinstance(synthesizer, HMASynthesizer)
assert loaded_synthesizer.get_info() == synthesizer.get_info()
assert loaded_synthesizer.metadata.to_dict() == metadata.to_dict()


def test_hma_primary_key_and_foreign_key_only():
"""Test that ``HMASynthesizer`` can handle tables with primary and foreign keys only."""
# Setup
users = pd.DataFrame({
'user_id': [1, 2, 3],
'user_name': ['John', 'Doe', 'Johanna']
})
sessions = pd.DataFrame({
'session_id': ['a', 'b', 'c'],
'clicks': [10, 20, 30]
})
games = pd.DataFrame({
'game_id': ['a1', 'b2', 'c3'],
'session_id': ['a', 'b', 'c'],
'user_id': [1, 2, 3]
})

data = {
'users': users,
'sessions': sessions,
'games': games
}

metadata = MultiTableMetadata()
for table_name, table in data.items():
metadata.detect_table_from_dataframe(table_name, table)

metadata.update_column('sessions', 'session_id', sdtype='text')
metadata.update_column('games', 'session_id', sdtype='text')
metadata.update_column('games', 'game_id', sdtype='text')
metadata.set_primary_key('users', 'user_id')
metadata.set_primary_key('sessions', 'session_id')
metadata.set_primary_key('games', 'game_id')
metadata.add_relationship('users', 'games', 'user_id', 'user_id')
metadata.add_relationship('sessions', 'games', 'session_id', 'session_id')

hmasynthesizer = HMASynthesizer(metadata)

# Fit
hmasynthesizer.fit(data)

# Sample
sample = hmasynthesizer.sample()

# Assert
assert all(sample['games']['user_id'].isin(sample['users']['user_id']))
assert all(sample['games']['session_id'].isin(sample['sessions']['session_id']))
5 changes: 0 additions & 5 deletions tests/unit/multi_table/test_hma.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,15 +107,10 @@ def test__extend_table(self):
'id_nesreca': [0, 1, 2, 3],
'upravna_enota': [0, 1, 2, 3],
'value': [0, 1, 2, 3],
'__oseba__id_nesreca__covariance__0__0': [0.] * 4,
'__oseba__id_nesreca__univariates__oseba_value__a': [1.] * 4,
'__oseba__id_nesreca__univariates__oseba_value__b': [1.] * 4,
'__oseba__id_nesreca__univariates__oseba_value__loc': [0., 1., 2., 3.],
'__oseba__id_nesreca__univariates__oseba_value__scale': [np.nan] * 4,
'__oseba__id_nesreca__univariates__upravna_enota__a': [1.] * 4,
'__oseba__id_nesreca__univariates__upravna_enota__b': [1.] * 4,
'__oseba__id_nesreca__univariates__upravna_enota__loc': [0., 1., 2., 3.],
'__oseba__id_nesreca__univariates__upravna_enota__scale': [np.nan] * 4,
'__oseba__id_nesreca__num_rows': [1.] * 4,
})

Expand Down
13 changes: 13 additions & 0 deletions tests/unit/single_table/test_copulas.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,3 +522,16 @@ def test_get_learned_distributions_raises_an_error(self):
)
with pytest.raises(ValueError, match=error_msg):
gcs.get_learned_distributions()

def test__get_likelihood(self):
"""Test that ``_get_likelihood`` returns the ``model.probability_density`` of the input."""
# Setup
table_rows = pd.Series([1, 2, 3])
instance = Mock()

# Run
result = GaussianCopulaSynthesizer._get_likelihood(instance, table_rows)

# Assert
assert result == instance._model.probability_density.return_value
instance._model.probability_density.assert_called_once_with(table_rows)