Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix issues with auto_entityset #21

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
12 changes: 8 additions & 4 deletions autonormalize/dfd.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from functools import partial
from itertools import combinations

import numpy
import pandas as pd
from tqdm import tqdm

from .classes import DfdDependencies, LHSs, Masks, Node
Expand Down Expand Up @@ -359,7 +359,6 @@ def approximate_dependencies(lhs_set, rhs, df, accuracy, masks):
acc = 0

for index, row in indicator.iterrows():

mask = None
for attr in lhs_set:

Expand All @@ -368,14 +367,19 @@ def approximate_dependencies(lhs_set, rhs, df, accuracy, masks):
if df[attr].dtypes.name == 'datetime64[ns]':
m = df[attr] == row[attr]
else:
m = df[attr].values == row[attr]
if pd.isna(row[attr]):
m = df[attr].isnull()
else:
m = df[attr].values == row[attr]
masks.add_mask(attr, row[attr], m)
if mask is None:
mask = m
else:
mask = mask & m
options = df[mask]
_, unique_counts = numpy.unique(options[rhs].to_numpy(), return_counts=True)

# _, unique_counts = np.unique(options[rhs].to_numpy(), return_counts=True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this line be removed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, removed this and another temporary test that I added trying to replicate the problem that was present in #19.

unique_counts = options[rhs].value_counts()
acc += unique_counts.sum() - unique_counts.max()
if acc > limit:
return False
Expand Down
6 changes: 4 additions & 2 deletions autonormalize/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def make_indexes(depdf):
Arguments:
depdf (DepDF) : depDF to make indexes for
"""

prim_key = depdf.deps.get_prim_key()

if len(prim_key) > 1:
Expand Down Expand Up @@ -103,8 +104,9 @@ def make_indexes(depdf):

for index in indices[name]:
add[index] = new_val

depdf.parent.df.drop(columns=prim_key, inplace=True)
# Don't drop a column if it is needed in another parent relationship
to_drop = [key for key in prim_key if key not in depdf.parent.deps.serialize().keys()]
depdf.parent.df.drop(columns=to_drop, inplace=True)
depdf.parent.df.insert(len(depdf.parent.df.columns), '_'.join(prim_key), add)

for child in depdf.children:
Expand Down
44 changes: 26 additions & 18 deletions autonormalize/tests/test_dfd.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os

import numpy as np
import pandas as pd

from autonormalize import dfd
Expand Down Expand Up @@ -73,21 +74,28 @@ def test_compute_partitions():
assert not dfd.compute_partitions(df, 'c', frozenset(['a', 'b']), {}, 0.96, mask)


# def test_approximate_dependencies():
# mask = dfd.Masks(['a', 'b', 'c'])
# a = [6, 2, 3, 7, 8, 1, 0, 2, 0, 3, 6, 0, 4, 6, 8, 7, 6, 8, 1, 5, 1, 3, 3, 0, 0, 4, 5, 5, 7, 0, 8, 2, 4, 7, 0, 0, 6, 4, 6, 8]
# # b = [int(x%2 == 0) for x in a]
# b = [1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1]
# # c = [(a[i] + b[i])<4 for i in range(40)]
# c = [False, True, True, False, False, True, True, True, True, True, False, True, False, False, False, False, False, False, True, False, True, True, True, True, True, False, False, False, False, True, False, True, False, False, True, True, False, False, False, False]
# df = pd.DataFrame({'a': a, 'b': b, 'c': c})
# assert dfd.approximate_dependencies([0, 1], 2, df, 1.00, mask, 0.90)
# assert dfd.approximate_dependencies(set([0, 1]), 2, df, .90, mask, 0.90)
# c[0] = True
# df = pd.DataFrame({'a': a, 'b': b, 'c': c})
# assert dfd.approximate_dependencies([0, 1], 2, df, .97, mask, 0.90)
# assert not dfd.approximate_dependencies(set([0, 1]), 2, df, .98, mask, 0.90)
# c[35] = False
# df = pd.DataFrame({'a': a, 'b': b, 'c': c})
# assert dfd.approximate_dependencies([0, 1], 2, df, .95, mask, 0.90)
# assert not dfd.approximate_dependencies([0, 1], 2, df, .96, mask, 0.90)
def test_approximate_dependencies():
mask = dfd.Masks(['a', 'b', 'c'])
a = [6, 2, 3, 7, 8, 1, 0, 2, 0, 3, 6, 0, 4, 6, 8, 7, 6, 8, 1, 5, 1, 3, 3, 0, 0, 4, 5, 5, 7, 0, 8, 2, 4, 7, 0, 0, 6, 4, 6, 8]
b = [1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1]
c = [False, True, True, False, False, True, True, True, True, True, False, True, False, False, False, False, False, False, True, False, True, True, True, True, True, False, False, False, False, True, False, True, False, False, True, True, False, False, False, False]
df = pd.DataFrame({'a': a, 'b': b, 'c': c})
assert dfd.approximate_dependencies(['a', 'b'], 'c', df, 1.00, mask)
assert dfd.approximate_dependencies(['a', 'b'], 'c', df, .90, mask)
c[0] = True
df = pd.DataFrame({'a': a, 'b': b, 'c': c})
assert dfd.approximate_dependencies(['a', 'b'], 'c', df, .97, mask)
assert not dfd.approximate_dependencies(['a', 'b'], 'c', df, .98, mask)
c[35] = False
df = pd.DataFrame({'a': a, 'b': b, 'c': c})
assert dfd.approximate_dependencies(['a', 'b'], 'c', df, .95, mask)
assert not dfd.approximate_dependencies(['a', 'b'], 'c', df, .96, mask)


def test_approximate_dependencies_with_nan():
mask = dfd.Masks(['a', 'b', 'c'])
a = [np.nan, 2, 3, 7, 8, 1, 0, 2, 0, 3, np.nan, 0, 4, np.nan, 8, 7, np.nan, 8, 1, 5, 1, 3, 3, 0, 0, 4, 5, 5, 7, 0, 8, 2, 4, 7, 0, 0, np.nan, 4, np.nan, 8]
b = [1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1]
c = [True, True, True, False, False, True, True, True, True, True, False, True, False, False, False, False, False, False, True, False, True, True, True, True, True, False, False, False, False, True, False, True, False, False, True, True, False, False, False, False]
df = pd.DataFrame({'a': a, 'b': b, 'c': c})
assert dfd.approximate_dependencies(['a', 'b'], 'c', df, 0.9, mask)
48 changes: 48 additions & 0 deletions autonormalize/tests/test_normalize.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import numpy as np
import pandas as pd
from pandas.testing import assert_frame_equal

import autonormalize as an
from autonormalize import classes, normalize

# from classes import Dependencies
Expand Down Expand Up @@ -178,3 +180,49 @@ def test_make_indexes():
assert new_dfs[0][new_dfs[1].columns[0]][5] == val
assert new_dfs[0][new_dfs[1].columns[0]][6] == val
assert new_dfs[0][new_dfs[1].columns[0]][7] == val


def test_make_indexes_improper_column_drop():
df_dict = {'Id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'MSSubClass': [90, 60, 90, 90, 20, 50, 80, 20, 60, 20, 20],
'MSZoning': ['RL', 'RL', 'RL', 'RL', 'RL', 'RM', 'RL', 'RL', 'RL', 'RL', 'RL'],
'LotFrontage': [55.0, np.nan, 42.0, 100.0, np.nan, 98.0, 70.0, 85.0, 65.0, 78.0, 60.0],
'LotArea': [12640, 8755, 7711, 25000, 14375, 8820, 8163, 14536, 14006, 9360, 7200],
'Alley': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
'LotShape': ['IR1', 'IR1', 'IR1', 'Reg', 'IR1', 'Reg', 'Reg', 'Reg', 'IR1', 'Reg', 'Reg'],
'LandContour': ['Lvl', 'Lvl', 'Lvl', 'Low', 'Lvl', 'Lvl', 'Lvl', 'Lvl', 'Lvl', 'Lvl', 'Lvl'],
'Utilities': ['AllPub', 'AllPub', 'AllPub', 'AllPub', 'NoSeWa', 'AllPub', 'AllPub', 'AllPub', 'AllPub', 'AllPub', 'AllPub']}
df = pd.DataFrame(df_dict)

deps = classes.Dependencies({'Id': [['LotArea']],
'MSSubClass': [['LotArea'], ['LotFrontage', 'Utilities'], ['Id']],
'MSZoning': [['LotFrontage'], ['LotArea'], ['MSSubClass'], ['Id']],
'LotFrontage': [['LotArea'], ['Id']], 'LotArea': [['Id']],
'Alley': [['LotFrontage'], ['LandContour'], ['Utilities'], ['MSSubClass'], ['Id'], ['MSZoning'], ['LotArea'], ['LotShape']],
'LotShape': [['LotFrontage'], ['MSSubClass', 'Utilities', 'LandContour'], ['LotArea'], ['Id']],
'LandContour': [['LotFrontage'], ['MSSubClass', 'LotShape'], ['LotArea'], ['Id']],
'Utilities': [['MSSubClass', 'LotShape'], ['LotArea'], ['MSSubClass', 'LotFrontage'], ['Id']]}, ['id'])

depdf = normalize.DepDF(deps, df, deps.get_prim_key())
normalize.normalize_dataframe(depdf)
normalize.make_indexes(depdf)
new_dfs = depdf.return_dfs()

assert 'MSSubClass' in new_dfs[0].columns


def test_issue19():
df_dict = {'Id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'MSSubClass': [90, 60, 90, 90, 20, 50, 80, 20, 60, 20, 20],
'MSZoning': ['RL', 'RL', 'RL', 'RL', 'RL', 'RM', 'RL', 'RL', 'RL', 'RL', 'RL'],
'LotFrontage': [55.0, np.nan, 42.0, 100.0, np.nan, 98.0, 70.0, 85.0, 65.0, 78.0, 60.0],
'LotArea': [12640, 8755, 7711, 25000, 14375, 8820, 8163, 14536, 14006, 9360, 7200],
'Alley': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
'LotShape': ['IR1', 'IR1', 'IR1', 'Reg', 'IR1', 'Reg', 'Reg', 'Reg', 'IR1', 'Reg', 'Reg'],
'LandContour': ['Lvl', 'Lvl', 'Lvl', 'Low', 'Lvl', 'Lvl', 'Lvl', 'Lvl', 'Lvl', 'Lvl', 'Lvl'],
'Utilities': ['AllPub', 'AllPub', 'AllPub', 'AllPub', 'NoSeWa', 'AllPub', 'AllPub', 'AllPub', 'AllPub', 'AllPub', 'AllPub']}

df = pd.DataFrame(df_dict)

es = an.auto_entityset(df, accuracy=1.0, name="es")
print(es)