alteryx · thehomebrewnerd · Mar 27, 2020 · Mar 30, 2020 · Apr 2, 2020 · May 19, 2020
diff --git a/autonormalize/dfd.py b/autonormalize/dfd.py
@@ -1,7 +1,7 @@
 from functools import partial
 from itertools import combinations
 
-import numpy
+import pandas as pd
 from tqdm import tqdm
 
 from .classes import DfdDependencies, LHSs, Masks, Node
@@ -359,7 +359,6 @@ def approximate_dependencies(lhs_set, rhs, df, accuracy, masks):
     acc = 0
 
     for index, row in indicator.iterrows():
-
         mask = None
         for attr in lhs_set:
 
@@ -368,14 +367,19 @@ def approximate_dependencies(lhs_set, rhs, df, accuracy, masks):
                 if df[attr].dtypes.name == 'datetime64[ns]':
                     m = df[attr] == row[attr]
                 else:
-                    m = df[attr].values == row[attr]
+                    if pd.isna(row[attr]):
+                        m = df[attr].isnull()
+                    else:
+                        m = df[attr].values == row[attr]
                 masks.add_mask(attr, row[attr], m)
             if mask is None:
                 mask = m
             else:
                 mask = mask & m
         options = df[mask]
-        _, unique_counts = numpy.unique(options[rhs].to_numpy(), return_counts=True)
+
+        # _, unique_counts = np.unique(options[rhs].to_numpy(), return_counts=True)
+        unique_counts = options[rhs].value_counts()
         acc += unique_counts.sum() - unique_counts.max()
         if acc > limit:
             return False

diff --git a/autonormalize/normalize.py b/autonormalize/normalize.py
@@ -76,6 +76,7 @@ def make_indexes(depdf):
     Arguments:
         depdf (DepDF) : depDF to make indexes for
     """
+
     prim_key = depdf.deps.get_prim_key()
 
     if len(prim_key) > 1:
@@ -103,8 +104,9 @@ def make_indexes(depdf):
 
                 for index in indices[name]:
                     add[index] = new_val
-
-            depdf.parent.df.drop(columns=prim_key, inplace=True)
+            # Don't drop a column if it is needed in another parent relationship
+            to_drop = [key for key in prim_key if key not in depdf.parent.deps.serialize().keys()]
+            depdf.parent.df.drop(columns=to_drop, inplace=True)
             depdf.parent.df.insert(len(depdf.parent.df.columns), '_'.join(prim_key), add)
 
     for child in depdf.children:

diff --git a/autonormalize/tests/test_dfd.py b/autonormalize/tests/test_dfd.py
@@ -1,5 +1,6 @@
 import os
 
+import numpy as np
 import pandas as pd
 
 from autonormalize import dfd
@@ -73,21 +74,28 @@ def test_compute_partitions():
     assert not dfd.compute_partitions(df, 'c', frozenset(['a', 'b']), {}, 0.96, mask)
 
 
-# def test_approximate_dependencies():
-#     mask = dfd.Masks(['a', 'b', 'c'])
-#     a = [6, 2, 3, 7, 8, 1, 0, 2, 0, 3, 6, 0, 4, 6, 8, 7, 6, 8, 1, 5, 1, 3, 3, 0, 0, 4, 5, 5, 7, 0, 8, 2, 4, 7, 0, 0, 6, 4, 6, 8]
-#     # b = [int(x%2 == 0) for x in a]
-#     b = [1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1]
-#     # c = [(a[i] + b[i])<4 for i in range(40)]
-#     c = [False, True, True, False, False, True, True, True, True, True, False, True, False, False, False, False, False, False, True, False, True, True, True, True, True, False, False, False, False, True, False, True, False, False, True, True, False, False, False, False]
-#     df = pd.DataFrame({'a': a, 'b': b, 'c': c})
-#     assert dfd.approximate_dependencies([0, 1], 2, df, 1.00, mask, 0.90)
-#     assert dfd.approximate_dependencies(set([0, 1]), 2, df, .90, mask, 0.90)
-#     c[0] = True
-#     df = pd.DataFrame({'a': a, 'b': b, 'c': c})
-#     assert dfd.approximate_dependencies([0, 1], 2, df, .97, mask, 0.90)
-#     assert not dfd.approximate_dependencies(set([0, 1]), 2, df, .98, mask, 0.90)
-#     c[35] = False
-#     df = pd.DataFrame({'a': a, 'b': b, 'c': c})
-#     assert dfd.approximate_dependencies([0, 1], 2, df, .95, mask, 0.90)
-#     assert not dfd.approximate_dependencies([0, 1], 2, df, .96, mask, 0.90)
+def test_approximate_dependencies():
+    mask = dfd.Masks(['a', 'b', 'c'])
+    a = [6, 2, 3, 7, 8, 1, 0, 2, 0, 3, 6, 0, 4, 6, 8, 7, 6, 8, 1, 5, 1, 3, 3, 0, 0, 4, 5, 5, 7, 0, 8, 2, 4, 7, 0, 0, 6, 4, 6, 8]
+    b = [1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1]
+    c = [False, True, True, False, False, True, True, True, True, True, False, True, False, False, False, False, False, False, True, False, True, True, True, True, True, False, False, False, False, True, False, True, False, False, True, True, False, False, False, False]
+    df = pd.DataFrame({'a': a, 'b': b, 'c': c})
+    assert dfd.approximate_dependencies(['a', 'b'], 'c', df, 1.00, mask)
+    assert dfd.approximate_dependencies(['a', 'b'], 'c', df, .90, mask)
+    c[0] = True
+    df = pd.DataFrame({'a': a, 'b': b, 'c': c})
+    assert dfd.approximate_dependencies(['a', 'b'], 'c', df, .97, mask)
+    assert not dfd.approximate_dependencies(['a', 'b'], 'c', df, .98, mask)
+    c[35] = False
+    df = pd.DataFrame({'a': a, 'b': b, 'c': c})
+    assert dfd.approximate_dependencies(['a', 'b'], 'c', df, .95, mask)
+    assert not dfd.approximate_dependencies(['a', 'b'], 'c', df, .96, mask)
+
+
+def test_approximate_dependencies_with_nan():
+    mask = dfd.Masks(['a', 'b', 'c'])
+    a = [np.nan, 2, 3, 7, 8, 1, 0, 2, 0, 3, np.nan, 0, 4, np.nan, 8, 7, np.nan, 8, 1, 5, 1, 3, 3, 0, 0, 4, 5, 5, 7, 0, 8, 2, 4, 7, 0, 0, np.nan, 4, np.nan, 8]
+    b = [1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1]
+    c = [True, True, True, False, False, True, True, True, True, True, False, True, False, False, False, False, False, False, True, False, True, True, True, True, True, False, False, False, False, True, False, True, False, False, True, True, False, False, False, False]
+    df = pd.DataFrame({'a': a, 'b': b, 'c': c})
+    assert dfd.approximate_dependencies(['a', 'b'], 'c', df, 0.9, mask)
diff --git a/autonormalize/tests/test_normalize.py b/autonormalize/tests/test_normalize.py
@@ -1,6 +1,8 @@
+import numpy as np
 import pandas as pd
 from pandas.testing import assert_frame_equal
 
+import autonormalize as an
 from autonormalize import classes, normalize
 
 # from classes import Dependencies
@@ -178,3 +180,49 @@ def test_make_indexes():
     assert new_dfs[0][new_dfs[1].columns[0]][5] == val
     assert new_dfs[0][new_dfs[1].columns[0]][6] == val
     assert new_dfs[0][new_dfs[1].columns[0]][7] == val
+
+
+def test_make_indexes_improper_column_drop():
+    df_dict = {'Id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+               'MSSubClass': [90, 60, 90, 90, 20, 50, 80, 20, 60, 20, 20],
+               'MSZoning': ['RL', 'RL', 'RL', 'RL', 'RL', 'RM', 'RL', 'RL', 'RL', 'RL', 'RL'],
+               'LotFrontage': [55.0, np.nan, 42.0, 100.0, np.nan, 98.0, 70.0, 85.0, 65.0, 78.0, 60.0],
+               'LotArea': [12640, 8755, 7711, 25000, 14375, 8820, 8163, 14536, 14006, 9360, 7200],
+               'Alley': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
+               'LotShape': ['IR1', 'IR1', 'IR1', 'Reg', 'IR1', 'Reg', 'Reg', 'Reg', 'IR1', 'Reg', 'Reg'],
+               'LandContour': ['Lvl', 'Lvl', 'Lvl', 'Low', 'Lvl', 'Lvl', 'Lvl', 'Lvl', 'Lvl', 'Lvl', 'Lvl'],
+               'Utilities': ['AllPub', 'AllPub', 'AllPub', 'AllPub', 'NoSeWa', 'AllPub', 'AllPub', 'AllPub', 'AllPub', 'AllPub', 'AllPub']}
+    df = pd.DataFrame(df_dict)
+
+    deps = classes.Dependencies({'Id': [['LotArea']],
+                                 'MSSubClass': [['LotArea'], ['LotFrontage', 'Utilities'], ['Id']],
+                                 'MSZoning': [['LotFrontage'], ['LotArea'], ['MSSubClass'], ['Id']],
+                                 'LotFrontage': [['LotArea'], ['Id']], 'LotArea': [['Id']],
+                                 'Alley': [['LotFrontage'], ['LandContour'], ['Utilities'], ['MSSubClass'], ['Id'], ['MSZoning'], ['LotArea'], ['LotShape']],
+                                 'LotShape': [['LotFrontage'], ['MSSubClass', 'Utilities', 'LandContour'], ['LotArea'], ['Id']],
+                                 'LandContour': [['LotFrontage'], ['MSSubClass', 'LotShape'], ['LotArea'], ['Id']],
+                                 'Utilities': [['MSSubClass', 'LotShape'], ['LotArea'], ['MSSubClass', 'LotFrontage'], ['Id']]}, ['id'])
+
+    depdf = normalize.DepDF(deps, df, deps.get_prim_key())
+    normalize.normalize_dataframe(depdf)
+    normalize.make_indexes(depdf)
+    new_dfs = depdf.return_dfs()
+
+    assert 'MSSubClass' in new_dfs[0].columns
+
+
+def test_issue19():
+    df_dict = {'Id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+               'MSSubClass': [90, 60, 90, 90, 20, 50, 80, 20, 60, 20, 20],
+               'MSZoning': ['RL', 'RL', 'RL', 'RL', 'RL', 'RM', 'RL', 'RL', 'RL', 'RL', 'RL'],
+               'LotFrontage': [55.0, np.nan, 42.0, 100.0, np.nan, 98.0, 70.0, 85.0, 65.0, 78.0, 60.0],
+               'LotArea': [12640, 8755, 7711, 25000, 14375, 8820, 8163, 14536, 14006, 9360, 7200],
+               'Alley': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
+               'LotShape': ['IR1', 'IR1', 'IR1', 'Reg', 'IR1', 'Reg', 'Reg', 'Reg', 'IR1', 'Reg', 'Reg'],
+               'LandContour': ['Lvl', 'Lvl', 'Lvl', 'Low', 'Lvl', 'Lvl', 'Lvl', 'Lvl', 'Lvl', 'Lvl', 'Lvl'],
+               'Utilities': ['AllPub', 'AllPub', 'AllPub', 'AllPub', 'NoSeWa', 'AllPub', 'AllPub', 'AllPub', 'AllPub', 'AllPub', 'AllPub']}
+
+    df = pd.DataFrame(df_dict)
+
+    es = an.auto_entityset(df, accuracy=1.0, name="es")
+    print(es)