Add workaround for nunique bug

alteryx · Feb 9, 2024 · f30393c · f30393c
1 parent 7e5dbfe
commit f30393c
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 0 deletions.
diff --git a/featuretools/computational_backends/feature_set_calculator.py b/featuretools/computational_backends/feature_set_calculator.py
@@ -822,6 +822,21 @@ def last_n(df):
                     to_merge = base_frame.groupby(groupby_col).agg(to_agg)
 
                 else:
+                    # TODO: Remove when https://github.com/pandas-dev/pandas/issues/57317 is fixed
+                    cols_to_fix = []
+                    for col in base_frame.columns:
+                        dtype = base_frame[col].dtype
+                        if (
+                            isinstance(dtype, pd.CategoricalDtype)
+                            and str(dtype.categories.dtype) == "int64"
+                        ):
+                            cols_to_fix.append(col)
+
+                    if cols_to_fix:
+                        base_frame[cols_to_fix] = base_frame[cols_to_fix].astype(
+                            "int64",
+                        )
+
                     to_merge = base_frame.groupby(
                         base_frame[groupby_col],
                         observed=True,

diff --git a/featuretools/tests/computational_backend/test_feature_set_calculator.py b/featuretools/tests/computational_backend/test_feature_set_calculator.py
@@ -40,6 +40,7 @@
     Trend,
 )
 from featuretools.primitives.base import AggregationPrimitive
+from featuretools.primitives.standard.aggregation.num_unique import NumUnique
 from featuretools.tests.testing_utils import backward_path, to_pandas
 from featuretools.utils import Trie
 from featuretools.utils.gen_utils import Library, import_or_none, is_instance
@@ -1293,3 +1294,26 @@ def error(s):
     # Calculating without precalculated features should error.
     with pytest.raises(RuntimeError, match=error_msg):
         FeatureSetCalculator(pd_es, feature_set=FeatureSet([direct])).run(instance_ids)
+
+
+def test_nunique_nested_with_agg_bug(pd_es):
+    """Pandas 2.2.0 has a bug where pd.Series.nunique produces columns with
+    the category dtype instead of int64 dtype, causing an error when we attempt
+    another aggregation"""
+    num_unique_feature = AggregationFeature(
+        Feature(pd_es["log"].ww["priority_level"]),
+        "sessions",
+        primitive=NumUnique,
+    )
+
+    mean_nunique_feature = AggregationFeature(
+        num_unique_feature,
+        "customers",
+        primitive=Mean,
+    )
+    feature_set = FeatureSet([mean_nunique_feature])
+    calculator = FeatureSetCalculator(pd_es, time_last=None, feature_set=feature_set)
+    df = calculator.run(np.array([0]))
+    df = to_pandas(df, index="id")
+
+    assert df.iloc[0, 0].round(4) == 1.6667