From f30393c8c789cd93d27e9d8a17c7c7c8d619c646 Mon Sep 17 00:00:00 2001 From: Tamar Grey Date: Fri, 9 Feb 2024 15:43:27 -0500 Subject: [PATCH] Add workaround for nunique bug --- .../feature_set_calculator.py | 15 ++++++++++++ .../test_feature_set_calculator.py | 24 +++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/featuretools/computational_backends/feature_set_calculator.py b/featuretools/computational_backends/feature_set_calculator.py index d67ce2f7d8..1731dc5f28 100644 --- a/featuretools/computational_backends/feature_set_calculator.py +++ b/featuretools/computational_backends/feature_set_calculator.py @@ -822,6 +822,21 @@ def last_n(df): to_merge = base_frame.groupby(groupby_col).agg(to_agg) else: + # TODO: Remove when https://github.com/pandas-dev/pandas/issues/57317 is fixed + cols_to_fix = [] + for col in base_frame.columns: + dtype = base_frame[col].dtype + if ( + isinstance(dtype, pd.CategoricalDtype) + and str(dtype.categories.dtype) == "int64" + ): + cols_to_fix.append(col) + + if cols_to_fix: + base_frame[cols_to_fix] = base_frame[cols_to_fix].astype( + "int64", + ) + to_merge = base_frame.groupby( base_frame[groupby_col], observed=True, diff --git a/featuretools/tests/computational_backend/test_feature_set_calculator.py b/featuretools/tests/computational_backend/test_feature_set_calculator.py index a0ad533425..68684d203b 100644 --- a/featuretools/tests/computational_backend/test_feature_set_calculator.py +++ b/featuretools/tests/computational_backend/test_feature_set_calculator.py @@ -40,6 +40,7 @@ Trend, ) from featuretools.primitives.base import AggregationPrimitive +from featuretools.primitives.standard.aggregation.num_unique import NumUnique from featuretools.tests.testing_utils import backward_path, to_pandas from featuretools.utils import Trie from featuretools.utils.gen_utils import Library, import_or_none, is_instance @@ -1293,3 +1294,26 @@ def error(s): # Calculating without precalculated features should error. with pytest.raises(RuntimeError, match=error_msg): FeatureSetCalculator(pd_es, feature_set=FeatureSet([direct])).run(instance_ids) + + +def test_nunique_nested_with_agg_bug(pd_es): + """Pandas 2.2.0 has a bug where pd.Series.nunique produces columns with + the category dtype instead of int64 dtype, causing an error when we attempt + another aggregation""" + num_unique_feature = AggregationFeature( + Feature(pd_es["log"].ww["priority_level"]), + "sessions", + primitive=NumUnique, + ) + + mean_nunique_feature = AggregationFeature( + num_unique_feature, + "customers", + primitive=Mean, + ) + feature_set = FeatureSet([mean_nunique_feature]) + calculator = FeatureSetCalculator(pd_es, time_last=None, feature_set=feature_set) + df = calculator.run(np.array([0])) + df = to_pandas(df, index="id") + + assert df.iloc[0, 0].round(4) == 1.6667