Skip to content

Commit

Permalink
Add workaround for nunique bug
Browse files Browse the repository at this point in the history
  • Loading branch information
Tamar Grey committed Feb 9, 2024
1 parent 7e5dbfe commit f30393c
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 0 deletions.
15 changes: 15 additions & 0 deletions featuretools/computational_backends/feature_set_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -822,6 +822,21 @@ def last_n(df):
to_merge = base_frame.groupby(groupby_col).agg(to_agg)

else:
# TODO: Remove when https://github.com/pandas-dev/pandas/issues/57317 is fixed
cols_to_fix = []
for col in base_frame.columns:
dtype = base_frame[col].dtype
if (
isinstance(dtype, pd.CategoricalDtype)
and str(dtype.categories.dtype) == "int64"
):
cols_to_fix.append(col)

if cols_to_fix:
base_frame[cols_to_fix] = base_frame[cols_to_fix].astype(
"int64",
)

to_merge = base_frame.groupby(
base_frame[groupby_col],
observed=True,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
Trend,
)
from featuretools.primitives.base import AggregationPrimitive
from featuretools.primitives.standard.aggregation.num_unique import NumUnique
from featuretools.tests.testing_utils import backward_path, to_pandas
from featuretools.utils import Trie
from featuretools.utils.gen_utils import Library, import_or_none, is_instance
Expand Down Expand Up @@ -1293,3 +1294,26 @@ def error(s):
# Calculating without precalculated features should error.
with pytest.raises(RuntimeError, match=error_msg):
FeatureSetCalculator(pd_es, feature_set=FeatureSet([direct])).run(instance_ids)


def test_nunique_nested_with_agg_bug(pd_es):
"""Pandas 2.2.0 has a bug where pd.Series.nunique produces columns with
the category dtype instead of int64 dtype, causing an error when we attempt
another aggregation"""
num_unique_feature = AggregationFeature(
Feature(pd_es["log"].ww["priority_level"]),
"sessions",
primitive=NumUnique,
)

mean_nunique_feature = AggregationFeature(
num_unique_feature,
"customers",
primitive=Mean,
)
feature_set = FeatureSet([mean_nunique_feature])
calculator = FeatureSetCalculator(pd_es, time_last=None, feature_set=feature_set)
df = calculator.run(np.array([0]))
df = to_pandas(df, index="id")

assert df.iloc[0, 0].round(4) == 1.6667

0 comments on commit f30393c

Please sign in to comment.