Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support pandas 2.2.0 #2657

Merged
merged 10 commits into from
Feb 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/tests_with_woodwork_main_branch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
strategy:
fail-fast: true
matrix:
python_version: ["3.8", "3.9", "3.10"]
python_version: ["3.9", "3.10", "3.11"]
libraries: ["core", "spark - misc", "spark - computational", "spark - entityset_1", "spark - entityset_2", "spark - primitives"]

steps:
Expand Down Expand Up @@ -62,7 +62,7 @@ jobs:

slack_alert_failure:
name: Send Slack alert if failure
needs: unit_tests_woodwork_main
needs: tests_woodwork_main
runs-on: ubuntu-latest
if: ${{ always() }}
steps:
Expand Down
2 changes: 2 additions & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@ Future Release
* Enhancements
* Fixes
* Fix dependency issues (:pr:`2644`, :pr:`2656`)
* Add workaround for pandas 2.2.0 bug with nunique and unpin pandas (:pr:`2657`)
* Changes
* Documentation Changes
* Testing Changes
* Update tests for compatibility with new versions of ``holidays`` (:pr:`2636`)
* Update ruff to 0.1.6 and use ruff linter/formatter (:pr:`2639`)
* Update ``release.yaml`` to use trusted publisher for PyPI releases (:pr:`2646`, :pr:`2653`, :pr:`2654`)
* Update dependency checkers and tests to include Dask (:pr:`2658`)
* Fix the tests that run with Woodwork main so they can be triggered (:pr:`2657`)


Thanks to the following people for contributing to this release:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -820,7 +820,6 @@ def last_n(df):
# work)
if is_instance(base_frame, (dd, ps), "DataFrame"):
to_merge = base_frame.groupby(groupby_col).agg(to_agg)

else:
to_merge = base_frame.groupby(
base_frame[groupby_col],
Expand Down
14 changes: 11 additions & 3 deletions featuretools/primitives/standard/aggregation/num_unique.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,14 @@
class NumUnique(AggregationPrimitive):
"""Determines the number of distinct values, ignoring `NaN` values.

Args:
use_string_for_pd_calc (bool): Determines if the string 'nunique' or the function
pd.Series.nunique is used for making the primitive calculation. Put in place to
account for the bug https://github.com/pandas-dev/pandas/issues/57317.
Defaults to using the string.

Examples:
>>> num_unique = NumUnique()
>>> num_unique = NumUnique(use_string_for_pd_calc=False)
>>> num_unique(['red', 'blue', 'green', 'yellow'])
4

Expand All @@ -29,6 +35,9 @@ class NumUnique(AggregationPrimitive):
compatibility = [Library.PANDAS, Library.DASK, Library.SPARK]
description_template = "the number of unique elements in {}"

def __init__(self, use_string_for_pd_calc=True):
self.use_string_for_pd_calc = use_string_for_pd_calc

def get_function(self, agg_type=Library.PANDAS):
if agg_type == Library.DASK:

Expand All @@ -51,7 +60,6 @@ def finalize(s):

return dd.Aggregation(self.name, chunk=chunk, agg=agg, finalize=finalize)

elif agg_type == Library.SPARK:
if self.use_string_for_pd_calc:
return "nunique"

return pd.Series.nunique
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
Trend,
)
from featuretools.primitives.base import AggregationPrimitive
from featuretools.primitives.standard.aggregation.num_unique import NumUnique
from featuretools.tests.testing_utils import backward_path, to_pandas
from featuretools.utils import Trie
from featuretools.utils.gen_utils import Library, import_or_none, is_instance
Expand Down Expand Up @@ -1293,3 +1294,26 @@ def error(s):
# Calculating without precalculated features should error.
with pytest.raises(RuntimeError, match=error_msg):
FeatureSetCalculator(pd_es, feature_set=FeatureSet([direct])).run(instance_ids)


def test_nunique_nested_with_agg_bug(pd_es):
"""Pandas 2.2.0 has a bug where pd.Series.nunique produces columns with
the category dtype instead of int64 dtype, causing an error when we attempt
another aggregation"""
tamargrey marked this conversation as resolved.
Show resolved Hide resolved
num_unique_feature = AggregationFeature(
Feature(pd_es["log"].ww["priority_level"]),
"sessions",
primitive=NumUnique,
)

mean_nunique_feature = AggregationFeature(
num_unique_feature,
"customers",
primitive=Mean,
)
feature_set = FeatureSet([mean_nunique_feature])
calculator = FeatureSetCalculator(pd_es, time_last=None, feature_set=feature_set)
df = calculator.run(np.array([0]))
df = to_pandas(df, index="id")

assert df.iloc[0, 0].round(4) == 1.6667
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ dependencies = [
"holidays >= 0.17",
"numpy >= 1.21.0",
"packaging >= 20.0",
"pandas >= 1.5.0,<2.2.0",
"pandas >= 1.5.0",
"psutil >= 5.6.6",
"scipy >= 1.10.0",
"tqdm >= 4.32.0",
Expand Down
Loading