Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix remaining tests for pandas 2 compatibility #28524

Merged
merged 4 commits into from
Sep 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions sdks/python/apache_beam/dataframe/frames_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1936,6 +1936,8 @@ def test_groupby_sum_min_count(self):

self._run_test(lambda df: df.groupby('group').sum(min_count=2), df)

@unittest.skipIf(
PD_VERSION >= (2, 0), "dtypes on groups is deprecated in Pandas 2.")
def test_groupby_dtypes(self):
self._run_test(
lambda df: df.groupby('group').dtypes, GROUPBY_DF, check_proxy=False)
Expand Down Expand Up @@ -2159,6 +2161,7 @@ def test_dataframe_agg_level(self):
level=1, numeric_only=True),
GROUPBY_DF)

@unittest.skipIf(PD_VERSION >= (2, 0), "level argument removed in Pandas 2")
def test_series_agg_multifunc_level(self):
# level= is ignored for multiple agg fns
self._run_test(
Expand All @@ -2181,6 +2184,7 @@ def test_series_mean_skipna(self):
self._run_test(lambda df: df.two.mean(skipna=True), df)
self._run_test(lambda df: df.three.mean(skipna=True), df)

@unittest.skipIf(PD_VERSION >= (2, 0), "level argument removed in Pandas 2")
def test_dataframe_agg_multifunc_level(self):
# level= is ignored for multiple agg fns
self._run_test(
Expand Down
61 changes: 45 additions & 16 deletions sdks/python/apache_beam/dataframe/pandas_doctests_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,9 @@ def test_ndframe_tests(self):
' key=lambda x: np.argsort(index_natsorted(df["time"]))\n'
')'
],
# TODO(https://github.com/apache/beam/issues/28559): Re-enable when
# bug is fixed.
'pandas.core.generic.NDFrame.xs': ['*'],
**skip_writes
})
self.assertEqual(result.failed, 0)
Expand Down Expand Up @@ -296,13 +299,19 @@ def test_dataframe_tests(self):
'pandas.core.frame.DataFrame.value_counts': [
'df.value_counts(dropna=False)'
],

'pandas.core.frame.DataFrame.to_timestamp': ['*']
},
skip={
# DataFrame construction from a dictionary and
# Series requires using the len() function, which
# is a non-deferred operation that we do not allow
# DataFrame construction from a dictionary, Series, or other
# DataFrame requires using the len() function, which is a
# non-deferred operation that we do not allow
'pandas.core.frame.DataFrame': [
'pd.DataFrame(data=d, index=[0, 1, 2, 3])',
'df = pd.DataFrame(data=ser, index=["a", "c"])',
'df',
'df2 = pd.DataFrame(data=df1, index=["a", "c"])',
'df2',
],
# s2 created with reindex
'pandas.core.frame.DataFrame.dot': [
Expand Down Expand Up @@ -361,15 +370,17 @@ def test_dataframe_tests(self):
# actually raise NotImplementedError
'pandas.core.frame.DataFrame.pivot_table': ['*'],
# Expected to raise a ValueError, but we raise NotImplementedError
# pylint: disable=line-too-long
'pandas.core.frame.DataFrame.pivot': [
"df.pivot(index='foo', columns='bar', values='baz')",
"df.pivot(index='foo', columns='bar')['baz']",
"df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])",
# pylint: disable=line-too-long
'df.pivot(index="lev1", columns=["lev2", "lev3"],values="values")',
# pylint: disable=line-too-long
'df.pivot(index=["lev1", "lev2"], columns=["lev3"],values="values")'
'df.pivot(index=["lev1", "lev2"], columns=["lev3"],values="values")',
'df.pivot(index="lev1", columns=["lev2", "lev3"], values="values")',
'df.pivot(index=["lev1", "lev2"], columns=["lev3"], values="values")',
],
# pylint: enable=line-too-long
'pandas.core.frame.DataFrame.append': [
'df',
# pylint: disable=line-too-long
Expand Down Expand Up @@ -511,6 +522,8 @@ def test_series_tests(self):
'ser.groupby(["a", "b", "a", np.nan]).mean()',
'ser.groupby(["a", "b", "a", np.nan], dropna=False).mean()',
],
'pandas.core.series.Series.to_period': ['*'],
'pandas.core.series.Series.to_timestamp': ['*'],
},
skip={
# Relies on setting values with iloc
Expand All @@ -535,6 +548,8 @@ def test_series_tests(self):
'pandas.core.series.Series.idxmin': ['s.idxmin()'],
'pandas.core.series.Series.idxmax': ['s.idxmax()'],
'pandas.core.series.Series.duplicated': ['*'],
# Relies on setting index.
'pandas.core.series.Series.rename_axis': ['*'],
'pandas.core.series.Series.set_axis': ['*'],
'pandas.core.series.Series.nonzero': ['*'],
'pandas.core.series.Series.pop': ['ser'], # testing side effect
Expand Down Expand Up @@ -710,6 +725,7 @@ def test_groupby_tests(self):
'pandas.core.groupby.groupby.GroupBy.nth': ['*'],
'pandas.core.groupby.groupby.GroupBy.cumcount': ['*'],
'pandas.core.groupby.groupby.GroupBy.resample': ['*'],
'pandas.core.groupby.groupby.GroupBy.rolling': ['*'],
},
not_implemented_ok={
'pandas.core.groupby.groupby.GroupBy.first': ['*'],
Expand Down Expand Up @@ -764,16 +780,21 @@ def test_groupby_tests(self):
'df.fillna(method=\'ffill\')',
'df.fillna(method="ffill")',
'df.fillna(value=values, limit=1)',
'df.groupby("key").fillna(method="ffill")',
'df.groupby("key").fillna(method="bfill")',
'df.groupby("key").fillna(method="ffill", limit=1)',
],
'pandas.core.groupby.generic.SeriesGroupBy.fillna': [
'df.fillna(method=\'ffill\')',
'df.fillna(method="ffill")',
'df.fillna(value=values, limit=1)',
],
'pandas.core.groupby.groupby.GroupBy.tail': ['*'],
},
not_implemented_ok={
'pandas.core.groupby.generic.DataFrameGroupBy.idxmax': ['*'],
'pandas.core.groupby.generic.DataFrameGroupBy.idxmin': ['*'],
'pandas.core.groupby.generic.DataFrameGroupBy.transform': ['*'],
'pandas.core.groupby.generic.SeriesGroupBy.transform': ['*'],
'pandas.core.groupby.generic.SeriesGroupBy.idxmax': ['*'],
'pandas.core.groupby.generic.SeriesGroupBy.idxmin': ['*'],
Expand All @@ -794,14 +815,6 @@ def test_groupby_tests(self):
# These examples rely on grouping by a list
'pandas.core.groupby.generic.SeriesGroupBy.aggregate': ['*'],
'pandas.core.groupby.generic.DataFrameGroupBy.aggregate': ['*'],
'pandas.core.groupby.generic.SeriesGroupBy.transform': [
# Dropping invalid columns during a transform is unsupported.
'grouped.transform(lambda x: (x - x.mean()) / x.std())'
],
'pandas.core.groupby.generic.DataFrameGroupBy.transform': [
# Dropping invalid columns during a transform is unsupported.
'grouped.transform(lambda x: (x - x.mean()) / x.std())'
],
# Skipped idxmax/idxmin due an issue with the test framework
'pandas.core.groupby.generic.SeriesGroupBy.idxmin': ['s.idxmin()'],
'pandas.core.groupby.generic.SeriesGroupBy.idxmax': ['s.idxmax()'],
Expand All @@ -811,7 +824,24 @@ def test_groupby_tests(self):
# pylint: disable=line-too-long
"df.groupby('gender', as_index=False).value_counts(normalize=True)",
],
})
# These examples rely on grouping by a list
'pandas.core.groupby.generic.SeriesGroupBy.fillna': ['*'],
# These examples rely on grouping by a list
'pandas.core.groupby.generic.DataFrameGroupBy.fillna': ['*'],
# These examples rely on grouping by a list
'pandas.core.groupby.generic.SeriesGroupBy.take': ['*'],
# These examples rely on grouping by a list
'pandas.core.groupby.generic.DataFrameGroupBy.take': ['*'],
# Named aggregation not supported yet.
'pandas.core.groupby.generic.NamedAgg': [
'df.groupby("key").agg(result_a=agg_a, result_1=agg_1)'
],
# These examples rely on grouping by a list
'pandas.core.groupby.generic.DataFrameGroupBy.transform': ['*'],
# These examples rely on grouping by a list
'pandas.core.groupby.generic.SeriesGroupBy.transform': ['*'],
},
)
self.assertEqual(result.failed, 0)

def test_top_level(self):
Expand Down Expand Up @@ -843,7 +873,6 @@ def test_top_level(self):
'pivot_table': ['*'],
'qcut': ['*'],
'reset_option': ['*'],
'set_eng_float_format': ['*'],
'set_option': ['*'],
'to_numeric': ['*'],
'to_timedelta': ['*'],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ def concat(
period_range = _defer_to_pandas('period_range')
pivot = _call_on_first_arg('pivot')
pivot_table = _call_on_first_arg('pivot_table')
set_eng_float_format = _defer_to_pandas('set_eng_float_format')
show_versions = _defer_to_pandas('show_versions')
test = frame_base.wont_implement_method(
pd,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ def test_dataframes_with_grouped_index(self):
Record('c', 18, 150)
]

aggregate = lambda df: df.groupby('height').mean()
aggregate = lambda df: df.groupby('height').mean(numeric_only=True)

deferred_df = aggregate(to_dataframe(p | beam.Create(data)))
df_expected = aggregate(pd.DataFrame(data))
Expand Down