From 4b456e23278b2e92b13e5c2bd2a5e621a8057bd1 Mon Sep 17 00:00:00 2001 From: Francisco Alfaro Date: Sun, 3 Sep 2023 17:42:14 -0300 Subject: [PATCH 01/32] new pandas cheat sheet fomats (#54928) * delete README.txt add new README version add alternative Pandas Cheat Sheets learning * modify README.txt * modify README.md 1.1 --- doc/cheatsheet/README.md | 22 ++++++++++++++++++++++ doc/cheatsheet/README.txt | 8 -------- 2 files changed, 22 insertions(+), 8 deletions(-) create mode 100644 doc/cheatsheet/README.md delete mode 100644 doc/cheatsheet/README.txt diff --git a/doc/cheatsheet/README.md b/doc/cheatsheet/README.md new file mode 100644 index 0000000000000..6c33de104ed90 --- /dev/null +++ b/doc/cheatsheet/README.md @@ -0,0 +1,22 @@ +# Pandas Cheat Sheet + +The Pandas Cheat Sheet was created using Microsoft Powerpoint 2013. +To create the PDF version, within Powerpoint, simply do a "Save As" +and pick "PDF" as the format. + +This cheat sheet, originally written by Irv Lustig, [Princeton Consultants](https://www.princetonoptimization.com/), was inspired by the [RStudio Data Wrangling Cheatsheet](https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf). + +| Topic | PDF | PPT | +|------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Pandas_Cheat_Sheet | | | +| Pandas_Cheat_Sheet_JA | | | + + +**Alternative** + +Alternatively, if you want to complement your learning, you can use the Pandas Cheat sheets +developed by [DataCamp](https://www.datacamp.com/) in "PDF", "Google Colab" and "Streamlit" formats. + +| Topic | PDF | Streamlit | Google Colab | +|-------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Pandas | | | Open In Colab | diff --git a/doc/cheatsheet/README.txt b/doc/cheatsheet/README.txt deleted file mode 100644 index c57da38b31777..0000000000000 --- a/doc/cheatsheet/README.txt +++ /dev/null @@ -1,8 +0,0 @@ -The Pandas Cheat Sheet was created using Microsoft Powerpoint 2013. -To create the PDF version, within Powerpoint, simply do a "Save As" -and pick "PDF" as the format. - -This cheat sheet was inspired by the RStudio Data Wrangling Cheatsheet[1], written by Irv Lustig, Princeton Consultants[2]. - -[1]: https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf -[2]: https://www.princetonoptimization.com/ From 4d3b536975a285b180b5fc5f6d1b77700ea5d256 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 4 Sep 2023 11:09:41 +0200 Subject: [PATCH 02/32] REGR: rountripping datetime through sqlite doesn't work (#54985) --- doc/source/whatsnew/v2.1.1.rst | 1 + pandas/io/sql.py | 2 -- pandas/tests/io/test_sql.py | 7 +++++++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index a6848dad6e3cd..11b19b1508a71 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -18,6 +18,7 @@ Fixed regressions - Fixed regression in :func:`read_csv` when ``delim_whitespace`` is True (:issue:`54918`, :issue:`54931`) - Fixed regression in :meth:`.GroupBy.get_group` raising for ``axis=1`` (:issue:`54858`) - Fixed regression in :meth:`DataFrame.__setitem__` raising ``AssertionError`` when setting a :class:`Series` with a partial :class:`MultiIndex` (:issue:`54875`) +- Fixed regression in :meth:`DataFrame.to_sql` not roundtripping datetime columns correctly for sqlite (:issue:`54877`) - Fixed regression in :meth:`MultiIndex.append` raising when appending overlapping :class:`IntervalIndex` levels (:issue:`54934`) - Fixed regression in :meth:`Series.drop_duplicates` for PyArrow strings (:issue:`54904`) - Fixed regression in :meth:`Series.value_counts` raising for numeric data if ``bins`` was specified (:issue:`54857`) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 7669d5aa4cea5..2b139f8ca527c 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -2091,13 +2091,11 @@ def _adapt_time(t) -> str: adapt_date_iso = lambda val: val.isoformat() adapt_datetime_iso = lambda val: val.isoformat() - adapt_datetime_epoch = lambda val: int(val.timestamp()) sqlite3.register_adapter(time, _adapt_time) sqlite3.register_adapter(date, adapt_date_iso) sqlite3.register_adapter(datetime, adapt_datetime_iso) - sqlite3.register_adapter(datetime, adapt_datetime_epoch) convert_date = lambda val: date.fromisoformat(val.decode()) convert_datetime = lambda val: datetime.fromisoformat(val.decode()) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 9ec0ba0b12a76..bfa93a4ff910e 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2962,6 +2962,13 @@ def test_read_sql_string_inference(self): tm.assert_frame_equal(result, expected) + def test_roundtripping_datetimes(self): + # GH#54877 + df = DataFrame({"t": [datetime(2020, 12, 31, 12)]}, dtype="datetime64[ns]") + df.to_sql("test", self.conn, if_exists="replace", index=False) + result = pd.read_sql("select * from test", self.conn).iloc[0, 0] + assert result == "2020-12-31 12:00:00.000000" + @pytest.mark.db class TestMySQLAlchemy(_TestSQLAlchemy): From 31d4d8b547d1872de2fc10351c3f906d68a9c48a Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> Date: Mon, 4 Sep 2023 11:52:00 +0200 Subject: [PATCH 03/32] DOC: fix an example in whatsnew/v0.15.2.rst (#54986) fix example in whatsnew/v0.15.2.rst --- doc/source/whatsnew/v0.15.2.rst | 62 ++++++++++++++++++++++++++------- 1 file changed, 49 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.15.2.rst b/doc/source/whatsnew/v0.15.2.rst index bb7beef449d93..acc5409b86d09 100644 --- a/doc/source/whatsnew/v0.15.2.rst +++ b/doc/source/whatsnew/v0.15.2.rst @@ -24,25 +24,61 @@ API changes - Indexing in ``MultiIndex`` beyond lex-sort depth is now supported, though a lexically sorted index will have a better performance. (:issue:`2646`) - .. ipython:: python - :okexcept: - :okwarning: + .. code-block:: ipython + + In [1]: df = pd.DataFrame({'jim':[0, 0, 1, 1], + ...: 'joe':['x', 'x', 'z', 'y'], + ...: 'jolie':np.random.rand(4)}).set_index(['jim', 'joe']) + ...: - df = pd.DataFrame({'jim':[0, 0, 1, 1], - 'joe':['x', 'x', 'z', 'y'], - 'jolie':np.random.rand(4)}).set_index(['jim', 'joe']) - df - df.index.lexsort_depth + In [2]: df + Out[2]: + jolie + jim joe + 0 x 0.126970 + x 0.966718 + 1 z 0.260476 + y 0.897237 + + [4 rows x 1 columns] + + In [3]: df.index.lexsort_depth + Out[3]: 1 # in prior versions this would raise a KeyError # will now show a PerformanceWarning - df.loc[(1, 'z')] + In [4]: df.loc[(1, 'z')] + Out[4]: + jolie + jim joe + 1 z 0.260476 + + [1 rows x 1 columns] # lexically sorting - df2 = df.sort_index() - df2 - df2.index.lexsort_depth - df2.loc[(1,'z')] + In [5]: df2 = df.sort_index() + + In [6]: df2 + Out[6]: + jolie + jim joe + 0 x 0.126970 + x 0.966718 + 1 y 0.897237 + z 0.260476 + + [4 rows x 1 columns] + + In [7]: df2.index.lexsort_depth + Out[7]: 2 + + In [8]: df2.loc[(1,'z')] + Out[8]: + jolie + jim joe + 1 z 0.260476 + + [1 rows x 1 columns] - Bug in unique of Series with ``category`` dtype, which returned all categories regardless whether they were "used" or not (see :issue:`8559` for the discussion). From 982d619bddbf85a905b4ec1e719275e2ab4f833d Mon Sep 17 00:00:00 2001 From: caneff Date: Mon, 4 Sep 2023 10:31:58 -0400 Subject: [PATCH 04/32] TYP: Add typing.overload signatures to DataFrame/Series.interpolate (#54999) * Add inplace overloads for interpolate This will help our type checker work better and is a plain improvement to the type hints. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- pandas/core/generic.py | 45 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b9407ebe6624a..671cfc11df597 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7938,6 +7938,51 @@ def replace( else: return result.__finalize__(self, method="replace") + @overload + def interpolate( + self, + method: InterpolateOptions = ..., + *, + axis: Axis = ..., + limit: int | None = ..., + inplace: Literal[False] = ..., + limit_direction: Literal["forward", "backward", "both"] | None = ..., + limit_area: Literal["inside", "outside"] | None = ..., + downcast: Literal["infer"] | None | lib.NoDefault = ..., + **kwargs, + ) -> Self: + ... + + @overload + def interpolate( + self, + method: InterpolateOptions = ..., + *, + axis: Axis = ..., + limit: int | None = ..., + inplace: Literal[True], + limit_direction: Literal["forward", "backward", "both"] | None = ..., + limit_area: Literal["inside", "outside"] | None = ..., + downcast: Literal["infer"] | None | lib.NoDefault = ..., + **kwargs, + ) -> None: + ... + + @overload + def interpolate( + self, + method: InterpolateOptions = ..., + *, + axis: Axis = ..., + limit: int | None = ..., + inplace: bool_t = ..., + limit_direction: Literal["forward", "backward", "both"] | None = ..., + limit_area: Literal["inside", "outside"] | None = ..., + downcast: Literal["infer"] | None | lib.NoDefault = ..., + **kwargs, + ) -> Self | None: + ... + @final def interpolate( self, From e30e5f85c2e28aedb6273116f1e112c1ecc859f0 Mon Sep 17 00:00:00 2001 From: caneff Date: Mon, 4 Sep 2023 13:23:33 -0400 Subject: [PATCH 05/32] TYP: Add typing.overload signatures to DataFrame/Series.clip (#55002) * TYP: Add typing.overload signatures to DataFrame/Series.clip This adds overloads so that a type checker can determine whether clip returns a Series/DataFrame or None based on the value of the inplace argument. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- pandas/core/generic.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 671cfc11df597..e9b0c23b18373 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8652,6 +8652,42 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace): # GH 40420 return self.where(subset, threshold, axis=axis, inplace=inplace) + @overload + def clip( + self, + lower=..., + upper=..., + *, + axis: Axis | None = ..., + inplace: Literal[False] = ..., + **kwargs, + ) -> Self: + ... + + @overload + def clip( + self, + lower=..., + upper=..., + *, + axis: Axis | None = ..., + inplace: Literal[True], + **kwargs, + ) -> None: + ... + + @overload + def clip( + self, + lower=..., + upper=..., + *, + axis: Axis | None = ..., + inplace: bool_t = ..., + **kwargs, + ) -> Self | None: + ... + @final def clip( self, From f03482094a6f17cedf1e0db3bc474ff4944a518f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 4 Sep 2023 19:52:19 -0400 Subject: [PATCH 06/32] [pre-commit.ci] pre-commit autoupdate (#55004) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [pre-commit.ci] pre-commit autoupdate updates: - [github.com/astral-sh/ruff-pre-commit: v0.0.285 → v0.0.287](https://github.com/astral-sh/ruff-pre-commit/compare/v0.0.285...v0.0.287) - [github.com/jendrikseipp/vulture: v2.7 → v2.9.1](https://github.com/jendrikseipp/vulture/compare/v2.7...v2.9.1) - [github.com/pylint-dev/pylint: v3.0.0a6 → v3.0.0a7](https://github.com/pylint-dev/pylint/compare/v3.0.0a6...v3.0.0a7) - [github.com/sphinx-contrib/sphinx-lint: v0.6.7 → v0.6.8](https://github.com/sphinx-contrib/sphinx-lint/compare/v0.6.7...v0.6.8) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 8 ++++---- asv_bench/benchmarks/array.py | 2 +- asv_bench/benchmarks/join_merge.py | 4 ++-- pandas/core/dtypes/dtypes.py | 2 +- pandas/core/indexes/api.py | 2 +- pandas/tests/frame/methods/test_copy.py | 2 +- pandas/tests/frame/methods/test_reset_index.py | 8 ++++---- pandas/tests/frame/methods/test_sort_index.py | 2 +- pandas/tests/frame/test_constructors.py | 6 +++--- pandas/tests/groupby/test_groupby.py | 2 +- pandas/tests/groupby/test_timegrouper.py | 2 +- pandas/tests/indexes/multi/test_partial_indexing.py | 2 +- pandas/tests/indexing/multiindex/test_getitem.py | 2 +- pandas/tests/io/json/test_pandas.py | 2 +- pandas/tests/io/json/test_ujson.py | 2 +- pandas/tests/io/test_parquet.py | 4 ++-- pandas/tests/io/test_stata.py | 2 +- pandas/tests/reshape/test_cut.py | 2 +- pandas/tests/reshape/test_pivot.py | 4 ++-- pandas/tests/series/methods/test_reindex.py | 4 ++-- pandas/tests/window/test_rolling_functions.py | 6 +++--- 21 files changed, 35 insertions(+), 35 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9f9bcd78c07b0..c01bf65818167 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -24,7 +24,7 @@ repos: hooks: - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.0.285 + rev: v0.0.287 hooks: - id: ruff args: [--exit-non-zero-on-fix] @@ -34,7 +34,7 @@ repos: alias: ruff-selected-autofixes args: [--select, "ANN001,ANN204", --fix-only, --exit-non-zero-on-fix] - repo: https://github.com/jendrikseipp/vulture - rev: 'v2.7' + rev: 'v2.9.1' hooks: - id: vulture entry: python scripts/run_vulture.py @@ -84,7 +84,7 @@ repos: '--filter=-readability/casting,-runtime/int,-build/include_subdir,-readability/fn_size' ] - repo: https://github.com/pylint-dev/pylint - rev: v3.0.0a6 + rev: v3.0.0a7 hooks: - id: pylint stages: [manual] @@ -124,7 +124,7 @@ repos: types: [text] # overwrite types: [rst] types_or: [python, rst] - repo: https://github.com/sphinx-contrib/sphinx-lint - rev: v0.6.7 + rev: v0.6.8 hooks: - id: sphinx-lint - repo: local diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index 09c4acc0ab309..0229cf15fbfb8 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -90,7 +90,7 @@ def time_setitem(self, multiple_chunks): self.array[i] = "foo" def time_setitem_list(self, multiple_chunks): - indexer = list(range(0, 50)) + list(range(-1000, 0, 50)) + indexer = list(range(50)) + list(range(-1000, 0, 50)) self.array[indexer] = ["foo"] * len(indexer) def time_setitem_slice(self, multiple_chunks): diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 54bcdb0fa2843..04ac47a892a22 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -360,14 +360,14 @@ class MergeCategoricals: def setup(self): self.left_object = DataFrame( { - "X": np.random.choice(range(0, 10), size=(10000,)), + "X": np.random.choice(range(10), size=(10000,)), "Y": np.random.choice(["one", "two", "three"], size=(10000,)), } ) self.right_object = DataFrame( { - "X": np.random.choice(range(0, 10), size=(10000,)), + "X": np.random.choice(range(10), size=(10000,)), "Z": np.random.choice(["jjj", "kkk", "sss"], size=(10000,)), } ) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index f76163cbbd0a1..0589dc5b717a4 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -70,7 +70,7 @@ from collections.abc import MutableMapping from datetime import tzinfo - import pyarrow as pa # noqa: F811, TCH004 + import pyarrow as pa # noqa: TCH004 from pandas._typing import ( Dtype, diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 781dfae7fef64..a8ef0e034ba9b 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -377,5 +377,5 @@ def all_indexes_same(indexes) -> bool: def default_index(n: int) -> RangeIndex: - rng = range(0, n) + rng = range(n) return RangeIndex._simple_new(rng, name=None) diff --git a/pandas/tests/frame/methods/test_copy.py b/pandas/tests/frame/methods/test_copy.py index 95fcaaa473067..e7901ed363106 100644 --- a/pandas/tests/frame/methods/test_copy.py +++ b/pandas/tests/frame/methods/test_copy.py @@ -56,7 +56,7 @@ def test_copy_consolidates(self): } ) - for i in range(0, 10): + for i in range(10): df.loc[:, f"n_{i}"] = np.random.default_rng(2).integers(0, 100, size=55) assert len(df._mgr.blocks) == 11 diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index d99dd36f3a2e3..339e19254fd10 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -788,15 +788,15 @@ def test_errorreset_index_rename(float_frame): def test_reset_index_false_index_name(): - result_series = Series(data=range(5, 10), index=range(0, 5)) + result_series = Series(data=range(5, 10), index=range(5)) result_series.index.name = False result_series.reset_index() - expected_series = Series(range(5, 10), RangeIndex(range(0, 5), name=False)) + expected_series = Series(range(5, 10), RangeIndex(range(5), name=False)) tm.assert_series_equal(result_series, expected_series) # GH 38147 - result_frame = DataFrame(data=range(5, 10), index=range(0, 5)) + result_frame = DataFrame(data=range(5, 10), index=range(5)) result_frame.index.name = False result_frame.reset_index() - expected_frame = DataFrame(range(5, 10), RangeIndex(range(0, 5), name=False)) + expected_frame = DataFrame(range(5, 10), RangeIndex(range(5), name=False)) tm.assert_frame_equal(result_frame, expected_frame) diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 228b62a418813..985a9e3602410 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -911,7 +911,7 @@ def test_sort_index_multiindex_sparse_column(self): expected = DataFrame( { i: pd.array([0.0, 0.0, 0.0, 0.0], dtype=pd.SparseDtype("float64", 0.0)) - for i in range(0, 4) + for i in range(4) }, index=MultiIndex.from_product([[1, 2], [1, 2]]), ) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 3e2cde37c30eb..fd851ab244cb8 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -692,12 +692,12 @@ def test_constructor_error_msgs(self): arr = np.array([[4, 5, 6]]) msg = r"Shape of passed values is \(1, 3\), indices imply \(1, 4\)" with pytest.raises(ValueError, match=msg): - DataFrame(index=[0], columns=range(0, 4), data=arr) + DataFrame(index=[0], columns=range(4), data=arr) arr = np.array([4, 5, 6]) msg = r"Shape of passed values is \(3, 1\), indices imply \(1, 4\)" with pytest.raises(ValueError, match=msg): - DataFrame(index=[0], columns=range(0, 4), data=arr) + DataFrame(index=[0], columns=range(4), data=arr) # higher dim raise exception with pytest.raises(ValueError, match="Must pass 2-d input"): @@ -2391,7 +2391,7 @@ def test_construct_with_two_categoricalindex_series(self): def test_constructor_series_nonexact_categoricalindex(self): # GH 42424 - ser = Series(range(0, 100)) + ser = Series(range(100)) ser1 = cut(ser, 10).value_counts().head(5) ser2 = cut(ser, 10).value_counts().tail(5) result = DataFrame({"1": ser1, "2": ser2}) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index be226b4466f98..1e6d220199e22 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1928,7 +1928,7 @@ def test_pivot_table_values_key_error(): df = DataFrame( { "eventDate": date_range(datetime.today(), periods=20, freq="M").tolist(), - "thename": range(0, 20), + "thename": range(20), } ) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index c9fe011f7063b..55f96bd1443de 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -842,7 +842,7 @@ def test_grouper_period_index(self): result = period_series.groupby(period_series.index.month).sum() expected = Series( - range(0, periods), index=Index(range(1, periods + 1), name=index.name) + range(periods), index=Index(range(1, periods + 1), name=index.name) ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/multi/test_partial_indexing.py b/pandas/tests/indexes/multi/test_partial_indexing.py index 47efc43d5eae0..66163dad3deae 100644 --- a/pandas/tests/indexes/multi/test_partial_indexing.py +++ b/pandas/tests/indexes/multi/test_partial_indexing.py @@ -31,7 +31,7 @@ def df(): dr = date_range("2016-01-01", "2016-01-03", freq="12H") abc = ["a", "b", "c"] mi = MultiIndex.from_product([dr, abc]) - frame = DataFrame({"c1": range(0, 15)}, index=mi) + frame = DataFrame({"c1": range(15)}, index=mi) return frame diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index 9d11827e2923e..b86e233110e88 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -148,7 +148,7 @@ def test_frame_getitem_simple_key_error( def test_tuple_string_column_names(): # GH#50372 mi = MultiIndex.from_tuples([("a", "aa"), ("a", "ab"), ("b", "ba"), ("b", "bb")]) - df = DataFrame([range(0, 4), range(1, 5), range(2, 6)], columns=mi) + df = DataFrame([range(4), range(1, 5), range(2, 6)], columns=mi) df["single_index"] = 0 df_flat = df.copy() diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index ca3ce6ba34515..b3c2e67f7c318 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2044,7 +2044,7 @@ def test_read_json_dtype_backend(self, string_storage, dtype_backend, orient): ) if orient == "values": - expected.columns = list(range(0, 8)) + expected.columns = list(range(8)) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 5bb7097770820..d5f8c5200c4a3 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -1033,7 +1033,7 @@ def test_decode_floating_point(self, sign, float_number): def test_encode_big_set(self): s = set() - for x in range(0, 100000): + for x in range(100000): s.add(x) # Make sure no Exception is raised. diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index db3909c147ad3..55445e44b9366 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1012,7 +1012,7 @@ def test_timezone_aware_index(self, request, pa, timezone_aware_date_list): def test_filter_row_groups(self, pa): # https://github.com/pandas-dev/pandas/issues/26551 pytest.importorskip("pyarrow") - df = pd.DataFrame({"a": list(range(0, 3))}) + df = pd.DataFrame({"a": list(range(3))}) with tm.ensure_clean() as path: df.to_parquet(path, engine=pa) result = read_parquet( @@ -1219,7 +1219,7 @@ def test_categorical(self, fp): check_round_trip(df, fp) def test_filter_row_groups(self, fp): - d = {"a": list(range(0, 3))} + d = {"a": list(range(3))} df = pd.DataFrame(d) with tm.ensure_clean() as path: df.to_parquet(path, engine=fp, compression=None, row_group_offsets=1) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 7459aa1df8f3e..cd504616b6c5d 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -798,7 +798,7 @@ def test_missing_value_generator(self): expected_values.insert(0, ".") for t in types: offset = valid_range[t][1] - for i in range(0, 27): + for i in range(27): val = StataMissingValue(offset + 1 + i) assert val.string == expected_values[i] diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index b2a6ac49fdff2..81b466b059702 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -700,7 +700,7 @@ def test_cut_with_duplicated_index_lowest_included(): def test_cut_with_nonexact_categorical_indices(): # GH 42424 - ser = Series(range(0, 100)) + ser = Series(range(100)) ser1 = cut(ser, 10).value_counts().head(5) ser2 = cut(ser, 10).value_counts().tail(5) result = DataFrame({"1": ser1, "2": ser2}) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 46da18445e135..c43fd05fd5501 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -33,7 +33,7 @@ def dropna(request): return request.param -@pytest.fixture(params=[([0] * 4, [1] * 4), (range(0, 3), range(1, 4))]) +@pytest.fixture(params=[([0] * 4, [1] * 4), (range(3), range(1, 4))]) def interval_values(request, closed): left, right = request.param return Categorical(pd.IntervalIndex.from_arrays(left, right, closed)) @@ -215,7 +215,7 @@ def test_pivot_table_dropna_categoricals(self, dropna): { "A": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], "B": [1, 2, 3, 1, 2, 3, 1, 2, 3], - "C": range(0, 9), + "C": range(9), } ) diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index bce7d2d554004..016208f2d2026 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -159,9 +159,9 @@ def test_reindex_inference(): def test_reindex_downcasting(): # GH4618 shifted series downcasting - s = Series(False, index=range(0, 5)) + s = Series(False, index=range(5)) result = s.shift(1).bfill() - expected = Series(False, index=range(0, 5)) + expected = Series(False, index=range(5)) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/window/test_rolling_functions.py b/pandas/tests/window/test_rolling_functions.py index 940f0845befa2..51f801ab3761b 100644 --- a/pandas/tests/window/test_rolling_functions.py +++ b/pandas/tests/window/test_rolling_functions.py @@ -388,7 +388,7 @@ def test_rolling_max_resample(step): # So that we can have 3 datapoints on last day (4, 10, and 20) indices.append(datetime(1975, 1, 5, 1)) indices.append(datetime(1975, 1, 5, 2)) - series = Series(list(range(0, 5)) + [10, 20], index=indices) + series = Series(list(range(5)) + [10, 20], index=indices) # Use floats instead of ints as values series = series.map(lambda x: float(x)) # Sort chronologically @@ -425,7 +425,7 @@ def test_rolling_min_resample(step): # So that we can have 3 datapoints on last day (4, 10, and 20) indices.append(datetime(1975, 1, 5, 1)) indices.append(datetime(1975, 1, 5, 2)) - series = Series(list(range(0, 5)) + [10, 20], index=indices) + series = Series(list(range(5)) + [10, 20], index=indices) # Use floats instead of ints as values series = series.map(lambda x: float(x)) # Sort chronologically @@ -445,7 +445,7 @@ def test_rolling_median_resample(): # So that we can have 3 datapoints on last day (4, 10, and 20) indices.append(datetime(1975, 1, 5, 1)) indices.append(datetime(1975, 1, 5, 2)) - series = Series(list(range(0, 5)) + [10, 20], index=indices) + series = Series(list(range(5)) + [10, 20], index=indices) # Use floats instead of ints as values series = series.map(lambda x: float(x)) # Sort chronologically From 4683e920434cbc2ee9e797e106d616c74f72afd0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 5 Sep 2023 20:07:21 +0200 Subject: [PATCH 07/32] BUG: ArrowDtype raising for fixed size list (#55000) * BUG: ArrowDtype raising for fixed size list * Update v2.1.1.rst * Update test_arrow.py --- doc/source/whatsnew/v2.1.1.rst | 1 + pandas/core/dtypes/dtypes.py | 2 ++ pandas/tests/extension/test_arrow.py | 9 +++++++++ 3 files changed, 12 insertions(+) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index 11b19b1508a71..64d7481117e8e 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -29,6 +29,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Fixed bug for :class:`ArrowDtype` raising ``NotImplementedError`` for fixed-size list (:issue:`55000`) - Fixed bug in :meth:`DataFrame.stack` with ``future_stack=True`` and columns a non-:class:`MultiIndex` consisting of tuples (:issue:`54948`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 0589dc5b717a4..12de63967c78f 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2148,6 +2148,8 @@ def type(self): return CategoricalDtypeType elif pa.types.is_list(pa_type) or pa.types.is_large_list(pa_type): return list + elif pa.types.is_fixed_size_list(pa_type): + return list elif pa.types.is_map(pa_type): return list elif pa.types.is_struct(pa_type): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 5f1b16a44b8e9..fa6e85ba204d2 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2992,6 +2992,15 @@ def test_groupby_count_return_arrow_dtype(data_missing): tm.assert_frame_equal(result, expected) +def test_fixed_size_list(): + # GH#55000 + ser = pd.Series( + [[1, 2], [3, 4]], dtype=ArrowDtype(pa.list_(pa.int64(), list_size=2)) + ) + result = ser.dtype.type + assert result == list + + def test_arrowextensiondtype_dataframe_repr(): # GH 54062 df = pd.DataFrame( From dac46b4652cd94bf6b873af372df3f1544796d5e Mon Sep 17 00:00:00 2001 From: mhb143 <139927657+mhb143@users.noreply.github.com> Date: Tue, 5 Sep 2023 12:09:45 -0600 Subject: [PATCH 08/32] DOC: Grammatically updated the tech docs (#54989) Grammatically updated the tech docs Co-authored-by: Molly Bowers --- .../getting_started/intro_tutorials/01_table_oriented.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst index 2dcc8b0abe3b8..caaff3557ae40 100644 --- a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst +++ b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst @@ -106,9 +106,9 @@ between square brackets ``[]``. .. note:: - If you are familiar to Python + If you are familiar with Python :ref:`dictionaries `, the selection of a - single column is very similar to selection of dictionary values based on + single column is very similar to the selection of dictionary values based on the key. You can create a ``Series`` from scratch as well: From 5b02305db6647d030ec66788a8ca9f37fe9a2790 Mon Sep 17 00:00:00 2001 From: Paul Uhlenbruck <48606747+pauluhlenbruck@users.noreply.github.com> Date: Tue, 5 Sep 2023 20:12:12 +0200 Subject: [PATCH 09/32] DOC: expanded pandas.DataFrame.to_sql docstring (#54988) expanded pandas.DataFrame.to_sql docstring Co-authored-by: vboxuser --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e9b0c23b18373..06284b05ba1b1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2847,7 +2847,7 @@ def to_sql( index : bool, default True Write DataFrame index as a column. Uses `index_label` as the column - name in the table. + name in the table. Creates a table index for this column. index_label : str or sequence, default None Column label for index column(s). If None is given (default) and `index` is True, then the index names are used. From 00bf889426ac40426cec0b4d71eb361a24ad1a8f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 5 Sep 2023 20:29:50 +0200 Subject: [PATCH 10/32] ENH: Use more arrow compute functions for string[pyarrow] dtype (#54957) --- pandas/core/arrays/string_arrow.py | 50 +++++++++++++++--------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index aaa515ac459bd..60d7ae1b998f5 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -417,7 +417,7 @@ def _str_isupper(self): def _str_len(self): result = pc.utf8_length(self._pa_array) - return Int64Dtype().__from_arrow__(result) + return self._convert_int_dtype(result) def _str_lower(self): return type(self)(pc.utf8_lower(self._pa_array)) @@ -446,6 +446,29 @@ def _str_rstrip(self, to_strip=None): result = pc.utf8_rtrim(self._pa_array, characters=to_strip) return type(self)(result) + def _str_count(self, pat: str, flags: int = 0): + if flags: + return super()._str_count(pat, flags) + result = pc.count_substring_regex(self._pa_array, pat) + return self._convert_int_dtype(result) + + def _str_find(self, sub: str, start: int = 0, end: int | None = None): + if start != 0 and end is not None: + slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) + result = pc.find_substring(slices, sub) + not_found = pc.equal(result, -1) + offset_result = pc.add(result, end - start) + result = pc.if_else(not_found, result, offset_result) + elif start == 0 and end is None: + slices = self._pa_array + result = pc.find_substring(slices, sub) + else: + return super()._str_find(sub, start, end) + return self._convert_int_dtype(result) + + def _convert_int_dtype(self, result): + return Int64Dtype().__from_arrow__(result) + class ArrowStringArrayNumpySemantics(ArrowStringArray): _storage = "pyarrow_numpy" @@ -526,34 +549,11 @@ def _str_map( return lib.map_infer_mask(arr, f, mask.view("uint8")) def _convert_int_dtype(self, result): + result = result.to_numpy() if result.dtype == np.int32: result = result.astype(np.int64) return result - def _str_count(self, pat: str, flags: int = 0): - if flags: - return super()._str_count(pat, flags) - result = pc.count_substring_regex(self._pa_array, pat).to_numpy() - return self._convert_int_dtype(result) - - def _str_len(self): - result = pc.utf8_length(self._pa_array).to_numpy() - return self._convert_int_dtype(result) - - def _str_find(self, sub: str, start: int = 0, end: int | None = None): - if start != 0 and end is not None: - slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) - result = pc.find_substring(slices, sub) - not_found = pc.equal(result, -1) - offset_result = pc.add(result, end - start) - result = pc.if_else(not_found, result, offset_result) - elif start == 0 and end is None: - slices = self._pa_array - result = pc.find_substring(slices, sub) - else: - return super()._str_find(sub, start, end) - return self._convert_int_dtype(result.to_numpy()) - def _cmp_method(self, other, op): result = super()._cmp_method(other, op) return result.to_numpy(np.bool_, na_value=False) From e1ec244688b80b52456264285033659bed01ffcd Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 5 Sep 2023 20:43:25 +0200 Subject: [PATCH 11/32] REGR: interpolate raising if fill_value is given (#54927) * REGR: interpolate raising if fill_value is given * Update test and message * Update pandas/core/generic.py --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.1.1.rst | 1 + pandas/core/generic.py | 5 +++-- pandas/tests/series/methods/test_interpolate.py | 8 ++++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index 64d7481117e8e..6f431ca7eea5f 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -21,6 +21,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.to_sql` not roundtripping datetime columns correctly for sqlite (:issue:`54877`) - Fixed regression in :meth:`MultiIndex.append` raising when appending overlapping :class:`IntervalIndex` levels (:issue:`54934`) - Fixed regression in :meth:`Series.drop_duplicates` for PyArrow strings (:issue:`54904`) +- Fixed regression in :meth:`Series.interpolate` raising when ``fill_value`` was given (:issue:`54920`) - Fixed regression in :meth:`Series.value_counts` raising for numeric data if ``bins`` was specified (:issue:`54857`) - Fixed regression when comparing a :class:`Series` with ``datetime64`` dtype with ``None`` (:issue:`54870`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 06284b05ba1b1..e6bf55a1cbadf 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8225,10 +8225,11 @@ def interpolate( stacklevel=find_stack_level(), ) - if "fill_value" in kwargs: + if method in fillna_methods and "fill_value" in kwargs: raise ValueError( "'fill_value' is not a valid keyword for " - f"{type(self).__name__}.interpolate" + f"{type(self).__name__}.interpolate with method from " + f"{fillna_methods}" ) if isinstance(obj.index, MultiIndex) and method != "linear": diff --git a/pandas/tests/series/methods/test_interpolate.py b/pandas/tests/series/methods/test_interpolate.py index 619690f400d98..549f429f09d35 100644 --- a/pandas/tests/series/methods/test_interpolate.py +++ b/pandas/tests/series/methods/test_interpolate.py @@ -858,3 +858,11 @@ def test_interpolate_asfreq_raises(self): with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning(FutureWarning, match=msg2): ser.interpolate(method="asfreq") + + def test_interpolate_fill_value(self): + # GH#54920 + pytest.importorskip("scipy") + ser = Series([np.nan, 0, 1, np.nan, 3, np.nan]) + result = ser.interpolate(method="nearest", fill_value=0) + expected = Series([np.nan, 0, 1, 1, 3, 0]) + tm.assert_series_equal(result, expected) From e7a1f9ddc88a32fe1c9f4379f9344595137f8d20 Mon Sep 17 00:00:00 2001 From: Abdullah Ihsan Secer Date: Tue, 5 Sep 2023 19:44:58 +0100 Subject: [PATCH 12/32] BUG: Fix Rolling where duplicate datetimelike indexes are treated as consecutive rather than equal with closed='left' and closed='neither' (#54917) * Add bugfix for rolling window with nonunique datetimelike index * Run black * Add entry to whatsnew * Fix VariableOffsetWindowIndexer * Simplify change in indexers.pyx * Add test --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/_libs/window/indexers.pyx | 2 + pandas/core/indexers/objects.py | 4 +- pandas/tests/window/test_groupby.py | 38 +++++++++------- pandas/tests/window/test_rolling.py | 70 +++++++++++++++++++++++++++++ 5 files changed, 98 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 89b4d102fcf04..bd15d5fa085e9 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -168,6 +168,7 @@ Performance improvements Bug fixes ~~~~~~~~~ - Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`) +- Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) Categorical ^^^^^^^^^^^ diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index 02934346130a5..7b306c5e681e0 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -138,6 +138,8 @@ def calculate_variable_window_bounds( break # end bound is previous end # or current index + elif index[end[i - 1]] == end_bound and not right_closed: + end[i] = end[i - 1] + 1 elif (index[end[i - 1]] - end_bound) * index_growth_sign <= 0: end[i] = i + 1 else: diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py index 694a420ad2494..c13ec51ff3851 100644 --- a/pandas/core/indexers/objects.py +++ b/pandas/core/indexers/objects.py @@ -262,7 +262,9 @@ def get_window_bounds( # end bound is previous end # or current index end_diff = (self.index[end[i - 1]] - end_bound) * index_growth_sign - if end_diff <= zero: + if end_diff == zero and not right_closed: + end[i] = end[i - 1] + 1 + elif end_diff <= zero: end[i] = i + 1 else: end[i] = end[i - 1] diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index ab00e18fc4812..46ab00c3e2284 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -466,20 +466,23 @@ def test_groupby_rolling_subset_with_closed(self): # GH 35549 df = DataFrame( { - "column1": range(6), - "column2": range(6), - "group": 3 * ["A", "B"], - "date": [Timestamp("2019-01-01")] * 6, + "column1": range(8), + "column2": range(8), + "group": ["A"] * 4 + ["B"] * 4, + "date": [ + Timestamp(date) + for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"] + ] + * 2, } ) result = ( df.groupby("group").rolling("1D", on="date", closed="left")["column1"].sum() ) expected = Series( - [np.nan, 0.0, 2.0, np.nan, 1.0, 4.0], - index=MultiIndex.from_tuples( - [("A", Timestamp("2019-01-01"))] * 3 - + [("B", Timestamp("2019-01-01"))] * 3, + [np.nan, np.nan, 1.0, 1.0, np.nan, np.nan, 9.0, 9.0], + index=MultiIndex.from_frame( + df[["group", "date"]], names=["group", "date"], ), name="column1", @@ -490,10 +493,14 @@ def test_groupby_subset_rolling_subset_with_closed(self): # GH 35549 df = DataFrame( { - "column1": range(6), - "column2": range(6), - "group": 3 * ["A", "B"], - "date": [Timestamp("2019-01-01")] * 6, + "column1": range(8), + "column2": range(8), + "group": ["A"] * 4 + ["B"] * 4, + "date": [ + Timestamp(date) + for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"] + ] + * 2, } ) @@ -503,10 +510,9 @@ def test_groupby_subset_rolling_subset_with_closed(self): .sum() ) expected = Series( - [np.nan, 0.0, 2.0, np.nan, 1.0, 4.0], - index=MultiIndex.from_tuples( - [("A", Timestamp("2019-01-01"))] * 3 - + [("B", Timestamp("2019-01-01"))] * 3, + [np.nan, np.nan, 1.0, 1.0, np.nan, np.nan, 9.0, 9.0], + index=MultiIndex.from_frame( + df[["group", "date"]], names=["group", "date"], ), name="column1", diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index f4d903dc19fb7..a02f132e540ac 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -304,6 +304,76 @@ def test_datetimelike_nonunique_index_centering( tm.assert_equal(result, expected) +@pytest.mark.parametrize( + "closed,expected", + [ + ("left", [np.nan, np.nan, 1, 1, 1, 10, 14, 14, 18, 21]), + ("neither", [np.nan, np.nan, 1, 1, 1, 9, 5, 5, 13, 8]), + ("right", [0, 1, 3, 6, 10, 14, 11, 18, 21, 17]), + ("both", [0, 1, 3, 6, 10, 15, 20, 27, 26, 30]), + ], +) +def test_variable_window_nonunique(closed, expected, frame_or_series): + # GH 20712 + index = DatetimeIndex( + [ + "2011-01-01", + "2011-01-01", + "2011-01-02", + "2011-01-02", + "2011-01-02", + "2011-01-03", + "2011-01-04", + "2011-01-04", + "2011-01-05", + "2011-01-06", + ] + ) + + df = frame_or_series(range(10), index=index, dtype=float) + expected = frame_or_series(expected, index=index, dtype=float) + + result = df.rolling("2D", closed=closed).sum() + + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "closed,expected", + [ + ("left", [np.nan, np.nan, 1, 1, 1, 10, 15, 15, 18, 21]), + ("neither", [np.nan, np.nan, 1, 1, 1, 10, 15, 15, 13, 8]), + ("right", [0, 1, 3, 6, 10, 15, 21, 28, 21, 17]), + ("both", [0, 1, 3, 6, 10, 15, 21, 28, 26, 30]), + ], +) +def test_variable_offset_window_nonunique(closed, expected, frame_or_series): + # GH 20712 + index = DatetimeIndex( + [ + "2011-01-01", + "2011-01-01", + "2011-01-02", + "2011-01-02", + "2011-01-02", + "2011-01-03", + "2011-01-04", + "2011-01-04", + "2011-01-05", + "2011-01-06", + ] + ) + + df = frame_or_series(range(10), index=index, dtype=float) + expected = frame_or_series(expected, index=index, dtype=float) + + offset = BusinessDay(2) + indexer = VariableOffsetWindowIndexer(index=index, offset=offset) + result = df.rolling(indexer, closed=closed, min_periods=1).sum() + + tm.assert_equal(result, expected) + + def test_even_number_window_alignment(): # see discussion in GH 38780 s = Series(range(3), index=date_range(start="2020-01-01", freq="D", periods=3)) From a317995665c5185a7a31316555d34fe5653941fc Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 5 Sep 2023 20:45:42 +0200 Subject: [PATCH 13/32] REGR: concat raising for 2 different ea dtypes (#54914) * REGR: concat raising for 2 different ea dtypes * Update --- doc/source/whatsnew/v2.1.1.rst | 1 + pandas/core/internals/concat.py | 2 +- pandas/tests/reshape/concat/test_concat.py | 9 +++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index 6f431ca7eea5f..258f05d4277bd 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -13,6 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed regression in :func:`concat` when :class:`DataFrame` 's have two different extension dtypes (:issue:`54848`) - Fixed regression in :func:`merge` when merging over a PyArrow string index (:issue:`54894`) - Fixed regression in :func:`read_csv` when ``usecols`` is given and ``dtypes`` is a dict for ``engine="python"`` (:issue:`54868`) - Fixed regression in :func:`read_csv` when ``delim_whitespace`` is True (:issue:`54918`, :issue:`54931`) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 4d33f0137d3c4..b2d463a8c6c26 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -177,7 +177,7 @@ def concatenate_managers( values = np.concatenate(vals, axis=1) # type: ignore[arg-type] elif is_1d_only_ea_dtype(blk.dtype): # TODO(EA2D): special-casing not needed with 2D EAs - values = concat_compat(vals, axis=1, ea_compat_axis=True) + values = concat_compat(vals, axis=0, ea_compat_axis=True) values = ensure_block_shape(values, ndim=2) else: values = concat_compat(vals, axis=1) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 3efcd930af581..5dde863f246d1 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -858,3 +858,12 @@ def test_concat_multiindex_with_category(): ) expected = expected.set_index(["c1", "c2"]) tm.assert_frame_equal(result, expected) + + +def test_concat_ea_upcast(): + # GH#54848 + df1 = DataFrame(["a"], dtype="string") + df2 = DataFrame([1], dtype="Int64") + result = concat([df1, df2]) + expected = DataFrame(["a", 1], index=[0, 0]) + tm.assert_frame_equal(result, expected) From e6814133c3df0948d5fb10a01074cea569a852fe Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 5 Sep 2023 10:07:09 -1000 Subject: [PATCH 14/32] CI: Ignore hypothesis differing executors (#55013) --- pandas/conftest.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index a4f58e99d8bcc..ac0275bf695d4 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -71,6 +71,7 @@ Index, MultiIndex, ) +from pandas.util.version import Version if TYPE_CHECKING: from collections.abc import ( @@ -191,6 +192,10 @@ def pytest_collection_modifyitems(items, config) -> None: item.add_marker(pytest.mark.arraymanager) +hypothesis_health_checks = [hypothesis.HealthCheck.too_slow] +if Version(hypothesis.__version__) >= Version("6.83.2"): + hypothesis_health_checks.append(hypothesis.HealthCheck.differing_executors) + # Hypothesis hypothesis.settings.register_profile( "ci", @@ -202,7 +207,7 @@ def pytest_collection_modifyitems(items, config) -> None: # 2022-02-09: Changed deadline from 500 -> None. Deadline leads to # non-actionable, flaky CI failures (# GH 24641, 44969, 45118, 44969) deadline=None, - suppress_health_check=(hypothesis.HealthCheck.too_slow,), + suppress_health_check=tuple(hypothesis_health_checks), ) hypothesis.settings.load_profile("ci") From 1b3ebe4656fca6607738851e46eaa53a9f970293 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 6 Sep 2023 00:32:36 +0200 Subject: [PATCH 15/32] Include pyarrow_numpy string in efficient merge implementation (#54974) Inlude pyarrow_numpy string in efficient merge implementation --- pandas/core/reshape/merge.py | 3 ++- pandas/tests/reshape/merge/test_merge.py | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 8ef3943ab0d8d..5b07a0010acdd 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2421,7 +2421,8 @@ def _factorize_keys( elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype: if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or ( - isinstance(lk.dtype, StringDtype) and lk.dtype.storage == "pyarrow" + isinstance(lk.dtype, StringDtype) + and lk.dtype.storage in ["pyarrow", "pyarrow_numpy"] ): import pyarrow as pa import pyarrow.compute as pc diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 9cada6964c094..4659c16909ed7 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2949,13 +2949,13 @@ def test_merge_ea_int_and_float_numpy(): tm.assert_frame_equal(result, expected.astype("float64")) -def test_merge_arrow_string_index(): +def test_merge_arrow_string_index(any_string_dtype): # GH#54894 pytest.importorskip("pyarrow") - left = DataFrame({"a": ["a", "b"]}, dtype="string[pyarrow]") - right = DataFrame({"b": 1}, index=Index(["a", "c"], dtype="string[pyarrow]")) + left = DataFrame({"a": ["a", "b"]}, dtype=any_string_dtype) + right = DataFrame({"b": 1}, index=Index(["a", "c"], dtype=any_string_dtype)) result = left.merge(right, left_on="a", right_index=True, how="left") expected = DataFrame( - {"a": Series(["a", "b"], dtype="string[pyarrow]"), "b": [1, np.nan]} + {"a": Series(["a", "b"], dtype=any_string_dtype), "b": [1, np.nan]} ) tm.assert_frame_equal(result, expected) From da849a95646717013c93838cc798dcc31aee2290 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 6 Sep 2023 00:41:51 +0200 Subject: [PATCH 16/32] REG: filter not respecting the order of labels (#54982) --- doc/source/whatsnew/v2.1.1.rst | 1 + pandas/core/generic.py | 8 +++++--- pandas/tests/frame/methods/test_filter.py | 14 ++++++++++++++ 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index 258f05d4277bd..b9bdb36fe0ed3 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -19,6 +19,7 @@ Fixed regressions - Fixed regression in :func:`read_csv` when ``delim_whitespace`` is True (:issue:`54918`, :issue:`54931`) - Fixed regression in :meth:`.GroupBy.get_group` raising for ``axis=1`` (:issue:`54858`) - Fixed regression in :meth:`DataFrame.__setitem__` raising ``AssertionError`` when setting a :class:`Series` with a partial :class:`MultiIndex` (:issue:`54875`) +- Fixed regression in :meth:`DataFrame.filter` not respecting the order of elements for ``filter`` (:issue:`54980`) - Fixed regression in :meth:`DataFrame.to_sql` not roundtripping datetime columns correctly for sqlite (:issue:`54877`) - Fixed regression in :meth:`MultiIndex.append` raising when appending overlapping :class:`IntervalIndex` levels (:issue:`54934`) - Fixed regression in :meth:`Series.drop_duplicates` for PyArrow strings (:issue:`54904`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e6bf55a1cbadf..8c1406fc305e3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5718,10 +5718,12 @@ def filter( if items is not None: name = self._get_axis_name(axis) + items = Index(items).intersection(labels) + if len(items) == 0: + # Keep the dtype of labels when we are empty + items = items.astype(labels.dtype) # error: Keywords must be strings - return self.reindex( # type: ignore[misc] - **{name: labels.intersection(items)} - ) + return self.reindex(**{name: items}) # type: ignore[misc] elif like: def f(x) -> bool_t: diff --git a/pandas/tests/frame/methods/test_filter.py b/pandas/tests/frame/methods/test_filter.py index 1a2fbf8a65a55..9d5e6876bb08c 100644 --- a/pandas/tests/frame/methods/test_filter.py +++ b/pandas/tests/frame/methods/test_filter.py @@ -137,3 +137,17 @@ def test_filter_regex_non_string(self): result = df.filter(regex="STRING") expected = df[["STRING"]] tm.assert_frame_equal(result, expected) + + def test_filter_keep_order(self): + # GH#54980 + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + result = df.filter(items=["B", "A"]) + expected = df[["B", "A"]] + tm.assert_frame_equal(result, expected) + + def test_filter_different_dtype(self): + # GH#54980 + df = DataFrame({1: [1, 2, 3], 2: [4, 5, 6]}) + result = df.filter(items=["B", "A"]) + expected = df[[]] + tm.assert_frame_equal(result, expected) From 876d7858721b4f364f431d38f849594aa0eadc7e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 6 Sep 2023 11:00:58 +0200 Subject: [PATCH 17/32] Enable Arrow implementation for removeprefix (#54972) --- pandas/core/arrays/arrow/array.py | 10 +++++----- pandas/core/arrays/string_arrow.py | 19 ++++++++++++++++++- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4d887ecd1510f..83ed54c42a23c 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2192,11 +2192,11 @@ def _str_rstrip(self, to_strip=None): return type(self)(result) def _str_removeprefix(self, prefix: str): - # TODO: Should work once https://github.com/apache/arrow/issues/14991 is fixed - # starts_with = pc.starts_with(self._pa_array, pattern=prefix) - # removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) - # result = pc.if_else(starts_with, removed, self._pa_array) - # return type(self)(result) + if not pa_version_under13p0: + starts_with = pc.starts_with(self._pa_array, pattern=prefix) + removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) + result = pc.if_else(starts_with, removed, self._pa_array) + return type(self)(result) predicate = lambda val: val.removeprefix(prefix) result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 60d7ae1b998f5..338724d405ad8 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -15,7 +15,10 @@ lib, missing as libmissing, ) -from pandas.compat import pa_version_under7p0 +from pandas.compat import ( + pa_version_under7p0, + pa_version_under13p0, +) from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( @@ -446,6 +449,20 @@ def _str_rstrip(self, to_strip=None): result = pc.utf8_rtrim(self._pa_array, characters=to_strip) return type(self)(result) + def _str_removeprefix(self, prefix: str): + if not pa_version_under13p0: + starts_with = pc.starts_with(self._pa_array, pattern=prefix) + removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) + result = pc.if_else(starts_with, removed, self._pa_array) + return type(self)(result) + return super()._str_removeprefix(prefix) + + def _str_removesuffix(self, suffix: str): + ends_with = pc.ends_with(self._pa_array, pattern=suffix) + removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) + result = pc.if_else(ends_with, removed, self._pa_array) + return type(self)(result) + def _str_count(self, pat: str, flags: int = 0): if flags: return super()._str_count(pat, flags) From a7005e0343cdfa682593400ed96fc72ddad629a1 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 6 Sep 2023 13:12:29 -0400 Subject: [PATCH 18/32] BUG: merge with left and/or right empty returning mis-ordered columns (#55028) --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/reshape/merge.py | 7 +--- pandas/tests/reshape/merge/test_merge.py | 47 +++++++++++++++++------- 3 files changed, 35 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index bd15d5fa085e9..4f38d420a53b4 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -246,7 +246,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ -- +- Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`) - Sparse diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 5b07a0010acdd..6d1ff07e07c76 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1272,12 +1272,7 @@ def _get_merge_keys( # work-around for merge_asof(right_index=True) right_keys.append(right.index._values) if lk is not None and lk == rk: # FIXME: what about other NAs? - # avoid key upcast in corner case (length-0) - lk = cast(Hashable, lk) - if len(left) > 0: - right_drop.append(rk) - else: - left_drop.append(lk) + right_drop.append(rk) else: rk = cast(ArrayLike, rk) right_keys.append(rk) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 4659c16909ed7..37ccfddfc82cd 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -582,11 +582,11 @@ def test_merge_empty_frame(self, series_of_dtype, series_of_dtype2): df_empty = df[:0] expected = DataFrame( { - "value_x": Series(dtype=df.dtypes["value"]), "key": Series(dtype=df.dtypes["key"]), + "value_x": Series(dtype=df.dtypes["value"]), "value_y": Series(dtype=df.dtypes["value"]), }, - columns=["value_x", "key", "value_y"], + columns=["key", "value_x", "value_y"], ) actual = df_empty.merge(df, on="key") tm.assert_frame_equal(actual, expected) @@ -889,13 +889,13 @@ def test_merge_on_datetime64tz_empty(self): result = left.merge(right, on="date") expected = DataFrame( { + "date": Series(dtype=dtz), "value_x": Series(dtype=float), "date2_x": Series(dtype=dtz), - "date": Series(dtype=dtz), "value_y": Series(dtype=float), "date2_y": Series(dtype=dtz), }, - columns=["value_x", "date2_x", "date", "value_y", "date2_y"], + columns=["date", "value_x", "date2_x", "value_y", "date2_y"], ) tm.assert_frame_equal(result, expected) @@ -1827,11 +1827,9 @@ def test_merge_empty(self, left_empty, how, exp): if exp == "left": expected = DataFrame({"A": [2, 1], "B": [3, 4], "C": [np.nan, np.nan]}) elif exp == "right": - expected = DataFrame({"B": [np.nan], "A": [1], "C": [5]}) + expected = DataFrame({"A": [1], "B": [np.nan], "C": [5]}) elif exp == "empty": expected = DataFrame(columns=["A", "B", "C"], dtype="int64") - if left_empty: - expected = expected[["B", "A", "C"]] elif exp == "empty_cross": expected = DataFrame(columns=["A_x", "B", "A_y", "C"], dtype="int64") @@ -2481,14 +2479,12 @@ def test_merge_multiindex_columns(): result = frame_x.merge(frame_y, on="id", suffixes=((l_suf, r_suf))) # Constructing the expected results - expected_labels = [letter + l_suf for letter in letters] + [ - letter + r_suf for letter in letters - ] - expected_index = MultiIndex.from_product( - [expected_labels, numbers], names=["outer", "inner"] - ) + tuples = [(letter + l_suf, num) for letter in letters for num in numbers] + tuples += [("id", "")] + tuples += [(letter + r_suf, num) for letter in letters for num in numbers] + + expected_index = MultiIndex.from_tuples(tuples, names=["outer", "inner"]) expected = DataFrame(columns=expected_index) - expected["id"] = "" tm.assert_frame_equal(result, expected) @@ -2959,3 +2955,26 @@ def test_merge_arrow_string_index(any_string_dtype): {"a": Series(["a", "b"], dtype=any_string_dtype), "b": [1, np.nan]} ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("left_empty", [True, False]) +@pytest.mark.parametrize("right_empty", [True, False]) +def test_merge_empty_frames_column_order(left_empty, right_empty): + # GH 51929 + df1 = DataFrame(1, index=[0], columns=["A", "B"]) + df2 = DataFrame(1, index=[0], columns=["A", "C", "D"]) + + if left_empty: + df1 = df1.iloc[:0] + if right_empty: + df2 = df2.iloc[:0] + + result = merge(df1, df2, on=["A"], how="outer") + expected = DataFrame(1, index=[0], columns=["A", "B", "C", "D"]) + if left_empty and right_empty: + expected = expected.iloc[:0] + elif left_empty: + expected.loc[:, "B"] = np.nan + elif right_empty: + expected.loc[:, ["C", "D"]] = np.nan + tm.assert_frame_equal(result, expected) From e5f81ac8a2645316a42db6348d2e5dc699f10783 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Wed, 6 Sep 2023 13:16:27 -0400 Subject: [PATCH 19/32] TYP: fix a few types (#54976) * TYP: fix a few types * namespace test * read_fwf overloads * Revert "namespace test" This reverts commit 0f72079f229db7e243784ee65c2e968db5f7e2ff. * revert util and move kwds * isort --- pandas/core/frame.py | 24 ++++++++++++------ pandas/core/generic.py | 7 ++++-- pandas/io/excel/_base.py | 31 +++++++++-------------- pandas/io/formats/excel.py | 4 +-- pandas/io/json/_json.py | 5 ++-- pandas/io/parsers/readers.py | 49 ++++++++++++++++++++++++++++++++++++ 6 files changed, 86 insertions(+), 34 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4bfa8a4415785..a731cdbf99b0e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1926,11 +1926,17 @@ def to_dict( self, orient: Literal["dict", "list", "series", "split", "tight", "index"] = ..., into: type[dict] = ..., + index: bool = ..., ) -> dict: ... @overload - def to_dict(self, orient: Literal["records"], into: type[dict] = ...) -> list[dict]: + def to_dict( + self, + orient: Literal["records"], + into: type[dict] = ..., + index: bool = ..., + ) -> list[dict]: ... @deprecate_nonkeyword_arguments( @@ -11297,7 +11303,7 @@ def _reduce_axis1(self, name: str, func, skipna: bool) -> Series: def any( # type: ignore[override] self, *, - axis: Axis = 0, + axis: Axis | None = 0, bool_only: bool = False, skipna: bool = True, **kwargs, @@ -11312,7 +11318,7 @@ def any( # type: ignore[override] @doc(make_doc("all", ndim=2)) def all( self, - axis: Axis = 0, + axis: Axis | None = 0, bool_only: bool = False, skipna: bool = True, **kwargs, @@ -11711,6 +11717,7 @@ def quantile( axis: Axis = ..., numeric_only: bool = ..., interpolation: QuantileInterpolation = ..., + method: Literal["single", "table"] = ..., ) -> Series: ... @@ -11721,6 +11728,7 @@ def quantile( axis: Axis = ..., numeric_only: bool = ..., interpolation: QuantileInterpolation = ..., + method: Literal["single", "table"] = ..., ) -> Series | DataFrame: ... @@ -11731,6 +11739,7 @@ def quantile( axis: Axis = ..., numeric_only: bool = ..., interpolation: QuantileInterpolation = ..., + method: Literal["single", "table"] = ..., ) -> Series | DataFrame: ... @@ -11830,11 +11839,10 @@ def quantile( if not is_list_like(q): # BlockManager.quantile expects listlike, so we wrap and unwrap here - # error: List item 0 has incompatible type "Union[float, Union[Union[ - # ExtensionArray, ndarray[Any, Any]], Index, Series], Sequence[float]]"; - # expected "float" - res_df = self.quantile( # type: ignore[call-overload] - [q], + # error: List item 0 has incompatible type "float | ExtensionArray | + # ndarray[Any, Any] | Index | Series | Sequence[float]"; expected "float" + res_df = self.quantile( + [q], # type: ignore[list-item] axis=axis, numeric_only=numeric_only, interpolation=interpolation, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8c1406fc305e3..975fbaf59df5c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11827,7 +11827,7 @@ def _logical_func( self, name: str, func, - axis: Axis = 0, + axis: Axis | None = 0, bool_only: bool_t = False, skipna: bool_t = True, **kwargs, @@ -11840,7 +11840,10 @@ def _logical_func( res = self._logical_func( name, func, axis=0, bool_only=bool_only, skipna=skipna, **kwargs ) - return res._logical_func(name, func, skipna=skipna, **kwargs) + # error: Item "bool" of "Series | bool" has no attribute "_logical_func" + return res._logical_func( # type: ignore[union-attr] + name, func, skipna=skipna, **kwargs + ) elif axis is None: axis = 0 diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 9ffbfb9f1149f..b4b0f29019c31 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1,6 +1,5 @@ from __future__ import annotations -import abc from collections.abc import ( Hashable, Iterable, @@ -549,7 +548,7 @@ def read_excel( _WorkbookT = TypeVar("_WorkbookT") -class BaseExcelReader(Generic[_WorkbookT], metaclass=abc.ABCMeta): +class BaseExcelReader(Generic[_WorkbookT]): book: _WorkbookT def __init__( @@ -589,13 +588,11 @@ def __init__( ) @property - @abc.abstractmethod def _workbook_class(self) -> type[_WorkbookT]: - pass + raise NotImplementedError - @abc.abstractmethod def load_workbook(self, filepath_or_buffer, engine_kwargs) -> _WorkbookT: - pass + raise NotImplementedError def close(self) -> None: if hasattr(self, "book"): @@ -611,21 +608,17 @@ def close(self) -> None: self.handles.close() @property - @abc.abstractmethod def sheet_names(self) -> list[str]: - pass + raise NotImplementedError - @abc.abstractmethod def get_sheet_by_name(self, name: str): - pass + raise NotImplementedError - @abc.abstractmethod def get_sheet_by_index(self, index: int): - pass + raise NotImplementedError - @abc.abstractmethod def get_sheet_data(self, sheet, rows: int | None = None): - pass + raise NotImplementedError def raise_if_bad_sheet_by_index(self, index: int) -> None: n_sheets = len(self.sheet_names) @@ -940,7 +933,7 @@ def parse( @doc(storage_options=_shared_docs["storage_options"]) -class ExcelWriter(Generic[_WorkbookT], metaclass=abc.ABCMeta): +class ExcelWriter(Generic[_WorkbookT]): """ Class for writing DataFrame objects into excel sheets. @@ -1178,20 +1171,19 @@ def engine(self) -> str: return self._engine @property - @abc.abstractmethod def sheets(self) -> dict[str, Any]: """Mapping of sheet names to sheet objects.""" + raise NotImplementedError @property - @abc.abstractmethod def book(self) -> _WorkbookT: """ Book instance. Class type will depend on the engine used. This attribute can be used to access engine-specific features. """ + raise NotImplementedError - @abc.abstractmethod def _write_cells( self, cells, @@ -1214,12 +1206,13 @@ def _write_cells( freeze_panes: int tuple of length 2 contains the bottom-most row and right-most column to freeze """ + raise NotImplementedError - @abc.abstractmethod def _save(self) -> None: """ Save workbook to disk. """ + raise NotImplementedError def __init__( self, diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index 9970d465ced9d..b344d9849f16c 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -941,9 +941,7 @@ def write( if isinstance(writer, ExcelWriter): need_save = False else: - # error: Cannot instantiate abstract class 'ExcelWriter' with abstract - # attributes 'engine', 'save', 'supported_extensions' and 'write_cells' - writer = ExcelWriter( # type: ignore[abstract] + writer = ExcelWriter( writer, engine=engine, storage_options=storage_options, diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 833f4986b6da6..52ea072d1483f 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -82,6 +82,7 @@ JSONEngine, JSONSerializable, ReadBuffer, + Self, StorageOptions, WriteBuffer, ) @@ -1056,7 +1057,7 @@ def close(self) -> None: if self.handles is not None: self.handles.close() - def __iter__(self: JsonReader[FrameSeriesStrT]) -> JsonReader[FrameSeriesStrT]: + def __iter__(self) -> Self: return self @overload @@ -1099,7 +1100,7 @@ def __next__(self) -> DataFrame | Series: else: return obj - def __enter__(self) -> JsonReader[FrameSeriesStrT]: + def __enter__(self) -> Self: return self def __exit__( diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 10d3ab230cb9d..e0f171035e89e 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1307,6 +1307,51 @@ def read_table( return _read(filepath_or_buffer, kwds) +@overload +def read_fwf( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + colspecs: Sequence[tuple[int, int]] | str | None = ..., + widths: Sequence[int] | None = ..., + infer_nrows: int = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., + iterator: Literal[True], + chunksize: int | None = ..., + **kwds, +) -> TextFileReader: + ... + + +@overload +def read_fwf( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + colspecs: Sequence[tuple[int, int]] | str | None = ..., + widths: Sequence[int] | None = ..., + infer_nrows: int = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., + iterator: bool = ..., + chunksize: int, + **kwds, +) -> TextFileReader: + ... + + +@overload +def read_fwf( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + colspecs: Sequence[tuple[int, int]] | str | None = ..., + widths: Sequence[int] | None = ..., + infer_nrows: int = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., + iterator: Literal[False] = ..., + chunksize: None = ..., + **kwds, +) -> DataFrame: + ... + + def read_fwf( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], *, @@ -1314,6 +1359,8 @@ def read_fwf( widths: Sequence[int] | None = None, infer_nrows: int = 100, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + iterator: bool = False, + chunksize: int | None = None, **kwds, ) -> DataFrame | TextFileReader: r""" @@ -1412,6 +1459,8 @@ def read_fwf( kwds["colspecs"] = colspecs kwds["infer_nrows"] = infer_nrows kwds["engine"] = "python-fwf" + kwds["iterator"] = iterator + kwds["chunksize"] = chunksize check_dtype_backend(dtype_backend) kwds["dtype_backend"] = dtype_backend From f87b7e309ff34541158fe06cc0c915c09fa37c37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Lucas=20Mayer?= Date: Wed, 6 Sep 2023 14:17:32 -0300 Subject: [PATCH 20/32] TST: add test case of ngroup with NaN value (#54966) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add test case of ngroup with nan value Co-authored-by: José Lucas Silva Mayer Co-authored-by: Willian Wang * fix linter issues Co-authored-by: José Lucas Silva Mayer Co-authored-by: Willian Wang * use Categorical object instead of pd.Categorical Co-authored-by: José Lucas Silva Mayer Co-authored-by: Willian Wang * use native assert function Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * test full result of ngroup method Signed-off-by: José Lucas Silva Mayer --------- Signed-off-by: José Lucas Silva Mayer Co-authored-by: Willian Wang Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/groupby/test_groupby.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 1e6d220199e22..999a03d18644d 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3189,6 +3189,14 @@ def test_depr_get_group_len_1_list_likes(test_series, kwarg, value, name, warn): tm.assert_equal(result, expected) +def test_groupby_ngroup_with_nan(): + # GH#50100 + df = DataFrame({"a": Categorical([np.nan]), "b": [1]}) + result = df.groupby(["a", "b"], dropna=False, observed=False).ngroup() + expected = Series([0]) + tm.assert_series_equal(result, expected) + + def test_get_group_axis_1(): # GH#54858 df = DataFrame( From 6cb1da95fcd2df9689f5c957a85c299d67b9aec9 Mon Sep 17 00:00:00 2001 From: Abdullah Ihsan Secer Date: Wed, 6 Sep 2023 18:19:30 +0100 Subject: [PATCH 21/32] TST: Use (unused) window parameter of test_freq_window_not_implemented (#54947) * Use window parameter of test_freq_window_not_implemented * Revert change in exception message --- pandas/tests/window/test_rolling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index a02f132e540ac..3fe922539780d 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -100,9 +100,9 @@ def test_freq_window_not_implemented(window): index=date_range("2015-12-24", periods=10, freq="D"), ) with pytest.raises( - NotImplementedError, match="step is not supported with frequency windows" + NotImplementedError, match="^step (not implemented|is not supported)" ): - df.rolling("3D", step=3) + df.rolling(window, step=3).sum() @pytest.mark.parametrize("agg", ["cov", "corr"]) From 3e1dc77866d3313f85564e7c67f8e6f7339c2cc6 Mon Sep 17 00:00:00 2001 From: David Poznik Date: Wed, 6 Sep 2023 13:06:14 -0700 Subject: [PATCH 22/32] DOC: Add missing word to `IndexOpsMixin.array` docstring (#55034) Add missing word to `IndexOpsMixin.array` docstring --- pandas/core/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index d973f8f5fe35a..3026189e747bb 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -485,8 +485,8 @@ def array(self) -> ExtensionArray: types, this is the actual array. For NumPy native types, this is a thin (no copy) wrapper around :class:`numpy.ndarray`. - ``.array`` differs ``.values`` which may require converting the - data to a different form. + ``.array`` differs from ``.values``, which may require converting + the data to a different form. See Also -------- From 1aa885730ae3e01bb7123059d59220e67012343b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 6 Sep 2023 10:07:06 -1000 Subject: [PATCH 23/32] TST: Use more explicit object names (#55033) --- pandas/tests/frame/methods/test_reindex.py | 24 +++++--- pandas/tests/indexes/ranges/test_range.py | 55 ++++++++++--------- pandas/tests/indexing/test_categorical.py | 11 ++-- .../indexing/test_chaining_and_caching.py | 6 +- pandas/tests/io/formats/test_info.py | 6 +- pandas/tests/io/formats/test_series_info.py | 6 +- pandas/tests/reshape/merge/test_merge.py | 18 +++--- pandas/tests/reshape/test_cut.py | 12 ++-- pandas/tests/reshape/test_pivot.py | 8 ++- pandas/tests/reshape/test_qcut.py | 10 ++-- pandas/tests/test_algos.py | 4 +- 11 files changed, 92 insertions(+), 68 deletions(-) diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index 0858e33a989b7..56bdd2fc664cc 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -26,7 +26,7 @@ isna, ) import pandas._testing as tm -from pandas.api.types import CategoricalDtype as CDT +from pandas.api.types import CategoricalDtype class TestReindexSetIndex: @@ -1082,7 +1082,9 @@ def test_reindex_with_categoricalindex(self): { "A": np.arange(3, dtype="int64"), }, - index=CategoricalIndex(list("abc"), dtype=CDT(list("cabe")), name="B"), + index=CategoricalIndex( + list("abc"), dtype=CategoricalDtype(list("cabe")), name="B" + ), ) # reindexing @@ -1111,13 +1113,13 @@ def test_reindex_with_categoricalindex(self): result = df.reindex(Categorical(["a", "e"], categories=cats)) expected = DataFrame( - {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats))} + {"A": [0, np.nan], "B": Series(list("ae")).astype(CategoricalDtype(cats))} ).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(Categorical(["a"], categories=cats)) expected = DataFrame( - {"A": [0], "B": Series(list("a")).astype(CDT(cats))} + {"A": [0], "B": Series(list("a")).astype(CategoricalDtype(cats))} ).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) @@ -1138,13 +1140,19 @@ def test_reindex_with_categoricalindex(self): # give back the type of categorical that we received result = df.reindex(Categorical(["a", "e"], categories=cats, ordered=True)) expected = DataFrame( - {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats, ordered=True))} + { + "A": [0, np.nan], + "B": Series(list("ae")).astype(CategoricalDtype(cats, ordered=True)), + } ).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(Categorical(["a", "d"], categories=["a", "d"])) expected = DataFrame( - {"A": [0, np.nan], "B": Series(list("ad")).astype(CDT(["a", "d"]))} + { + "A": [0, np.nan], + "B": Series(list("ad")).astype(CategoricalDtype(["a", "d"])), + } ).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) @@ -1152,7 +1160,9 @@ def test_reindex_with_categoricalindex(self): { "A": np.arange(6, dtype="int64"), }, - index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cabe")), name="B"), + index=CategoricalIndex( + list("aabbca"), dtype=CategoricalDtype(list("cabe")), name="B" + ), ) # passed duplicate indexers are not allowed msg = "cannot reindex on an axis with duplicate labels" diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 5f137df281fa3..132704434829e 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -10,9 +10,6 @@ ) import pandas._testing as tm -# aliases to make some tests easier to read -RI = RangeIndex - class TestRangeIndex: @pytest.fixture @@ -507,25 +504,31 @@ def test_len_specialised(self, step): @pytest.mark.parametrize( "indices, expected", [ - ([RI(1, 12, 5)], RI(1, 12, 5)), - ([RI(0, 6, 4)], RI(0, 6, 4)), - ([RI(1, 3), RI(3, 7)], RI(1, 7)), - ([RI(1, 5, 2), RI(5, 6)], RI(1, 6, 2)), - ([RI(1, 3, 2), RI(4, 7, 3)], RI(1, 7, 3)), - ([RI(-4, 3, 2), RI(4, 7, 2)], RI(-4, 7, 2)), - ([RI(-4, -8), RI(-8, -12)], RI(0, 0)), - ([RI(-4, -8), RI(3, -4)], RI(0, 0)), - ([RI(-4, -8), RI(3, 5)], RI(3, 5)), - ([RI(-4, -2), RI(3, 5)], Index([-4, -3, 3, 4])), - ([RI(-2), RI(3, 5)], RI(3, 5)), - ([RI(2), RI(2)], Index([0, 1, 0, 1])), - ([RI(2), RI(2, 5), RI(5, 8, 4)], RI(0, 6)), - ([RI(2), RI(3, 5), RI(5, 8, 4)], Index([0, 1, 3, 4, 5])), - ([RI(-2, 2), RI(2, 5), RI(5, 8, 4)], RI(-2, 6)), - ([RI(3), Index([-1, 3, 15])], Index([0, 1, 2, -1, 3, 15])), - ([RI(3), Index([-1, 3.1, 15.0])], Index([0, 1, 2, -1, 3.1, 15.0])), - ([RI(3), Index(["a", None, 14])], Index([0, 1, 2, "a", None, 14])), - ([RI(3, 1), Index(["a", None, 14])], Index(["a", None, 14])), + ([RangeIndex(1, 12, 5)], RangeIndex(1, 12, 5)), + ([RangeIndex(0, 6, 4)], RangeIndex(0, 6, 4)), + ([RangeIndex(1, 3), RangeIndex(3, 7)], RangeIndex(1, 7)), + ([RangeIndex(1, 5, 2), RangeIndex(5, 6)], RangeIndex(1, 6, 2)), + ([RangeIndex(1, 3, 2), RangeIndex(4, 7, 3)], RangeIndex(1, 7, 3)), + ([RangeIndex(-4, 3, 2), RangeIndex(4, 7, 2)], RangeIndex(-4, 7, 2)), + ([RangeIndex(-4, -8), RangeIndex(-8, -12)], RangeIndex(0, 0)), + ([RangeIndex(-4, -8), RangeIndex(3, -4)], RangeIndex(0, 0)), + ([RangeIndex(-4, -8), RangeIndex(3, 5)], RangeIndex(3, 5)), + ([RangeIndex(-4, -2), RangeIndex(3, 5)], Index([-4, -3, 3, 4])), + ([RangeIndex(-2), RangeIndex(3, 5)], RangeIndex(3, 5)), + ([RangeIndex(2), RangeIndex(2)], Index([0, 1, 0, 1])), + ([RangeIndex(2), RangeIndex(2, 5), RangeIndex(5, 8, 4)], RangeIndex(0, 6)), + ( + [RangeIndex(2), RangeIndex(3, 5), RangeIndex(5, 8, 4)], + Index([0, 1, 3, 4, 5]), + ), + ( + [RangeIndex(-2, 2), RangeIndex(2, 5), RangeIndex(5, 8, 4)], + RangeIndex(-2, 6), + ), + ([RangeIndex(3), Index([-1, 3, 15])], Index([0, 1, 2, -1, 3, 15])), + ([RangeIndex(3), Index([-1, 3.1, 15.0])], Index([0, 1, 2, -1, 3.1, 15.0])), + ([RangeIndex(3), Index(["a", None, 14])], Index([0, 1, 2, "a", None, 14])), + ([RangeIndex(3, 1), Index(["a", None, 14])], Index(["a", None, 14])), ], ) def test_append(self, indices, expected): @@ -567,7 +570,7 @@ def test_format_empty(self): assert empty_idx.format(name=True) == [""] @pytest.mark.parametrize( - "RI", + "ri", [ RangeIndex(0, -1, -1), RangeIndex(0, 1, 1), @@ -576,10 +579,10 @@ def test_format_empty(self): RangeIndex(-3, -5, -2), ], ) - def test_append_len_one(self, RI): + def test_append_len_one(self, ri): # GH39401 - result = RI.append([]) - tm.assert_index_equal(result, RI, exact=True) + result = ri.append([]) + tm.assert_index_equal(result, ri, exact=True) @pytest.mark.parametrize("base", [RangeIndex(0, 2), Index([0, 1])]) def test_isin_range(self, base): diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index b45d197af332e..d3a6d4bf7cebf 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -16,7 +16,6 @@ Timestamp, ) import pandas._testing as tm -from pandas.api.types import CategoricalDtype as CDT @pytest.fixture @@ -25,7 +24,9 @@ def df(): { "A": np.arange(6, dtype="int64"), }, - index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cab")), name="B"), + index=CategoricalIndex( + list("aabbca"), dtype=CategoricalDtype(list("cab")), name="B" + ), ) @@ -35,13 +36,15 @@ def df2(): { "A": np.arange(6, dtype="int64"), }, - index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cabe")), name="B"), + index=CategoricalIndex( + list("aabbca"), dtype=CategoricalDtype(list("cabe")), name="B" + ), ) class TestCategoricalIndex: def test_loc_scalar(self, df): - dtype = CDT(list("cab")) + dtype = CategoricalDtype(list("cab")) result = df.loc["a"] bidx = Series(list("aaa"), name="B").astype(dtype) assert bidx.dtype == dtype diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index f36fdf0d36ea9..7353b5ef76ba3 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -1,4 +1,4 @@ -from string import ascii_letters as letters +from string import ascii_letters import numpy as np import pytest @@ -24,9 +24,9 @@ def random_text(nobs=100): # Construct a DataFrame where each row is a random slice from 'letters' - idxs = np.random.default_rng(2).integers(len(letters), size=(nobs, 2)) + idxs = np.random.default_rng(2).integers(len(ascii_letters), size=(nobs, 2)) idxs.sort(axis=1) - strings = [letters[x[0] : x[1]] for x in idxs] + strings = [ascii_letters[x[0] : x[1]] for x in idxs] return DataFrame(strings, columns=["letters"]) diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index 73de2b068b699..6c3bf01cb1857 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -1,6 +1,6 @@ from io import StringIO import re -from string import ascii_uppercase as uppercase +from string import ascii_uppercase import sys import textwrap @@ -452,9 +452,9 @@ def memory_usage(f): return f.memory_usage(deep=True).sum() N = 100 - M = len(uppercase) + M = len(ascii_uppercase) index = MultiIndex.from_product( - [list(uppercase), date_range("20160101", periods=N)], + [list(ascii_uppercase), date_range("20160101", periods=N)], names=["id", "date"], ) df = DataFrame( diff --git a/pandas/tests/io/formats/test_series_info.py b/pandas/tests/io/formats/test_series_info.py index 02827ee25042a..29dd704f6efa9 100644 --- a/pandas/tests/io/formats/test_series_info.py +++ b/pandas/tests/io/formats/test_series_info.py @@ -1,5 +1,5 @@ from io import StringIO -from string import ascii_uppercase as uppercase +from string import ascii_uppercase import textwrap import numpy as np @@ -165,9 +165,9 @@ def test_info_memory_usage_bug_on_multiindex(): # GH 14308 # memory usage introspection should not materialize .values N = 100 - M = len(uppercase) + M = len(ascii_uppercase) index = MultiIndex.from_product( - [list(uppercase), date_range("20160101", periods=N)], + [list(ascii_uppercase), date_range("20160101", periods=N)], names=["id", "date"], ) s = Series(np.random.default_rng(2).standard_normal(N * M), index=index) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 37ccfddfc82cd..d889ae2e4806b 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -26,7 +26,6 @@ TimedeltaIndex, ) import pandas._testing as tm -from pandas.api.types import CategoricalDtype as CDT from pandas.core.reshape.concat import concat from pandas.core.reshape.merge import ( MergeError, @@ -1842,7 +1841,7 @@ def left(): { "X": Series( np.random.default_rng(2).choice(["foo", "bar"], size=(10,)) - ).astype(CDT(["foo", "bar"])), + ).astype(CategoricalDtype(["foo", "bar"])), "Y": np.random.default_rng(2).choice(["one", "two", "three"], size=(10,)), } ) @@ -1851,7 +1850,10 @@ def left(): @pytest.fixture def right(): return DataFrame( - {"X": Series(["foo", "bar"]).astype(CDT(["foo", "bar"])), "Z": [1, 2]} + { + "X": Series(["foo", "bar"]).astype(CategoricalDtype(["foo", "bar"])), + "Z": [1, 2], + } ) @@ -2002,8 +2004,8 @@ def test_other_columns(self, left, right): "change", [ lambda x: x, - lambda x: x.astype(CDT(["foo", "bar", "bah"])), - lambda x: x.astype(CDT(ordered=True)), + lambda x: x.astype(CategoricalDtype(["foo", "bar", "bah"])), + lambda x: x.astype(CategoricalDtype(ordered=True)), ], ) def test_dtype_on_merged_different(self, change, join_type, left, right): @@ -2110,11 +2112,13 @@ def test_merging_with_bool_or_int_cateorical_column( # GH 17187 # merging with a boolean/int categorical column df1 = DataFrame({"id": [1, 2, 3, 4], "cat": category_column}) - df1["cat"] = df1["cat"].astype(CDT(categories, ordered=ordered)) + df1["cat"] = df1["cat"].astype(CategoricalDtype(categories, ordered=ordered)) df2 = DataFrame({"id": [2, 4], "num": [1, 9]}) result = df1.merge(df2) expected = DataFrame({"id": [2, 4], "cat": expected_categories, "num": [1, 9]}) - expected["cat"] = expected["cat"].astype(CDT(categories, ordered=ordered)) + expected["cat"] = expected["cat"].astype( + CategoricalDtype(categories, ordered=ordered) + ) tm.assert_frame_equal(expected, result) def test_merge_on_int_array(self): diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 81b466b059702..3a284f7732ac1 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -21,7 +21,7 @@ to_datetime, ) import pandas._testing as tm -from pandas.api.types import CategoricalDtype as CDT +from pandas.api.types import CategoricalDtype import pandas.core.reshape.tile as tmod @@ -359,7 +359,7 @@ def test_cut_return_intervals(): IntervalIndex.from_breaks(exp_bins, closed="right").take( [0, 0, 0, 1, 1, 1, 2, 2, 2] ) - ).astype(CDT(ordered=True)) + ).astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(result, expected) @@ -370,7 +370,7 @@ def test_series_ret_bins(): expected = Series( IntervalIndex.from_breaks([-0.003, 1.5, 3], closed="right").repeat(2) - ).astype(CDT(ordered=True)) + ).astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(result, expected) @@ -445,7 +445,7 @@ def test_datetime_bin(conv): Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2])), ] ) - ).astype(CDT(ordered=True)) + ).astype(CategoricalDtype(ordered=True)) bins = [conv(v) for v in bin_data] result = Series(cut(data, bins=bins)) @@ -491,7 +491,7 @@ def test_datetime_cut(data): ), ] ) - ).astype(CDT(ordered=True)) + ).astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(Series(result), expected) @@ -534,7 +534,7 @@ def test_datetime_tz_cut(bins, box): ), ] ) - ).astype(CDT(ordered=True)) + ).astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index c43fd05fd5501..28ad133a0c8d6 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -23,7 +23,7 @@ date_range, ) import pandas._testing as tm -from pandas.api.types import CategoricalDtype as CDT +from pandas.api.types import CategoricalDtype from pandas.core.reshape import reshape as reshape_lib from pandas.core.reshape.pivot import pivot_table @@ -219,10 +219,12 @@ def test_pivot_table_dropna_categoricals(self, dropna): } ) - df["A"] = df["A"].astype(CDT(categories, ordered=False)) + df["A"] = df["A"].astype(CategoricalDtype(categories, ordered=False)) result = df.pivot_table(index="B", columns="A", values="C", dropna=dropna) expected_columns = Series(["a", "b", "c"], name="A") - expected_columns = expected_columns.astype(CDT(categories, ordered=False)) + expected_columns = expected_columns.astype( + CategoricalDtype(categories, ordered=False) + ) expected_index = Series([1, 2, 3], name="B") expected = DataFrame( [[0.0, 3.0, 6.0], [1.0, 4.0, 7.0], [2.0, 5.0, 8.0]], diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py index 907eeca6e9b5e..bcfbe5ed1aa20 100644 --- a/pandas/tests/reshape/test_qcut.py +++ b/pandas/tests/reshape/test_qcut.py @@ -20,7 +20,7 @@ timedelta_range, ) import pandas._testing as tm -from pandas.api.types import CategoricalDtype as CDT +from pandas.api.types import CategoricalDtype from pandas.tseries.offsets import ( Day, @@ -129,7 +129,9 @@ def test_qcut_return_intervals(): exp_levels = np.array( [Interval(-0.001, 2.664), Interval(2.664, 5.328), Interval(5.328, 8)] ) - exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(CDT(ordered=True)) + exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype( + CategoricalDtype(ordered=True) + ) tm.assert_series_equal(res, exp) @@ -199,7 +201,7 @@ def test_single_quantile(data, start, end, length, labels): if labels is None: intervals = IntervalIndex([Interval(start, end)] * length, closed="right") - expected = Series(intervals).astype(CDT(ordered=True)) + expected = Series(intervals).astype(CategoricalDtype(ordered=True)) else: expected = Series([0] * length, dtype=np.intp) @@ -249,7 +251,7 @@ def test_datetime_tz_qcut(bins): ), ] ) - ).astype(CDT(ordered=True)) + ).astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index cb703d3439d44..661290fb00d13 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -17,7 +17,7 @@ is_integer_dtype, is_object_dtype, ) -from pandas.core.dtypes.dtypes import CategoricalDtype as CDT +from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd from pandas import ( @@ -1182,7 +1182,7 @@ def test_value_counts(self): with tm.assert_produces_warning(FutureWarning, match=msg): result = algos.value_counts(factor) breaks = [-1.606, -1.018, -0.431, 0.155, 0.741] - index = IntervalIndex.from_breaks(breaks).astype(CDT(ordered=True)) + index = IntervalIndex.from_breaks(breaks).astype(CategoricalDtype(ordered=True)) expected = Series([1, 0, 2, 1], index=index, name="count") tm.assert_series_equal(result.sort_index(), expected.sort_index()) From 88683e9dbc4ba2fe7b1185d88b538e2bbb2d3601 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 7 Sep 2023 00:40:23 +0200 Subject: [PATCH 24/32] BUG: pct_change showing unnecessary FutureWarning (#54983) * BUG: pct_change showing unnecessary FutureWarning * Fix df case * Fix --- doc/source/whatsnew/v2.1.1.rst | 1 + pandas/core/generic.py | 24 ++++++++++++------- pandas/tests/frame/methods/test_pct_change.py | 18 ++++++++++++++ .../tests/series/methods/test_pct_change.py | 8 +++++++ 4 files changed, 42 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index b9bdb36fe0ed3..fe511b5cdec67 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -34,6 +34,7 @@ Bug fixes ~~~~~~~~~ - Fixed bug for :class:`ArrowDtype` raising ``NotImplementedError`` for fixed-size list (:issue:`55000`) - Fixed bug in :meth:`DataFrame.stack` with ``future_stack=True`` and columns a non-:class:`MultiIndex` consisting of tuples (:issue:`54948`) +- Fixed bug in :meth:`Series.pct_change` and :meth:`DataFrame.pct_change` showing unnecessary ``FutureWarning`` (:issue:`54981`) .. --------------------------------------------------------------------------- .. _whatsnew_211.other: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 975fbaf59df5c..5c303e2a73bd7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11793,15 +11793,21 @@ def pct_change( stacklevel=find_stack_level(), ) if fill_method is lib.no_default: - if self.isna().values.any(): - warnings.warn( - "The default fill_method='pad' in " - f"{type(self).__name__}.pct_change is deprecated and will be " - "removed in a future version. Call ffill before calling " - "pct_change to retain current behavior and silence this warning.", - FutureWarning, - stacklevel=find_stack_level(), - ) + cols = self.items() if self.ndim == 2 else [(None, self)] + for _, col in cols: + mask = col.isna().values + mask = mask[np.argmax(~mask) :] + if mask.any(): + warnings.warn( + "The default fill_method='pad' in " + f"{type(self).__name__}.pct_change is deprecated and will be " + "removed in a future version. Call ffill before calling " + "pct_change to retain current behavior and silence this " + "warning.", + FutureWarning, + stacklevel=find_stack_level(), + ) + break fill_method = "pad" if limit is lib.no_default: limit = None diff --git a/pandas/tests/frame/methods/test_pct_change.py b/pandas/tests/frame/methods/test_pct_change.py index d0153da038a75..ede212ae18ae9 100644 --- a/pandas/tests/frame/methods/test_pct_change.py +++ b/pandas/tests/frame/methods/test_pct_change.py @@ -160,3 +160,21 @@ def test_pct_change_with_duplicated_indices(fill_method): index=["a", "b"] * 3, ) tm.assert_frame_equal(result, expected) + + +def test_pct_change_none_beginning_no_warning(): + # GH#54481 + df = DataFrame( + [ + [1, None], + [2, 1], + [3, 2], + [4, 3], + [5, 4], + ] + ) + result = df.pct_change() + expected = DataFrame( + {0: [np.nan, 1, 0.5, 1 / 3, 0.25], 1: [np.nan, np.nan, 1, 0.5, 1 / 3]} + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_pct_change.py b/pandas/tests/series/methods/test_pct_change.py index 4dabf7b87e2cd..6740b8756853e 100644 --- a/pandas/tests/series/methods/test_pct_change.py +++ b/pandas/tests/series/methods/test_pct_change.py @@ -107,3 +107,11 @@ def test_pct_change_with_duplicated_indices(fill_method): expected = Series([np.nan, np.nan, 1.0, 0.5, 2.0, 1.0], index=["a", "b"] * 3) tm.assert_series_equal(result, expected) + + +def test_pct_change_no_warning_na_beginning(): + # GH#54981 + ser = Series([None, None, 1, 2, 3]) + result = ser.pct_change() + expected = Series([np.nan, np.nan, np.nan, 1, 0.5]) + tm.assert_series_equal(result, expected) From faeedade7966d6f2a5b601c26205a71362913c47 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 7 Sep 2023 00:42:37 +0200 Subject: [PATCH 25/32] ENH: Implement more string accessors through PyArrow (#54960) --- pandas/core/arrays/string_arrow.py | 31 ++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 338724d405ad8..a6838fbc73be9 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -50,6 +50,8 @@ if TYPE_CHECKING: + from collections.abc import Sequence + from pandas._typing import ( Dtype, Scalar, @@ -337,19 +339,13 @@ def _str_startswith(self, pat: str, na=None): result = pc.starts_with(self._pa_array, pattern=pat) if not isna(na): result = result.fill_null(na) - result = self._result_converter(result) - if not isna(na): - result[isna(result)] = bool(na) - return result + return self._result_converter(result) def _str_endswith(self, pat: str, na=None): result = pc.ends_with(self._pa_array, pattern=pat) if not isna(na): result = result.fill_null(na) - result = self._result_converter(result) - if not isna(na): - result[isna(result)] = bool(na) - return result + return self._result_converter(result) def _str_replace( self, @@ -368,6 +364,12 @@ def _str_replace( result = func(self._pa_array, pattern=pat, replacement=repl, max_replacements=n) return type(self)(result) + def _str_repeat(self, repeats: int | Sequence[int]): + if not isinstance(repeats, int): + return super()._str_repeat(repeats) + else: + return type(self)(pc.binary_repeat(self._pa_array, repeats)) + def _str_match( self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None ): @@ -382,6 +384,19 @@ def _str_fullmatch( pat = f"{pat}$" return self._str_match(pat, case, flags, na) + def _str_slice( + self, start: int | None = None, stop: int | None = None, step: int | None = None + ): + if stop is None: + return super()._str_slice(start, stop, step) + if start is None: + start = 0 + if step is None: + step = 1 + return type(self)( + pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) + ) + def _str_isalnum(self): result = pc.utf8_is_alnum(self._pa_array) return self._result_converter(result) From cf6100b2aa8f2210fc60c34865587c4c24d42582 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Thu, 7 Sep 2023 11:59:58 -0400 Subject: [PATCH 26/32] DEPR: DataFrameGroupBy.apply operating on the group keys (#54950) * DEPR: DataFrameGroupBy.apply operating on the group keys * fixups * Improvements * Add DataFrameGroupBy.resample to the whatsnew; mypy fixup * Ignore wrong parameter order * Ignore groupby.resample in docstring validation * Fixup docstring --- doc/source/user_guide/cookbook.rst | 4 +- doc/source/user_guide/groupby.rst | 14 +- doc/source/whatsnew/v0.14.0.rst | 21 +- doc/source/whatsnew/v0.18.1.rst | 93 ++++- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/frame.py | 26 +- pandas/core/groupby/groupby.py | 134 ++++++-- pandas/core/resample.py | 48 ++- pandas/core/reshape/pivot.py | 4 +- pandas/tests/extension/base/groupby.py | 8 +- pandas/tests/frame/test_stack_unstack.py | 4 +- pandas/tests/groupby/aggregate/test_other.py | 8 +- pandas/tests/groupby/test_apply.py | 319 ++++++++++++------ pandas/tests/groupby/test_apply_mutate.py | 32 +- pandas/tests/groupby/test_categorical.py | 13 +- pandas/tests/groupby/test_counting.py | 4 +- pandas/tests/groupby/test_function.py | 6 +- pandas/tests/groupby/test_groupby.py | 67 ++-- pandas/tests/groupby/test_groupby_dropna.py | 4 +- pandas/tests/groupby/test_groupby_subclass.py | 8 +- pandas/tests/groupby/test_grouping.py | 4 +- pandas/tests/groupby/test_timegrouper.py | 19 +- pandas/tests/groupby/test_value_counts.py | 9 +- .../tests/groupby/transform/test_transform.py | 12 +- pandas/tests/resample/test_datetime_index.py | 20 +- pandas/tests/resample/test_resample_api.py | 4 +- .../tests/resample/test_resampler_grouper.py | 71 +++- pandas/tests/resample/test_time_grouper.py | 14 +- pandas/tests/window/test_groupby.py | 88 +++-- scripts/validate_unwanted_patterns.py | 1 + 30 files changed, 767 insertions(+), 294 deletions(-) diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index c0d2a14507383..002e88533ab93 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -459,7 +459,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to df # List the size of the animals with the highest weight. - df.groupby("animal").apply(lambda subf: subf["size"][subf["weight"].idxmax()]) + df.groupby("animal").apply(lambda subf: subf["size"][subf["weight"].idxmax()], include_groups=False) `Using get_group `__ @@ -482,7 +482,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to return pd.Series(["L", avg_weight, True], index=["size", "weight", "adult"]) - expected_df = gb.apply(GrowUp) + expected_df = gb.apply(GrowUp, include_groups=False) expected_df `Expanding apply diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index c28123cec4491..5dd14e243fbb3 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -420,6 +420,12 @@ This is mainly syntactic sugar for the alternative, which is much more verbose: Additionally, this method avoids recomputing the internal grouping information derived from the passed key. +You can also include the grouping columns if you want to operate on them. + +.. ipython:: python + + grouped[["A", "B"]].sum() + .. _groupby.iterating-label: Iterating through groups @@ -1053,7 +1059,7 @@ missing values with the ``ffill()`` method. ).set_index("date") df_re - df_re.groupby("group").resample("1D").ffill() + df_re.groupby("group").resample("1D", include_groups=False).ffill() .. _groupby.filter: @@ -1219,13 +1225,13 @@ the argument ``group_keys`` which defaults to ``True``. Compare .. ipython:: python - df.groupby("A", group_keys=True).apply(lambda x: x) + df.groupby("A", group_keys=True).apply(lambda x: x, include_groups=False) with .. ipython:: python - df.groupby("A", group_keys=False).apply(lambda x: x) + df.groupby("A", group_keys=False).apply(lambda x: x, include_groups=False) Numba Accelerated Routines @@ -1709,7 +1715,7 @@ column index name will be used as the name of the inserted column: result = {"b_sum": x["b"].sum(), "c_mean": x["c"].mean()} return pd.Series(result, name="metrics") - result = df.groupby("a").apply(compute_metrics) + result = df.groupby("a").apply(compute_metrics, include_groups=False) result diff --git a/doc/source/whatsnew/v0.14.0.rst b/doc/source/whatsnew/v0.14.0.rst index 92c37243b7e81..9c537b3a48c74 100644 --- a/doc/source/whatsnew/v0.14.0.rst +++ b/doc/source/whatsnew/v0.14.0.rst @@ -328,13 +328,24 @@ More consistent behavior for some groupby methods: - groupby ``head`` and ``tail`` now act more like ``filter`` rather than an aggregation: - .. ipython:: python + .. code-block:: ipython - df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') - g.head(1) # filters DataFrame + In [1]: df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) - g.apply(lambda x: x.head(1)) # used to simply fall-through + In [2]: g = df.groupby('A') + + In [3]: g.head(1) # filters DataFrame + Out[3]: + A B + 0 1 2 + 2 5 6 + + In [4]: g.apply(lambda x: x.head(1)) # used to simply fall-through + Out[4]: + A B + A + 1 0 1 2 + 5 2 5 6 - groupby head and tail respect column selection: diff --git a/doc/source/whatsnew/v0.18.1.rst b/doc/source/whatsnew/v0.18.1.rst index 7d9008fdbdecd..ee6a60144bc35 100644 --- a/doc/source/whatsnew/v0.18.1.rst +++ b/doc/source/whatsnew/v0.18.1.rst @@ -77,9 +77,52 @@ Previously you would have to do this to get a rolling window mean per-group: df = pd.DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}) df -.. ipython:: python +.. code-block:: ipython - df.groupby("A").apply(lambda x: x.rolling(4).B.mean()) + In [1]: df.groupby("A").apply(lambda x: x.rolling(4).B.mean()) + Out[1]: + A + 1 0 NaN + 1 NaN + 2 NaN + 3 1.5 + 4 2.5 + 5 3.5 + 6 4.5 + 7 5.5 + 8 6.5 + 9 7.5 + 10 8.5 + 11 9.5 + 12 10.5 + 13 11.5 + 14 12.5 + 15 13.5 + 16 14.5 + 17 15.5 + 18 16.5 + 19 17.5 + 2 20 NaN + 21 NaN + 22 NaN + 23 21.5 + 24 22.5 + 25 23.5 + 26 24.5 + 27 25.5 + 28 26.5 + 29 27.5 + 30 28.5 + 31 29.5 + 3 32 NaN + 33 NaN + 34 NaN + 35 33.5 + 36 34.5 + 37 35.5 + 38 36.5 + 39 37.5 + Name: B, dtype: float64 Now you can do: @@ -101,15 +144,53 @@ For ``.resample(..)`` type of operations, previously you would have to: df -.. ipython:: python +.. code-block:: ipython - df.groupby("group").apply(lambda x: x.resample("1D").ffill()) + In[1]: df.groupby("group").apply(lambda x: x.resample("1D").ffill()) + Out[1]: + group val + group date + 1 2016-01-03 1 5 + 2016-01-04 1 5 + 2016-01-05 1 5 + 2016-01-06 1 5 + 2016-01-07 1 5 + 2016-01-08 1 5 + 2016-01-09 1 5 + 2016-01-10 1 6 + 2 2016-01-17 2 7 + 2016-01-18 2 7 + 2016-01-19 2 7 + 2016-01-20 2 7 + 2016-01-21 2 7 + 2016-01-22 2 7 + 2016-01-23 2 7 + 2016-01-24 2 8 Now you can do: -.. ipython:: python +.. code-block:: ipython - df.groupby("group").resample("1D").ffill() + In[1]: df.groupby("group").resample("1D").ffill() + Out[1]: + group val + group date + 1 2016-01-03 1 5 + 2016-01-04 1 5 + 2016-01-05 1 5 + 2016-01-06 1 5 + 2016-01-07 1 5 + 2016-01-08 1 5 + 2016-01-09 1 5 + 2016-01-10 1 6 + 2 2016-01-17 2 7 + 2016-01-18 2 7 + 2016-01-19 2 7 + 2016-01-20 2 7 + 2016-01-21 2 7 + 2016-01-22 2 7 + 2016-01-23 2 7 + 2016-01-24 2 8 .. _whatsnew_0181.enhancements.method_chain: diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 4f38d420a53b4..7bb4aaec0dd7c 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -146,12 +146,12 @@ Deprecations - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_pickle` except ``path``. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_string` except ``buf``. (:issue:`54229`) - Deprecated downcasting behavior in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, :meth:`DataFrame.mask`, :meth:`Series.clip`, :meth:`DataFrame.clip`; in a future version these will not infer object-dtype columns to non-object dtype, or all-round floats to integer dtype. Call ``result.infer_objects(copy=False)`` on the result for object inference, or explicitly cast floats to ints. To opt in to the future version, use ``pd.set_option("future.downcasting", True)`` (:issue:`53656`) +- Deprecated including the groups in computations when using :meth:`DataFrameGroupBy.apply` and :meth:`DataFrameGroupBy.resample`; pass ``include_groups=False`` to exclude the groups (:issue:`7155`) - Deprecated not passing a tuple to :class:`DataFrameGroupBy.get_group` or :class:`SeriesGroupBy.get_group` when grouping by a length-1 list-like (:issue:`25971`) - Deprecated strings ``S``, ``U``, and ``N`` denoting units in :func:`to_timedelta` (:issue:`52536`) - Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`52536`) - Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`52536`) - Deprecated the extension test classes ``BaseNoReduceTests``, ``BaseBooleanReduceTests``, and ``BaseNumericReduceTests``, use ``BaseReduceTests`` instead (:issue:`54663`) -- .. --------------------------------------------------------------------------- .. _whatsnew_220.performance: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a731cdbf99b0e..f1fc63bc4b1ea 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8869,20 +8869,20 @@ def update( >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', ... 'Parrot', 'Parrot'], ... 'Max Speed': [380., 370., 24., 26.]}) - >>> df.groupby("Animal", group_keys=True).apply(lambda x: x) - Animal Max Speed + >>> df.groupby("Animal", group_keys=True)[['Max Speed']].apply(lambda x: x) + Max Speed Animal - Falcon 0 Falcon 380.0 - 1 Falcon 370.0 - Parrot 2 Parrot 24.0 - 3 Parrot 26.0 - - >>> df.groupby("Animal", group_keys=False).apply(lambda x: x) - Animal Max Speed - 0 Falcon 380.0 - 1 Falcon 370.0 - 2 Parrot 24.0 - 3 Parrot 26.0 + Falcon 0 380.0 + 1 370.0 + Parrot 2 24.0 + 3 26.0 + + >>> df.groupby("Animal", group_keys=False)[['Max Speed']].apply(lambda x: x) + Max Speed + 0 380.0 + 1 370.0 + 2 24.0 + 3 26.0 """ ) ) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 43d200027220b..e6dd6a990d285 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -180,6 +180,19 @@ class providing the base-class of operations. A callable that takes a {input} as its first argument, and returns a dataframe, a series or a scalar. In addition the callable may take positional and keyword arguments. + include_groups : bool, default True + When True, will attempt to apply ``func`` to the groupings in + the case that they are columns of the DataFrame. If this raises a + TypeError, the result will be computed with the groupings excluded. + When False, the groupings will be excluded when applying ``func``. + + .. versionadded:: 2.2.0 + + .. deprecated:: 2.2.0 + + Setting include_groups to True is deprecated. Only the value + False will be allowed in a future version of pandas. + args, kwargs : tuple and dict Optional positional and keyword arguments to pass to ``func``. @@ -272,7 +285,7 @@ class providing the base-class of operations. each group together into a Series, including setting the index as appropriate: - >>> g1.apply(lambda x: x.C.max() - x.B.min()) + >>> g1.apply(lambda x: x.C.max() - x.B.min(), include_groups=False) A a 5 b 2 @@ -1748,7 +1761,7 @@ def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs): input="dataframe", examples=_apply_docs["dataframe_examples"] ) ) - def apply(self, func, *args, **kwargs) -> NDFrameT: + def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: orig_func = func func = com.is_builtin_func(func) if orig_func != func: @@ -1781,10 +1794,25 @@ def f(g): else: f = func + if not include_groups: + return self._python_apply_general(f, self._obj_with_exclusions) + # ignore SettingWithCopy here in case the user mutates with option_context("mode.chained_assignment", None): try: result = self._python_apply_general(f, self._selected_obj) + if ( + not isinstance(self.obj, Series) + and self._selection is None + and self._selected_obj.shape != self._obj_with_exclusions.shape + ): + warnings.warn( + message=_apply_groupings_depr.format( + type(self).__name__, "apply" + ), + category=FutureWarning, + stacklevel=find_stack_level(), + ) except TypeError: # gh-20949 # try again, with .apply acting as a filtering @@ -3520,7 +3548,7 @@ def describe( return result @final - def resample(self, rule, *args, **kwargs) -> Resampler: + def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resampler: """ Provide resampling when using a TimeGrouper. @@ -3534,7 +3562,23 @@ def resample(self, rule, *args, **kwargs) -> Resampler: ---------- rule : str or DateOffset The offset string or object representing target grouper conversion. - *args, **kwargs + *args + Possible arguments are `how`, `fill_method`, `limit`, `kind` and + `on`, and other arguments of `TimeGrouper`. + include_groups : bool, default True + When True, will attempt to include the groupings in the operation in + the case that they are columns of the DataFrame. If this raises a + TypeError, the result will be computed with the groupings excluded. + When False, the groupings will be excluded when applying ``func``. + + .. versionadded:: 2.2.0 + + .. deprecated:: 2.2.0 + + Setting include_groups to True is deprecated. Only the value + False will be allowed in a future version of pandas. + + **kwargs Possible arguments are `how`, `fill_method`, `limit`, `kind` and `on`, and other arguments of `TimeGrouper`. @@ -3570,59 +3614,71 @@ def resample(self, rule, *args, **kwargs) -> Resampler: Downsample the DataFrame into 3 minute bins and sum the values of the timestamps falling into a bin. - >>> df.groupby('a').resample('3min').sum() - a b + >>> df.groupby('a').resample('3min', include_groups=False).sum() + b a - 0 2000-01-01 00:00:00 0 2 - 2000-01-01 00:03:00 0 1 - 5 2000-01-01 00:00:00 5 1 + 0 2000-01-01 00:00:00 2 + 2000-01-01 00:03:00 1 + 5 2000-01-01 00:00:00 1 Upsample the series into 30 second bins. - >>> df.groupby('a').resample('30s').sum() - a b + >>> df.groupby('a').resample('30s', include_groups=False).sum() + b a - 0 2000-01-01 00:00:00 0 1 - 2000-01-01 00:00:30 0 0 - 2000-01-01 00:01:00 0 1 - 2000-01-01 00:01:30 0 0 - 2000-01-01 00:02:00 0 0 - 2000-01-01 00:02:30 0 0 - 2000-01-01 00:03:00 0 1 - 5 2000-01-01 00:02:00 5 1 + 0 2000-01-01 00:00:00 1 + 2000-01-01 00:00:30 0 + 2000-01-01 00:01:00 1 + 2000-01-01 00:01:30 0 + 2000-01-01 00:02:00 0 + 2000-01-01 00:02:30 0 + 2000-01-01 00:03:00 1 + 5 2000-01-01 00:02:00 1 Resample by month. Values are assigned to the month of the period. - >>> df.groupby('a').resample('M').sum() - a b + >>> df.groupby('a').resample('M', include_groups=False).sum() + b a - 0 2000-01-31 0 3 - 5 2000-01-31 5 1 + 0 2000-01-31 3 + 5 2000-01-31 1 Downsample the series into 3 minute bins as above, but close the right side of the bin interval. - >>> df.groupby('a').resample('3min', closed='right').sum() - a b + >>> ( + ... df.groupby('a') + ... .resample('3min', closed='right', include_groups=False) + ... .sum() + ... ) + b a - 0 1999-12-31 23:57:00 0 1 - 2000-01-01 00:00:00 0 2 - 5 2000-01-01 00:00:00 5 1 + 0 1999-12-31 23:57:00 1 + 2000-01-01 00:00:00 2 + 5 2000-01-01 00:00:00 1 Downsample the series into 3 minute bins and close the right side of the bin interval, but label each bin using the right edge instead of the left. - >>> df.groupby('a').resample('3min', closed='right', label='right').sum() - a b + >>> ( + ... df.groupby('a') + ... .resample('3min', closed='right', label='right', include_groups=False) + ... .sum() + ... ) + b a - 0 2000-01-01 00:00:00 0 1 - 2000-01-01 00:03:00 0 2 - 5 2000-01-01 00:03:00 5 1 + 0 2000-01-01 00:00:00 1 + 2000-01-01 00:03:00 2 + 5 2000-01-01 00:03:00 1 """ from pandas.core.resample import get_resampler_for_grouping - return get_resampler_for_grouping(self, rule, *args, **kwargs) + # mypy flags that include_groups could be specified via `*args` or `**kwargs` + # GH#54961 would resolve. + return get_resampler_for_grouping( # type: ignore[misc] + self, rule, *args, include_groups=include_groups, **kwargs + ) @final def rolling(self, *args, **kwargs) -> RollingGroupby: @@ -5728,3 +5784,13 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde mi = MultiIndex(levels=levels, codes=codes, names=[idx.name, None]) return mi + + +# GH#7155 +_apply_groupings_depr = ( + "{}.{} operated on the grouping columns. This behavior is deprecated, " + "and in a future version of pandas the grouping columns will be excluded " + "from the operation. Either pass `include_groups=False` to exclude the " + "groupings or explicitly select the grouping columns after groupby to silence " + "this warning." +) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 5ff18d8a25e36..9605bf154a8b7 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -32,7 +32,10 @@ Substitution, doc, ) -from pandas.util._exceptions import find_stack_level +from pandas.util._exceptions import ( + find_stack_level, + rewrite_warning, +) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -57,6 +60,7 @@ from pandas.core.groupby.groupby import ( BaseGroupBy, GroupBy, + _apply_groupings_depr, _pipe_template, get_groupby, ) @@ -163,6 +167,7 @@ def __init__( gpr_index: Index, group_keys: bool = False, selection=None, + include_groups: bool = True, ) -> None: self._timegrouper = timegrouper self.keys = None @@ -171,6 +176,7 @@ def __init__( self.kind = kind self.group_keys = group_keys self.as_index = True + self.include_groups = include_groups self.obj, self.ax, self._indexer = self._timegrouper._set_grouper( self._convert_obj(obj), sort=True, gpr_index=gpr_index @@ -444,7 +450,9 @@ def _groupby_and_aggregate(self, how, *args, **kwargs): # a DataFrame column, but aggregate_item_by_item operates column-wise # on Series, raising AttributeError or KeyError # (depending on whether the column lookup uses getattr/__getitem__) - result = grouped.apply(how, *args, **kwargs) + result = _apply( + grouped, how, *args, include_groups=self.include_groups, **kwargs + ) except ValueError as err: if "Must produce aggregated value" in str(err): @@ -456,15 +464,21 @@ def _groupby_and_aggregate(self, how, *args, **kwargs): # we have a non-reducing function # try to evaluate - result = grouped.apply(how, *args, **kwargs) + result = _apply( + grouped, how, *args, include_groups=self.include_groups, **kwargs + ) return self._wrap_result(result) - def _get_resampler_for_grouping(self, groupby: GroupBy, key): + def _get_resampler_for_grouping( + self, groupby: GroupBy, key, include_groups: bool = True + ): """ Return the correct class for resampling with groupby. """ - return self._resampler_for_grouping(groupby=groupby, key=key, parent=self) + return self._resampler_for_grouping( + groupby=groupby, key=key, parent=self, include_groups=include_groups + ) def _wrap_result(self, result): """ @@ -1590,6 +1604,7 @@ def __init__( groupby: GroupBy, key=None, selection: IndexLabel | None = None, + include_groups: bool = False, ) -> None: # reached via ._gotitem and _get_resampler_for_grouping @@ -1612,6 +1627,7 @@ def __init__( self.ax = parent.ax self.obj = parent.obj + self.include_groups = include_groups @no_type_check def _apply(self, f, *args, **kwargs): @@ -1628,7 +1644,7 @@ def func(x): return x.apply(f, *args, **kwargs) - result = self._groupby.apply(func) + result = _apply(self._groupby, func, include_groups=self.include_groups) return self._wrap_result(result) _upsample = _apply @@ -2003,6 +2019,7 @@ def get_resampler_for_grouping( limit: int | None = None, kind=None, on=None, + include_groups: bool = True, **kwargs, ) -> Resampler: """ @@ -2011,7 +2028,9 @@ def get_resampler_for_grouping( # .resample uses 'on' similar to how .groupby uses 'key' tg = TimeGrouper(freq=rule, key=on, **kwargs) resampler = tg._get_resampler(groupby.obj, kind=kind) - return resampler._get_resampler_for_grouping(groupby=groupby, key=tg.key) + return resampler._get_resampler_for_grouping( + groupby=groupby, include_groups=include_groups, key=tg.key + ) class TimeGrouper(Grouper): @@ -2789,3 +2808,18 @@ def maybe_warn_args_and_kwargs(cls, kernel: str, args, kwargs) -> None: category=FutureWarning, stacklevel=find_stack_level(), ) + + +def _apply( + grouped: GroupBy, how: Callable, *args, include_groups: bool, **kwargs +) -> DataFrame: + # GH#7155 - rewrite warning to appear as if it came from `.resample` + target_message = "DataFrameGroupBy.apply operated on the grouping columns" + new_message = _apply_groupings_depr.format("DataFrameGroupBy", "resample") + with rewrite_warning( + target_message=target_message, + target_category=FutureWarning, + new_message=new_message, + ): + result = grouped.apply(how, *args, include_groups=include_groups, **kwargs) + return result diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 924b56f7a14d5..e8ca520e7b420 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -449,7 +449,7 @@ def _all_key(): return (margins_name,) + ("",) * (len(cols) - 1) if len(rows) > 0: - margin = data[rows].groupby(rows, observed=observed).apply(aggfunc) + margin = data.groupby(rows, observed=observed)[rows].apply(aggfunc) all_key = _all_key() table[all_key] = margin result = table @@ -467,7 +467,7 @@ def _all_key(): margin_keys = table.columns if len(cols): - row_margin = data[cols].groupby(cols, observed=observed).apply(aggfunc) + row_margin = data.groupby(cols, observed=observed)[cols].apply(aggfunc) else: row_margin = Series(np.nan, index=result.columns) diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 489f43729a004..5c21c4f7137a5 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -108,9 +108,13 @@ def test_groupby_extension_transform(self, data_for_grouping): def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) - df.groupby("B", group_keys=False).apply(groupby_apply_op) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby("B", group_keys=False).apply(groupby_apply_op) df.groupby("B", group_keys=False).A.apply(groupby_apply_op) - df.groupby("A", group_keys=False).apply(groupby_apply_op) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby("A", group_keys=False).apply(groupby_apply_op) df.groupby("A", group_keys=False).B.apply(groupby_apply_op) def test_groupby_apply_identity(self, data_for_grouping): diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index dbd1f96fc17c9..b54a795af4fdc 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1767,7 +1767,9 @@ def test_unstack_bug(self, future_stack): } ) - result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) unstacked = result.unstack() restacked = unstacked.stack(future_stack=future_stack) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 9d3ebbd3672ae..7ea107f254104 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -499,13 +499,17 @@ def test_agg_timezone_round_trip(): assert ts == grouped.first()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1] ts = df["B"].iloc[2] assert ts == grouped.last()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1] def test_sum_uint64_overflow(): diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index d04ee7cec0db1..abcb9f68e0f5c 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -28,7 +28,9 @@ def test_apply_func_that_appends_group_to_list_without_copy(): def store(group): groups.append(group) - df.groupby("index").apply(store) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby("index").apply(store) expected_value = DataFrame( {"index": [0] * 10, 0: [1] * 10}, index=pd.RangeIndex(0, 100, 10) ) @@ -71,9 +73,11 @@ def test_apply_issues(): ["2011.05.16", "2011.05.17", "2011.05.18"], dtype=object, name="date" ) expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) - result = df.groupby("date", group_keys=False).apply( - lambda x: x["time"][x["value"].idxmax()] - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("date", group_keys=False).apply( + lambda x: x["time"][x["value"].idxmax()] + ) tm.assert_series_equal(result, expected) @@ -179,7 +183,9 @@ def f_constant_df(group): for func in [f_copy, f_nocopy, f_scalar, f_none, f_constant_df]: del names[:] - df.groupby("a", group_keys=False).apply(func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby("a", group_keys=False).apply(func) assert names == group_names @@ -197,9 +203,11 @@ def test_group_apply_once_per_group2(capsys): index=["0", "2", "4", "6", "8", "10", "12", "14"], ) - df.groupby("group_by_column", group_keys=False).apply( - lambda df: print("function_called") - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby("group_by_column", group_keys=False).apply( + lambda df: print("function_called") + ) result = capsys.readouterr().out.count("function_called") # If `groupby` behaves unexpectedly, this test will break @@ -219,8 +227,11 @@ def slow(group): def fast(group): return group.copy() - fast_df = df.groupby("A", group_keys=False).apply(fast) - slow_df = df.groupby("A", group_keys=False).apply(slow) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + fast_df = df.groupby("A", group_keys=False).apply(fast) + with tm.assert_produces_warning(FutureWarning, match=msg): + slow_df = df.groupby("A", group_keys=False).apply(slow) tm.assert_frame_equal(fast_df, slow_df) @@ -242,7 +253,9 @@ def test_groupby_apply_identity_maybecopy_index_identical(func): df = DataFrame({"g": [1, 2, 2, 2], "a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) - result = df.groupby("g", group_keys=False).apply(func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("g", group_keys=False).apply(func) tm.assert_frame_equal(result, df) @@ -285,8 +298,11 @@ def test_groupby_as_index_apply(): tm.assert_index_equal(res_as, exp) tm.assert_index_equal(res_not_as, exp) - res_as_apply = g_as.apply(lambda x: x.head(2)).index - res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + res_as_apply = g_as.apply(lambda x: x.head(2)).index + with tm.assert_produces_warning(FutureWarning, match=msg): + res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index # apply doesn't maintain the original ordering # changed in GH5610 as the as_index=False returns a MI here @@ -299,7 +315,9 @@ def test_groupby_as_index_apply(): ind = Index(list("abcde")) df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) - res = df.groupby(0, as_index=False, group_keys=False).apply(lambda x: x).index + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = df.groupby(0, as_index=False, group_keys=False).apply(lambda x: x).index tm.assert_index_equal(res, ind) @@ -328,13 +346,19 @@ def desc3(group): # weirdo return result - result = grouped.apply(desc) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.apply(desc) assert result.index.names == ("A", "B", "stat") - result2 = grouped.apply(desc2) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result2 = grouped.apply(desc2) assert result2.index.names == ("A", "B", "stat") - result3 = grouped.apply(desc3) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result3 = grouped.apply(desc3) assert result3.index.names == ("A", "B", None) @@ -364,7 +388,9 @@ def test_apply_series_yield_constant(df): def test_apply_frame_yield_constant(df): # GH13568 - result = df.groupby(["A", "B"]).apply(len) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(["A", "B"]).apply(len) assert isinstance(result, Series) assert result.name is None @@ -375,7 +401,9 @@ def test_apply_frame_yield_constant(df): def test_apply_frame_to_series(df): grouped = df.groupby(["A", "B"]) - result = grouped.apply(len) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.apply(len) expected = grouped.count()["C"] tm.assert_index_equal(result.index, expected.index) tm.assert_numpy_array_equal(result.values, expected.values) @@ -384,7 +412,9 @@ def test_apply_frame_to_series(df): def test_apply_frame_not_as_index_column_name(df): # GH 35964 - path within _wrap_applied_output not hit by a test grouped = df.groupby(["A", "B"], as_index=False) - result = grouped.apply(len) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.apply(len) expected = grouped.count().rename(columns={"C": np.nan}).drop(columns="D") # TODO(GH#34306): Use assert_frame_equal when column name is not np.nan tm.assert_index_equal(result.index, expected.index) @@ -407,7 +437,9 @@ def trans2(group): } ) - result = df.groupby("A").apply(trans) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(trans) exp = df.groupby("A")["C"].apply(trans2) tm.assert_series_equal(result, exp, check_names=False) assert result.name == "C" @@ -436,7 +468,9 @@ def test_apply_chunk_view(group_keys): # Low level tinkering could be unsafe, make sure not df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - result = df.groupby("key", group_keys=group_keys).apply(lambda x: x.iloc[:2]) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("key", group_keys=group_keys).apply(lambda x: x.iloc[:2]) expected = df.take([0, 1, 3, 4, 6, 7]) if group_keys: expected.index = MultiIndex.from_arrays( @@ -457,7 +491,9 @@ def test_apply_no_name_column_conflict(): # it works! #2605 grouped = df.groupby(["name", "name2"]) - grouped.apply(lambda x: x.sort_values("value", inplace=True)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + grouped.apply(lambda x: x.sort_values("value", inplace=True)) def test_apply_typecast_fail(): @@ -474,7 +510,9 @@ def f(group): group["v2"] = (v - v.min()) / (v.max() - v.min()) return group - result = df.groupby("d", group_keys=False).apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("d", group_keys=False).apply(f) expected = df.copy() expected["v2"] = np.tile([0.0, 0.5, 1], 2) @@ -498,7 +536,9 @@ def f(group): group["v2"] = (v - v.min()) / (v.max() - v.min()) return group - result = df.groupby("d", group_keys=False).apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("d", group_keys=False).apply(f) expected = df.copy() expected["v2"] = np.tile([0.0, 0.5, 1], 2) @@ -536,8 +576,11 @@ def filt2(x): else: return x[x.category == "c"] - expected = data.groupby("id_field").apply(filt1) - result = data.groupby("id_field").apply(filt2) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = data.groupby("id_field").apply(filt1) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = data.groupby("id_field").apply(filt2) tm.assert_frame_equal(result, expected) @@ -556,7 +599,9 @@ def test_apply_with_duplicated_non_sorted_axis(test_series): expected = ser.sort_index() tm.assert_series_equal(result, expected) else: - result = df.groupby("Y", group_keys=False).apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("Y", group_keys=False).apply(lambda x: x) # not expecting the order to remain the same for duplicated axis result = result.sort_values("Y") @@ -601,7 +646,9 @@ def f(g): g["value3"] = g["value1"] * 2 return g - result = grouped.apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.apply(f) assert "value3" in result @@ -615,9 +662,13 @@ def test_apply_numeric_coercion_when_datetime(): df = DataFrame( {"Number": [1, 2], "Date": ["2017-03-02"] * 2, "Str": ["foo", "inf"]} ) - expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) df.Date = pd.to_datetime(df.Date) - result = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) tm.assert_series_equal(result["Str"], expected["Str"]) # GH 15421 @@ -628,7 +679,9 @@ def test_apply_numeric_coercion_when_datetime(): def get_B(g): return g.iloc[0][["B"]] - result = df.groupby("A").apply(get_B)["B"] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(get_B)["B"] expected = df.B expected.index = df.A tm.assert_series_equal(result, expected) @@ -653,8 +706,11 @@ def predictions(tool): ) df2 = df1.copy() df2.oTime = pd.to_datetime(df2.oTime) - expected = df1.groupby("Key").apply(predictions).p1 - result = df2.groupby("Key").apply(predictions).p1 + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df1.groupby("Key").apply(predictions).p1 + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df2.groupby("Key").apply(predictions).p1 tm.assert_series_equal(expected, result) @@ -669,11 +725,13 @@ def test_apply_aggregating_timedelta_and_datetime(): } ) df["time_delta_zero"] = df.datetime - df.datetime - result = df.groupby("clientid").apply( - lambda ddf: Series( - {"clientid_age": ddf.time_delta_zero.min(), "date": ddf.datetime.min()} + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("clientid").apply( + lambda ddf: Series( + {"clientid_age": ddf.time_delta_zero.min(), "date": ddf.datetime.min()} + ) ) - ) expected = DataFrame( { "clientid": ["A", "B", "C"], @@ -716,11 +774,15 @@ def func_with_no_date(batch): def func_with_date(batch): return Series({"b": datetime(2015, 1, 1), "c": 2}) - dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date) dfg_no_conversion_expected = DataFrame({"c": 2}, index=[1]) dfg_no_conversion_expected.index.name = "a" - dfg_conversion = df.groupby(by=["a"]).apply(func_with_date) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + dfg_conversion = df.groupby(by=["a"]).apply(func_with_date) dfg_conversion_expected = DataFrame( {"b": pd.Timestamp(2015, 1, 1).as_unit("ns"), "c": 2}, index=[1] ) @@ -764,7 +826,9 @@ def test_groupby_apply_all_none(): def test_func(x): pass - result = test_df.groupby("groups").apply(test_func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = test_df.groupby("groups").apply(test_func) expected = DataFrame() tm.assert_frame_equal(result, expected) @@ -779,8 +843,11 @@ def test_func(x): return None return x.iloc[[0, -1]] - result1 = test_df1.groupby("groups").apply(test_func) - result2 = test_df2.groupby("groups").apply(test_func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result1 = test_df1.groupby("groups").apply(test_func) + with tm.assert_produces_warning(FutureWarning, match=msg): + result2 = test_df2.groupby("groups").apply(test_func) index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], names=["groups", None]) index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], names=["groups", None]) expected1 = DataFrame({"groups": [1, 1], "vars": [0, 2]}, index=index1) @@ -793,7 +860,9 @@ def test_groupby_apply_return_empty_chunk(): # GH 22221: apply filter which returns some empty groups df = DataFrame({"value": [0, 1], "group": ["filled", "empty"]}) groups = df.groupby("group") - result = groups.apply(lambda group: group[group.value != 1]["value"]) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = groups.apply(lambda group: group[group.value != 1]["value"]) expected = Series( [0], name="value", @@ -820,7 +889,9 @@ def test_apply_with_mixed_types(): def test_func_returns_object(): # GH 28652 df = DataFrame({"a": [1, 2]}, index=Index([1, 2])) - result = df.groupby("a").apply(lambda g: g.index) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("a").apply(lambda g: g.index) expected = Series([Index([1]), Index([2])], index=Index([1, 2], name="a")) tm.assert_series_equal(result, expected) @@ -837,7 +908,9 @@ def test_apply_datetime_issue(group_column_dtlike): # standard int values in range(len(num_columns)) df = DataFrame({"a": ["foo"], "b": [group_column_dtlike]}) - result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) expected = DataFrame( ["spam"], Index(["foo"], dtype="object", name="a"), columns=[42] @@ -876,7 +949,9 @@ def test_apply_series_return_dataframe_groups(): def most_common_values(df): return Series({c: s.value_counts().index[0] for c, s in df.items()}) - result = tdf.groupby("day").apply(most_common_values)["userId"] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = tdf.groupby("day").apply(most_common_values)["userId"] expected = Series( ["17661101"], index=pd.DatetimeIndex(["2015-02-24"], name="day"), name="userId" ) @@ -917,7 +992,9 @@ def test_groupby_apply_datetime_result_dtypes(): ], columns=["observation", "color", "mood", "intensity", "score"], ) - result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes expected = Series( [np.dtype("datetime64[ns]"), object, object, np.int64, object], index=["observation", "color", "mood", "intensity", "score"], @@ -937,7 +1014,9 @@ def test_groupby_apply_datetime_result_dtypes(): def test_apply_index_has_complex_internals(index): # GH 31248 df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index) - result = df.groupby("group", group_keys=False).apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("group", group_keys=False).apply(lambda x: x) tm.assert_frame_equal(result, df) @@ -960,7 +1039,9 @@ def test_apply_index_has_complex_internals(index): def test_apply_function_returns_non_pandas_non_scalar(function, expected_values): # GH 31441 df = DataFrame(["A", "A", "B", "B"], columns=["groups"]) - result = df.groupby("groups").apply(function) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("groups").apply(function) expected = Series(expected_values, index=Index(["A", "B"], name="groups")) tm.assert_series_equal(result, expected) @@ -972,7 +1053,9 @@ def fct(group): df = DataFrame({"A": ["a", "a", "b", "none"], "B": [1, 2, 3, np.nan]}) - result = df.groupby("A").apply(fct) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(fct) expected = Series( [[1.0, 2.0], [3.0], [np.nan]], index=Index(["a", "b", "none"], name="A") ) @@ -983,7 +1066,9 @@ def fct(group): def test_apply_function_index_return(function): # GH: 22541 df = DataFrame([1, 2, 2, 2, 1, 2, 3, 1, 3, 1], columns=["id"]) - result = df.groupby("id").apply(function) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("id").apply(function) expected = Series( [Index([0, 4, 7, 9]), Index([1, 2, 3, 5]), Index([6, 8])], index=Index([1, 2, 3], name="id"), @@ -1019,7 +1104,9 @@ def test_apply_result_type(group_keys, udf): # We'd like to control whether the group keys end up in the index # regardless of whether the UDF happens to be a transform. df = DataFrame({"A": ["a", "b"], "B": [1, 2]}) - df_result = df.groupby("A", group_keys=group_keys).apply(udf) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df_result = df.groupby("A", group_keys=group_keys).apply(udf) series_result = df.B.groupby(df.A, group_keys=group_keys).apply(udf) if group_keys: @@ -1034,8 +1121,11 @@ def test_result_order_group_keys_false(): # GH 34998 # apply result order should not depend on whether index is the same or just equal df = DataFrame({"A": [2, 1, 2], "B": [1, 2, 3]}) - result = df.groupby("A", group_keys=False).apply(lambda x: x) - expected = df.groupby("A", group_keys=False).apply(lambda x: x.copy()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A", group_keys=False).apply(lambda x: x) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby("A", group_keys=False).apply(lambda x: x.copy()) tm.assert_frame_equal(result, expected) @@ -1047,8 +1137,15 @@ def test_apply_with_timezones_aware(): df1 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_no_tz}) df2 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_tz}) - result1 = df1.groupby("x", group_keys=False).apply(lambda df: df[["x", "y"]].copy()) - result2 = df2.groupby("x", group_keys=False).apply(lambda df: df[["x", "y"]].copy()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result1 = df1.groupby("x", group_keys=False).apply( + lambda df: df[["x", "y"]].copy() + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result2 = df2.groupby("x", group_keys=False).apply( + lambda df: df[["x", "y"]].copy() + ) tm.assert_frame_equal(result1, result2) @@ -1103,7 +1200,9 @@ def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp(): ) grp = df.groupby(["A", "B"]) - result = grp.apply(lambda x: x.head(1)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grp.apply(lambda x: x.head(1)) expected = df.iloc[[0, 2, 3]] expected = expected.reset_index() @@ -1151,7 +1250,9 @@ def test_apply_dropna_with_indexed_same(dropna): }, index=list("xxyxz"), ) - result = df.groupby("group", dropna=dropna, group_keys=False).apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("group", dropna=dropna, group_keys=False).apply(lambda x: x) expected = df.dropna() if dropna else df.iloc[[0, 3, 1, 2, 4]] tm.assert_frame_equal(result, expected) @@ -1176,7 +1277,9 @@ def test_apply_dropna_with_indexed_same(dropna): def test_apply_as_index_constant_lambda(as_index, expected): # GH 13217 df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 1, 2, 2], "c": [1, 1, 1, 1]}) - result = df.groupby(["a", "b"], as_index=as_index).apply(lambda x: 1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(["a", "b"], as_index=as_index).apply(lambda x: 1) tm.assert_equal(result, expected) @@ -1186,7 +1289,9 @@ def test_sort_index_groups(): {"A": [1, 2, 3, 4, 5], "B": [6, 7, 8, 9, 0], "C": [1, 1, 1, 2, 2]}, index=range(5), ) - result = df.groupby("C").apply(lambda x: x.A.sort_index()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("C").apply(lambda x: x.A.sort_index()) expected = Series( range(1, 6), index=MultiIndex.from_tuples( @@ -1206,9 +1311,11 @@ def test_positional_slice_groups_datetimelike(): "let": list("abcde"), } ) - result = expected.groupby( - [expected.let, expected.date.dt.date], group_keys=False - ).apply(lambda x: x.iloc[0:]) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = expected.groupby( + [expected.let, expected.date.dt.date], group_keys=False + ).apply(lambda x: x.iloc[0:]) tm.assert_frame_equal(result, expected) @@ -1251,24 +1358,29 @@ def test_apply_na(dropna): {"grp": [1, 1, 2, 2], "y": [1, 0, 2, 5], "z": [1, 2, np.nan, np.nan]} ) dfgrp = df.groupby("grp", dropna=dropna) - result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z")) - expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z")) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1)) tm.assert_frame_equal(result, expected) def test_apply_empty_string_nan_coerce_bug(): # GH#24903 - result = ( - DataFrame( - { - "a": [1, 1, 2, 2], - "b": ["", "", "", ""], - "c": pd.to_datetime([1, 2, 3, 4], unit="s"), - } + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ( + DataFrame( + { + "a": [1, 1, 2, 2], + "b": ["", "", "", ""], + "c": pd.to_datetime([1, 2, 3, 4], unit="s"), + } + ) + .groupby(["a", "b"]) + .apply(lambda df: df.iloc[-1]) ) - .groupby(["a", "b"]) - .apply(lambda df: df.iloc[-1]) - ) expected = DataFrame( [[1, "", pd.to_datetime(2, unit="s")], [2, "", pd.to_datetime(4, unit="s")]], columns=["a", "b", "c"], @@ -1293,9 +1405,11 @@ def test_apply_index_key_error_bug(index_values): }, index=Index(["a2", "a3", "aa"], name="a"), ) - result = result.groupby("a").apply( - lambda df: Series([df["b"].mean()], index=["b_mean"]) - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = result.groupby("a").apply( + lambda df: Series([df["b"].mean()], index=["b_mean"]) + ) tm.assert_frame_equal(result, expected) @@ -1343,7 +1457,9 @@ def test_apply_index_key_error_bug(index_values): def test_apply_nonmonotonic_float_index(arg, idx): # GH 34455 expected = DataFrame({"col": arg}, index=idx) - result = expected.groupby("col", group_keys=False).apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = expected.groupby("col", group_keys=False).apply(lambda x: x) tm.assert_frame_equal(result, expected) @@ -1390,33 +1506,16 @@ def test_empty_df(method, op): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize( - "group_col", - [([0.0, np.nan, 0.0, 0.0]), ([np.nan, 0.0, 0.0, 0.0]), ([0, 0.0, 0.0, np.nan])], -) -def test_apply_inconsistent_output(group_col): - # GH 34478 - df = DataFrame({"group_col": group_col, "value_col": [2, 2, 2, 2]}) - - result = df.groupby("group_col").value_col.apply( - lambda x: x.value_counts().reindex(index=[1, 2, 3]) - ) - expected = Series( - [np.nan, 3.0, np.nan], - name="value_col", - index=MultiIndex.from_product([[0.0], [1, 2, 3]], names=["group_col", 0.0]), - ) - - tm.assert_series_equal(result, expected) - - -def test_apply_array_output_multi_getitem(): - # GH 18930 - df = DataFrame( - {"A": {"a": 1, "b": 2}, "B": {"a": 1, "b": 2}, "C": {"a": 1, "b": 2}} - ) - result = df.groupby("A")[["B", "C"]].apply(lambda x: np.array([0])) - expected = Series( - [np.array([0])] * 2, index=Index([1, 2], name="A"), name=("B", "C") - ) - tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("include_groups", [True, False]) +def test_include_groups(include_groups): + # GH#7155 + df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}) + gb = df.groupby("a") + warn = FutureWarning if include_groups else None + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(warn, match=msg): + result = gb.apply(lambda x: x.sum(), include_groups=include_groups) + expected = DataFrame({"a": [2, 2], "b": [7, 5]}, index=Index([1, 2], name="a")) + if not include_groups: + expected = expected[["b"]] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_apply_mutate.py b/pandas/tests/groupby/test_apply_mutate.py index 9bc07b584e9d1..09d5e06bf6ddd 100644 --- a/pandas/tests/groupby/test_apply_mutate.py +++ b/pandas/tests/groupby/test_apply_mutate.py @@ -13,10 +13,16 @@ def test_group_by_copy(): } ).set_index("name") - grp_by_same_value = df.groupby(["age"], group_keys=False).apply(lambda group: group) - grp_by_copy = df.groupby(["age"], group_keys=False).apply( - lambda group: group.copy() - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + grp_by_same_value = df.groupby(["age"], group_keys=False).apply( + lambda group: group + ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + grp_by_copy = df.groupby(["age"], group_keys=False).apply( + lambda group: group.copy() + ) tm.assert_frame_equal(grp_by_same_value, grp_by_copy) @@ -47,8 +53,11 @@ def f_no_copy(x): x["rank"] = x.val.rank(method="min") return x.groupby("cat2")["rank"].min() - grpby_copy = df.groupby("cat1").apply(f_copy) - grpby_no_copy = df.groupby("cat1").apply(f_no_copy) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + grpby_copy = df.groupby("cat1").apply(f_copy) + with tm.assert_produces_warning(FutureWarning, match=msg): + grpby_no_copy = df.groupby("cat1").apply(f_no_copy) tm.assert_series_equal(grpby_copy, grpby_no_copy) @@ -58,8 +67,11 @@ def test_no_mutate_but_looks_like(): # second does not, but should yield the same results df = pd.DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key) - result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key) + with tm.assert_produces_warning(FutureWarning, match=msg): + result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key) tm.assert_series_equal(result1, result2) @@ -73,7 +85,9 @@ def fn(x): x.loc[x.index[-1], "col2"] = 0 return x.col2 - result = df.groupby(["col1"], as_index=False).apply(fn) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(["col1"], as_index=False).apply(fn) expected = pd.Series( [1, 2, 0, 4, 5, 0], index=pd.MultiIndex.from_tuples( diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index f2d21c10f7a15..b11240c841420 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -124,7 +124,9 @@ def test_basic(): # TODO: split this test def f(x): return x.drop_duplicates("person_name").iloc[0] - result = g.apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = g.apply(f) expected = x.iloc[[0, 1]].copy() expected.index = Index([1, 2], name="person_id") expected["person_name"] = expected["person_name"].astype("object") @@ -329,7 +331,9 @@ def test_apply(ordered): # but for transform we should still get back the original index idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"]) expected = Series(1, index=idx) - result = grouped.apply(lambda x: 1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.apply(lambda x: 1) tm.assert_series_equal(result, expected) @@ -2013,7 +2017,10 @@ def test_category_order_apply(as_index, sort, observed, method, index_kind, orde df["a2"] = df["a"] df = df.set_index(keys) gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) - op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True)) + warn = FutureWarning if method == "apply" and index_kind == "range" else None + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(warn, match=msg): + op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True)) if (method == "transform" or not as_index) and index_kind == "range": result = op_result["a"].cat.categories else: diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index 25a4fd2550df6..16d7fe61b90ad 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -289,7 +289,9 @@ def test_count(): for key in ["1st", "2nd", ["1st", "2nd"]]: left = df.groupby(key).count() - right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) tm.assert_frame_equal(left, right) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 0abf6428730ff..287310a18c7df 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -95,10 +95,12 @@ def test_builtins_apply(keys, f): assert result.shape == (ngroups, 3), assert_msg npfunc = lambda x: getattr(np, fname)(x, axis=0) # numpy's equivalent function - expected = gb.apply(npfunc) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = gb.apply(npfunc) tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning(None): + with tm.assert_produces_warning(FutureWarning, match=msg): expected2 = gb.apply(lambda x: npfunc(x)) tm.assert_frame_equal(result, expected2) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 999a03d18644d..fdd959f0e8754 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -150,7 +150,9 @@ def test_groupby_nonobject_dtype(mframe, df_mixed_floats): def max_value(group): return group.loc[group["value"].idxmax()] - applied = df.groupby("A").apply(max_value) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + applied = df.groupby("A").apply(max_value) result = applied.dtypes expected = df.dtypes tm.assert_series_equal(result, expected) @@ -171,7 +173,9 @@ def f_0(grp): return grp.iloc[0] expected = df.groupby("A").first()[["B"]] - result = df.groupby("A").apply(f_0)[["B"]] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(f_0)[["B"]] tm.assert_frame_equal(result, expected) def f_1(grp): @@ -179,9 +183,10 @@ def f_1(grp): return None return grp.iloc[0] - result = df.groupby("A").apply(f_1)[["B"]] - # Cast to avoid upcast when setting nan below - e = expected.copy().astype("float64") + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(f_1)[["B"]] + e = expected.copy() e.loc["Tiger"] = np.nan tm.assert_frame_equal(result, e) @@ -190,9 +195,10 @@ def f_2(grp): return None return grp.iloc[0] - result = df.groupby("A").apply(f_2)[["B"]] - # Explicit cast to float to avoid implicit cast when setting nan - e = expected.copy().astype({"B": "float"}) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(f_2)[["B"]] + e = expected.copy() e.loc["Pony"] = np.nan tm.assert_frame_equal(result, e) @@ -202,7 +208,9 @@ def f_3(grp): return None return grp.iloc[0] - result = df.groupby("A").apply(f_3)[["C"]] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(f_3)[["C"]] e = df.groupby("A").first()[["C"]] e.loc["Pony"] = pd.NaT tm.assert_frame_equal(result, e) @@ -213,7 +221,9 @@ def f_4(grp): return None return grp.iloc[0].loc["C"] - result = df.groupby("A").apply(f_4) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(f_4) e = df.groupby("A").first()["C"].copy() e.loc["Pony"] = np.nan e.name = None @@ -392,8 +402,11 @@ def f3(x): depr_msg = "The behavior of array concatenation with empty entries is deprecated" # correct result - result1 = df.groupby("a").apply(f1) - result2 = df2.groupby("a").apply(f1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result1 = df.groupby("a").apply(f1) + with tm.assert_produces_warning(FutureWarning, match=msg): + result2 = df2.groupby("a").apply(f1) tm.assert_frame_equal(result1, result2) # should fail (not the same number of levels) @@ -1322,11 +1335,15 @@ def summarize_random_name(df): # inconsistent. return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["A"]) - metrics = df.groupby("A").apply(summarize) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + metrics = df.groupby("A").apply(summarize) assert metrics.columns.name is None - metrics = df.groupby("A").apply(summarize, "metrics") + with tm.assert_produces_warning(FutureWarning, match=msg): + metrics = df.groupby("A").apply(summarize, "metrics") assert metrics.columns.name == "metrics" - metrics = df.groupby("A").apply(summarize_random_name) + with tm.assert_produces_warning(FutureWarning, match=msg): + metrics = df.groupby("A").apply(summarize_random_name) assert metrics.columns.name is None @@ -1619,7 +1636,9 @@ def test_dont_clobber_name_column(): {"key": ["a", "a", "a", "b", "b", "b"], "name": ["foo", "bar", "baz"] * 2} ) - result = df.groupby("key", group_keys=False).apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("key", group_keys=False).apply(lambda x: x) tm.assert_frame_equal(result, df) @@ -1693,7 +1712,9 @@ def freducex(x): grouped = df.groupby(grouper, group_keys=False) # make sure all these work - grouped.apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + grouped.apply(f) grouped.aggregate(freduce) grouped.aggregate({"C": freduce, "D": freduce}) grouped.transform(f) @@ -1714,7 +1735,9 @@ def f(group): names.append(group.name) return group.copy() - df.groupby("a", sort=False, group_keys=False).apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby("a", sort=False, group_keys=False).apply(f) expected_names = [0, 1, 2] assert names == expected_names @@ -1920,7 +1943,9 @@ def test_groupby_preserves_sort(sort_column, group_column): def test_sort(x): tm.assert_frame_equal(x, x.sort_values(by=sort_column)) - g.apply(test_sort) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + g.apply(test_sort) def test_pivot_table_values_key_error(): @@ -2102,7 +2127,9 @@ def test_empty_groupby_apply_nonunique_columns(): df[3] = df[3].astype(np.int64) df.columns = [0, 1, 2, 0] gb = df.groupby(df[1], group_keys=False) - res = gb.apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = gb.apply(lambda x: x) assert (res.dtypes == df.dtypes).all() diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 099e7bc3890d0..d82278c277d48 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -324,7 +324,9 @@ def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, df = pd.DataFrame(data) gb = df.groupby("groups", dropna=dropna) - result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))})) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))})) mi_tuples = tuple(zip(data["groups"], selected_data["values"])) mi = pd.MultiIndex.from_tuples(mi_tuples, names=["groups", None]) diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index 773c1e60e97af..601e67bbca5e3 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -63,7 +63,9 @@ def func(group): assert hasattr(group, "testattr") return group.testattr - result = custom_df.groupby("c").apply(func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = custom_df.groupby("c").apply(func) expected = tm.SubclassedSeries(["hello"] * 3, index=Index([7, 8, 9], name="c")) tm.assert_series_equal(result, expected) @@ -101,5 +103,7 @@ def test_groupby_resample_preserves_subclass(obj): df = df.set_index("Date") # Confirm groupby.resample() preserves dataframe type - result = df.groupby("Buyer").resample("5D").sum() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("Buyer").resample("5D").sum() assert isinstance(result, obj) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index e0793ada679c2..d05b60fd56b5f 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -224,7 +224,9 @@ def test_grouper_creation_bug(self): result = g.sum() tm.assert_frame_equal(result, expected) - result = g.apply(lambda x: x.sum()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = g.apply(lambda x: x.sum()) expected["A"] = [0, 2, 4] expected = expected.loc[:, ["A", "B"]] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 55f96bd1443de..1a26559ef4447 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -470,8 +470,12 @@ def test_timegrouper_apply_return_type_series(self): def sumfunc_series(x): return Series([x["value"].sum()], ("sum",)) - expected = df.groupby(Grouper(key="date")).apply(sumfunc_series) - result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_series) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby(Grouper(key="date")).apply(sumfunc_series) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_series) tm.assert_frame_equal( result.reset_index(drop=True), expected.reset_index(drop=True) ) @@ -487,8 +491,11 @@ def test_timegrouper_apply_return_type_value(self): def sumfunc_value(x): return x.value.sum() - expected = df.groupby(Grouper(key="date")).apply(sumfunc_value) - result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_value) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby(Grouper(key="date")).apply(sumfunc_value) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_value) tm.assert_series_equal( result.reset_index(drop=True), expected.reset_index(drop=True) ) @@ -895,7 +902,9 @@ def test_groupby_apply_timegrouper_with_nat_apply_squeeze( assert gb._selected_obj._get_axis(gb.axis).nlevels == 1 # function that returns a Series - res = gb.apply(lambda x: x["Quantity"] * 2) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = gb.apply(lambda x: x["Quantity"] * 2) expected = DataFrame( [[36, 6, 6, 10, 2]], diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 7c50124e57e29..944dda8977882 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -327,9 +327,12 @@ def test_against_frame_and_seriesgroupby( ) if frame: # compare against apply with DataFrame value_counts - expected = gp.apply( - _frame_value_counts, ["gender", "education"], normalize, sort, ascending - ) + warn = FutureWarning if groupby == "column" else None + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(warn, match=msg): + expected = gp.apply( + _frame_value_counts, ["gender", "education"], normalize, sort, ascending + ) if as_index: tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 062dfe3931423..acb4b93ba1af3 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -636,7 +636,9 @@ def f(group): return group[:1] grouped = df.groupby("c") - result = grouped.apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.apply(f) assert result["d"].dtype == np.float64 @@ -790,7 +792,13 @@ def test_cython_transform_frame(request, op, args, targop, df_fix, gb_target): f = gb[["float", "float_missing"]].apply(targop) expected = concat([f, i], axis=1) else: - expected = gb.apply(targop) + if op != "shift" or not isinstance(gb_target.get("by"), (str, list)): + warn = None + else: + warn = FutureWarning + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(warn, match=msg): + expected = gb.apply(targop) expected = expected.sort_index(axis=1) if op == "shift": diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 66ecb93385a87..a955fa0b096f0 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1077,8 +1077,12 @@ def test_resample_segfault(unit): all_wins_and_wagers, columns=("ID", "timestamp", "A", "B") ).set_index("timestamp") df.index = df.index.as_unit(unit) - result = df.groupby("ID").resample("5min").sum() - expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum()) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("ID").resample("5min").sum() + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum()) tm.assert_frame_equal(result, expected) @@ -1097,7 +1101,9 @@ def test_resample_dtype_preservation(unit): result = df.resample("1D").ffill() assert result.val.dtype == np.int32 - result = df.groupby("group").resample("1D").ffill() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("group").resample("1D").ffill() assert result.val.dtype == np.int32 @@ -1823,8 +1829,12 @@ def f(data, add_arg): # Testing dataframe df = DataFrame({"A": 1, "B": 2}, index=date_range("2017", periods=10)) - result = df.groupby("A").resample("D").agg(f, multiplier).astype(float) - expected = df.groupby("A").resample("D").mean().multiply(multiplier) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").resample("D").agg(f, multiplier).astype(float) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby("A").resample("D").mean().multiply(multiplier) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 1b20a7b99d1d7..f331851596317 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -77,7 +77,9 @@ def test_groupby_resample_api(): ) index = pd.MultiIndex.from_arrays([[1] * 8 + [2] * 8, i], names=["group", "date"]) expected = DataFrame({"val": [5] * 7 + [6] + [7] * 7 + [8]}, index=index) - result = df.groupby("group").apply(lambda x: x.resample("1D").ffill())[["val"]] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("group").apply(lambda x: x.resample("1D").ffill())[["val"]] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 6f4f1154907dc..d47a8132f26bb 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -68,8 +68,12 @@ def test_deferred_with_groupby(): def f_0(x): return x.set_index("date").resample("D").asfreq() - expected = df.groupby("id").apply(f_0) - result = df.set_index("date").groupby("id").resample("D").asfreq() + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby("id").apply(f_0) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.set_index("date").groupby("id").resample("D").asfreq() tm.assert_frame_equal(result, expected) df = DataFrame( @@ -83,8 +87,12 @@ def f_0(x): def f_1(x): return x.resample("1D").ffill() - expected = df.groupby("group").apply(f_1) - result = df.groupby("group").resample("1D").ffill() + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby("group").apply(f_1) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("group").resample("1D").ffill() tm.assert_frame_equal(result, expected) @@ -99,7 +107,9 @@ def test_getitem(test_frame): result = g.B.resample("2s").mean() tm.assert_series_equal(result, expected) - result = g.resample("2s").mean().B + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = g.resample("2s").mean().B tm.assert_series_equal(result, expected) @@ -230,8 +240,12 @@ def test_methods(f, test_frame): g = test_frame.groupby("A") r = g.resample("2s") - result = getattr(r, f)() - expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = getattr(r, f)() + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) tm.assert_equal(result, expected) @@ -248,8 +262,12 @@ def test_methods_nunique(test_frame): def test_methods_std_var(f, test_frame): g = test_frame.groupby("A") r = g.resample("2s") - result = getattr(r, f)(ddof=1) - expected = g.apply(lambda x: getattr(x.resample("2s"), f)(ddof=1)) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = getattr(r, f)(ddof=1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: getattr(x.resample("2s"), f)(ddof=1)) tm.assert_frame_equal(result, expected) @@ -258,18 +276,24 @@ def test_apply(test_frame): r = g.resample("2s") # reduction - expected = g.resample("2s").sum() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.resample("2s").sum() def f_0(x): return x.resample("2s").sum() - result = r.apply(f_0) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = r.apply(f_0) tm.assert_frame_equal(result, expected) def f_1(x): return x.resample("2s").apply(lambda y: y.sum()) - result = g.apply(f_1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = g.apply(f_1) # y.sum() results in int64 instead of int32 on 32-bit architectures expected = expected.astype("int64") tm.assert_frame_equal(result, expected) @@ -337,7 +361,9 @@ def test_resample_groupby_with_label(): # GH 13235 index = date_range("2000-01-01", freq="2D", periods=5) df = DataFrame(index=index, data={"col0": [0, 0, 1, 1, 2], "col1": [1, 1, 1, 1, 1]}) - result = df.groupby("col0").resample("1W", label="left").sum() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("col0").resample("1W", label="left").sum() mi = [ np.array([0, 0, 1, 2], dtype=np.int64), @@ -357,7 +383,9 @@ def test_consistency_with_window(test_frame): # consistent return values with window df = test_frame expected = Index([1, 2, 3], name="A") - result = df.groupby("A").resample("2s").mean() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").resample("2s").mean() assert result.index.nlevels == 2 tm.assert_index_equal(result.index.levels[0], expected) @@ -455,7 +483,9 @@ def test_resample_groupby_agg_listlike(): def test_empty(keys): # GH 26411 df = DataFrame([], columns=["a", "b"], index=TimedeltaIndex([])) - result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() expected = ( DataFrame(columns=["a", "b"]) .set_index(keys, drop=False) @@ -478,7 +508,8 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate): if consolidate: df = df._consolidate() - result = df.groupby(["key"]).resample("W", on="date").min() + with tm.assert_produces_warning(FutureWarning): + result = df.groupby(["key"]).resample("W", on="date").min() idx = pd.MultiIndex.from_arrays( [ ["A"] * 3 + ["B"] * 3, @@ -530,7 +561,9 @@ def test_resample_no_index(keys): df = DataFrame([], columns=["a", "b", "date"]) df["date"] = pd.to_datetime(df["date"]) df = df.set_index("date") - result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() expected = DataFrame(columns=["a", "b", "date"]).set_index(keys, drop=False) expected["date"] = pd.to_datetime(expected["date"]) expected = expected.set_index("date", append=True, drop=True) @@ -577,7 +610,9 @@ def test_groupby_resample_size_all_index_same(): {"A": [1] * 3 + [2] * 3 + [1] * 3 + [2] * 3, "B": np.arange(12)}, index=date_range("31/12/2000 18:00", freq="H", periods=12), ) - result = df.groupby("A").resample("D").size() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").resample("D").size() expected = Series( 3, index=pd.MultiIndex.from_tuples( diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index d7fdbc4fe5f08..8b1eab552c97d 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -323,12 +323,14 @@ def test_groupby_resample_interpolate(): df["week_starting"] = date_range("01/01/2018", periods=3, freq="W") - result = ( - df.set_index("week_starting") - .groupby("volume") - .resample("1D") - .interpolate(method="linear") - ) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ( + df.set_index("week_starting") + .groupby("volume") + .resample("1D") + .interpolate(method="linear") + ) expected_ind = pd.MultiIndex.from_tuples( [ diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 46ab00c3e2284..b8e0173ee131f 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -99,7 +99,9 @@ def test_rolling(self, f, roll_frame): r = g.rolling(window=4) result = getattr(r, f)() - expected = g.apply(lambda x: getattr(x.rolling(4), f)()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: getattr(x.rolling(4), f)()) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -113,7 +115,9 @@ def test_rolling_ddof(self, f, roll_frame): r = g.rolling(window=4) result = getattr(r, f)(ddof=1) - expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -129,9 +133,11 @@ def test_rolling_quantile(self, interpolation, roll_frame): r = g.rolling(window=4) result = r.quantile(0.4, interpolation=interpolation) - expected = g.apply( - lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation) - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply( + lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation) + ) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -174,7 +180,9 @@ def test_rolling_corr_cov_other_diff_size_as_groups(self, f, roll_frame): def func(x): return getattr(x.rolling(4), f)(roll_frame) - expected = g.apply(func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(func) # GH 39591: The grouped column should be all np.nan # (groupby.apply inserts 0s for cov) expected["A"] = np.nan @@ -190,7 +198,9 @@ def test_rolling_corr_cov_pairwise(self, f, roll_frame): def func(x): return getattr(x.B.rolling(4), f)(pairwise=True) - expected = g.apply(func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(func) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -235,7 +245,9 @@ def test_rolling_apply(self, raw, roll_frame): # reduction result = r.apply(lambda x: x.sum(), raw=raw) - expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -784,9 +796,13 @@ def test_groupby_rolling_resulting_multiindex3(self): def test_groupby_rolling_object_doesnt_affect_groupby_apply(self, roll_frame): # GH 39732 g = roll_frame.groupby("A", group_keys=False) - expected = g.apply(lambda x: x.rolling(4).sum()).index + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: x.rolling(4).sum()).index _ = g.rolling(window=4) - result = g.apply(lambda x: x.rolling(4).sum()).index + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = g.apply(lambda x: x.rolling(4).sum()).index tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -960,11 +976,13 @@ def test_groupby_monotonic(self): df["date"] = to_datetime(df["date"]) df = df.sort_values("date") - expected = ( - df.set_index("date") - .groupby("name") - .apply(lambda x: x.rolling("180D")["amount"].sum()) - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = ( + df.set_index("date") + .groupby("name") + .apply(lambda x: x.rolling("180D")["amount"].sum()) + ) result = df.groupby("name").rolling("180D", on="date")["amount"].sum() tm.assert_series_equal(result, expected) @@ -983,9 +1001,13 @@ def test_datelike_on_monotonic_within_each_group(self): } ) - expected = ( - df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean()) - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = ( + df.set_index("B") + .groupby("A") + .apply(lambda x: x.rolling("4s")["C"].mean()) + ) result = df.groupby("A").rolling("4s", on="B").C.mean() tm.assert_series_equal(result, expected) @@ -1015,7 +1037,9 @@ def test_expanding(self, f, frame): r = g.expanding() result = getattr(r, f)() - expected = g.apply(lambda x: getattr(x.expanding(), f)()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: getattr(x.expanding(), f)()) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -1029,7 +1053,9 @@ def test_expanding_ddof(self, f, frame): r = g.expanding() result = getattr(r, f)(ddof=0) - expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -1045,9 +1071,11 @@ def test_expanding_quantile(self, interpolation, frame): r = g.expanding() result = r.quantile(0.4, interpolation=interpolation) - expected = g.apply( - lambda x: x.expanding().quantile(0.4, interpolation=interpolation) - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply( + lambda x: x.expanding().quantile(0.4, interpolation=interpolation) + ) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -1065,7 +1093,9 @@ def test_expanding_corr_cov(self, f, frame): def func_0(x): return getattr(x.expanding(), f)(frame) - expected = g.apply(func_0) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(func_0) # GH 39591: groupby.apply returns 1 instead of nan for windows # with all nan values null_idx = list(range(20, 61)) + list(range(72, 113)) @@ -1080,7 +1110,9 @@ def func_0(x): def func_1(x): return getattr(x.B.expanding(), f)(pairwise=True) - expected = g.apply(func_1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(func_1) tm.assert_series_equal(result, expected) def test_expanding_apply(self, raw, frame): @@ -1089,7 +1121,11 @@ def test_expanding_apply(self, raw, frame): # reduction result = r.apply(lambda x: x.sum(), raw=raw) - expected = g.apply(lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply( + lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw) + ) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 47534226f972f..0931dd209ee05 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -33,6 +33,7 @@ "_agg_template_series", "_agg_template_frame", "_pipe_template", + "_apply_groupings_depr", "__main__", "_transform_template", "_use_inf_as_na", From c118953c380ed82b01e57caee4b8b58f993760f4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 7 Sep 2023 18:00:41 +0200 Subject: [PATCH 27/32] Improve error message for StringDtype with invalid storage (#55052) --- pandas/core/arrays/string_.py | 3 ++- pandas/tests/arrays/string_/test_string_arrow.py | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index c90127c0e9812..693ebad0ca16f 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -123,7 +123,8 @@ def __init__(self, storage=None) -> None: storage = get_option("mode.string_storage") if storage not in {"python", "pyarrow", "pyarrow_numpy"}: raise ValueError( - f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." + f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. " + f"Got {storage} instead." ) if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under7p0: raise ImportError( diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 09f9f788dc3e4..fb6c338b8f8ea 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -255,3 +255,11 @@ def test_pickle_roundtrip(): result_sliced = pickle.loads(sliced_pickled) tm.assert_series_equal(result_sliced, expected_sliced) + + +@skip_if_no_pyarrow +def test_string_dtype_error_message(): + # GH#55051 + msg = "Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'." + with pytest.raises(ValueError, match=msg): + StringDtype("bla") From ba1e73500756fab064b83e6421ae7f42dffe55f8 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 7 Sep 2023 18:05:08 +0200 Subject: [PATCH 28/32] Fix pickle roundtrip for new arrow string dtype (#55051) --- pandas/core/arrays/string_arrow.py | 5 ++++- pandas/tests/arrays/string_/test_string_arrow.py | 5 +++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index a6838fbc73be9..6262055827428 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -523,7 +523,10 @@ def _result_converter(cls, values, na=None): def __getattribute__(self, item): # ArrowStringArray and we both inherit from ArrowExtensionArray, which # creates inheritance problems (Diamond inheritance) - if item in ArrowStringArrayMixin.__dict__ and item != "_pa_array": + if item in ArrowStringArrayMixin.__dict__ and item not in ( + "_pa_array", + "__dict__", + ): return partial(getattr(ArrowStringArrayMixin, item), self) return super().__getattribute__(item) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index fb6c338b8f8ea..c1d424f12bfc4 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -241,9 +241,10 @@ def test_setitem_invalid_indexer_raises(): @skip_if_no_pyarrow -def test_pickle_roundtrip(): +@pytest.mark.parametrize("dtype", ["string[pyarrow]", "string[pyarrow_numpy]"]) +def test_pickle_roundtrip(dtype): # GH 42600 - expected = pd.Series(range(10), dtype="string[pyarrow]") + expected = pd.Series(range(10), dtype=dtype) expected_sliced = expected.head(2) full_pickled = pickle.dumps(expected) sliced_pickled = pickle.dumps(expected_sliced) From b1e9b58340110b63dc74f5c5b5de4aaa8db184fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 7 Sep 2023 18:07:11 +0200 Subject: [PATCH 29/32] Fix docstring of Index.join in base.py (#55050) --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6a397862712de..cd55997ad5f69 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4557,7 +4557,7 @@ def join( ------- join_index, (left_indexer, right_indexer) - Examples + Examples -------- >>> idx1 = pd.Index([1, 2, 3]) >>> idx2 = pd.Index([4, 5, 6]) From a7cb22631dd607f8c36292118c69cbd7bed1485c Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 7 Sep 2023 12:28:45 -0400 Subject: [PATCH 30/32] BLD: Build wheels for Python 3.12 (#55010) * BLD: Build wheels for Python 3.12 * Update pyproject.toml * Update pyproject.toml * also circle * fix windows? * typo? * try single quotes * tyr to fix again * just use the base shared tag, no need to append windowsservercore * typo * update the other too * Update wheels.yml * try something * try something * debug * escape string? * go for green --- .circleci/config.yml | 5 ++--- .github/workflows/wheels.yml | 16 +++++++++------- pyproject.toml | 7 +++++-- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 50f6a116a6630..ba124533e953a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -48,7 +48,7 @@ jobs: name: Build aarch64 wheels no_output_timeout: 30m # Sometimes the tests won't generate any output, make sure the job doesn't get killed by that command: | - pip3 install cibuildwheel==2.14.1 + pip3 install cibuildwheel==2.15.0 cibuildwheel --prerelease-pythons --output-dir wheelhouse environment: CIBW_BUILD: << parameters.cibw-build >> @@ -92,5 +92,4 @@ workflows: only: /^v.*/ matrix: parameters: - # TODO: Enable Python 3.12 wheels when numpy releases a version that supports Python 3.12 - cibw-build: ["cp39-manylinux_aarch64", "cp310-manylinux_aarch64", "cp311-manylinux_aarch64"]#, "cp312-manylinux_aarch64"] + cibw-build: ["cp39-manylinux_aarch64", "cp310-manylinux_aarch64", "cp311-manylinux_aarch64", "cp312-manylinux_aarch64"] diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 5f541f1bae1fd..97d78a1a9afe3 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -97,8 +97,7 @@ jobs: - [macos-12, macosx_*] - [windows-2022, win_amd64] # TODO: support PyPy? - # TODO: Enable Python 3.12 wheels when numpy releases a version that supports Python 3.12 - python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"]]#, ["cp312", "3.12"]] + python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"]] env: IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} @@ -150,8 +149,10 @@ jobs: uses: mamba-org/setup-micromamba@v1 with: environment-name: wheel-env + # Use a fixed Python, since we might have an unreleased Python not + # yet present on conda-forge create-args: >- - python=${{ matrix.python[1] }} + python=3.11 anaconda-client wheel cache-downloads: true @@ -167,12 +168,13 @@ jobs: shell: pwsh run: | $TST_CMD = @" - python -m pip install pytz six numpy python-dateutil tzdata>=2022.1 hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17; - python -m pip install --find-links=pandas\wheelhouse --no-index pandas; + python -m pip install hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17; + python -m pip install `$(Get-Item pandas\wheelhouse\*.whl); python -c `'import pandas as pd; pd.test(extra_args=[\"`\"--no-strict-data-files`\"\", \"`\"-m not clipboard and not single_cpu and not slow and not network and not db`\"\"])`'; "@ - docker pull python:${{ matrix.python[1] }}-windowsservercore - docker run --env PANDAS_CI='1' -v ${PWD}:C:\pandas python:${{ matrix.python[1] }}-windowsservercore powershell -Command $TST_CMD + # add rc to the end of the image name if the Python version is unreleased + docker pull python:${{ matrix.python[1] == '3.12' && '3.12-rc' || format('{0}-windowsservercore', matrix.python[1]) }} + docker run --env PANDAS_CI='1' -v ${PWD}:C:\pandas python:${{ matrix.python[1] == '3.12' && '3.12-rc' || format('{0}-windowsservercore', matrix.python[1]) }} powershell -Command $TST_CMD - uses: actions/upload-artifact@v3 with: diff --git a/pyproject.toml b/pyproject.toml index 845c2a63e84f0..74d6aaee286a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,8 @@ requires = [ # we don't want to force users to compile with 1.25 though # (Ideally, in the future, though, oldest-supported-numpy can be dropped when our min numpy is 1.25.x) "oldest-supported-numpy>=2022.8.16; python_version<'3.12'", - "numpy>=1.22.4; python_version>='3.12'", + # TODO: This needs to be updated when the official numpy 1.26 comes out + "numpy>=1.26.0b1; python_version>='3.12'", "versioneer[toml]" ] @@ -30,7 +31,9 @@ license = {file = 'LICENSE'} requires-python = '>=3.9' dependencies = [ "numpy>=1.22.4; python_version<'3.11'", - "numpy>=1.23.2; python_version>='3.11'", + "numpy>=1.23.2; python_version=='3.11'", + # TODO: This needs to be updated when the official numpy 1.26 comes out + "numpy>=1.26.0b1; python_version>='3.12'", "python-dateutil>=2.8.2", "pytz>=2020.1", "tzdata>=2022.1" From cea0cc0a54725ed234e2f51cc21a1182674a6032 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Thu, 7 Sep 2023 12:30:50 -0400 Subject: [PATCH 31/32] Convert test_sql to pytest idiom (#54936) * Convert test_sql to pytest idiom * Try KeyError catch * Added drop_view to existing test method * xfail MySQL issue --- pandas/io/sql.py | 2 +- pandas/tests/io/test_sql.py | 1008 ++++++++++++++++++++++------------- 2 files changed, 630 insertions(+), 380 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 2b139f8ca527c..0788d9da06eb9 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -138,7 +138,7 @@ def _parse_date_columns(data_frame, parse_dates): if isinstance(df_col.dtype, DatetimeTZDtype) or col_name in parse_dates: try: fmt = parse_dates[col_name] - except TypeError: + except (KeyError, TypeError): fmt = None data_frame.isetitem(i, _handle_date_column(df_col, format=fmt)) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index bfa93a4ff910e..bbdb22955297e 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -413,6 +413,8 @@ def mysql_pymysql_engine(iris_path, types_data): for entry in types_data: entry.pop("DateColWithTz") create_and_load_types(engine, types_data, "mysql") + if not insp.has_table("iris_view"): + create_and_load_iris_view(engine) yield engine with engine.connect() as conn: with conn.begin(): @@ -422,7 +424,7 @@ def mysql_pymysql_engine(iris_path, types_data): @pytest.fixture -def mysql_pymysql_conn(mysql_pymysql_engine): +def mysql_pymysql_conn(iris_path, mysql_pymysql_engine): with mysql_pymysql_engine.connect() as conn: yield conn @@ -440,6 +442,8 @@ def postgresql_psycopg2_engine(iris_path, types_data): create_and_load_iris(engine, iris_path, "postgresql") if not insp.has_table("types"): create_and_load_types(engine, types_data, "postgresql") + if not insp.has_table("iris_view"): + create_and_load_iris_view(engine) yield engine with engine.connect() as conn: with conn.begin(): @@ -462,9 +466,20 @@ def sqlite_str(): @pytest.fixture -def sqlite_engine(sqlite_str): +def sqlite_engine(sqlite_str, iris_path, types_data): sqlalchemy = pytest.importorskip("sqlalchemy") engine = sqlalchemy.create_engine(sqlite_str, poolclass=sqlalchemy.pool.NullPool) + + insp = sqlalchemy.inspect(engine) + if not insp.has_table("iris"): + create_and_load_iris(engine, iris_path, "sqlite") + if not insp.has_table("iris_view"): + create_and_load_iris_view(engine) + if not insp.has_table("types"): + for entry in types_data: + entry.pop("DateColWithTz") + create_and_load_types(engine, types_data, "sqlite") + yield engine engine.dispose() @@ -476,17 +491,25 @@ def sqlite_conn(sqlite_engine): @pytest.fixture -def sqlite_iris_str(sqlite_str, iris_path): +def sqlite_iris_str(sqlite_str, iris_path, types_data): sqlalchemy = pytest.importorskip("sqlalchemy") engine = sqlalchemy.create_engine(sqlite_str) - create_and_load_iris(engine, iris_path, "sqlite") + + insp = sqlalchemy.inspect(engine) + if not insp.has_table("iris"): + create_and_load_iris(engine, iris_path, "sqlite") + if not insp.has_table("iris_view"): + create_and_load_iris_view(engine) + if not insp.has_table("types"): + for entry in types_data: + entry.pop("DateColWithTz") + create_and_load_types(engine, types_data, "sqlite") engine.dispose() return sqlite_str @pytest.fixture def sqlite_iris_engine(sqlite_engine, iris_path): - create_and_load_iris(sqlite_engine, iris_path, "sqlite") return sqlite_engine @@ -499,6 +522,7 @@ def sqlite_iris_conn(sqlite_iris_engine): @pytest.fixture def sqlite_buildin(): with contextlib.closing(sqlite3.connect(":memory:")) as closing_conn: + create_and_load_iris_view(closing_conn) with closing_conn as conn: yield conn @@ -1097,6 +1121,7 @@ class PandasSQLTest: """ def load_iris_data(self, iris_path): + self.drop_view("iris_view", self.conn) self.drop_table("iris", self.conn) if isinstance(self.conn, sqlite3.Connection): create_and_load_iris_sqlite3(self.conn, iris_path) @@ -1221,470 +1246,695 @@ class DummyException(Exception): # -- Testing the public API -class _TestSQLApi(PandasSQLTest): - """ - Base class to test the public API. +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable_iris) +def test_api_read_sql_view(conn, request): + conn = request.getfixturevalue(conn) + iris_frame = sql.read_sql_query("SELECT * FROM iris_view", conn) + check_iris_frame(iris_frame) - From this two classes are derived to run these tests for both the - sqlalchemy mode (`TestSQLApi`) and the fallback mode - (`TestSQLiteFallbackApi`). These tests are run with sqlite3. Specific - tests for the different sql flavours are included in `_TestSQLAlchemy`. - Notes: - flavor can always be passed even in SQLAlchemy mode, - should be correctly ignored. +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable_iris) +def test_api_read_sql_with_chunksize_no_result(conn, request): + conn = request.getfixturevalue(conn) + query = 'SELECT * FROM iris_view WHERE "SepalLength" < 0.0' + with_batch = sql.read_sql_query(query, conn, chunksize=5) + without_batch = sql.read_sql_query(query, conn) + tm.assert_frame_equal(concat(with_batch), without_batch) - we don't use drop_table because that isn't part of the public api - """ +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_to_sql(conn, request, test_frame1): + conn = request.getfixturevalue(conn) + if sql.has_table("test_frame1", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_frame1") - flavor = "sqlite" - mode: str + sql.to_sql(test_frame1, "test_frame1", conn) + assert sql.has_table("test_frame1", conn) - @pytest.fixture(autouse=True) - def setup_method(self, iris_path, types_data): - self.conn = self.connect() - self.load_iris_data(iris_path) - self.load_types_data(types_data) - self.load_test_data_and_sql() - def load_test_data_and_sql(self): - create_and_load_iris_view(self.conn) +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_to_sql_fail(conn, request, test_frame1): + conn = request.getfixturevalue(conn) + if sql.has_table("test_frame2", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_frame2") - def test_read_sql_view(self): - iris_frame = sql.read_sql_query("SELECT * FROM iris_view", self.conn) - check_iris_frame(iris_frame) + sql.to_sql(test_frame1, "test_frame2", conn, if_exists="fail") + assert sql.has_table("test_frame2", conn) - def test_read_sql_with_chunksize_no_result(self): - query = "SELECT * FROM iris_view WHERE SepalLength < 0.0" - with_batch = sql.read_sql_query(query, self.conn, chunksize=5) - without_batch = sql.read_sql_query(query, self.conn) - tm.assert_frame_equal(concat(with_batch), without_batch) + msg = "Table 'test_frame2' already exists" + with pytest.raises(ValueError, match=msg): + sql.to_sql(test_frame1, "test_frame2", conn, if_exists="fail") - def test_to_sql(self, test_frame1): - sql.to_sql(test_frame1, "test_frame1", self.conn) - assert sql.has_table("test_frame1", self.conn) - def test_to_sql_fail(self, test_frame1): - sql.to_sql(test_frame1, "test_frame2", self.conn, if_exists="fail") - assert sql.has_table("test_frame2", self.conn) +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_to_sql_replace(conn, request, test_frame1): + conn = request.getfixturevalue(conn) + if sql.has_table("test_frame3", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_frame3") - msg = "Table 'test_frame2' already exists" - with pytest.raises(ValueError, match=msg): - sql.to_sql(test_frame1, "test_frame2", self.conn, if_exists="fail") + sql.to_sql(test_frame1, "test_frame3", conn, if_exists="fail") + # Add to table again + sql.to_sql(test_frame1, "test_frame3", conn, if_exists="replace") + assert sql.has_table("test_frame3", conn) - def test_to_sql_replace(self, test_frame1): - sql.to_sql(test_frame1, "test_frame3", self.conn, if_exists="fail") - # Add to table again - sql.to_sql(test_frame1, "test_frame3", self.conn, if_exists="replace") - assert sql.has_table("test_frame3", self.conn) + num_entries = len(test_frame1) + num_rows = count_rows(conn, "test_frame3") - num_entries = len(test_frame1) - num_rows = count_rows(self.conn, "test_frame3") + assert num_rows == num_entries - assert num_rows == num_entries - def test_to_sql_append(self, test_frame1): - assert sql.to_sql(test_frame1, "test_frame4", self.conn, if_exists="fail") == 4 +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_to_sql_append(conn, request, test_frame1): + conn = request.getfixturevalue(conn) + if sql.has_table("test_frame4", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_frame4") - # Add to table again - assert ( - sql.to_sql(test_frame1, "test_frame4", self.conn, if_exists="append") == 4 - ) - assert sql.has_table("test_frame4", self.conn) + assert sql.to_sql(test_frame1, "test_frame4", conn, if_exists="fail") == 4 - num_entries = 2 * len(test_frame1) - num_rows = count_rows(self.conn, "test_frame4") + # Add to table again + assert sql.to_sql(test_frame1, "test_frame4", conn, if_exists="append") == 4 + assert sql.has_table("test_frame4", conn) - assert num_rows == num_entries + num_entries = 2 * len(test_frame1) + num_rows = count_rows(conn, "test_frame4") - def test_to_sql_type_mapping(self, test_frame3): - sql.to_sql(test_frame3, "test_frame5", self.conn, index=False) - result = sql.read_sql("SELECT * FROM test_frame5", self.conn) + assert num_rows == num_entries - tm.assert_frame_equal(test_frame3, result) - def test_to_sql_series(self): - s = Series(np.arange(5, dtype="int64"), name="series") - sql.to_sql(s, "test_series", self.conn, index=False) - s2 = sql.read_sql_query("SELECT * FROM test_series", self.conn) - tm.assert_frame_equal(s.to_frame(), s2) +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_to_sql_type_mapping(conn, request, test_frame3): + conn = request.getfixturevalue(conn) + if sql.has_table("test_frame5", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_frame5") + + sql.to_sql(test_frame3, "test_frame5", conn, index=False) + result = sql.read_sql("SELECT * FROM test_frame5", conn) - def test_roundtrip(self, test_frame1): - sql.to_sql(test_frame1, "test_frame_roundtrip", con=self.conn) - result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=self.conn) + tm.assert_frame_equal(test_frame3, result) - # HACK! - result.index = test_frame1.index - result.set_index("level_0", inplace=True) - result.index.astype(int) - result.index.name = None - tm.assert_frame_equal(result, test_frame1) - def test_roundtrip_chunksize(self, test_frame1): - sql.to_sql( - test_frame1, - "test_frame_roundtrip", - con=self.conn, - index=False, - chunksize=2, - ) - result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=self.conn) - tm.assert_frame_equal(result, test_frame1) +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_to_sql_series(conn, request): + conn = request.getfixturevalue(conn) + if sql.has_table("test_series", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_series") - def test_execute_sql(self): - # drop_sql = "DROP TABLE IF EXISTS test" # should already be done - with sql.pandasSQL_builder(self.conn) as pandas_sql: - iris_results = pandas_sql.execute("SELECT * FROM iris") + s = Series(np.arange(5, dtype="int64"), name="series") + sql.to_sql(s, "test_series", conn, index=False) + s2 = sql.read_sql_query("SELECT * FROM test_series", conn) + tm.assert_frame_equal(s.to_frame(), s2) + + +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_roundtrip(conn, request, test_frame1): + conn = request.getfixturevalue(conn) + if sql.has_table("test_frame_roundtrip", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_frame_roundtrip") + + sql.to_sql(test_frame1, "test_frame_roundtrip", con=conn) + result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=conn) + + # HACK! + result.index = test_frame1.index + result.set_index("level_0", inplace=True) + result.index.astype(int) + result.index.name = None + tm.assert_frame_equal(result, test_frame1) + + +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_roundtrip_chunksize(conn, request, test_frame1): + conn = request.getfixturevalue(conn) + if sql.has_table("test_frame_roundtrip", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_frame_roundtrip") + + sql.to_sql( + test_frame1, + "test_frame_roundtrip", + con=conn, + index=False, + chunksize=2, + ) + result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=conn) + tm.assert_frame_equal(result, test_frame1) + + +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable_iris) +def test_api_execute_sql(conn, request): + # drop_sql = "DROP TABLE IF EXISTS test" # should already be done + conn = request.getfixturevalue(conn) + with sql.pandasSQL_builder(conn) as pandas_sql: + iris_results = pandas_sql.execute("SELECT * FROM iris") row = iris_results.fetchone() - tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) + tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) - def test_date_parsing(self): - # Test date parsing in read_sql - # No Parsing - df = sql.read_sql_query("SELECT * FROM types", self.conn) + +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_date_parsing(conn, request): + conn_name = conn + if conn_name in {"sqlite_buildin", "sqlite_str"}: + pytest.skip("types tables not created in sqlite_buildin or sqlite_str fixture") + + conn = request.getfixturevalue(conn) + # Test date parsing in read_sql + # No Parsing + df = sql.read_sql_query("SELECT * FROM types", conn) + if not ("mysql" in conn_name or "postgres" in conn_name): assert not issubclass(df.DateCol.dtype.type, np.datetime64) - df = sql.read_sql_query( - "SELECT * FROM types", self.conn, parse_dates=["DateCol"] - ) - assert issubclass(df.DateCol.dtype.type, np.datetime64) - assert df.DateCol.tolist() == [ - Timestamp(2000, 1, 3, 0, 0, 0), - Timestamp(2000, 1, 4, 0, 0, 0), - ] + df = sql.read_sql_query("SELECT * FROM types", conn, parse_dates=["DateCol"]) + assert issubclass(df.DateCol.dtype.type, np.datetime64) + assert df.DateCol.tolist() == [ + Timestamp(2000, 1, 3, 0, 0, 0), + Timestamp(2000, 1, 4, 0, 0, 0), + ] - df = sql.read_sql_query( - "SELECT * FROM types", - self.conn, - parse_dates={"DateCol": "%Y-%m-%d %H:%M:%S"}, - ) - assert issubclass(df.DateCol.dtype.type, np.datetime64) - assert df.DateCol.tolist() == [ - Timestamp(2000, 1, 3, 0, 0, 0), - Timestamp(2000, 1, 4, 0, 0, 0), - ] + df = sql.read_sql_query( + "SELECT * FROM types", + conn, + parse_dates={"DateCol": "%Y-%m-%d %H:%M:%S"}, + ) + assert issubclass(df.DateCol.dtype.type, np.datetime64) + assert df.DateCol.tolist() == [ + Timestamp(2000, 1, 3, 0, 0, 0), + Timestamp(2000, 1, 4, 0, 0, 0), + ] - df = sql.read_sql_query( - "SELECT * FROM types", self.conn, parse_dates=["IntDateCol"] - ) - assert issubclass(df.IntDateCol.dtype.type, np.datetime64) - assert df.IntDateCol.tolist() == [ - Timestamp(1986, 12, 25, 0, 0, 0), - Timestamp(2013, 1, 1, 0, 0, 0), - ] + df = sql.read_sql_query("SELECT * FROM types", conn, parse_dates=["IntDateCol"]) + assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + assert df.IntDateCol.tolist() == [ + Timestamp(1986, 12, 25, 0, 0, 0), + Timestamp(2013, 1, 1, 0, 0, 0), + ] + + df = sql.read_sql_query( + "SELECT * FROM types", conn, parse_dates={"IntDateCol": "s"} + ) + assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + assert df.IntDateCol.tolist() == [ + Timestamp(1986, 12, 25, 0, 0, 0), + Timestamp(2013, 1, 1, 0, 0, 0), + ] + + df = sql.read_sql_query( + "SELECT * FROM types", + conn, + parse_dates={"IntDateOnlyCol": "%Y%m%d"}, + ) + assert issubclass(df.IntDateOnlyCol.dtype.type, np.datetime64) + assert df.IntDateOnlyCol.tolist() == [ + Timestamp("2010-10-10"), + Timestamp("2010-12-12"), + ] - df = sql.read_sql_query( - "SELECT * FROM types", self.conn, parse_dates={"IntDateCol": "s"} - ) - assert issubclass(df.IntDateCol.dtype.type, np.datetime64) - assert df.IntDateCol.tolist() == [ - Timestamp(1986, 12, 25, 0, 0, 0), - Timestamp(2013, 1, 1, 0, 0, 0), - ] - df = sql.read_sql_query( +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +@pytest.mark.parametrize("error", ["ignore", "raise", "coerce"]) +@pytest.mark.parametrize( + "read_sql, text, mode", + [ + (sql.read_sql, "SELECT * FROM types", ("sqlalchemy", "fallback")), + (sql.read_sql, "types", ("sqlalchemy")), + ( + sql.read_sql_query, "SELECT * FROM types", - self.conn, - parse_dates={"IntDateOnlyCol": "%Y%m%d"}, - ) - assert issubclass(df.IntDateOnlyCol.dtype.type, np.datetime64) - assert df.IntDateOnlyCol.tolist() == [ - Timestamp("2010-10-10"), - Timestamp("2010-12-12"), - ] + ("sqlalchemy", "fallback"), + ), + (sql.read_sql_table, "types", ("sqlalchemy")), + ], +) +def test_api_custom_dateparsing_error( + conn, request, read_sql, text, mode, error, types_data_frame +): + conn_name = conn + if conn_name in {"sqlite_buildin", "sqlite_str"}: + pytest.skip("types tables not created in sqlite_buildin or sqlite_str fixture") - @pytest.mark.parametrize("error", ["ignore", "raise", "coerce"]) - @pytest.mark.parametrize( - "read_sql, text, mode", - [ - (sql.read_sql, "SELECT * FROM types", ("sqlalchemy", "fallback")), - (sql.read_sql, "types", ("sqlalchemy")), - ( - sql.read_sql_query, - "SELECT * FROM types", - ("sqlalchemy", "fallback"), - ), - (sql.read_sql_table, "types", ("sqlalchemy")), - ], + conn = request.getfixturevalue(conn) + + expected = types_data_frame.astype({"DateCol": "datetime64[ns]"}) + + result = read_sql( + text, + con=conn, + parse_dates={ + "DateCol": {"errors": error}, + }, ) - def test_custom_dateparsing_error( - self, read_sql, text, mode, error, types_data_frame - ): - if self.mode in mode: - expected = types_data_frame.astype({"DateCol": "datetime64[ns]"}) + if "postgres" in conn_name: + # TODO: clean up types_data_frame fixture + result = result.drop(columns=["DateColWithTz"]) + result["BoolCol"] = result["BoolCol"].astype(int) + result["BoolColWithNull"] = result["BoolColWithNull"].astype(float) - result = read_sql( - text, - con=self.conn, - parse_dates={ - "DateCol": {"errors": error}, - }, - ) + tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result, expected) - def test_date_and_index(self): - # Test case where same column appears in parse_date and index_col +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_date_and_index(conn, request): + # Test case where same column appears in parse_date and index_col + conn_name = conn + if conn_name in {"sqlite_buildin", "sqlite_str"}: + pytest.skip("types tables not created in sqlite_buildin or sqlite_str fixture") - df = sql.read_sql_query( - "SELECT * FROM types", - self.conn, - index_col="DateCol", - parse_dates=["DateCol", "IntDateCol"], - ) + conn = request.getfixturevalue(conn) + df = sql.read_sql_query( + "SELECT * FROM types", + conn, + index_col="DateCol", + parse_dates=["DateCol", "IntDateCol"], + ) - assert issubclass(df.index.dtype.type, np.datetime64) - assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + assert issubclass(df.index.dtype.type, np.datetime64) + assert issubclass(df.IntDateCol.dtype.type, np.datetime64) - def test_timedelta(self): - # see #6921 - df = to_timedelta(Series(["00:00:01", "00:00:03"], name="foo")).to_frame() - with tm.assert_produces_warning(UserWarning): - result_count = df.to_sql(name="test_timedelta", con=self.conn) - assert result_count == 2 - result = sql.read_sql_query("SELECT * FROM test_timedelta", self.conn) - tm.assert_series_equal(result["foo"], df["foo"].view("int64")) - - def test_complex_raises(self): - df = DataFrame({"a": [1 + 1j, 2j]}) - msg = "Complex datatypes not supported" - with pytest.raises(ValueError, match=msg): - assert df.to_sql("test_complex", con=self.conn) is None - @pytest.mark.parametrize( - "index_name,index_label,expected", - [ - # no index name, defaults to 'index' - (None, None, "index"), - # specifying index_label - (None, "other_label", "other_label"), - # using the index name - ("index_name", None, "index_name"), - # has index name, but specifying index_label - ("index_name", "other_label", "other_label"), - # index name is integer - (0, None, "0"), - # index name is None but index label is integer - (None, 0, "0"), - ], - ) - def test_to_sql_index_label(self, index_name, index_label, expected): - temp_frame = DataFrame({"col1": range(4)}) - temp_frame.index.name = index_name - query = "SELECT * FROM test_index_label" - sql.to_sql(temp_frame, "test_index_label", self.conn, index_label=index_label) - frame = sql.read_sql_query(query, self.conn) - assert frame.columns[0] == expected - - def test_to_sql_index_label_multiindex(self): - expected_row_count = 4 - temp_frame = DataFrame( - {"col1": range(4)}, - index=MultiIndex.from_product([("A0", "A1"), ("B0", "B1")]), - ) +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_timedelta(conn, request): + # see #6921 + conn = request.getfixturevalue(conn) + if sql.has_table("test_timedelta", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_timedelta") - # no index name, defaults to 'level_0' and 'level_1' - result = sql.to_sql(temp_frame, "test_index_label", self.conn) - assert result == expected_row_count - frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn) - assert frame.columns[0] == "level_0" - assert frame.columns[1] == "level_1" + df = to_timedelta(Series(["00:00:01", "00:00:03"], name="foo")).to_frame() + with tm.assert_produces_warning(UserWarning): + result_count = df.to_sql(name="test_timedelta", con=conn) + assert result_count == 2 + result = sql.read_sql_query("SELECT * FROM test_timedelta", conn) + tm.assert_series_equal(result["foo"], df["foo"].view("int64")) + + +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_complex_raises(conn, request): + conn = request.getfixturevalue(conn) + df = DataFrame({"a": [1 + 1j, 2j]}) + msg = "Complex datatypes not supported" + with pytest.raises(ValueError, match=msg): + assert df.to_sql("test_complex", con=conn) is None - # specifying index_label - result = sql.to_sql( - temp_frame, - "test_index_label", - self.conn, - if_exists="replace", - index_label=["A", "B"], - ) - assert result == expected_row_count - frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn) - assert frame.columns[:2].tolist() == ["A", "B"] +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +@pytest.mark.parametrize( + "index_name,index_label,expected", + [ + # no index name, defaults to 'index' + (None, None, "index"), + # specifying index_label + (None, "other_label", "other_label"), # using the index name - temp_frame.index.names = ["A", "B"] - result = sql.to_sql( - temp_frame, "test_index_label", self.conn, if_exists="replace" + ("index_name", None, "index_name"), + # has index name, but specifying index_label + ("index_name", "other_label", "other_label"), + # index name is integer + (0, None, "0"), + # index name is None but index label is integer + (None, 0, "0"), + ], +) +def test_api_to_sql_index_label(conn, request, index_name, index_label, expected): + conn = request.getfixturevalue(conn) + if sql.has_table("test_index_label", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_index_label") + + temp_frame = DataFrame({"col1": range(4)}) + temp_frame.index.name = index_name + query = "SELECT * FROM test_index_label" + sql.to_sql(temp_frame, "test_index_label", conn, index_label=index_label) + frame = sql.read_sql_query(query, conn) + assert frame.columns[0] == expected + + +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_to_sql_index_label_multiindex(conn, request): + conn_name = conn + if "mysql" in conn_name: + request.node.add_marker( + pytest.mark.xfail(reason="MySQL can fail using TEXT without length as key") ) - assert result == expected_row_count - frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn) - assert frame.columns[:2].tolist() == ["A", "B"] - # has index name, but specifying index_label - result = sql.to_sql( + conn = request.getfixturevalue(conn) + if sql.has_table("test_index_label", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_index_label") + + expected_row_count = 4 + temp_frame = DataFrame( + {"col1": range(4)}, + index=MultiIndex.from_product([("A0", "A1"), ("B0", "B1")]), + ) + + # no index name, defaults to 'level_0' and 'level_1' + result = sql.to_sql(temp_frame, "test_index_label", conn) + assert result == expected_row_count + frame = sql.read_sql_query("SELECT * FROM test_index_label", conn) + assert frame.columns[0] == "level_0" + assert frame.columns[1] == "level_1" + + # specifying index_label + result = sql.to_sql( + temp_frame, + "test_index_label", + conn, + if_exists="replace", + index_label=["A", "B"], + ) + assert result == expected_row_count + frame = sql.read_sql_query("SELECT * FROM test_index_label", conn) + assert frame.columns[:2].tolist() == ["A", "B"] + + # using the index name + temp_frame.index.names = ["A", "B"] + result = sql.to_sql(temp_frame, "test_index_label", conn, if_exists="replace") + assert result == expected_row_count + frame = sql.read_sql_query("SELECT * FROM test_index_label", conn) + assert frame.columns[:2].tolist() == ["A", "B"] + + # has index name, but specifying index_label + result = sql.to_sql( + temp_frame, + "test_index_label", + conn, + if_exists="replace", + index_label=["C", "D"], + ) + assert result == expected_row_count + frame = sql.read_sql_query("SELECT * FROM test_index_label", conn) + assert frame.columns[:2].tolist() == ["C", "D"] + + msg = "Length of 'index_label' should match number of levels, which is 2" + with pytest.raises(ValueError, match=msg): + sql.to_sql( temp_frame, "test_index_label", - self.conn, + conn, if_exists="replace", - index_label=["C", "D"], + index_label="C", ) - assert result == expected_row_count - frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn) - assert frame.columns[:2].tolist() == ["C", "D"] - - msg = "Length of 'index_label' should match number of levels, which is 2" - with pytest.raises(ValueError, match=msg): - sql.to_sql( - temp_frame, - "test_index_label", - self.conn, - if_exists="replace", - index_label="C", - ) - def test_multiindex_roundtrip(self): - df = DataFrame.from_records( - [(1, 2.1, "line1"), (2, 1.5, "line2")], - columns=["A", "B", "C"], - index=["A", "B"], - ) - df.to_sql(name="test_multiindex_roundtrip", con=self.conn) - result = sql.read_sql_query( - "SELECT * FROM test_multiindex_roundtrip", self.conn, index_col=["A", "B"] - ) - tm.assert_frame_equal(df, result, check_index_type=True) +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_multiindex_roundtrip(conn, request): + conn = request.getfixturevalue(conn) + if sql.has_table("test_multiindex_roundtrip", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_multiindex_roundtrip") + + df = DataFrame.from_records( + [(1, 2.1, "line1"), (2, 1.5, "line2")], + columns=["A", "B", "C"], + index=["A", "B"], + ) - @pytest.mark.parametrize( - "dtype", - [ - None, - int, - float, - {"A": int, "B": float}, - ], + df.to_sql(name="test_multiindex_roundtrip", con=conn) + result = sql.read_sql_query( + "SELECT * FROM test_multiindex_roundtrip", conn, index_col=["A", "B"] ) - def test_dtype_argument(self, dtype): - # GH10285 Add dtype argument to read_sql_query - df = DataFrame([[1.2, 3.4], [5.6, 7.8]], columns=["A", "B"]) - assert df.to_sql(name="test_dtype_argument", con=self.conn) == 2 - - expected = df.astype(dtype) - result = sql.read_sql_query( - "SELECT A, B FROM test_dtype_argument", con=self.conn, dtype=dtype - ) + tm.assert_frame_equal(df, result, check_index_type=True) - tm.assert_frame_equal(result, expected) - def test_integer_col_names(self): - df = DataFrame([[1, 2], [3, 4]], columns=[0, 1]) - sql.to_sql(df, "test_frame_integer_col_names", self.conn, if_exists="replace") +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +@pytest.mark.parametrize( + "dtype", + [ + None, + int, + float, + {"A": int, "B": float}, + ], +) +def test_api_dtype_argument(conn, request, dtype): + # GH10285 Add dtype argument to read_sql_query + conn_name = conn + conn = request.getfixturevalue(conn) + if sql.has_table("test_dtype_argument", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_dtype_argument") - def test_get_schema(self, test_frame1): - create_sql = sql.get_schema(test_frame1, "test", con=self.conn) - assert "CREATE" in create_sql + df = DataFrame([[1.2, 3.4], [5.6, 7.8]], columns=["A", "B"]) + assert df.to_sql(name="test_dtype_argument", con=conn) == 2 - def test_get_schema_with_schema(self, test_frame1): - # GH28486 - create_sql = sql.get_schema(test_frame1, "test", con=self.conn, schema="pypi") - assert "CREATE TABLE pypi." in create_sql + expected = df.astype(dtype) - def test_get_schema_dtypes(self): - if self.mode == "sqlalchemy": - from sqlalchemy import Integer + if "postgres" in conn_name: + query = 'SELECT "A", "B" FROM test_dtype_argument' + else: + query = "SELECT A, B FROM test_dtype_argument" + result = sql.read_sql_query(query, con=conn, dtype=dtype) - dtype = Integer - else: - dtype = "INTEGER" + tm.assert_frame_equal(result, expected) + + +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_integer_col_names(conn, request): + conn = request.getfixturevalue(conn) + df = DataFrame([[1, 2], [3, 4]], columns=[0, 1]) + sql.to_sql(df, "test_frame_integer_col_names", conn, if_exists="replace") + + +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_get_schema(conn, request, test_frame1): + conn = request.getfixturevalue(conn) + create_sql = sql.get_schema(test_frame1, "test", con=conn) + assert "CREATE" in create_sql + + +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_get_schema_with_schema(conn, request, test_frame1): + # GH28486 + conn = request.getfixturevalue(conn) + create_sql = sql.get_schema(test_frame1, "test", con=conn, schema="pypi") + assert "CREATE TABLE pypi." in create_sql + + +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_get_schema_dtypes(conn, request): + conn_name = conn + conn = request.getfixturevalue(conn) + float_frame = DataFrame({"a": [1.1, 1.2], "b": [2.1, 2.2]}) + + if conn_name == "sqlite_buildin": + dtype = "INTEGER" + else: + from sqlalchemy import Integer + + dtype = Integer + create_sql = sql.get_schema(float_frame, "test", con=conn, dtype={"b": dtype}) + assert "CREATE" in create_sql + assert "INTEGER" in create_sql - float_frame = DataFrame({"a": [1.1, 1.2], "b": [2.1, 2.2]}) - create_sql = sql.get_schema( - float_frame, "test", con=self.conn, dtype={"b": dtype} - ) - assert "CREATE" in create_sql - assert "INTEGER" in create_sql - def test_get_schema_keys(self, test_frame1): - frame = DataFrame({"Col1": [1.1, 1.2], "Col2": [2.1, 2.2]}) - create_sql = sql.get_schema(frame, "test", con=self.conn, keys="Col1") +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_get_schema_keys(conn, request, test_frame1): + conn_name = conn + conn = request.getfixturevalue(conn) + frame = DataFrame({"Col1": [1.1, 1.2], "Col2": [2.1, 2.2]}) + create_sql = sql.get_schema(frame, "test", con=conn, keys="Col1") + + if "mysql" in conn_name: + constraint_sentence = "CONSTRAINT test_pk PRIMARY KEY (`Col1`)" + else: constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("Col1")' - assert constraint_sentence in create_sql + assert constraint_sentence in create_sql - # multiple columns as key (GH10385) - create_sql = sql.get_schema(test_frame1, "test", con=self.conn, keys=["A", "B"]) + # multiple columns as key (GH10385) + create_sql = sql.get_schema(test_frame1, "test", con=conn, keys=["A", "B"]) + if "mysql" in conn_name: + constraint_sentence = "CONSTRAINT test_pk PRIMARY KEY (`A`, `B`)" + else: constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("A", "B")' - assert constraint_sentence in create_sql + assert constraint_sentence in create_sql - def test_chunksize_read(self): - df = DataFrame( - np.random.default_rng(2).standard_normal((22, 5)), columns=list("abcde") - ) - df.to_sql(name="test_chunksize", con=self.conn, index=False) - # reading the query in one time - res1 = sql.read_sql_query("select * from test_chunksize", self.conn) +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_chunksize_read(conn, request): + conn_name = conn + conn = request.getfixturevalue(conn) + if sql.has_table("test_chunksize", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_chunksize") + + df = DataFrame( + np.random.default_rng(2).standard_normal((22, 5)), columns=list("abcde") + ) + df.to_sql(name="test_chunksize", con=conn, index=False) + + # reading the query in one time + res1 = sql.read_sql_query("select * from test_chunksize", conn) + + # reading the query in chunks with read_sql_query + res2 = DataFrame() + i = 0 + sizes = [5, 5, 5, 5, 2] + + for chunk in sql.read_sql_query("select * from test_chunksize", conn, chunksize=5): + res2 = concat([res2, chunk], ignore_index=True) + assert len(chunk) == sizes[i] + i += 1 - # reading the query in chunks with read_sql_query - res2 = DataFrame() + tm.assert_frame_equal(res1, res2) + + # reading the query in chunks with read_sql_query + if conn_name == "sqlite_buildin": + with pytest.raises(NotImplementedError, match=""): + sql.read_sql_table("test_chunksize", conn, chunksize=5) + else: + res3 = DataFrame() i = 0 sizes = [5, 5, 5, 5, 2] - for chunk in sql.read_sql_query( - "select * from test_chunksize", self.conn, chunksize=5 - ): - res2 = concat([res2, chunk], ignore_index=True) + for chunk in sql.read_sql_table("test_chunksize", conn, chunksize=5): + res3 = concat([res3, chunk], ignore_index=True) assert len(chunk) == sizes[i] i += 1 - tm.assert_frame_equal(res1, res2) + tm.assert_frame_equal(res1, res3) - # reading the query in chunks with read_sql_query - if self.mode == "sqlalchemy": - res3 = DataFrame() - i = 0 - sizes = [5, 5, 5, 5, 2] - for chunk in sql.read_sql_table("test_chunksize", self.conn, chunksize=5): - res3 = concat([res3, chunk], ignore_index=True) - assert len(chunk) == sizes[i] - i += 1 +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_categorical(conn, request): + # GH8624 + # test that categorical gets written correctly as dense column + conn = request.getfixturevalue(conn) + if sql.has_table("test_categorical", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_categorical") - tm.assert_frame_equal(res1, res3) + df = DataFrame( + { + "person_id": [1, 2, 3], + "person_name": ["John P. Doe", "Jane Dove", "John P. Doe"], + } + ) + df2 = df.copy() + df2["person_name"] = df2["person_name"].astype("category") - def test_categorical(self): - # GH8624 - # test that categorical gets written correctly as dense column - df = DataFrame( - { - "person_id": [1, 2, 3], - "person_name": ["John P. Doe", "Jane Dove", "John P. Doe"], - } - ) - df2 = df.copy() - df2["person_name"] = df2["person_name"].astype("category") + df2.to_sql(name="test_categorical", con=conn, index=False) + res = sql.read_sql_query("SELECT * FROM test_categorical", conn) - df2.to_sql(name="test_categorical", con=self.conn, index=False) - res = sql.read_sql_query("SELECT * FROM test_categorical", self.conn) + tm.assert_frame_equal(res, df) - tm.assert_frame_equal(res, df) - def test_unicode_column_name(self): - # GH 11431 - df = DataFrame([[1, 2], [3, 4]], columns=["\xe9", "b"]) - df.to_sql(name="test_unicode", con=self.conn, index=False) +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_unicode_column_name(conn, request): + # GH 11431 + conn = request.getfixturevalue(conn) + if sql.has_table("test_unicode", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_unicode") - def test_escaped_table_name(self): - # GH 13206 - df = DataFrame({"A": [0, 1, 2], "B": [0.2, np.nan, 5.6]}) - df.to_sql(name="d1187b08-4943-4c8d-a7f6", con=self.conn, index=False) + df = DataFrame([[1, 2], [3, 4]], columns=["\xe9", "b"]) + df.to_sql(name="test_unicode", con=conn, index=False) - res = sql.read_sql_query("SELECT * FROM `d1187b08-4943-4c8d-a7f6`", self.conn) - tm.assert_frame_equal(res, df) +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_escaped_table_name(conn, request): + # GH 13206 + conn_name = conn + conn = request.getfixturevalue(conn) + if sql.has_table("d1187b08-4943-4c8d-a7f6", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("d1187b08-4943-4c8d-a7f6") - def test_read_sql_duplicate_columns(self): - # GH#53117 - df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": 1}) - df.to_sql(name="test_table", con=self.conn, index=False) + df = DataFrame({"A": [0, 1, 2], "B": [0.2, np.nan, 5.6]}) + df.to_sql(name="d1187b08-4943-4c8d-a7f6", con=conn, index=False) - result = pd.read_sql("SELECT a, b, a +1 as a, c FROM test_table;", self.conn) - expected = DataFrame( - [[1, 0.1, 2, 1], [2, 0.2, 3, 1], [3, 0.3, 4, 1]], - columns=["a", "b", "a", "c"], - ) - tm.assert_frame_equal(result, expected) + if "postgres" in conn_name: + query = 'SELECT * FROM "d1187b08-4943-4c8d-a7f6"' + else: + query = "SELECT * FROM `d1187b08-4943-4c8d-a7f6`" + res = sql.read_sql_query(query, conn) + + tm.assert_frame_equal(res, df) + + +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_read_sql_duplicate_columns(conn, request): + # GH#53117 + conn = request.getfixturevalue(conn) + if sql.has_table("test_table", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_table") + + df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": 1}) + df.to_sql(name="test_table", con=conn, index=False) + + result = pd.read_sql("SELECT a, b, a +1 as a, c FROM test_table;", conn) + expected = DataFrame( + [[1, 0.1, 2, 1], [2, 0.2, 3, 1], [3, 0.3, 4, 1]], + columns=["a", "b", "a", "c"], + ) + tm.assert_frame_equal(result, expected) + + +class _TestSQLApi(PandasSQLTest): + """ + Base class to test the public API. + + From this two classes are derived to run these tests for both the + sqlalchemy mode (`TestSQLApi`) and the fallback mode + (`TestSQLiteFallbackApi`). These tests are run with sqlite3. Specific + tests for the different sql flavours are included in `_TestSQLAlchemy`. + + Notes: + flavor can always be passed even in SQLAlchemy mode, + should be correctly ignored. + + we don't use drop_table because that isn't part of the public api + + """ + + flavor = "sqlite" + mode: str + + @pytest.fixture(autouse=True) + def setup_method(self, iris_path, types_data): + self.conn = self.connect() + self.load_iris_data(iris_path) + self.load_types_data(types_data) + self.load_test_data_and_sql() + + def load_test_data_and_sql(self): + create_and_load_iris_view(self.conn) @pytest.mark.skipif(not SQLALCHEMY_INSTALLED, reason="SQLAlchemy not installed") From d04747c367d00ee03c5d008ce5670892d450e801 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Thu, 7 Sep 2023 12:32:52 -0400 Subject: [PATCH 32/32] REGR: DataFrameGroupBy.agg with duplicate column names and a dict (#55042) --- doc/source/whatsnew/v2.1.1.rst | 1 + pandas/core/apply.py | 8 +++++++- pandas/tests/groupby/aggregate/test_aggregate.py | 12 ++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index fe511b5cdec67..42af61be26355 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -21,6 +21,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.__setitem__` raising ``AssertionError`` when setting a :class:`Series` with a partial :class:`MultiIndex` (:issue:`54875`) - Fixed regression in :meth:`DataFrame.filter` not respecting the order of elements for ``filter`` (:issue:`54980`) - Fixed regression in :meth:`DataFrame.to_sql` not roundtripping datetime columns correctly for sqlite (:issue:`54877`) +- Fixed regression in :meth:`DataFrameGroupBy.agg` when aggregating a DataFrame with duplicate column names using a dictionary (:issue:`55006`) - Fixed regression in :meth:`MultiIndex.append` raising when appending overlapping :class:`IntervalIndex` levels (:issue:`54934`) - Fixed regression in :meth:`Series.drop_duplicates` for PyArrow strings (:issue:`54904`) - Fixed regression in :meth:`Series.interpolate` raising when ``fill_value`` was given (:issue:`54920`) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 4d6dd8f4fd577..26467a4a982fa 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -436,7 +436,13 @@ def compute_dict_like( Data for result. When aggregating with a Series, this can contain any Python object. """ + from pandas.core.groupby.generic import ( + DataFrameGroupBy, + SeriesGroupBy, + ) + obj = self.obj + is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy)) func = cast(AggFuncTypeDict, self.func) func = self.normalize_dictlike_arg(op_name, selected_obj, func) @@ -450,7 +456,7 @@ def compute_dict_like( colg = obj._gotitem(selection, ndim=1) results = [getattr(colg, op_name)(how, **kwargs) for _, how in func.items()] keys = list(func.keys()) - elif is_non_unique_col: + elif not is_groupby and is_non_unique_col: # key used for column selection and output # GH#51099 results = [] diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index c01ca4922a84b..882f42ff18bdd 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -515,6 +515,18 @@ def test_groupby_agg_dict_with_getitem(): tm.assert_frame_equal(result, expected) +def test_groupby_agg_dict_dup_columns(): + # GH#55006 + df = DataFrame( + [[1, 2, 3, 4], [1, 3, 4, 5], [2, 4, 5, 6]], + columns=["a", "b", "c", "c"], + ) + gb = df.groupby("a") + result = gb.agg({"b": "sum"}) + expected = DataFrame({"b": [5, 4]}, index=Index([1, 2], name="a")) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "op", [