diff --git a/.circleci/config.yml b/.circleci/config.yml
index 50f6a116a6630..ba124533e953a 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -48,7 +48,7 @@ jobs:
name: Build aarch64 wheels
no_output_timeout: 30m # Sometimes the tests won't generate any output, make sure the job doesn't get killed by that
command: |
- pip3 install cibuildwheel==2.14.1
+ pip3 install cibuildwheel==2.15.0
cibuildwheel --prerelease-pythons --output-dir wheelhouse
environment:
CIBW_BUILD: << parameters.cibw-build >>
@@ -92,5 +92,4 @@ workflows:
only: /^v.*/
matrix:
parameters:
- # TODO: Enable Python 3.12 wheels when numpy releases a version that supports Python 3.12
- cibw-build: ["cp39-manylinux_aarch64", "cp310-manylinux_aarch64", "cp311-manylinux_aarch64"]#, "cp312-manylinux_aarch64"]
+ cibw-build: ["cp39-manylinux_aarch64", "cp310-manylinux_aarch64", "cp311-manylinux_aarch64", "cp312-manylinux_aarch64"]
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index 5f541f1bae1fd..97d78a1a9afe3 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -97,8 +97,7 @@ jobs:
- [macos-12, macosx_*]
- [windows-2022, win_amd64]
# TODO: support PyPy?
- # TODO: Enable Python 3.12 wheels when numpy releases a version that supports Python 3.12
- python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"]]#, ["cp312", "3.12"]]
+ python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"]]
env:
IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }}
IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
@@ -150,8 +149,10 @@ jobs:
uses: mamba-org/setup-micromamba@v1
with:
environment-name: wheel-env
+ # Use a fixed Python, since we might have an unreleased Python not
+ # yet present on conda-forge
create-args: >-
- python=${{ matrix.python[1] }}
+ python=3.11
anaconda-client
wheel
cache-downloads: true
@@ -167,12 +168,13 @@ jobs:
shell: pwsh
run: |
$TST_CMD = @"
- python -m pip install pytz six numpy python-dateutil tzdata>=2022.1 hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17;
- python -m pip install --find-links=pandas\wheelhouse --no-index pandas;
+ python -m pip install hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17;
+ python -m pip install `$(Get-Item pandas\wheelhouse\*.whl);
python -c `'import pandas as pd; pd.test(extra_args=[\"`\"--no-strict-data-files`\"\", \"`\"-m not clipboard and not single_cpu and not slow and not network and not db`\"\"])`';
"@
- docker pull python:${{ matrix.python[1] }}-windowsservercore
- docker run --env PANDAS_CI='1' -v ${PWD}:C:\pandas python:${{ matrix.python[1] }}-windowsservercore powershell -Command $TST_CMD
+ # add rc to the end of the image name if the Python version is unreleased
+ docker pull python:${{ matrix.python[1] == '3.12' && '3.12-rc' || format('{0}-windowsservercore', matrix.python[1]) }}
+ docker run --env PANDAS_CI='1' -v ${PWD}:C:\pandas python:${{ matrix.python[1] == '3.12' && '3.12-rc' || format('{0}-windowsservercore', matrix.python[1]) }} powershell -Command $TST_CMD
- uses: actions/upload-artifact@v3
with:
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9f9bcd78c07b0..c01bf65818167 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -24,7 +24,7 @@ repos:
hooks:
- id: black
- repo: https://github.com/astral-sh/ruff-pre-commit
- rev: v0.0.285
+ rev: v0.0.287
hooks:
- id: ruff
args: [--exit-non-zero-on-fix]
@@ -34,7 +34,7 @@ repos:
alias: ruff-selected-autofixes
args: [--select, "ANN001,ANN204", --fix-only, --exit-non-zero-on-fix]
- repo: https://github.com/jendrikseipp/vulture
- rev: 'v2.7'
+ rev: 'v2.9.1'
hooks:
- id: vulture
entry: python scripts/run_vulture.py
@@ -84,7 +84,7 @@ repos:
'--filter=-readability/casting,-runtime/int,-build/include_subdir,-readability/fn_size'
]
- repo: https://github.com/pylint-dev/pylint
- rev: v3.0.0a6
+ rev: v3.0.0a7
hooks:
- id: pylint
stages: [manual]
@@ -124,7 +124,7 @@ repos:
types: [text] # overwrite types: [rst]
types_or: [python, rst]
- repo: https://github.com/sphinx-contrib/sphinx-lint
- rev: v0.6.7
+ rev: v0.6.8
hooks:
- id: sphinx-lint
- repo: local
diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py
index 09c4acc0ab309..0229cf15fbfb8 100644
--- a/asv_bench/benchmarks/array.py
+++ b/asv_bench/benchmarks/array.py
@@ -90,7 +90,7 @@ def time_setitem(self, multiple_chunks):
self.array[i] = "foo"
def time_setitem_list(self, multiple_chunks):
- indexer = list(range(0, 50)) + list(range(-1000, 0, 50))
+ indexer = list(range(50)) + list(range(-1000, 0, 50))
self.array[indexer] = ["foo"] * len(indexer)
def time_setitem_slice(self, multiple_chunks):
diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py
index 54bcdb0fa2843..04ac47a892a22 100644
--- a/asv_bench/benchmarks/join_merge.py
+++ b/asv_bench/benchmarks/join_merge.py
@@ -360,14 +360,14 @@ class MergeCategoricals:
def setup(self):
self.left_object = DataFrame(
{
- "X": np.random.choice(range(0, 10), size=(10000,)),
+ "X": np.random.choice(range(10), size=(10000,)),
"Y": np.random.choice(["one", "two", "three"], size=(10000,)),
}
)
self.right_object = DataFrame(
{
- "X": np.random.choice(range(0, 10), size=(10000,)),
+ "X": np.random.choice(range(10), size=(10000,)),
"Z": np.random.choice(["jjj", "kkk", "sss"], size=(10000,)),
}
)
diff --git a/doc/cheatsheet/README.md b/doc/cheatsheet/README.md
new file mode 100644
index 0000000000000..6c33de104ed90
--- /dev/null
+++ b/doc/cheatsheet/README.md
@@ -0,0 +1,22 @@
+# Pandas Cheat Sheet
+
+The Pandas Cheat Sheet was created using Microsoft Powerpoint 2013.
+To create the PDF version, within Powerpoint, simply do a "Save As"
+and pick "PDF" as the format.
+
+This cheat sheet, originally written by Irv Lustig, [Princeton Consultants](https://www.princetonoptimization.com/), was inspired by the [RStudio Data Wrangling Cheatsheet](https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf).
+
+| Topic | PDF | PPT |
+|------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Pandas_Cheat_Sheet |
|
|
+| Pandas_Cheat_Sheet_JA |
|
|
+
+
+**Alternative**
+
+Alternatively, if you want to complement your learning, you can use the Pandas Cheat sheets
+developed by [DataCamp](https://www.datacamp.com/) in "PDF", "Google Colab" and "Streamlit" formats.
+
+| Topic | PDF | Streamlit | Google Colab |
+|-------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Pandas |
|
|
|
diff --git a/doc/cheatsheet/README.txt b/doc/cheatsheet/README.txt
deleted file mode 100644
index c57da38b31777..0000000000000
--- a/doc/cheatsheet/README.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-The Pandas Cheat Sheet was created using Microsoft Powerpoint 2013.
-To create the PDF version, within Powerpoint, simply do a "Save As"
-and pick "PDF" as the format.
-
-This cheat sheet was inspired by the RStudio Data Wrangling Cheatsheet[1], written by Irv Lustig, Princeton Consultants[2].
-
-[1]: https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf
-[2]: https://www.princetonoptimization.com/
diff --git a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst
index 2dcc8b0abe3b8..caaff3557ae40 100644
--- a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst
+++ b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst
@@ -106,9 +106,9 @@ between square brackets ``[]``.
.. note::
- If you are familiar to Python
+ If you are familiar with Python
:ref:`dictionaries `, the selection of a
- single column is very similar to selection of dictionary values based on
+ single column is very similar to the selection of dictionary values based on
the key.
You can create a ``Series`` from scratch as well:
diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst
index c0d2a14507383..002e88533ab93 100644
--- a/doc/source/user_guide/cookbook.rst
+++ b/doc/source/user_guide/cookbook.rst
@@ -459,7 +459,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to
df
# List the size of the animals with the highest weight.
- df.groupby("animal").apply(lambda subf: subf["size"][subf["weight"].idxmax()])
+ df.groupby("animal").apply(lambda subf: subf["size"][subf["weight"].idxmax()], include_groups=False)
`Using get_group
`__
@@ -482,7 +482,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to
return pd.Series(["L", avg_weight, True], index=["size", "weight", "adult"])
- expected_df = gb.apply(GrowUp)
+ expected_df = gb.apply(GrowUp, include_groups=False)
expected_df
`Expanding apply
diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
index c28123cec4491..5dd14e243fbb3 100644
--- a/doc/source/user_guide/groupby.rst
+++ b/doc/source/user_guide/groupby.rst
@@ -420,6 +420,12 @@ This is mainly syntactic sugar for the alternative, which is much more verbose:
Additionally, this method avoids recomputing the internal grouping information
derived from the passed key.
+You can also include the grouping columns if you want to operate on them.
+
+.. ipython:: python
+
+ grouped[["A", "B"]].sum()
+
.. _groupby.iterating-label:
Iterating through groups
@@ -1053,7 +1059,7 @@ missing values with the ``ffill()`` method.
).set_index("date")
df_re
- df_re.groupby("group").resample("1D").ffill()
+ df_re.groupby("group").resample("1D", include_groups=False).ffill()
.. _groupby.filter:
@@ -1219,13 +1225,13 @@ the argument ``group_keys`` which defaults to ``True``. Compare
.. ipython:: python
- df.groupby("A", group_keys=True).apply(lambda x: x)
+ df.groupby("A", group_keys=True).apply(lambda x: x, include_groups=False)
with
.. ipython:: python
- df.groupby("A", group_keys=False).apply(lambda x: x)
+ df.groupby("A", group_keys=False).apply(lambda x: x, include_groups=False)
Numba Accelerated Routines
@@ -1709,7 +1715,7 @@ column index name will be used as the name of the inserted column:
result = {"b_sum": x["b"].sum(), "c_mean": x["c"].mean()}
return pd.Series(result, name="metrics")
- result = df.groupby("a").apply(compute_metrics)
+ result = df.groupby("a").apply(compute_metrics, include_groups=False)
result
diff --git a/doc/source/whatsnew/v0.14.0.rst b/doc/source/whatsnew/v0.14.0.rst
index 92c37243b7e81..9c537b3a48c74 100644
--- a/doc/source/whatsnew/v0.14.0.rst
+++ b/doc/source/whatsnew/v0.14.0.rst
@@ -328,13 +328,24 @@ More consistent behavior for some groupby methods:
- groupby ``head`` and ``tail`` now act more like ``filter`` rather than an aggregation:
- .. ipython:: python
+ .. code-block:: ipython
- df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
- g = df.groupby('A')
- g.head(1) # filters DataFrame
+ In [1]: df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
- g.apply(lambda x: x.head(1)) # used to simply fall-through
+ In [2]: g = df.groupby('A')
+
+ In [3]: g.head(1) # filters DataFrame
+ Out[3]:
+ A B
+ 0 1 2
+ 2 5 6
+
+ In [4]: g.apply(lambda x: x.head(1)) # used to simply fall-through
+ Out[4]:
+ A B
+ A
+ 1 0 1 2
+ 5 2 5 6
- groupby head and tail respect column selection:
diff --git a/doc/source/whatsnew/v0.15.2.rst b/doc/source/whatsnew/v0.15.2.rst
index bb7beef449d93..acc5409b86d09 100644
--- a/doc/source/whatsnew/v0.15.2.rst
+++ b/doc/source/whatsnew/v0.15.2.rst
@@ -24,25 +24,61 @@ API changes
- Indexing in ``MultiIndex`` beyond lex-sort depth is now supported, though
a lexically sorted index will have a better performance. (:issue:`2646`)
- .. ipython:: python
- :okexcept:
- :okwarning:
+ .. code-block:: ipython
+
+ In [1]: df = pd.DataFrame({'jim':[0, 0, 1, 1],
+ ...: 'joe':['x', 'x', 'z', 'y'],
+ ...: 'jolie':np.random.rand(4)}).set_index(['jim', 'joe'])
+ ...:
- df = pd.DataFrame({'jim':[0, 0, 1, 1],
- 'joe':['x', 'x', 'z', 'y'],
- 'jolie':np.random.rand(4)}).set_index(['jim', 'joe'])
- df
- df.index.lexsort_depth
+ In [2]: df
+ Out[2]:
+ jolie
+ jim joe
+ 0 x 0.126970
+ x 0.966718
+ 1 z 0.260476
+ y 0.897237
+
+ [4 rows x 1 columns]
+
+ In [3]: df.index.lexsort_depth
+ Out[3]: 1
# in prior versions this would raise a KeyError
# will now show a PerformanceWarning
- df.loc[(1, 'z')]
+ In [4]: df.loc[(1, 'z')]
+ Out[4]:
+ jolie
+ jim joe
+ 1 z 0.260476
+
+ [1 rows x 1 columns]
# lexically sorting
- df2 = df.sort_index()
- df2
- df2.index.lexsort_depth
- df2.loc[(1,'z')]
+ In [5]: df2 = df.sort_index()
+
+ In [6]: df2
+ Out[6]:
+ jolie
+ jim joe
+ 0 x 0.126970
+ x 0.966718
+ 1 y 0.897237
+ z 0.260476
+
+ [4 rows x 1 columns]
+
+ In [7]: df2.index.lexsort_depth
+ Out[7]: 2
+
+ In [8]: df2.loc[(1,'z')]
+ Out[8]:
+ jolie
+ jim joe
+ 1 z 0.260476
+
+ [1 rows x 1 columns]
- Bug in unique of Series with ``category`` dtype, which returned all categories regardless
whether they were "used" or not (see :issue:`8559` for the discussion).
diff --git a/doc/source/whatsnew/v0.18.1.rst b/doc/source/whatsnew/v0.18.1.rst
index 7d9008fdbdecd..ee6a60144bc35 100644
--- a/doc/source/whatsnew/v0.18.1.rst
+++ b/doc/source/whatsnew/v0.18.1.rst
@@ -77,9 +77,52 @@ Previously you would have to do this to get a rolling window mean per-group:
df = pd.DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)})
df
-.. ipython:: python
+.. code-block:: ipython
- df.groupby("A").apply(lambda x: x.rolling(4).B.mean())
+ In [1]: df.groupby("A").apply(lambda x: x.rolling(4).B.mean())
+ Out[1]:
+ A
+ 1 0 NaN
+ 1 NaN
+ 2 NaN
+ 3 1.5
+ 4 2.5
+ 5 3.5
+ 6 4.5
+ 7 5.5
+ 8 6.5
+ 9 7.5
+ 10 8.5
+ 11 9.5
+ 12 10.5
+ 13 11.5
+ 14 12.5
+ 15 13.5
+ 16 14.5
+ 17 15.5
+ 18 16.5
+ 19 17.5
+ 2 20 NaN
+ 21 NaN
+ 22 NaN
+ 23 21.5
+ 24 22.5
+ 25 23.5
+ 26 24.5
+ 27 25.5
+ 28 26.5
+ 29 27.5
+ 30 28.5
+ 31 29.5
+ 3 32 NaN
+ 33 NaN
+ 34 NaN
+ 35 33.5
+ 36 34.5
+ 37 35.5
+ 38 36.5
+ 39 37.5
+ Name: B, dtype: float64
Now you can do:
@@ -101,15 +144,53 @@ For ``.resample(..)`` type of operations, previously you would have to:
df
-.. ipython:: python
+.. code-block:: ipython
- df.groupby("group").apply(lambda x: x.resample("1D").ffill())
+ In[1]: df.groupby("group").apply(lambda x: x.resample("1D").ffill())
+ Out[1]:
+ group val
+ group date
+ 1 2016-01-03 1 5
+ 2016-01-04 1 5
+ 2016-01-05 1 5
+ 2016-01-06 1 5
+ 2016-01-07 1 5
+ 2016-01-08 1 5
+ 2016-01-09 1 5
+ 2016-01-10 1 6
+ 2 2016-01-17 2 7
+ 2016-01-18 2 7
+ 2016-01-19 2 7
+ 2016-01-20 2 7
+ 2016-01-21 2 7
+ 2016-01-22 2 7
+ 2016-01-23 2 7
+ 2016-01-24 2 8
Now you can do:
-.. ipython:: python
+.. code-block:: ipython
- df.groupby("group").resample("1D").ffill()
+ In[1]: df.groupby("group").resample("1D").ffill()
+ Out[1]:
+ group val
+ group date
+ 1 2016-01-03 1 5
+ 2016-01-04 1 5
+ 2016-01-05 1 5
+ 2016-01-06 1 5
+ 2016-01-07 1 5
+ 2016-01-08 1 5
+ 2016-01-09 1 5
+ 2016-01-10 1 6
+ 2 2016-01-17 2 7
+ 2016-01-18 2 7
+ 2016-01-19 2 7
+ 2016-01-20 2 7
+ 2016-01-21 2 7
+ 2016-01-22 2 7
+ 2016-01-23 2 7
+ 2016-01-24 2 8
.. _whatsnew_0181.enhancements.method_chain:
diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst
index a6848dad6e3cd..42af61be26355 100644
--- a/doc/source/whatsnew/v2.1.1.rst
+++ b/doc/source/whatsnew/v2.1.1.rst
@@ -13,13 +13,18 @@ including other versions of pandas.
Fixed regressions
~~~~~~~~~~~~~~~~~
+- Fixed regression in :func:`concat` when :class:`DataFrame` 's have two different extension dtypes (:issue:`54848`)
- Fixed regression in :func:`merge` when merging over a PyArrow string index (:issue:`54894`)
- Fixed regression in :func:`read_csv` when ``usecols`` is given and ``dtypes`` is a dict for ``engine="python"`` (:issue:`54868`)
- Fixed regression in :func:`read_csv` when ``delim_whitespace`` is True (:issue:`54918`, :issue:`54931`)
- Fixed regression in :meth:`.GroupBy.get_group` raising for ``axis=1`` (:issue:`54858`)
- Fixed regression in :meth:`DataFrame.__setitem__` raising ``AssertionError`` when setting a :class:`Series` with a partial :class:`MultiIndex` (:issue:`54875`)
+- Fixed regression in :meth:`DataFrame.filter` not respecting the order of elements for ``filter`` (:issue:`54980`)
+- Fixed regression in :meth:`DataFrame.to_sql` not roundtripping datetime columns correctly for sqlite (:issue:`54877`)
+- Fixed regression in :meth:`DataFrameGroupBy.agg` when aggregating a DataFrame with duplicate column names using a dictionary (:issue:`55006`)
- Fixed regression in :meth:`MultiIndex.append` raising when appending overlapping :class:`IntervalIndex` levels (:issue:`54934`)
- Fixed regression in :meth:`Series.drop_duplicates` for PyArrow strings (:issue:`54904`)
+- Fixed regression in :meth:`Series.interpolate` raising when ``fill_value`` was given (:issue:`54920`)
- Fixed regression in :meth:`Series.value_counts` raising for numeric data if ``bins`` was specified (:issue:`54857`)
- Fixed regression when comparing a :class:`Series` with ``datetime64`` dtype with ``None`` (:issue:`54870`)
@@ -28,7 +33,9 @@ Fixed regressions
Bug fixes
~~~~~~~~~
+- Fixed bug for :class:`ArrowDtype` raising ``NotImplementedError`` for fixed-size list (:issue:`55000`)
- Fixed bug in :meth:`DataFrame.stack` with ``future_stack=True`` and columns a non-:class:`MultiIndex` consisting of tuples (:issue:`54948`)
+- Fixed bug in :meth:`Series.pct_change` and :meth:`DataFrame.pct_change` showing unnecessary ``FutureWarning`` (:issue:`54981`)
.. ---------------------------------------------------------------------------
.. _whatsnew_211.other:
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
index 89b4d102fcf04..7bb4aaec0dd7c 100644
--- a/doc/source/whatsnew/v2.2.0.rst
+++ b/doc/source/whatsnew/v2.2.0.rst
@@ -146,12 +146,12 @@ Deprecations
- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_pickle` except ``path``. (:issue:`54229`)
- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_string` except ``buf``. (:issue:`54229`)
- Deprecated downcasting behavior in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, :meth:`DataFrame.mask`, :meth:`Series.clip`, :meth:`DataFrame.clip`; in a future version these will not infer object-dtype columns to non-object dtype, or all-round floats to integer dtype. Call ``result.infer_objects(copy=False)`` on the result for object inference, or explicitly cast floats to ints. To opt in to the future version, use ``pd.set_option("future.downcasting", True)`` (:issue:`53656`)
+- Deprecated including the groups in computations when using :meth:`DataFrameGroupBy.apply` and :meth:`DataFrameGroupBy.resample`; pass ``include_groups=False`` to exclude the groups (:issue:`7155`)
- Deprecated not passing a tuple to :class:`DataFrameGroupBy.get_group` or :class:`SeriesGroupBy.get_group` when grouping by a length-1 list-like (:issue:`25971`)
- Deprecated strings ``S``, ``U``, and ``N`` denoting units in :func:`to_timedelta` (:issue:`52536`)
- Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`52536`)
- Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`52536`)
- Deprecated the extension test classes ``BaseNoReduceTests``, ``BaseBooleanReduceTests``, and ``BaseNumericReduceTests``, use ``BaseReduceTests`` instead (:issue:`54663`)
--
.. ---------------------------------------------------------------------------
.. _whatsnew_220.performance:
@@ -168,6 +168,7 @@ Performance improvements
Bug fixes
~~~~~~~~~
- Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`)
+- Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`)
Categorical
^^^^^^^^^^^
@@ -245,7 +246,7 @@ Groupby/resample/rolling
Reshaping
^^^^^^^^^
--
+- Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`)
-
Sparse
diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx
index 02934346130a5..7b306c5e681e0 100644
--- a/pandas/_libs/window/indexers.pyx
+++ b/pandas/_libs/window/indexers.pyx
@@ -138,6 +138,8 @@ def calculate_variable_window_bounds(
break
# end bound is previous end
# or current index
+ elif index[end[i - 1]] == end_bound and not right_closed:
+ end[i] = end[i - 1] + 1
elif (index[end[i - 1]] - end_bound) * index_growth_sign <= 0:
end[i] = i + 1
else:
diff --git a/pandas/conftest.py b/pandas/conftest.py
index a4f58e99d8bcc..ac0275bf695d4 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -71,6 +71,7 @@
Index,
MultiIndex,
)
+from pandas.util.version import Version
if TYPE_CHECKING:
from collections.abc import (
@@ -191,6 +192,10 @@ def pytest_collection_modifyitems(items, config) -> None:
item.add_marker(pytest.mark.arraymanager)
+hypothesis_health_checks = [hypothesis.HealthCheck.too_slow]
+if Version(hypothesis.__version__) >= Version("6.83.2"):
+ hypothesis_health_checks.append(hypothesis.HealthCheck.differing_executors)
+
# Hypothesis
hypothesis.settings.register_profile(
"ci",
@@ -202,7 +207,7 @@ def pytest_collection_modifyitems(items, config) -> None:
# 2022-02-09: Changed deadline from 500 -> None. Deadline leads to
# non-actionable, flaky CI failures (# GH 24641, 44969, 45118, 44969)
deadline=None,
- suppress_health_check=(hypothesis.HealthCheck.too_slow,),
+ suppress_health_check=tuple(hypothesis_health_checks),
)
hypothesis.settings.load_profile("ci")
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index 4d6dd8f4fd577..26467a4a982fa 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -436,7 +436,13 @@ def compute_dict_like(
Data for result. When aggregating with a Series, this can contain any
Python object.
"""
+ from pandas.core.groupby.generic import (
+ DataFrameGroupBy,
+ SeriesGroupBy,
+ )
+
obj = self.obj
+ is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy))
func = cast(AggFuncTypeDict, self.func)
func = self.normalize_dictlike_arg(op_name, selected_obj, func)
@@ -450,7 +456,7 @@ def compute_dict_like(
colg = obj._gotitem(selection, ndim=1)
results = [getattr(colg, op_name)(how, **kwargs) for _, how in func.items()]
keys = list(func.keys())
- elif is_non_unique_col:
+ elif not is_groupby and is_non_unique_col:
# key used for column selection and output
# GH#51099
results = []
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 4d887ecd1510f..83ed54c42a23c 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -2192,11 +2192,11 @@ def _str_rstrip(self, to_strip=None):
return type(self)(result)
def _str_removeprefix(self, prefix: str):
- # TODO: Should work once https://github.com/apache/arrow/issues/14991 is fixed
- # starts_with = pc.starts_with(self._pa_array, pattern=prefix)
- # removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
- # result = pc.if_else(starts_with, removed, self._pa_array)
- # return type(self)(result)
+ if not pa_version_under13p0:
+ starts_with = pc.starts_with(self._pa_array, pattern=prefix)
+ removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
+ result = pc.if_else(starts_with, removed, self._pa_array)
+ return type(self)(result)
predicate = lambda val: val.removeprefix(prefix)
result = self._apply_elementwise(predicate)
return type(self)(pa.chunked_array(result))
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index c90127c0e9812..693ebad0ca16f 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -123,7 +123,8 @@ def __init__(self, storage=None) -> None:
storage = get_option("mode.string_storage")
if storage not in {"python", "pyarrow", "pyarrow_numpy"}:
raise ValueError(
- f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."
+ f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. "
+ f"Got {storage} instead."
)
if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under7p0:
raise ImportError(
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index aaa515ac459bd..6262055827428 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -15,7 +15,10 @@
lib,
missing as libmissing,
)
-from pandas.compat import pa_version_under7p0
+from pandas.compat import (
+ pa_version_under7p0,
+ pa_version_under13p0,
+)
from pandas.util._exceptions import find_stack_level
from pandas.core.dtypes.common import (
@@ -47,6 +50,8 @@
if TYPE_CHECKING:
+ from collections.abc import Sequence
+
from pandas._typing import (
Dtype,
Scalar,
@@ -334,19 +339,13 @@ def _str_startswith(self, pat: str, na=None):
result = pc.starts_with(self._pa_array, pattern=pat)
if not isna(na):
result = result.fill_null(na)
- result = self._result_converter(result)
- if not isna(na):
- result[isna(result)] = bool(na)
- return result
+ return self._result_converter(result)
def _str_endswith(self, pat: str, na=None):
result = pc.ends_with(self._pa_array, pattern=pat)
if not isna(na):
result = result.fill_null(na)
- result = self._result_converter(result)
- if not isna(na):
- result[isna(result)] = bool(na)
- return result
+ return self._result_converter(result)
def _str_replace(
self,
@@ -365,6 +364,12 @@ def _str_replace(
result = func(self._pa_array, pattern=pat, replacement=repl, max_replacements=n)
return type(self)(result)
+ def _str_repeat(self, repeats: int | Sequence[int]):
+ if not isinstance(repeats, int):
+ return super()._str_repeat(repeats)
+ else:
+ return type(self)(pc.binary_repeat(self._pa_array, repeats))
+
def _str_match(
self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
):
@@ -379,6 +384,19 @@ def _str_fullmatch(
pat = f"{pat}$"
return self._str_match(pat, case, flags, na)
+ def _str_slice(
+ self, start: int | None = None, stop: int | None = None, step: int | None = None
+ ):
+ if stop is None:
+ return super()._str_slice(start, stop, step)
+ if start is None:
+ start = 0
+ if step is None:
+ step = 1
+ return type(self)(
+ pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
+ )
+
def _str_isalnum(self):
result = pc.utf8_is_alnum(self._pa_array)
return self._result_converter(result)
@@ -417,7 +435,7 @@ def _str_isupper(self):
def _str_len(self):
result = pc.utf8_length(self._pa_array)
- return Int64Dtype().__from_arrow__(result)
+ return self._convert_int_dtype(result)
def _str_lower(self):
return type(self)(pc.utf8_lower(self._pa_array))
@@ -446,6 +464,43 @@ def _str_rstrip(self, to_strip=None):
result = pc.utf8_rtrim(self._pa_array, characters=to_strip)
return type(self)(result)
+ def _str_removeprefix(self, prefix: str):
+ if not pa_version_under13p0:
+ starts_with = pc.starts_with(self._pa_array, pattern=prefix)
+ removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
+ result = pc.if_else(starts_with, removed, self._pa_array)
+ return type(self)(result)
+ return super()._str_removeprefix(prefix)
+
+ def _str_removesuffix(self, suffix: str):
+ ends_with = pc.ends_with(self._pa_array, pattern=suffix)
+ removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
+ result = pc.if_else(ends_with, removed, self._pa_array)
+ return type(self)(result)
+
+ def _str_count(self, pat: str, flags: int = 0):
+ if flags:
+ return super()._str_count(pat, flags)
+ result = pc.count_substring_regex(self._pa_array, pat)
+ return self._convert_int_dtype(result)
+
+ def _str_find(self, sub: str, start: int = 0, end: int | None = None):
+ if start != 0 and end is not None:
+ slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
+ result = pc.find_substring(slices, sub)
+ not_found = pc.equal(result, -1)
+ offset_result = pc.add(result, end - start)
+ result = pc.if_else(not_found, result, offset_result)
+ elif start == 0 and end is None:
+ slices = self._pa_array
+ result = pc.find_substring(slices, sub)
+ else:
+ return super()._str_find(sub, start, end)
+ return self._convert_int_dtype(result)
+
+ def _convert_int_dtype(self, result):
+ return Int64Dtype().__from_arrow__(result)
+
class ArrowStringArrayNumpySemantics(ArrowStringArray):
_storage = "pyarrow_numpy"
@@ -468,7 +523,10 @@ def _result_converter(cls, values, na=None):
def __getattribute__(self, item):
# ArrowStringArray and we both inherit from ArrowExtensionArray, which
# creates inheritance problems (Diamond inheritance)
- if item in ArrowStringArrayMixin.__dict__ and item != "_pa_array":
+ if item in ArrowStringArrayMixin.__dict__ and item not in (
+ "_pa_array",
+ "__dict__",
+ ):
return partial(getattr(ArrowStringArrayMixin, item), self)
return super().__getattribute__(item)
@@ -526,34 +584,11 @@ def _str_map(
return lib.map_infer_mask(arr, f, mask.view("uint8"))
def _convert_int_dtype(self, result):
+ result = result.to_numpy()
if result.dtype == np.int32:
result = result.astype(np.int64)
return result
- def _str_count(self, pat: str, flags: int = 0):
- if flags:
- return super()._str_count(pat, flags)
- result = pc.count_substring_regex(self._pa_array, pat).to_numpy()
- return self._convert_int_dtype(result)
-
- def _str_len(self):
- result = pc.utf8_length(self._pa_array).to_numpy()
- return self._convert_int_dtype(result)
-
- def _str_find(self, sub: str, start: int = 0, end: int | None = None):
- if start != 0 and end is not None:
- slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
- result = pc.find_substring(slices, sub)
- not_found = pc.equal(result, -1)
- offset_result = pc.add(result, end - start)
- result = pc.if_else(not_found, result, offset_result)
- elif start == 0 and end is None:
- slices = self._pa_array
- result = pc.find_substring(slices, sub)
- else:
- return super()._str_find(sub, start, end)
- return self._convert_int_dtype(result.to_numpy())
-
def _cmp_method(self, other, op):
result = super()._cmp_method(other, op)
return result.to_numpy(np.bool_, na_value=False)
diff --git a/pandas/core/base.py b/pandas/core/base.py
index d973f8f5fe35a..3026189e747bb 100644
--- a/pandas/core/base.py
+++ b/pandas/core/base.py
@@ -485,8 +485,8 @@ def array(self) -> ExtensionArray:
types, this is the actual array. For NumPy native types, this
is a thin (no copy) wrapper around :class:`numpy.ndarray`.
- ``.array`` differs ``.values`` which may require converting the
- data to a different form.
+ ``.array`` differs from ``.values``, which may require converting
+ the data to a different form.
See Also
--------
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
index f76163cbbd0a1..12de63967c78f 100644
--- a/pandas/core/dtypes/dtypes.py
+++ b/pandas/core/dtypes/dtypes.py
@@ -70,7 +70,7 @@
from collections.abc import MutableMapping
from datetime import tzinfo
- import pyarrow as pa # noqa: F811, TCH004
+ import pyarrow as pa # noqa: TCH004
from pandas._typing import (
Dtype,
@@ -2148,6 +2148,8 @@ def type(self):
return CategoricalDtypeType
elif pa.types.is_list(pa_type) or pa.types.is_large_list(pa_type):
return list
+ elif pa.types.is_fixed_size_list(pa_type):
+ return list
elif pa.types.is_map(pa_type):
return list
elif pa.types.is_struct(pa_type):
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 4bfa8a4415785..f1fc63bc4b1ea 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1926,11 +1926,17 @@ def to_dict(
self,
orient: Literal["dict", "list", "series", "split", "tight", "index"] = ...,
into: type[dict] = ...,
+ index: bool = ...,
) -> dict:
...
@overload
- def to_dict(self, orient: Literal["records"], into: type[dict] = ...) -> list[dict]:
+ def to_dict(
+ self,
+ orient: Literal["records"],
+ into: type[dict] = ...,
+ index: bool = ...,
+ ) -> list[dict]:
...
@deprecate_nonkeyword_arguments(
@@ -8863,20 +8869,20 @@ def update(
>>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
... 'Parrot', 'Parrot'],
... 'Max Speed': [380., 370., 24., 26.]})
- >>> df.groupby("Animal", group_keys=True).apply(lambda x: x)
- Animal Max Speed
+ >>> df.groupby("Animal", group_keys=True)[['Max Speed']].apply(lambda x: x)
+ Max Speed
Animal
- Falcon 0 Falcon 380.0
- 1 Falcon 370.0
- Parrot 2 Parrot 24.0
- 3 Parrot 26.0
-
- >>> df.groupby("Animal", group_keys=False).apply(lambda x: x)
- Animal Max Speed
- 0 Falcon 380.0
- 1 Falcon 370.0
- 2 Parrot 24.0
- 3 Parrot 26.0
+ Falcon 0 380.0
+ 1 370.0
+ Parrot 2 24.0
+ 3 26.0
+
+ >>> df.groupby("Animal", group_keys=False)[['Max Speed']].apply(lambda x: x)
+ Max Speed
+ 0 380.0
+ 1 370.0
+ 2 24.0
+ 3 26.0
"""
)
)
@@ -11297,7 +11303,7 @@ def _reduce_axis1(self, name: str, func, skipna: bool) -> Series:
def any( # type: ignore[override]
self,
*,
- axis: Axis = 0,
+ axis: Axis | None = 0,
bool_only: bool = False,
skipna: bool = True,
**kwargs,
@@ -11312,7 +11318,7 @@ def any( # type: ignore[override]
@doc(make_doc("all", ndim=2))
def all(
self,
- axis: Axis = 0,
+ axis: Axis | None = 0,
bool_only: bool = False,
skipna: bool = True,
**kwargs,
@@ -11711,6 +11717,7 @@ def quantile(
axis: Axis = ...,
numeric_only: bool = ...,
interpolation: QuantileInterpolation = ...,
+ method: Literal["single", "table"] = ...,
) -> Series:
...
@@ -11721,6 +11728,7 @@ def quantile(
axis: Axis = ...,
numeric_only: bool = ...,
interpolation: QuantileInterpolation = ...,
+ method: Literal["single", "table"] = ...,
) -> Series | DataFrame:
...
@@ -11731,6 +11739,7 @@ def quantile(
axis: Axis = ...,
numeric_only: bool = ...,
interpolation: QuantileInterpolation = ...,
+ method: Literal["single", "table"] = ...,
) -> Series | DataFrame:
...
@@ -11830,11 +11839,10 @@ def quantile(
if not is_list_like(q):
# BlockManager.quantile expects listlike, so we wrap and unwrap here
- # error: List item 0 has incompatible type "Union[float, Union[Union[
- # ExtensionArray, ndarray[Any, Any]], Index, Series], Sequence[float]]";
- # expected "float"
- res_df = self.quantile( # type: ignore[call-overload]
- [q],
+ # error: List item 0 has incompatible type "float | ExtensionArray |
+ # ndarray[Any, Any] | Index | Series | Sequence[float]"; expected "float"
+ res_df = self.quantile(
+ [q], # type: ignore[list-item]
axis=axis,
numeric_only=numeric_only,
interpolation=interpolation,
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index b9407ebe6624a..5c303e2a73bd7 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -2847,7 +2847,7 @@ def to_sql(
index : bool, default True
Write DataFrame index as a column. Uses `index_label` as the column
- name in the table.
+ name in the table. Creates a table index for this column.
index_label : str or sequence, default None
Column label for index column(s). If None is given (default) and
`index` is True, then the index names are used.
@@ -5718,10 +5718,12 @@ def filter(
if items is not None:
name = self._get_axis_name(axis)
+ items = Index(items).intersection(labels)
+ if len(items) == 0:
+ # Keep the dtype of labels when we are empty
+ items = items.astype(labels.dtype)
# error: Keywords must be strings
- return self.reindex( # type: ignore[misc]
- **{name: labels.intersection(items)}
- )
+ return self.reindex(**{name: items}) # type: ignore[misc]
elif like:
def f(x) -> bool_t:
@@ -7938,6 +7940,51 @@ def replace(
else:
return result.__finalize__(self, method="replace")
+ @overload
+ def interpolate(
+ self,
+ method: InterpolateOptions = ...,
+ *,
+ axis: Axis = ...,
+ limit: int | None = ...,
+ inplace: Literal[False] = ...,
+ limit_direction: Literal["forward", "backward", "both"] | None = ...,
+ limit_area: Literal["inside", "outside"] | None = ...,
+ downcast: Literal["infer"] | None | lib.NoDefault = ...,
+ **kwargs,
+ ) -> Self:
+ ...
+
+ @overload
+ def interpolate(
+ self,
+ method: InterpolateOptions = ...,
+ *,
+ axis: Axis = ...,
+ limit: int | None = ...,
+ inplace: Literal[True],
+ limit_direction: Literal["forward", "backward", "both"] | None = ...,
+ limit_area: Literal["inside", "outside"] | None = ...,
+ downcast: Literal["infer"] | None | lib.NoDefault = ...,
+ **kwargs,
+ ) -> None:
+ ...
+
+ @overload
+ def interpolate(
+ self,
+ method: InterpolateOptions = ...,
+ *,
+ axis: Axis = ...,
+ limit: int | None = ...,
+ inplace: bool_t = ...,
+ limit_direction: Literal["forward", "backward", "both"] | None = ...,
+ limit_area: Literal["inside", "outside"] | None = ...,
+ downcast: Literal["infer"] | None | lib.NoDefault = ...,
+ **kwargs,
+ ) -> Self | None:
+ ...
+
@final
def interpolate(
self,
@@ -8180,10 +8227,11 @@ def interpolate(
stacklevel=find_stack_level(),
)
- if "fill_value" in kwargs:
+ if method in fillna_methods and "fill_value" in kwargs:
raise ValueError(
"'fill_value' is not a valid keyword for "
- f"{type(self).__name__}.interpolate"
+ f"{type(self).__name__}.interpolate with method from "
+ f"{fillna_methods}"
)
if isinstance(obj.index, MultiIndex) and method != "linear":
@@ -8607,6 +8655,42 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace):
# GH 40420
return self.where(subset, threshold, axis=axis, inplace=inplace)
+ @overload
+ def clip(
+ self,
+ lower=...,
+ upper=...,
+ *,
+ axis: Axis | None = ...,
+ inplace: Literal[False] = ...,
+ **kwargs,
+ ) -> Self:
+ ...
+
+ @overload
+ def clip(
+ self,
+ lower=...,
+ upper=...,
+ *,
+ axis: Axis | None = ...,
+ inplace: Literal[True],
+ **kwargs,
+ ) -> None:
+ ...
+
+ @overload
+ def clip(
+ self,
+ lower=...,
+ upper=...,
+ *,
+ axis: Axis | None = ...,
+ inplace: bool_t = ...,
+ **kwargs,
+ ) -> Self | None:
+ ...
+
@final
def clip(
self,
@@ -11709,15 +11793,21 @@ def pct_change(
stacklevel=find_stack_level(),
)
if fill_method is lib.no_default:
- if self.isna().values.any():
- warnings.warn(
- "The default fill_method='pad' in "
- f"{type(self).__name__}.pct_change is deprecated and will be "
- "removed in a future version. Call ffill before calling "
- "pct_change to retain current behavior and silence this warning.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
+ cols = self.items() if self.ndim == 2 else [(None, self)]
+ for _, col in cols:
+ mask = col.isna().values
+ mask = mask[np.argmax(~mask) :]
+ if mask.any():
+ warnings.warn(
+ "The default fill_method='pad' in "
+ f"{type(self).__name__}.pct_change is deprecated and will be "
+ "removed in a future version. Call ffill before calling "
+ "pct_change to retain current behavior and silence this "
+ "warning.",
+ FutureWarning,
+ stacklevel=find_stack_level(),
+ )
+ break
fill_method = "pad"
if limit is lib.no_default:
limit = None
@@ -11743,7 +11833,7 @@ def _logical_func(
self,
name: str,
func,
- axis: Axis = 0,
+ axis: Axis | None = 0,
bool_only: bool_t = False,
skipna: bool_t = True,
**kwargs,
@@ -11756,7 +11846,10 @@ def _logical_func(
res = self._logical_func(
name, func, axis=0, bool_only=bool_only, skipna=skipna, **kwargs
)
- return res._logical_func(name, func, skipna=skipna, **kwargs)
+ # error: Item "bool" of "Series | bool" has no attribute "_logical_func"
+ return res._logical_func( # type: ignore[union-attr]
+ name, func, skipna=skipna, **kwargs
+ )
elif axis is None:
axis = 0
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 43d200027220b..e6dd6a990d285 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -180,6 +180,19 @@ class providing the base-class of operations.
A callable that takes a {input} as its first argument, and
returns a dataframe, a series or a scalar. In addition the
callable may take positional and keyword arguments.
+ include_groups : bool, default True
+ When True, will attempt to apply ``func`` to the groupings in
+ the case that they are columns of the DataFrame. If this raises a
+ TypeError, the result will be computed with the groupings excluded.
+ When False, the groupings will be excluded when applying ``func``.
+
+ .. versionadded:: 2.2.0
+
+ .. deprecated:: 2.2.0
+
+ Setting include_groups to True is deprecated. Only the value
+ False will be allowed in a future version of pandas.
+
args, kwargs : tuple and dict
Optional positional and keyword arguments to pass to ``func``.
@@ -272,7 +285,7 @@ class providing the base-class of operations.
each group together into a Series, including setting the index as
appropriate:
- >>> g1.apply(lambda x: x.C.max() - x.B.min())
+ >>> g1.apply(lambda x: x.C.max() - x.B.min(), include_groups=False)
A
a 5
b 2
@@ -1748,7 +1761,7 @@ def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs):
input="dataframe", examples=_apply_docs["dataframe_examples"]
)
)
- def apply(self, func, *args, **kwargs) -> NDFrameT:
+ def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT:
orig_func = func
func = com.is_builtin_func(func)
if orig_func != func:
@@ -1781,10 +1794,25 @@ def f(g):
else:
f = func
+ if not include_groups:
+ return self._python_apply_general(f, self._obj_with_exclusions)
+
# ignore SettingWithCopy here in case the user mutates
with option_context("mode.chained_assignment", None):
try:
result = self._python_apply_general(f, self._selected_obj)
+ if (
+ not isinstance(self.obj, Series)
+ and self._selection is None
+ and self._selected_obj.shape != self._obj_with_exclusions.shape
+ ):
+ warnings.warn(
+ message=_apply_groupings_depr.format(
+ type(self).__name__, "apply"
+ ),
+ category=FutureWarning,
+ stacklevel=find_stack_level(),
+ )
except TypeError:
# gh-20949
# try again, with .apply acting as a filtering
@@ -3520,7 +3548,7 @@ def describe(
return result
@final
- def resample(self, rule, *args, **kwargs) -> Resampler:
+ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resampler:
"""
Provide resampling when using a TimeGrouper.
@@ -3534,7 +3562,23 @@ def resample(self, rule, *args, **kwargs) -> Resampler:
----------
rule : str or DateOffset
The offset string or object representing target grouper conversion.
- *args, **kwargs
+ *args
+ Possible arguments are `how`, `fill_method`, `limit`, `kind` and
+ `on`, and other arguments of `TimeGrouper`.
+ include_groups : bool, default True
+ When True, will attempt to include the groupings in the operation in
+ the case that they are columns of the DataFrame. If this raises a
+ TypeError, the result will be computed with the groupings excluded.
+ When False, the groupings will be excluded when applying ``func``.
+
+ .. versionadded:: 2.2.0
+
+ .. deprecated:: 2.2.0
+
+ Setting include_groups to True is deprecated. Only the value
+ False will be allowed in a future version of pandas.
+
+ **kwargs
Possible arguments are `how`, `fill_method`, `limit`, `kind` and
`on`, and other arguments of `TimeGrouper`.
@@ -3570,59 +3614,71 @@ def resample(self, rule, *args, **kwargs) -> Resampler:
Downsample the DataFrame into 3 minute bins and sum the values of
the timestamps falling into a bin.
- >>> df.groupby('a').resample('3min').sum()
- a b
+ >>> df.groupby('a').resample('3min', include_groups=False).sum()
+ b
a
- 0 2000-01-01 00:00:00 0 2
- 2000-01-01 00:03:00 0 1
- 5 2000-01-01 00:00:00 5 1
+ 0 2000-01-01 00:00:00 2
+ 2000-01-01 00:03:00 1
+ 5 2000-01-01 00:00:00 1
Upsample the series into 30 second bins.
- >>> df.groupby('a').resample('30s').sum()
- a b
+ >>> df.groupby('a').resample('30s', include_groups=False).sum()
+ b
a
- 0 2000-01-01 00:00:00 0 1
- 2000-01-01 00:00:30 0 0
- 2000-01-01 00:01:00 0 1
- 2000-01-01 00:01:30 0 0
- 2000-01-01 00:02:00 0 0
- 2000-01-01 00:02:30 0 0
- 2000-01-01 00:03:00 0 1
- 5 2000-01-01 00:02:00 5 1
+ 0 2000-01-01 00:00:00 1
+ 2000-01-01 00:00:30 0
+ 2000-01-01 00:01:00 1
+ 2000-01-01 00:01:30 0
+ 2000-01-01 00:02:00 0
+ 2000-01-01 00:02:30 0
+ 2000-01-01 00:03:00 1
+ 5 2000-01-01 00:02:00 1
Resample by month. Values are assigned to the month of the period.
- >>> df.groupby('a').resample('M').sum()
- a b
+ >>> df.groupby('a').resample('M', include_groups=False).sum()
+ b
a
- 0 2000-01-31 0 3
- 5 2000-01-31 5 1
+ 0 2000-01-31 3
+ 5 2000-01-31 1
Downsample the series into 3 minute bins as above, but close the right
side of the bin interval.
- >>> df.groupby('a').resample('3min', closed='right').sum()
- a b
+ >>> (
+ ... df.groupby('a')
+ ... .resample('3min', closed='right', include_groups=False)
+ ... .sum()
+ ... )
+ b
a
- 0 1999-12-31 23:57:00 0 1
- 2000-01-01 00:00:00 0 2
- 5 2000-01-01 00:00:00 5 1
+ 0 1999-12-31 23:57:00 1
+ 2000-01-01 00:00:00 2
+ 5 2000-01-01 00:00:00 1
Downsample the series into 3 minute bins and close the right side of
the bin interval, but label each bin using the right edge instead of
the left.
- >>> df.groupby('a').resample('3min', closed='right', label='right').sum()
- a b
+ >>> (
+ ... df.groupby('a')
+ ... .resample('3min', closed='right', label='right', include_groups=False)
+ ... .sum()
+ ... )
+ b
a
- 0 2000-01-01 00:00:00 0 1
- 2000-01-01 00:03:00 0 2
- 5 2000-01-01 00:03:00 5 1
+ 0 2000-01-01 00:00:00 1
+ 2000-01-01 00:03:00 2
+ 5 2000-01-01 00:03:00 1
"""
from pandas.core.resample import get_resampler_for_grouping
- return get_resampler_for_grouping(self, rule, *args, **kwargs)
+ # mypy flags that include_groups could be specified via `*args` or `**kwargs`
+ # GH#54961 would resolve.
+ return get_resampler_for_grouping( # type: ignore[misc]
+ self, rule, *args, include_groups=include_groups, **kwargs
+ )
@final
def rolling(self, *args, **kwargs) -> RollingGroupby:
@@ -5728,3 +5784,13 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde
mi = MultiIndex(levels=levels, codes=codes, names=[idx.name, None])
return mi
+
+
+# GH#7155
+_apply_groupings_depr = (
+ "{}.{} operated on the grouping columns. This behavior is deprecated, "
+ "and in a future version of pandas the grouping columns will be excluded "
+ "from the operation. Either pass `include_groups=False` to exclude the "
+ "groupings or explicitly select the grouping columns after groupby to silence "
+ "this warning."
+)
diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py
index 694a420ad2494..c13ec51ff3851 100644
--- a/pandas/core/indexers/objects.py
+++ b/pandas/core/indexers/objects.py
@@ -262,7 +262,9 @@ def get_window_bounds(
# end bound is previous end
# or current index
end_diff = (self.index[end[i - 1]] - end_bound) * index_growth_sign
- if end_diff <= zero:
+ if end_diff == zero and not right_closed:
+ end[i] = end[i - 1] + 1
+ elif end_diff <= zero:
end[i] = i + 1
else:
end[i] = end[i - 1]
diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py
index 781dfae7fef64..a8ef0e034ba9b 100644
--- a/pandas/core/indexes/api.py
+++ b/pandas/core/indexes/api.py
@@ -377,5 +377,5 @@ def all_indexes_same(indexes) -> bool:
def default_index(n: int) -> RangeIndex:
- rng = range(0, n)
+ rng = range(n)
return RangeIndex._simple_new(rng, name=None)
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 6a397862712de..cd55997ad5f69 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -4557,7 +4557,7 @@ def join(
-------
join_index, (left_indexer, right_indexer)
- Examples
+ Examples
--------
>>> idx1 = pd.Index([1, 2, 3])
>>> idx2 = pd.Index([4, 5, 6])
diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py
index 4d33f0137d3c4..b2d463a8c6c26 100644
--- a/pandas/core/internals/concat.py
+++ b/pandas/core/internals/concat.py
@@ -177,7 +177,7 @@ def concatenate_managers(
values = np.concatenate(vals, axis=1) # type: ignore[arg-type]
elif is_1d_only_ea_dtype(blk.dtype):
# TODO(EA2D): special-casing not needed with 2D EAs
- values = concat_compat(vals, axis=1, ea_compat_axis=True)
+ values = concat_compat(vals, axis=0, ea_compat_axis=True)
values = ensure_block_shape(values, ndim=2)
else:
values = concat_compat(vals, axis=1)
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
index 5ff18d8a25e36..9605bf154a8b7 100644
--- a/pandas/core/resample.py
+++ b/pandas/core/resample.py
@@ -32,7 +32,10 @@
Substitution,
doc,
)
-from pandas.util._exceptions import find_stack_level
+from pandas.util._exceptions import (
+ find_stack_level,
+ rewrite_warning,
+)
from pandas.core.dtypes.generic import (
ABCDataFrame,
@@ -57,6 +60,7 @@
from pandas.core.groupby.groupby import (
BaseGroupBy,
GroupBy,
+ _apply_groupings_depr,
_pipe_template,
get_groupby,
)
@@ -163,6 +167,7 @@ def __init__(
gpr_index: Index,
group_keys: bool = False,
selection=None,
+ include_groups: bool = True,
) -> None:
self._timegrouper = timegrouper
self.keys = None
@@ -171,6 +176,7 @@ def __init__(
self.kind = kind
self.group_keys = group_keys
self.as_index = True
+ self.include_groups = include_groups
self.obj, self.ax, self._indexer = self._timegrouper._set_grouper(
self._convert_obj(obj), sort=True, gpr_index=gpr_index
@@ -444,7 +450,9 @@ def _groupby_and_aggregate(self, how, *args, **kwargs):
# a DataFrame column, but aggregate_item_by_item operates column-wise
# on Series, raising AttributeError or KeyError
# (depending on whether the column lookup uses getattr/__getitem__)
- result = grouped.apply(how, *args, **kwargs)
+ result = _apply(
+ grouped, how, *args, include_groups=self.include_groups, **kwargs
+ )
except ValueError as err:
if "Must produce aggregated value" in str(err):
@@ -456,15 +464,21 @@ def _groupby_and_aggregate(self, how, *args, **kwargs):
# we have a non-reducing function
# try to evaluate
- result = grouped.apply(how, *args, **kwargs)
+ result = _apply(
+ grouped, how, *args, include_groups=self.include_groups, **kwargs
+ )
return self._wrap_result(result)
- def _get_resampler_for_grouping(self, groupby: GroupBy, key):
+ def _get_resampler_for_grouping(
+ self, groupby: GroupBy, key, include_groups: bool = True
+ ):
"""
Return the correct class for resampling with groupby.
"""
- return self._resampler_for_grouping(groupby=groupby, key=key, parent=self)
+ return self._resampler_for_grouping(
+ groupby=groupby, key=key, parent=self, include_groups=include_groups
+ )
def _wrap_result(self, result):
"""
@@ -1590,6 +1604,7 @@ def __init__(
groupby: GroupBy,
key=None,
selection: IndexLabel | None = None,
+ include_groups: bool = False,
) -> None:
# reached via ._gotitem and _get_resampler_for_grouping
@@ -1612,6 +1627,7 @@ def __init__(
self.ax = parent.ax
self.obj = parent.obj
+ self.include_groups = include_groups
@no_type_check
def _apply(self, f, *args, **kwargs):
@@ -1628,7 +1644,7 @@ def func(x):
return x.apply(f, *args, **kwargs)
- result = self._groupby.apply(func)
+ result = _apply(self._groupby, func, include_groups=self.include_groups)
return self._wrap_result(result)
_upsample = _apply
@@ -2003,6 +2019,7 @@ def get_resampler_for_grouping(
limit: int | None = None,
kind=None,
on=None,
+ include_groups: bool = True,
**kwargs,
) -> Resampler:
"""
@@ -2011,7 +2028,9 @@ def get_resampler_for_grouping(
# .resample uses 'on' similar to how .groupby uses 'key'
tg = TimeGrouper(freq=rule, key=on, **kwargs)
resampler = tg._get_resampler(groupby.obj, kind=kind)
- return resampler._get_resampler_for_grouping(groupby=groupby, key=tg.key)
+ return resampler._get_resampler_for_grouping(
+ groupby=groupby, include_groups=include_groups, key=tg.key
+ )
class TimeGrouper(Grouper):
@@ -2789,3 +2808,18 @@ def maybe_warn_args_and_kwargs(cls, kernel: str, args, kwargs) -> None:
category=FutureWarning,
stacklevel=find_stack_level(),
)
+
+
+def _apply(
+ grouped: GroupBy, how: Callable, *args, include_groups: bool, **kwargs
+) -> DataFrame:
+ # GH#7155 - rewrite warning to appear as if it came from `.resample`
+ target_message = "DataFrameGroupBy.apply operated on the grouping columns"
+ new_message = _apply_groupings_depr.format("DataFrameGroupBy", "resample")
+ with rewrite_warning(
+ target_message=target_message,
+ target_category=FutureWarning,
+ new_message=new_message,
+ ):
+ result = grouped.apply(how, *args, include_groups=include_groups, **kwargs)
+ return result
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
index 8ef3943ab0d8d..6d1ff07e07c76 100644
--- a/pandas/core/reshape/merge.py
+++ b/pandas/core/reshape/merge.py
@@ -1272,12 +1272,7 @@ def _get_merge_keys(
# work-around for merge_asof(right_index=True)
right_keys.append(right.index._values)
if lk is not None and lk == rk: # FIXME: what about other NAs?
- # avoid key upcast in corner case (length-0)
- lk = cast(Hashable, lk)
- if len(left) > 0:
- right_drop.append(rk)
- else:
- left_drop.append(lk)
+ right_drop.append(rk)
else:
rk = cast(ArrayLike, rk)
right_keys.append(rk)
@@ -2421,7 +2416,8 @@ def _factorize_keys(
elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype:
if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or (
- isinstance(lk.dtype, StringDtype) and lk.dtype.storage == "pyarrow"
+ isinstance(lk.dtype, StringDtype)
+ and lk.dtype.storage in ["pyarrow", "pyarrow_numpy"]
):
import pyarrow as pa
import pyarrow.compute as pc
diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py
index 924b56f7a14d5..e8ca520e7b420 100644
--- a/pandas/core/reshape/pivot.py
+++ b/pandas/core/reshape/pivot.py
@@ -449,7 +449,7 @@ def _all_key():
return (margins_name,) + ("",) * (len(cols) - 1)
if len(rows) > 0:
- margin = data[rows].groupby(rows, observed=observed).apply(aggfunc)
+ margin = data.groupby(rows, observed=observed)[rows].apply(aggfunc)
all_key = _all_key()
table[all_key] = margin
result = table
@@ -467,7 +467,7 @@ def _all_key():
margin_keys = table.columns
if len(cols):
- row_margin = data[cols].groupby(cols, observed=observed).apply(aggfunc)
+ row_margin = data.groupby(cols, observed=observed)[cols].apply(aggfunc)
else:
row_margin = Series(np.nan, index=result.columns)
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
index 9ffbfb9f1149f..b4b0f29019c31 100644
--- a/pandas/io/excel/_base.py
+++ b/pandas/io/excel/_base.py
@@ -1,6 +1,5 @@
from __future__ import annotations
-import abc
from collections.abc import (
Hashable,
Iterable,
@@ -549,7 +548,7 @@ def read_excel(
_WorkbookT = TypeVar("_WorkbookT")
-class BaseExcelReader(Generic[_WorkbookT], metaclass=abc.ABCMeta):
+class BaseExcelReader(Generic[_WorkbookT]):
book: _WorkbookT
def __init__(
@@ -589,13 +588,11 @@ def __init__(
)
@property
- @abc.abstractmethod
def _workbook_class(self) -> type[_WorkbookT]:
- pass
+ raise NotImplementedError
- @abc.abstractmethod
def load_workbook(self, filepath_or_buffer, engine_kwargs) -> _WorkbookT:
- pass
+ raise NotImplementedError
def close(self) -> None:
if hasattr(self, "book"):
@@ -611,21 +608,17 @@ def close(self) -> None:
self.handles.close()
@property
- @abc.abstractmethod
def sheet_names(self) -> list[str]:
- pass
+ raise NotImplementedError
- @abc.abstractmethod
def get_sheet_by_name(self, name: str):
- pass
+ raise NotImplementedError
- @abc.abstractmethod
def get_sheet_by_index(self, index: int):
- pass
+ raise NotImplementedError
- @abc.abstractmethod
def get_sheet_data(self, sheet, rows: int | None = None):
- pass
+ raise NotImplementedError
def raise_if_bad_sheet_by_index(self, index: int) -> None:
n_sheets = len(self.sheet_names)
@@ -940,7 +933,7 @@ def parse(
@doc(storage_options=_shared_docs["storage_options"])
-class ExcelWriter(Generic[_WorkbookT], metaclass=abc.ABCMeta):
+class ExcelWriter(Generic[_WorkbookT]):
"""
Class for writing DataFrame objects into excel sheets.
@@ -1178,20 +1171,19 @@ def engine(self) -> str:
return self._engine
@property
- @abc.abstractmethod
def sheets(self) -> dict[str, Any]:
"""Mapping of sheet names to sheet objects."""
+ raise NotImplementedError
@property
- @abc.abstractmethod
def book(self) -> _WorkbookT:
"""
Book instance. Class type will depend on the engine used.
This attribute can be used to access engine-specific features.
"""
+ raise NotImplementedError
- @abc.abstractmethod
def _write_cells(
self,
cells,
@@ -1214,12 +1206,13 @@ def _write_cells(
freeze_panes: int tuple of length 2
contains the bottom-most row and right-most column to freeze
"""
+ raise NotImplementedError
- @abc.abstractmethod
def _save(self) -> None:
"""
Save workbook to disk.
"""
+ raise NotImplementedError
def __init__(
self,
diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py
index 9970d465ced9d..b344d9849f16c 100644
--- a/pandas/io/formats/excel.py
+++ b/pandas/io/formats/excel.py
@@ -941,9 +941,7 @@ def write(
if isinstance(writer, ExcelWriter):
need_save = False
else:
- # error: Cannot instantiate abstract class 'ExcelWriter' with abstract
- # attributes 'engine', 'save', 'supported_extensions' and 'write_cells'
- writer = ExcelWriter( # type: ignore[abstract]
+ writer = ExcelWriter(
writer,
engine=engine,
storage_options=storage_options,
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
index 833f4986b6da6..52ea072d1483f 100644
--- a/pandas/io/json/_json.py
+++ b/pandas/io/json/_json.py
@@ -82,6 +82,7 @@
JSONEngine,
JSONSerializable,
ReadBuffer,
+ Self,
StorageOptions,
WriteBuffer,
)
@@ -1056,7 +1057,7 @@ def close(self) -> None:
if self.handles is not None:
self.handles.close()
- def __iter__(self: JsonReader[FrameSeriesStrT]) -> JsonReader[FrameSeriesStrT]:
+ def __iter__(self) -> Self:
return self
@overload
@@ -1099,7 +1100,7 @@ def __next__(self) -> DataFrame | Series:
else:
return obj
- def __enter__(self) -> JsonReader[FrameSeriesStrT]:
+ def __enter__(self) -> Self:
return self
def __exit__(
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
index 10d3ab230cb9d..e0f171035e89e 100644
--- a/pandas/io/parsers/readers.py
+++ b/pandas/io/parsers/readers.py
@@ -1307,6 +1307,51 @@ def read_table(
return _read(filepath_or_buffer, kwds)
+@overload
+def read_fwf(
+ filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
+ *,
+ colspecs: Sequence[tuple[int, int]] | str | None = ...,
+ widths: Sequence[int] | None = ...,
+ infer_nrows: int = ...,
+ dtype_backend: DtypeBackend | lib.NoDefault = ...,
+ iterator: Literal[True],
+ chunksize: int | None = ...,
+ **kwds,
+) -> TextFileReader:
+ ...
+
+
+@overload
+def read_fwf(
+ filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
+ *,
+ colspecs: Sequence[tuple[int, int]] | str | None = ...,
+ widths: Sequence[int] | None = ...,
+ infer_nrows: int = ...,
+ dtype_backend: DtypeBackend | lib.NoDefault = ...,
+ iterator: bool = ...,
+ chunksize: int,
+ **kwds,
+) -> TextFileReader:
+ ...
+
+
+@overload
+def read_fwf(
+ filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
+ *,
+ colspecs: Sequence[tuple[int, int]] | str | None = ...,
+ widths: Sequence[int] | None = ...,
+ infer_nrows: int = ...,
+ dtype_backend: DtypeBackend | lib.NoDefault = ...,
+ iterator: Literal[False] = ...,
+ chunksize: None = ...,
+ **kwds,
+) -> DataFrame:
+ ...
+
+
def read_fwf(
filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
*,
@@ -1314,6 +1359,8 @@ def read_fwf(
widths: Sequence[int] | None = None,
infer_nrows: int = 100,
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
+ iterator: bool = False,
+ chunksize: int | None = None,
**kwds,
) -> DataFrame | TextFileReader:
r"""
@@ -1412,6 +1459,8 @@ def read_fwf(
kwds["colspecs"] = colspecs
kwds["infer_nrows"] = infer_nrows
kwds["engine"] = "python-fwf"
+ kwds["iterator"] = iterator
+ kwds["chunksize"] = chunksize
check_dtype_backend(dtype_backend)
kwds["dtype_backend"] = dtype_backend
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
index 7669d5aa4cea5..0788d9da06eb9 100644
--- a/pandas/io/sql.py
+++ b/pandas/io/sql.py
@@ -138,7 +138,7 @@ def _parse_date_columns(data_frame, parse_dates):
if isinstance(df_col.dtype, DatetimeTZDtype) or col_name in parse_dates:
try:
fmt = parse_dates[col_name]
- except TypeError:
+ except (KeyError, TypeError):
fmt = None
data_frame.isetitem(i, _handle_date_column(df_col, format=fmt))
@@ -2091,13 +2091,11 @@ def _adapt_time(t) -> str:
adapt_date_iso = lambda val: val.isoformat()
adapt_datetime_iso = lambda val: val.isoformat()
- adapt_datetime_epoch = lambda val: int(val.timestamp())
sqlite3.register_adapter(time, _adapt_time)
sqlite3.register_adapter(date, adapt_date_iso)
sqlite3.register_adapter(datetime, adapt_datetime_iso)
- sqlite3.register_adapter(datetime, adapt_datetime_epoch)
convert_date = lambda val: date.fromisoformat(val.decode())
convert_datetime = lambda val: datetime.fromisoformat(val.decode())
diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
index 09f9f788dc3e4..c1d424f12bfc4 100644
--- a/pandas/tests/arrays/string_/test_string_arrow.py
+++ b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -241,9 +241,10 @@ def test_setitem_invalid_indexer_raises():
@skip_if_no_pyarrow
-def test_pickle_roundtrip():
+@pytest.mark.parametrize("dtype", ["string[pyarrow]", "string[pyarrow_numpy]"])
+def test_pickle_roundtrip(dtype):
# GH 42600
- expected = pd.Series(range(10), dtype="string[pyarrow]")
+ expected = pd.Series(range(10), dtype=dtype)
expected_sliced = expected.head(2)
full_pickled = pickle.dumps(expected)
sliced_pickled = pickle.dumps(expected_sliced)
@@ -255,3 +256,11 @@ def test_pickle_roundtrip():
result_sliced = pickle.loads(sliced_pickled)
tm.assert_series_equal(result_sliced, expected_sliced)
+
+
+@skip_if_no_pyarrow
+def test_string_dtype_error_message():
+ # GH#55051
+ msg = "Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'."
+ with pytest.raises(ValueError, match=msg):
+ StringDtype("bla")
diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py
index 489f43729a004..5c21c4f7137a5 100644
--- a/pandas/tests/extension/base/groupby.py
+++ b/pandas/tests/extension/base/groupby.py
@@ -108,9 +108,13 @@ def test_groupby_extension_transform(self, data_for_grouping):
def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op):
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
- df.groupby("B", group_keys=False).apply(groupby_apply_op)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ df.groupby("B", group_keys=False).apply(groupby_apply_op)
df.groupby("B", group_keys=False).A.apply(groupby_apply_op)
- df.groupby("A", group_keys=False).apply(groupby_apply_op)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ df.groupby("A", group_keys=False).apply(groupby_apply_op)
df.groupby("A", group_keys=False).B.apply(groupby_apply_op)
def test_groupby_apply_identity(self, data_for_grouping):
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index 5f1b16a44b8e9..fa6e85ba204d2 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -2992,6 +2992,15 @@ def test_groupby_count_return_arrow_dtype(data_missing):
tm.assert_frame_equal(result, expected)
+def test_fixed_size_list():
+ # GH#55000
+ ser = pd.Series(
+ [[1, 2], [3, 4]], dtype=ArrowDtype(pa.list_(pa.int64(), list_size=2))
+ )
+ result = ser.dtype.type
+ assert result == list
+
+
def test_arrowextensiondtype_dataframe_repr():
# GH 54062
df = pd.DataFrame(
diff --git a/pandas/tests/frame/methods/test_copy.py b/pandas/tests/frame/methods/test_copy.py
index 95fcaaa473067..e7901ed363106 100644
--- a/pandas/tests/frame/methods/test_copy.py
+++ b/pandas/tests/frame/methods/test_copy.py
@@ -56,7 +56,7 @@ def test_copy_consolidates(self):
}
)
- for i in range(0, 10):
+ for i in range(10):
df.loc[:, f"n_{i}"] = np.random.default_rng(2).integers(0, 100, size=55)
assert len(df._mgr.blocks) == 11
diff --git a/pandas/tests/frame/methods/test_filter.py b/pandas/tests/frame/methods/test_filter.py
index 1a2fbf8a65a55..9d5e6876bb08c 100644
--- a/pandas/tests/frame/methods/test_filter.py
+++ b/pandas/tests/frame/methods/test_filter.py
@@ -137,3 +137,17 @@ def test_filter_regex_non_string(self):
result = df.filter(regex="STRING")
expected = df[["STRING"]]
tm.assert_frame_equal(result, expected)
+
+ def test_filter_keep_order(self):
+ # GH#54980
+ df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+ result = df.filter(items=["B", "A"])
+ expected = df[["B", "A"]]
+ tm.assert_frame_equal(result, expected)
+
+ def test_filter_different_dtype(self):
+ # GH#54980
+ df = DataFrame({1: [1, 2, 3], 2: [4, 5, 6]})
+ result = df.filter(items=["B", "A"])
+ expected = df[[]]
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/methods/test_pct_change.py b/pandas/tests/frame/methods/test_pct_change.py
index d0153da038a75..ede212ae18ae9 100644
--- a/pandas/tests/frame/methods/test_pct_change.py
+++ b/pandas/tests/frame/methods/test_pct_change.py
@@ -160,3 +160,21 @@ def test_pct_change_with_duplicated_indices(fill_method):
index=["a", "b"] * 3,
)
tm.assert_frame_equal(result, expected)
+
+
+def test_pct_change_none_beginning_no_warning():
+ # GH#54481
+ df = DataFrame(
+ [
+ [1, None],
+ [2, 1],
+ [3, 2],
+ [4, 3],
+ [5, 4],
+ ]
+ )
+ result = df.pct_change()
+ expected = DataFrame(
+ {0: [np.nan, 1, 0.5, 1 / 3, 0.25], 1: [np.nan, np.nan, 1, 0.5, 1 / 3]}
+ )
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py
index 0858e33a989b7..56bdd2fc664cc 100644
--- a/pandas/tests/frame/methods/test_reindex.py
+++ b/pandas/tests/frame/methods/test_reindex.py
@@ -26,7 +26,7 @@
isna,
)
import pandas._testing as tm
-from pandas.api.types import CategoricalDtype as CDT
+from pandas.api.types import CategoricalDtype
class TestReindexSetIndex:
@@ -1082,7 +1082,9 @@ def test_reindex_with_categoricalindex(self):
{
"A": np.arange(3, dtype="int64"),
},
- index=CategoricalIndex(list("abc"), dtype=CDT(list("cabe")), name="B"),
+ index=CategoricalIndex(
+ list("abc"), dtype=CategoricalDtype(list("cabe")), name="B"
+ ),
)
# reindexing
@@ -1111,13 +1113,13 @@ def test_reindex_with_categoricalindex(self):
result = df.reindex(Categorical(["a", "e"], categories=cats))
expected = DataFrame(
- {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats))}
+ {"A": [0, np.nan], "B": Series(list("ae")).astype(CategoricalDtype(cats))}
).set_index("B")
tm.assert_frame_equal(result, expected, check_index_type=True)
result = df.reindex(Categorical(["a"], categories=cats))
expected = DataFrame(
- {"A": [0], "B": Series(list("a")).astype(CDT(cats))}
+ {"A": [0], "B": Series(list("a")).astype(CategoricalDtype(cats))}
).set_index("B")
tm.assert_frame_equal(result, expected, check_index_type=True)
@@ -1138,13 +1140,19 @@ def test_reindex_with_categoricalindex(self):
# give back the type of categorical that we received
result = df.reindex(Categorical(["a", "e"], categories=cats, ordered=True))
expected = DataFrame(
- {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats, ordered=True))}
+ {
+ "A": [0, np.nan],
+ "B": Series(list("ae")).astype(CategoricalDtype(cats, ordered=True)),
+ }
).set_index("B")
tm.assert_frame_equal(result, expected, check_index_type=True)
result = df.reindex(Categorical(["a", "d"], categories=["a", "d"]))
expected = DataFrame(
- {"A": [0, np.nan], "B": Series(list("ad")).astype(CDT(["a", "d"]))}
+ {
+ "A": [0, np.nan],
+ "B": Series(list("ad")).astype(CategoricalDtype(["a", "d"])),
+ }
).set_index("B")
tm.assert_frame_equal(result, expected, check_index_type=True)
@@ -1152,7 +1160,9 @@ def test_reindex_with_categoricalindex(self):
{
"A": np.arange(6, dtype="int64"),
},
- index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cabe")), name="B"),
+ index=CategoricalIndex(
+ list("aabbca"), dtype=CategoricalDtype(list("cabe")), name="B"
+ ),
)
# passed duplicate indexers are not allowed
msg = "cannot reindex on an axis with duplicate labels"
diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py
index d99dd36f3a2e3..339e19254fd10 100644
--- a/pandas/tests/frame/methods/test_reset_index.py
+++ b/pandas/tests/frame/methods/test_reset_index.py
@@ -788,15 +788,15 @@ def test_errorreset_index_rename(float_frame):
def test_reset_index_false_index_name():
- result_series = Series(data=range(5, 10), index=range(0, 5))
+ result_series = Series(data=range(5, 10), index=range(5))
result_series.index.name = False
result_series.reset_index()
- expected_series = Series(range(5, 10), RangeIndex(range(0, 5), name=False))
+ expected_series = Series(range(5, 10), RangeIndex(range(5), name=False))
tm.assert_series_equal(result_series, expected_series)
# GH 38147
- result_frame = DataFrame(data=range(5, 10), index=range(0, 5))
+ result_frame = DataFrame(data=range(5, 10), index=range(5))
result_frame.index.name = False
result_frame.reset_index()
- expected_frame = DataFrame(range(5, 10), RangeIndex(range(0, 5), name=False))
+ expected_frame = DataFrame(range(5, 10), RangeIndex(range(5), name=False))
tm.assert_frame_equal(result_frame, expected_frame)
diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py
index 228b62a418813..985a9e3602410 100644
--- a/pandas/tests/frame/methods/test_sort_index.py
+++ b/pandas/tests/frame/methods/test_sort_index.py
@@ -911,7 +911,7 @@ def test_sort_index_multiindex_sparse_column(self):
expected = DataFrame(
{
i: pd.array([0.0, 0.0, 0.0, 0.0], dtype=pd.SparseDtype("float64", 0.0))
- for i in range(0, 4)
+ for i in range(4)
},
index=MultiIndex.from_product([[1, 2], [1, 2]]),
)
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
index 3e2cde37c30eb..fd851ab244cb8 100644
--- a/pandas/tests/frame/test_constructors.py
+++ b/pandas/tests/frame/test_constructors.py
@@ -692,12 +692,12 @@ def test_constructor_error_msgs(self):
arr = np.array([[4, 5, 6]])
msg = r"Shape of passed values is \(1, 3\), indices imply \(1, 4\)"
with pytest.raises(ValueError, match=msg):
- DataFrame(index=[0], columns=range(0, 4), data=arr)
+ DataFrame(index=[0], columns=range(4), data=arr)
arr = np.array([4, 5, 6])
msg = r"Shape of passed values is \(3, 1\), indices imply \(1, 4\)"
with pytest.raises(ValueError, match=msg):
- DataFrame(index=[0], columns=range(0, 4), data=arr)
+ DataFrame(index=[0], columns=range(4), data=arr)
# higher dim raise exception
with pytest.raises(ValueError, match="Must pass 2-d input"):
@@ -2391,7 +2391,7 @@ def test_construct_with_two_categoricalindex_series(self):
def test_constructor_series_nonexact_categoricalindex(self):
# GH 42424
- ser = Series(range(0, 100))
+ ser = Series(range(100))
ser1 = cut(ser, 10).value_counts().head(5)
ser2 = cut(ser, 10).value_counts().tail(5)
result = DataFrame({"1": ser1, "2": ser2})
diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py
index dbd1f96fc17c9..b54a795af4fdc 100644
--- a/pandas/tests/frame/test_stack_unstack.py
+++ b/pandas/tests/frame/test_stack_unstack.py
@@ -1767,7 +1767,9 @@ def test_unstack_bug(self, future_stack):
}
)
- result = df.groupby(["state", "exp", "barcode", "v"]).apply(len)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby(["state", "exp", "barcode", "v"]).apply(len)
unstacked = result.unstack()
restacked = unstacked.stack(future_stack=future_stack)
diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
index c01ca4922a84b..882f42ff18bdd 100644
--- a/pandas/tests/groupby/aggregate/test_aggregate.py
+++ b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -515,6 +515,18 @@ def test_groupby_agg_dict_with_getitem():
tm.assert_frame_equal(result, expected)
+def test_groupby_agg_dict_dup_columns():
+ # GH#55006
+ df = DataFrame(
+ [[1, 2, 3, 4], [1, 3, 4, 5], [2, 4, 5, 6]],
+ columns=["a", "b", "c", "c"],
+ )
+ gb = df.groupby("a")
+ result = gb.agg({"b": "sum"})
+ expected = DataFrame({"b": [5, 4]}, index=Index([1, 2], name="a"))
+ tm.assert_frame_equal(result, expected)
+
+
@pytest.mark.parametrize(
"op",
[
diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py
index 9d3ebbd3672ae..7ea107f254104 100644
--- a/pandas/tests/groupby/aggregate/test_other.py
+++ b/pandas/tests/groupby/aggregate/test_other.py
@@ -499,13 +499,17 @@ def test_agg_timezone_round_trip():
assert ts == grouped.first()["B"].iloc[0]
# GH#27110 applying iloc should return a DataFrame
- assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1]
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1]
ts = df["B"].iloc[2]
assert ts == grouped.last()["B"].iloc[0]
# GH#27110 applying iloc should return a DataFrame
- assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1]
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1]
def test_sum_uint64_overflow():
diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
index d04ee7cec0db1..abcb9f68e0f5c 100644
--- a/pandas/tests/groupby/test_apply.py
+++ b/pandas/tests/groupby/test_apply.py
@@ -28,7 +28,9 @@ def test_apply_func_that_appends_group_to_list_without_copy():
def store(group):
groups.append(group)
- df.groupby("index").apply(store)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ df.groupby("index").apply(store)
expected_value = DataFrame(
{"index": [0] * 10, 0: [1] * 10}, index=pd.RangeIndex(0, 100, 10)
)
@@ -71,9 +73,11 @@ def test_apply_issues():
["2011.05.16", "2011.05.17", "2011.05.18"], dtype=object, name="date"
)
expected = Series(["00:00", "02:00", "02:00"], index=exp_idx)
- result = df.groupby("date", group_keys=False).apply(
- lambda x: x["time"][x["value"].idxmax()]
- )
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("date", group_keys=False).apply(
+ lambda x: x["time"][x["value"].idxmax()]
+ )
tm.assert_series_equal(result, expected)
@@ -179,7 +183,9 @@ def f_constant_df(group):
for func in [f_copy, f_nocopy, f_scalar, f_none, f_constant_df]:
del names[:]
- df.groupby("a", group_keys=False).apply(func)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ df.groupby("a", group_keys=False).apply(func)
assert names == group_names
@@ -197,9 +203,11 @@ def test_group_apply_once_per_group2(capsys):
index=["0", "2", "4", "6", "8", "10", "12", "14"],
)
- df.groupby("group_by_column", group_keys=False).apply(
- lambda df: print("function_called")
- )
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ df.groupby("group_by_column", group_keys=False).apply(
+ lambda df: print("function_called")
+ )
result = capsys.readouterr().out.count("function_called")
# If `groupby` behaves unexpectedly, this test will break
@@ -219,8 +227,11 @@ def slow(group):
def fast(group):
return group.copy()
- fast_df = df.groupby("A", group_keys=False).apply(fast)
- slow_df = df.groupby("A", group_keys=False).apply(slow)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ fast_df = df.groupby("A", group_keys=False).apply(fast)
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ slow_df = df.groupby("A", group_keys=False).apply(slow)
tm.assert_frame_equal(fast_df, slow_df)
@@ -242,7 +253,9 @@ def test_groupby_apply_identity_maybecopy_index_identical(func):
df = DataFrame({"g": [1, 2, 2, 2], "a": [1, 2, 3, 4], "b": [5, 6, 7, 8]})
- result = df.groupby("g", group_keys=False).apply(func)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("g", group_keys=False).apply(func)
tm.assert_frame_equal(result, df)
@@ -285,8 +298,11 @@ def test_groupby_as_index_apply():
tm.assert_index_equal(res_as, exp)
tm.assert_index_equal(res_not_as, exp)
- res_as_apply = g_as.apply(lambda x: x.head(2)).index
- res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ res_as_apply = g_as.apply(lambda x: x.head(2)).index
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index
# apply doesn't maintain the original ordering
# changed in GH5610 as the as_index=False returns a MI here
@@ -299,7 +315,9 @@ def test_groupby_as_index_apply():
ind = Index(list("abcde"))
df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind)
- res = df.groupby(0, as_index=False, group_keys=False).apply(lambda x: x).index
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ res = df.groupby(0, as_index=False, group_keys=False).apply(lambda x: x).index
tm.assert_index_equal(res, ind)
@@ -328,13 +346,19 @@ def desc3(group):
# weirdo
return result
- result = grouped.apply(desc)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = grouped.apply(desc)
assert result.index.names == ("A", "B", "stat")
- result2 = grouped.apply(desc2)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result2 = grouped.apply(desc2)
assert result2.index.names == ("A", "B", "stat")
- result3 = grouped.apply(desc3)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result3 = grouped.apply(desc3)
assert result3.index.names == ("A", "B", None)
@@ -364,7 +388,9 @@ def test_apply_series_yield_constant(df):
def test_apply_frame_yield_constant(df):
# GH13568
- result = df.groupby(["A", "B"]).apply(len)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby(["A", "B"]).apply(len)
assert isinstance(result, Series)
assert result.name is None
@@ -375,7 +401,9 @@ def test_apply_frame_yield_constant(df):
def test_apply_frame_to_series(df):
grouped = df.groupby(["A", "B"])
- result = grouped.apply(len)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = grouped.apply(len)
expected = grouped.count()["C"]
tm.assert_index_equal(result.index, expected.index)
tm.assert_numpy_array_equal(result.values, expected.values)
@@ -384,7 +412,9 @@ def test_apply_frame_to_series(df):
def test_apply_frame_not_as_index_column_name(df):
# GH 35964 - path within _wrap_applied_output not hit by a test
grouped = df.groupby(["A", "B"], as_index=False)
- result = grouped.apply(len)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = grouped.apply(len)
expected = grouped.count().rename(columns={"C": np.nan}).drop(columns="D")
# TODO(GH#34306): Use assert_frame_equal when column name is not np.nan
tm.assert_index_equal(result.index, expected.index)
@@ -407,7 +437,9 @@ def trans2(group):
}
)
- result = df.groupby("A").apply(trans)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("A").apply(trans)
exp = df.groupby("A")["C"].apply(trans2)
tm.assert_series_equal(result, exp, check_names=False)
assert result.name == "C"
@@ -436,7 +468,9 @@ def test_apply_chunk_view(group_keys):
# Low level tinkering could be unsafe, make sure not
df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)})
- result = df.groupby("key", group_keys=group_keys).apply(lambda x: x.iloc[:2])
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("key", group_keys=group_keys).apply(lambda x: x.iloc[:2])
expected = df.take([0, 1, 3, 4, 6, 7])
if group_keys:
expected.index = MultiIndex.from_arrays(
@@ -457,7 +491,9 @@ def test_apply_no_name_column_conflict():
# it works! #2605
grouped = df.groupby(["name", "name2"])
- grouped.apply(lambda x: x.sort_values("value", inplace=True))
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ grouped.apply(lambda x: x.sort_values("value", inplace=True))
def test_apply_typecast_fail():
@@ -474,7 +510,9 @@ def f(group):
group["v2"] = (v - v.min()) / (v.max() - v.min())
return group
- result = df.groupby("d", group_keys=False).apply(f)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("d", group_keys=False).apply(f)
expected = df.copy()
expected["v2"] = np.tile([0.0, 0.5, 1], 2)
@@ -498,7 +536,9 @@ def f(group):
group["v2"] = (v - v.min()) / (v.max() - v.min())
return group
- result = df.groupby("d", group_keys=False).apply(f)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("d", group_keys=False).apply(f)
expected = df.copy()
expected["v2"] = np.tile([0.0, 0.5, 1], 2)
@@ -536,8 +576,11 @@ def filt2(x):
else:
return x[x.category == "c"]
- expected = data.groupby("id_field").apply(filt1)
- result = data.groupby("id_field").apply(filt2)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = data.groupby("id_field").apply(filt1)
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = data.groupby("id_field").apply(filt2)
tm.assert_frame_equal(result, expected)
@@ -556,7 +599,9 @@ def test_apply_with_duplicated_non_sorted_axis(test_series):
expected = ser.sort_index()
tm.assert_series_equal(result, expected)
else:
- result = df.groupby("Y", group_keys=False).apply(lambda x: x)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("Y", group_keys=False).apply(lambda x: x)
# not expecting the order to remain the same for duplicated axis
result = result.sort_values("Y")
@@ -601,7 +646,9 @@ def f(g):
g["value3"] = g["value1"] * 2
return g
- result = grouped.apply(f)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = grouped.apply(f)
assert "value3" in result
@@ -615,9 +662,13 @@ def test_apply_numeric_coercion_when_datetime():
df = DataFrame(
{"Number": [1, 2], "Date": ["2017-03-02"] * 2, "Str": ["foo", "inf"]}
)
- expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0])
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0])
df.Date = pd.to_datetime(df.Date)
- result = df.groupby(["Number"]).apply(lambda x: x.iloc[0])
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby(["Number"]).apply(lambda x: x.iloc[0])
tm.assert_series_equal(result["Str"], expected["Str"])
# GH 15421
@@ -628,7 +679,9 @@ def test_apply_numeric_coercion_when_datetime():
def get_B(g):
return g.iloc[0][["B"]]
- result = df.groupby("A").apply(get_B)["B"]
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("A").apply(get_B)["B"]
expected = df.B
expected.index = df.A
tm.assert_series_equal(result, expected)
@@ -653,8 +706,11 @@ def predictions(tool):
)
df2 = df1.copy()
df2.oTime = pd.to_datetime(df2.oTime)
- expected = df1.groupby("Key").apply(predictions).p1
- result = df2.groupby("Key").apply(predictions).p1
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = df1.groupby("Key").apply(predictions).p1
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df2.groupby("Key").apply(predictions).p1
tm.assert_series_equal(expected, result)
@@ -669,11 +725,13 @@ def test_apply_aggregating_timedelta_and_datetime():
}
)
df["time_delta_zero"] = df.datetime - df.datetime
- result = df.groupby("clientid").apply(
- lambda ddf: Series(
- {"clientid_age": ddf.time_delta_zero.min(), "date": ddf.datetime.min()}
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("clientid").apply(
+ lambda ddf: Series(
+ {"clientid_age": ddf.time_delta_zero.min(), "date": ddf.datetime.min()}
+ )
)
- )
expected = DataFrame(
{
"clientid": ["A", "B", "C"],
@@ -716,11 +774,15 @@ def func_with_no_date(batch):
def func_with_date(batch):
return Series({"b": datetime(2015, 1, 1), "c": 2})
- dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date)
dfg_no_conversion_expected = DataFrame({"c": 2}, index=[1])
dfg_no_conversion_expected.index.name = "a"
- dfg_conversion = df.groupby(by=["a"]).apply(func_with_date)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ dfg_conversion = df.groupby(by=["a"]).apply(func_with_date)
dfg_conversion_expected = DataFrame(
{"b": pd.Timestamp(2015, 1, 1).as_unit("ns"), "c": 2}, index=[1]
)
@@ -764,7 +826,9 @@ def test_groupby_apply_all_none():
def test_func(x):
pass
- result = test_df.groupby("groups").apply(test_func)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = test_df.groupby("groups").apply(test_func)
expected = DataFrame()
tm.assert_frame_equal(result, expected)
@@ -779,8 +843,11 @@ def test_func(x):
return None
return x.iloc[[0, -1]]
- result1 = test_df1.groupby("groups").apply(test_func)
- result2 = test_df2.groupby("groups").apply(test_func)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result1 = test_df1.groupby("groups").apply(test_func)
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result2 = test_df2.groupby("groups").apply(test_func)
index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], names=["groups", None])
index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], names=["groups", None])
expected1 = DataFrame({"groups": [1, 1], "vars": [0, 2]}, index=index1)
@@ -793,7 +860,9 @@ def test_groupby_apply_return_empty_chunk():
# GH 22221: apply filter which returns some empty groups
df = DataFrame({"value": [0, 1], "group": ["filled", "empty"]})
groups = df.groupby("group")
- result = groups.apply(lambda group: group[group.value != 1]["value"])
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = groups.apply(lambda group: group[group.value != 1]["value"])
expected = Series(
[0],
name="value",
@@ -820,7 +889,9 @@ def test_apply_with_mixed_types():
def test_func_returns_object():
# GH 28652
df = DataFrame({"a": [1, 2]}, index=Index([1, 2]))
- result = df.groupby("a").apply(lambda g: g.index)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("a").apply(lambda g: g.index)
expected = Series([Index([1]), Index([2])], index=Index([1, 2], name="a"))
tm.assert_series_equal(result, expected)
@@ -837,7 +908,9 @@ def test_apply_datetime_issue(group_column_dtlike):
# standard int values in range(len(num_columns))
df = DataFrame({"a": ["foo"], "b": [group_column_dtlike]})
- result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42]))
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42]))
expected = DataFrame(
["spam"], Index(["foo"], dtype="object", name="a"), columns=[42]
@@ -876,7 +949,9 @@ def test_apply_series_return_dataframe_groups():
def most_common_values(df):
return Series({c: s.value_counts().index[0] for c, s in df.items()})
- result = tdf.groupby("day").apply(most_common_values)["userId"]
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = tdf.groupby("day").apply(most_common_values)["userId"]
expected = Series(
["17661101"], index=pd.DatetimeIndex(["2015-02-24"], name="day"), name="userId"
)
@@ -917,7 +992,9 @@ def test_groupby_apply_datetime_result_dtypes():
],
columns=["observation", "color", "mood", "intensity", "score"],
)
- result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes
expected = Series(
[np.dtype("datetime64[ns]"), object, object, np.int64, object],
index=["observation", "color", "mood", "intensity", "score"],
@@ -937,7 +1014,9 @@ def test_groupby_apply_datetime_result_dtypes():
def test_apply_index_has_complex_internals(index):
# GH 31248
df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index)
- result = df.groupby("group", group_keys=False).apply(lambda x: x)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("group", group_keys=False).apply(lambda x: x)
tm.assert_frame_equal(result, df)
@@ -960,7 +1039,9 @@ def test_apply_index_has_complex_internals(index):
def test_apply_function_returns_non_pandas_non_scalar(function, expected_values):
# GH 31441
df = DataFrame(["A", "A", "B", "B"], columns=["groups"])
- result = df.groupby("groups").apply(function)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("groups").apply(function)
expected = Series(expected_values, index=Index(["A", "B"], name="groups"))
tm.assert_series_equal(result, expected)
@@ -972,7 +1053,9 @@ def fct(group):
df = DataFrame({"A": ["a", "a", "b", "none"], "B": [1, 2, 3, np.nan]})
- result = df.groupby("A").apply(fct)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("A").apply(fct)
expected = Series(
[[1.0, 2.0], [3.0], [np.nan]], index=Index(["a", "b", "none"], name="A")
)
@@ -983,7 +1066,9 @@ def fct(group):
def test_apply_function_index_return(function):
# GH: 22541
df = DataFrame([1, 2, 2, 2, 1, 2, 3, 1, 3, 1], columns=["id"])
- result = df.groupby("id").apply(function)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("id").apply(function)
expected = Series(
[Index([0, 4, 7, 9]), Index([1, 2, 3, 5]), Index([6, 8])],
index=Index([1, 2, 3], name="id"),
@@ -1019,7 +1104,9 @@ def test_apply_result_type(group_keys, udf):
# We'd like to control whether the group keys end up in the index
# regardless of whether the UDF happens to be a transform.
df = DataFrame({"A": ["a", "b"], "B": [1, 2]})
- df_result = df.groupby("A", group_keys=group_keys).apply(udf)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ df_result = df.groupby("A", group_keys=group_keys).apply(udf)
series_result = df.B.groupby(df.A, group_keys=group_keys).apply(udf)
if group_keys:
@@ -1034,8 +1121,11 @@ def test_result_order_group_keys_false():
# GH 34998
# apply result order should not depend on whether index is the same or just equal
df = DataFrame({"A": [2, 1, 2], "B": [1, 2, 3]})
- result = df.groupby("A", group_keys=False).apply(lambda x: x)
- expected = df.groupby("A", group_keys=False).apply(lambda x: x.copy())
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("A", group_keys=False).apply(lambda x: x)
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = df.groupby("A", group_keys=False).apply(lambda x: x.copy())
tm.assert_frame_equal(result, expected)
@@ -1047,8 +1137,15 @@ def test_apply_with_timezones_aware():
df1 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_no_tz})
df2 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_tz})
- result1 = df1.groupby("x", group_keys=False).apply(lambda df: df[["x", "y"]].copy())
- result2 = df2.groupby("x", group_keys=False).apply(lambda df: df[["x", "y"]].copy())
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result1 = df1.groupby("x", group_keys=False).apply(
+ lambda df: df[["x", "y"]].copy()
+ )
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result2 = df2.groupby("x", group_keys=False).apply(
+ lambda df: df[["x", "y"]].copy()
+ )
tm.assert_frame_equal(result1, result2)
@@ -1103,7 +1200,9 @@ def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp():
)
grp = df.groupby(["A", "B"])
- result = grp.apply(lambda x: x.head(1))
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = grp.apply(lambda x: x.head(1))
expected = df.iloc[[0, 2, 3]]
expected = expected.reset_index()
@@ -1151,7 +1250,9 @@ def test_apply_dropna_with_indexed_same(dropna):
},
index=list("xxyxz"),
)
- result = df.groupby("group", dropna=dropna, group_keys=False).apply(lambda x: x)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("group", dropna=dropna, group_keys=False).apply(lambda x: x)
expected = df.dropna() if dropna else df.iloc[[0, 3, 1, 2, 4]]
tm.assert_frame_equal(result, expected)
@@ -1176,7 +1277,9 @@ def test_apply_dropna_with_indexed_same(dropna):
def test_apply_as_index_constant_lambda(as_index, expected):
# GH 13217
df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 1, 2, 2], "c": [1, 1, 1, 1]})
- result = df.groupby(["a", "b"], as_index=as_index).apply(lambda x: 1)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby(["a", "b"], as_index=as_index).apply(lambda x: 1)
tm.assert_equal(result, expected)
@@ -1186,7 +1289,9 @@ def test_sort_index_groups():
{"A": [1, 2, 3, 4, 5], "B": [6, 7, 8, 9, 0], "C": [1, 1, 1, 2, 2]},
index=range(5),
)
- result = df.groupby("C").apply(lambda x: x.A.sort_index())
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("C").apply(lambda x: x.A.sort_index())
expected = Series(
range(1, 6),
index=MultiIndex.from_tuples(
@@ -1206,9 +1311,11 @@ def test_positional_slice_groups_datetimelike():
"let": list("abcde"),
}
)
- result = expected.groupby(
- [expected.let, expected.date.dt.date], group_keys=False
- ).apply(lambda x: x.iloc[0:])
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = expected.groupby(
+ [expected.let, expected.date.dt.date], group_keys=False
+ ).apply(lambda x: x.iloc[0:])
tm.assert_frame_equal(result, expected)
@@ -1251,24 +1358,29 @@ def test_apply_na(dropna):
{"grp": [1, 1, 2, 2], "y": [1, 0, 2, 5], "z": [1, 2, np.nan, np.nan]}
)
dfgrp = df.groupby("grp", dropna=dropna)
- result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z"))
- expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1))
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z"))
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1))
tm.assert_frame_equal(result, expected)
def test_apply_empty_string_nan_coerce_bug():
# GH#24903
- result = (
- DataFrame(
- {
- "a": [1, 1, 2, 2],
- "b": ["", "", "", ""],
- "c": pd.to_datetime([1, 2, 3, 4], unit="s"),
- }
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = (
+ DataFrame(
+ {
+ "a": [1, 1, 2, 2],
+ "b": ["", "", "", ""],
+ "c": pd.to_datetime([1, 2, 3, 4], unit="s"),
+ }
+ )
+ .groupby(["a", "b"])
+ .apply(lambda df: df.iloc[-1])
)
- .groupby(["a", "b"])
- .apply(lambda df: df.iloc[-1])
- )
expected = DataFrame(
[[1, "", pd.to_datetime(2, unit="s")], [2, "", pd.to_datetime(4, unit="s")]],
columns=["a", "b", "c"],
@@ -1293,9 +1405,11 @@ def test_apply_index_key_error_bug(index_values):
},
index=Index(["a2", "a3", "aa"], name="a"),
)
- result = result.groupby("a").apply(
- lambda df: Series([df["b"].mean()], index=["b_mean"])
- )
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = result.groupby("a").apply(
+ lambda df: Series([df["b"].mean()], index=["b_mean"])
+ )
tm.assert_frame_equal(result, expected)
@@ -1343,7 +1457,9 @@ def test_apply_index_key_error_bug(index_values):
def test_apply_nonmonotonic_float_index(arg, idx):
# GH 34455
expected = DataFrame({"col": arg}, index=idx)
- result = expected.groupby("col", group_keys=False).apply(lambda x: x)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = expected.groupby("col", group_keys=False).apply(lambda x: x)
tm.assert_frame_equal(result, expected)
@@ -1390,33 +1506,16 @@ def test_empty_df(method, op):
tm.assert_series_equal(result, expected)
-@pytest.mark.parametrize(
- "group_col",
- [([0.0, np.nan, 0.0, 0.0]), ([np.nan, 0.0, 0.0, 0.0]), ([0, 0.0, 0.0, np.nan])],
-)
-def test_apply_inconsistent_output(group_col):
- # GH 34478
- df = DataFrame({"group_col": group_col, "value_col": [2, 2, 2, 2]})
-
- result = df.groupby("group_col").value_col.apply(
- lambda x: x.value_counts().reindex(index=[1, 2, 3])
- )
- expected = Series(
- [np.nan, 3.0, np.nan],
- name="value_col",
- index=MultiIndex.from_product([[0.0], [1, 2, 3]], names=["group_col", 0.0]),
- )
-
- tm.assert_series_equal(result, expected)
-
-
-def test_apply_array_output_multi_getitem():
- # GH 18930
- df = DataFrame(
- {"A": {"a": 1, "b": 2}, "B": {"a": 1, "b": 2}, "C": {"a": 1, "b": 2}}
- )
- result = df.groupby("A")[["B", "C"]].apply(lambda x: np.array([0]))
- expected = Series(
- [np.array([0])] * 2, index=Index([1, 2], name="A"), name=("B", "C")
- )
- tm.assert_series_equal(result, expected)
+@pytest.mark.parametrize("include_groups", [True, False])
+def test_include_groups(include_groups):
+ # GH#7155
+ df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]})
+ gb = df.groupby("a")
+ warn = FutureWarning if include_groups else None
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(warn, match=msg):
+ result = gb.apply(lambda x: x.sum(), include_groups=include_groups)
+ expected = DataFrame({"a": [2, 2], "b": [7, 5]}, index=Index([1, 2], name="a"))
+ if not include_groups:
+ expected = expected[["b"]]
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_apply_mutate.py b/pandas/tests/groupby/test_apply_mutate.py
index 9bc07b584e9d1..09d5e06bf6ddd 100644
--- a/pandas/tests/groupby/test_apply_mutate.py
+++ b/pandas/tests/groupby/test_apply_mutate.py
@@ -13,10 +13,16 @@ def test_group_by_copy():
}
).set_index("name")
- grp_by_same_value = df.groupby(["age"], group_keys=False).apply(lambda group: group)
- grp_by_copy = df.groupby(["age"], group_keys=False).apply(
- lambda group: group.copy()
- )
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ grp_by_same_value = df.groupby(["age"], group_keys=False).apply(
+ lambda group: group
+ )
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ grp_by_copy = df.groupby(["age"], group_keys=False).apply(
+ lambda group: group.copy()
+ )
tm.assert_frame_equal(grp_by_same_value, grp_by_copy)
@@ -47,8 +53,11 @@ def f_no_copy(x):
x["rank"] = x.val.rank(method="min")
return x.groupby("cat2")["rank"].min()
- grpby_copy = df.groupby("cat1").apply(f_copy)
- grpby_no_copy = df.groupby("cat1").apply(f_no_copy)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ grpby_copy = df.groupby("cat1").apply(f_copy)
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ grpby_no_copy = df.groupby("cat1").apply(f_no_copy)
tm.assert_series_equal(grpby_copy, grpby_no_copy)
@@ -58,8 +67,11 @@ def test_no_mutate_but_looks_like():
# second does not, but should yield the same results
df = pd.DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)})
- result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key)
- result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key)
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key)
tm.assert_series_equal(result1, result2)
@@ -73,7 +85,9 @@ def fn(x):
x.loc[x.index[-1], "col2"] = 0
return x.col2
- result = df.groupby(["col1"], as_index=False).apply(fn)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby(["col1"], as_index=False).apply(fn)
expected = pd.Series(
[1, 2, 0, 4, 5, 0],
index=pd.MultiIndex.from_tuples(
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
index f2d21c10f7a15..b11240c841420 100644
--- a/pandas/tests/groupby/test_categorical.py
+++ b/pandas/tests/groupby/test_categorical.py
@@ -124,7 +124,9 @@ def test_basic(): # TODO: split this test
def f(x):
return x.drop_duplicates("person_name").iloc[0]
- result = g.apply(f)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = g.apply(f)
expected = x.iloc[[0, 1]].copy()
expected.index = Index([1, 2], name="person_id")
expected["person_name"] = expected["person_name"].astype("object")
@@ -329,7 +331,9 @@ def test_apply(ordered):
# but for transform we should still get back the original index
idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"])
expected = Series(1, index=idx)
- result = grouped.apply(lambda x: 1)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = grouped.apply(lambda x: 1)
tm.assert_series_equal(result, expected)
@@ -2013,7 +2017,10 @@ def test_category_order_apply(as_index, sort, observed, method, index_kind, orde
df["a2"] = df["a"]
df = df.set_index(keys)
gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
- op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True))
+ warn = FutureWarning if method == "apply" and index_kind == "range" else None
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(warn, match=msg):
+ op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True))
if (method == "transform" or not as_index) and index_kind == "range":
result = op_result["a"].cat.categories
else:
diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py
index 25a4fd2550df6..16d7fe61b90ad 100644
--- a/pandas/tests/groupby/test_counting.py
+++ b/pandas/tests/groupby/test_counting.py
@@ -289,7 +289,9 @@ def test_count():
for key in ["1st", "2nd", ["1st", "2nd"]]:
left = df.groupby(key).count()
- right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1)
tm.assert_frame_equal(left, right)
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
index 0abf6428730ff..287310a18c7df 100644
--- a/pandas/tests/groupby/test_function.py
+++ b/pandas/tests/groupby/test_function.py
@@ -95,10 +95,12 @@ def test_builtins_apply(keys, f):
assert result.shape == (ngroups, 3), assert_msg
npfunc = lambda x: getattr(np, fname)(x, axis=0) # numpy's equivalent function
- expected = gb.apply(npfunc)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = gb.apply(npfunc)
tm.assert_frame_equal(result, expected)
- with tm.assert_produces_warning(None):
+ with tm.assert_produces_warning(FutureWarning, match=msg):
expected2 = gb.apply(lambda x: npfunc(x))
tm.assert_frame_equal(result, expected2)
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index be226b4466f98..fdd959f0e8754 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -150,7 +150,9 @@ def test_groupby_nonobject_dtype(mframe, df_mixed_floats):
def max_value(group):
return group.loc[group["value"].idxmax()]
- applied = df.groupby("A").apply(max_value)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ applied = df.groupby("A").apply(max_value)
result = applied.dtypes
expected = df.dtypes
tm.assert_series_equal(result, expected)
@@ -171,7 +173,9 @@ def f_0(grp):
return grp.iloc[0]
expected = df.groupby("A").first()[["B"]]
- result = df.groupby("A").apply(f_0)[["B"]]
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("A").apply(f_0)[["B"]]
tm.assert_frame_equal(result, expected)
def f_1(grp):
@@ -179,9 +183,10 @@ def f_1(grp):
return None
return grp.iloc[0]
- result = df.groupby("A").apply(f_1)[["B"]]
- # Cast to avoid upcast when setting nan below
- e = expected.copy().astype("float64")
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("A").apply(f_1)[["B"]]
+ e = expected.copy()
e.loc["Tiger"] = np.nan
tm.assert_frame_equal(result, e)
@@ -190,9 +195,10 @@ def f_2(grp):
return None
return grp.iloc[0]
- result = df.groupby("A").apply(f_2)[["B"]]
- # Explicit cast to float to avoid implicit cast when setting nan
- e = expected.copy().astype({"B": "float"})
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("A").apply(f_2)[["B"]]
+ e = expected.copy()
e.loc["Pony"] = np.nan
tm.assert_frame_equal(result, e)
@@ -202,7 +208,9 @@ def f_3(grp):
return None
return grp.iloc[0]
- result = df.groupby("A").apply(f_3)[["C"]]
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("A").apply(f_3)[["C"]]
e = df.groupby("A").first()[["C"]]
e.loc["Pony"] = pd.NaT
tm.assert_frame_equal(result, e)
@@ -213,7 +221,9 @@ def f_4(grp):
return None
return grp.iloc[0].loc["C"]
- result = df.groupby("A").apply(f_4)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("A").apply(f_4)
e = df.groupby("A").first()["C"].copy()
e.loc["Pony"] = np.nan
e.name = None
@@ -392,8 +402,11 @@ def f3(x):
depr_msg = "The behavior of array concatenation with empty entries is deprecated"
# correct result
- result1 = df.groupby("a").apply(f1)
- result2 = df2.groupby("a").apply(f1)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result1 = df.groupby("a").apply(f1)
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result2 = df2.groupby("a").apply(f1)
tm.assert_frame_equal(result1, result2)
# should fail (not the same number of levels)
@@ -1322,11 +1335,15 @@ def summarize_random_name(df):
# inconsistent.
return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["A"])
- metrics = df.groupby("A").apply(summarize)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ metrics = df.groupby("A").apply(summarize)
assert metrics.columns.name is None
- metrics = df.groupby("A").apply(summarize, "metrics")
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ metrics = df.groupby("A").apply(summarize, "metrics")
assert metrics.columns.name == "metrics"
- metrics = df.groupby("A").apply(summarize_random_name)
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ metrics = df.groupby("A").apply(summarize_random_name)
assert metrics.columns.name is None
@@ -1619,7 +1636,9 @@ def test_dont_clobber_name_column():
{"key": ["a", "a", "a", "b", "b", "b"], "name": ["foo", "bar", "baz"] * 2}
)
- result = df.groupby("key", group_keys=False).apply(lambda x: x)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("key", group_keys=False).apply(lambda x: x)
tm.assert_frame_equal(result, df)
@@ -1693,7 +1712,9 @@ def freducex(x):
grouped = df.groupby(grouper, group_keys=False)
# make sure all these work
- grouped.apply(f)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ grouped.apply(f)
grouped.aggregate(freduce)
grouped.aggregate({"C": freduce, "D": freduce})
grouped.transform(f)
@@ -1714,7 +1735,9 @@ def f(group):
names.append(group.name)
return group.copy()
- df.groupby("a", sort=False, group_keys=False).apply(f)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ df.groupby("a", sort=False, group_keys=False).apply(f)
expected_names = [0, 1, 2]
assert names == expected_names
@@ -1920,7 +1943,9 @@ def test_groupby_preserves_sort(sort_column, group_column):
def test_sort(x):
tm.assert_frame_equal(x, x.sort_values(by=sort_column))
- g.apply(test_sort)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ g.apply(test_sort)
def test_pivot_table_values_key_error():
@@ -1928,7 +1953,7 @@ def test_pivot_table_values_key_error():
df = DataFrame(
{
"eventDate": date_range(datetime.today(), periods=20, freq="M").tolist(),
- "thename": range(0, 20),
+ "thename": range(20),
}
)
@@ -2102,7 +2127,9 @@ def test_empty_groupby_apply_nonunique_columns():
df[3] = df[3].astype(np.int64)
df.columns = [0, 1, 2, 0]
gb = df.groupby(df[1], group_keys=False)
- res = gb.apply(lambda x: x)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ res = gb.apply(lambda x: x)
assert (res.dtypes == df.dtypes).all()
@@ -3189,6 +3216,14 @@ def test_depr_get_group_len_1_list_likes(test_series, kwarg, value, name, warn):
tm.assert_equal(result, expected)
+def test_groupby_ngroup_with_nan():
+ # GH#50100
+ df = DataFrame({"a": Categorical([np.nan]), "b": [1]})
+ result = df.groupby(["a", "b"], dropna=False, observed=False).ngroup()
+ expected = Series([0])
+ tm.assert_series_equal(result, expected)
+
+
def test_get_group_axis_1():
# GH#54858
df = DataFrame(
diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
index 099e7bc3890d0..d82278c277d48 100644
--- a/pandas/tests/groupby/test_groupby_dropna.py
+++ b/pandas/tests/groupby/test_groupby_dropna.py
@@ -324,7 +324,9 @@ def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data,
df = pd.DataFrame(data)
gb = df.groupby("groups", dropna=dropna)
- result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))}))
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))}))
mi_tuples = tuple(zip(data["groups"], selected_data["values"]))
mi = pd.MultiIndex.from_tuples(mi_tuples, names=["groups", None])
diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py
index 773c1e60e97af..601e67bbca5e3 100644
--- a/pandas/tests/groupby/test_groupby_subclass.py
+++ b/pandas/tests/groupby/test_groupby_subclass.py
@@ -63,7 +63,9 @@ def func(group):
assert hasattr(group, "testattr")
return group.testattr
- result = custom_df.groupby("c").apply(func)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = custom_df.groupby("c").apply(func)
expected = tm.SubclassedSeries(["hello"] * 3, index=Index([7, 8, 9], name="c"))
tm.assert_series_equal(result, expected)
@@ -101,5 +103,7 @@ def test_groupby_resample_preserves_subclass(obj):
df = df.set_index("Date")
# Confirm groupby.resample() preserves dataframe type
- result = df.groupby("Buyer").resample("5D").sum()
+ msg = "DataFrameGroupBy.resample operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("Buyer").resample("5D").sum()
assert isinstance(result, obj)
diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py
index e0793ada679c2..d05b60fd56b5f 100644
--- a/pandas/tests/groupby/test_grouping.py
+++ b/pandas/tests/groupby/test_grouping.py
@@ -224,7 +224,9 @@ def test_grouper_creation_bug(self):
result = g.sum()
tm.assert_frame_equal(result, expected)
- result = g.apply(lambda x: x.sum())
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = g.apply(lambda x: x.sum())
expected["A"] = [0, 2, 4]
expected = expected.loc[:, ["A", "B"]]
tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py
index c9fe011f7063b..1a26559ef4447 100644
--- a/pandas/tests/groupby/test_timegrouper.py
+++ b/pandas/tests/groupby/test_timegrouper.py
@@ -470,8 +470,12 @@ def test_timegrouper_apply_return_type_series(self):
def sumfunc_series(x):
return Series([x["value"].sum()], ("sum",))
- expected = df.groupby(Grouper(key="date")).apply(sumfunc_series)
- result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_series)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = df.groupby(Grouper(key="date")).apply(sumfunc_series)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_series)
tm.assert_frame_equal(
result.reset_index(drop=True), expected.reset_index(drop=True)
)
@@ -487,8 +491,11 @@ def test_timegrouper_apply_return_type_value(self):
def sumfunc_value(x):
return x.value.sum()
- expected = df.groupby(Grouper(key="date")).apply(sumfunc_value)
- result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_value)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = df.groupby(Grouper(key="date")).apply(sumfunc_value)
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_value)
tm.assert_series_equal(
result.reset_index(drop=True), expected.reset_index(drop=True)
)
@@ -842,7 +849,7 @@ def test_grouper_period_index(self):
result = period_series.groupby(period_series.index.month).sum()
expected = Series(
- range(0, periods), index=Index(range(1, periods + 1), name=index.name)
+ range(periods), index=Index(range(1, periods + 1), name=index.name)
)
tm.assert_series_equal(result, expected)
@@ -895,7 +902,9 @@ def test_groupby_apply_timegrouper_with_nat_apply_squeeze(
assert gb._selected_obj._get_axis(gb.axis).nlevels == 1
# function that returns a Series
- res = gb.apply(lambda x: x["Quantity"] * 2)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ res = gb.apply(lambda x: x["Quantity"] * 2)
expected = DataFrame(
[[36, 6, 6, 10, 2]],
diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py
index 7c50124e57e29..944dda8977882 100644
--- a/pandas/tests/groupby/test_value_counts.py
+++ b/pandas/tests/groupby/test_value_counts.py
@@ -327,9 +327,12 @@ def test_against_frame_and_seriesgroupby(
)
if frame:
# compare against apply with DataFrame value_counts
- expected = gp.apply(
- _frame_value_counts, ["gender", "education"], normalize, sort, ascending
- )
+ warn = FutureWarning if groupby == "column" else None
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(warn, match=msg):
+ expected = gp.apply(
+ _frame_value_counts, ["gender", "education"], normalize, sort, ascending
+ )
if as_index:
tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
index 062dfe3931423..acb4b93ba1af3 100644
--- a/pandas/tests/groupby/transform/test_transform.py
+++ b/pandas/tests/groupby/transform/test_transform.py
@@ -636,7 +636,9 @@ def f(group):
return group[:1]
grouped = df.groupby("c")
- result = grouped.apply(f)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = grouped.apply(f)
assert result["d"].dtype == np.float64
@@ -790,7 +792,13 @@ def test_cython_transform_frame(request, op, args, targop, df_fix, gb_target):
f = gb[["float", "float_missing"]].apply(targop)
expected = concat([f, i], axis=1)
else:
- expected = gb.apply(targop)
+ if op != "shift" or not isinstance(gb_target.get("by"), (str, list)):
+ warn = None
+ else:
+ warn = FutureWarning
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(warn, match=msg):
+ expected = gb.apply(targop)
expected = expected.sort_index(axis=1)
if op == "shift":
diff --git a/pandas/tests/indexes/multi/test_partial_indexing.py b/pandas/tests/indexes/multi/test_partial_indexing.py
index 47efc43d5eae0..66163dad3deae 100644
--- a/pandas/tests/indexes/multi/test_partial_indexing.py
+++ b/pandas/tests/indexes/multi/test_partial_indexing.py
@@ -31,7 +31,7 @@ def df():
dr = date_range("2016-01-01", "2016-01-03", freq="12H")
abc = ["a", "b", "c"]
mi = MultiIndex.from_product([dr, abc])
- frame = DataFrame({"c1": range(0, 15)}, index=mi)
+ frame = DataFrame({"c1": range(15)}, index=mi)
return frame
diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py
index 5f137df281fa3..132704434829e 100644
--- a/pandas/tests/indexes/ranges/test_range.py
+++ b/pandas/tests/indexes/ranges/test_range.py
@@ -10,9 +10,6 @@
)
import pandas._testing as tm
-# aliases to make some tests easier to read
-RI = RangeIndex
-
class TestRangeIndex:
@pytest.fixture
@@ -507,25 +504,31 @@ def test_len_specialised(self, step):
@pytest.mark.parametrize(
"indices, expected",
[
- ([RI(1, 12, 5)], RI(1, 12, 5)),
- ([RI(0, 6, 4)], RI(0, 6, 4)),
- ([RI(1, 3), RI(3, 7)], RI(1, 7)),
- ([RI(1, 5, 2), RI(5, 6)], RI(1, 6, 2)),
- ([RI(1, 3, 2), RI(4, 7, 3)], RI(1, 7, 3)),
- ([RI(-4, 3, 2), RI(4, 7, 2)], RI(-4, 7, 2)),
- ([RI(-4, -8), RI(-8, -12)], RI(0, 0)),
- ([RI(-4, -8), RI(3, -4)], RI(0, 0)),
- ([RI(-4, -8), RI(3, 5)], RI(3, 5)),
- ([RI(-4, -2), RI(3, 5)], Index([-4, -3, 3, 4])),
- ([RI(-2), RI(3, 5)], RI(3, 5)),
- ([RI(2), RI(2)], Index([0, 1, 0, 1])),
- ([RI(2), RI(2, 5), RI(5, 8, 4)], RI(0, 6)),
- ([RI(2), RI(3, 5), RI(5, 8, 4)], Index([0, 1, 3, 4, 5])),
- ([RI(-2, 2), RI(2, 5), RI(5, 8, 4)], RI(-2, 6)),
- ([RI(3), Index([-1, 3, 15])], Index([0, 1, 2, -1, 3, 15])),
- ([RI(3), Index([-1, 3.1, 15.0])], Index([0, 1, 2, -1, 3.1, 15.0])),
- ([RI(3), Index(["a", None, 14])], Index([0, 1, 2, "a", None, 14])),
- ([RI(3, 1), Index(["a", None, 14])], Index(["a", None, 14])),
+ ([RangeIndex(1, 12, 5)], RangeIndex(1, 12, 5)),
+ ([RangeIndex(0, 6, 4)], RangeIndex(0, 6, 4)),
+ ([RangeIndex(1, 3), RangeIndex(3, 7)], RangeIndex(1, 7)),
+ ([RangeIndex(1, 5, 2), RangeIndex(5, 6)], RangeIndex(1, 6, 2)),
+ ([RangeIndex(1, 3, 2), RangeIndex(4, 7, 3)], RangeIndex(1, 7, 3)),
+ ([RangeIndex(-4, 3, 2), RangeIndex(4, 7, 2)], RangeIndex(-4, 7, 2)),
+ ([RangeIndex(-4, -8), RangeIndex(-8, -12)], RangeIndex(0, 0)),
+ ([RangeIndex(-4, -8), RangeIndex(3, -4)], RangeIndex(0, 0)),
+ ([RangeIndex(-4, -8), RangeIndex(3, 5)], RangeIndex(3, 5)),
+ ([RangeIndex(-4, -2), RangeIndex(3, 5)], Index([-4, -3, 3, 4])),
+ ([RangeIndex(-2), RangeIndex(3, 5)], RangeIndex(3, 5)),
+ ([RangeIndex(2), RangeIndex(2)], Index([0, 1, 0, 1])),
+ ([RangeIndex(2), RangeIndex(2, 5), RangeIndex(5, 8, 4)], RangeIndex(0, 6)),
+ (
+ [RangeIndex(2), RangeIndex(3, 5), RangeIndex(5, 8, 4)],
+ Index([0, 1, 3, 4, 5]),
+ ),
+ (
+ [RangeIndex(-2, 2), RangeIndex(2, 5), RangeIndex(5, 8, 4)],
+ RangeIndex(-2, 6),
+ ),
+ ([RangeIndex(3), Index([-1, 3, 15])], Index([0, 1, 2, -1, 3, 15])),
+ ([RangeIndex(3), Index([-1, 3.1, 15.0])], Index([0, 1, 2, -1, 3.1, 15.0])),
+ ([RangeIndex(3), Index(["a", None, 14])], Index([0, 1, 2, "a", None, 14])),
+ ([RangeIndex(3, 1), Index(["a", None, 14])], Index(["a", None, 14])),
],
)
def test_append(self, indices, expected):
@@ -567,7 +570,7 @@ def test_format_empty(self):
assert empty_idx.format(name=True) == [""]
@pytest.mark.parametrize(
- "RI",
+ "ri",
[
RangeIndex(0, -1, -1),
RangeIndex(0, 1, 1),
@@ -576,10 +579,10 @@ def test_format_empty(self):
RangeIndex(-3, -5, -2),
],
)
- def test_append_len_one(self, RI):
+ def test_append_len_one(self, ri):
# GH39401
- result = RI.append([])
- tm.assert_index_equal(result, RI, exact=True)
+ result = ri.append([])
+ tm.assert_index_equal(result, ri, exact=True)
@pytest.mark.parametrize("base", [RangeIndex(0, 2), Index([0, 1])])
def test_isin_range(self, base):
diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py
index 9d11827e2923e..b86e233110e88 100644
--- a/pandas/tests/indexing/multiindex/test_getitem.py
+++ b/pandas/tests/indexing/multiindex/test_getitem.py
@@ -148,7 +148,7 @@ def test_frame_getitem_simple_key_error(
def test_tuple_string_column_names():
# GH#50372
mi = MultiIndex.from_tuples([("a", "aa"), ("a", "ab"), ("b", "ba"), ("b", "bb")])
- df = DataFrame([range(0, 4), range(1, 5), range(2, 6)], columns=mi)
+ df = DataFrame([range(4), range(1, 5), range(2, 6)], columns=mi)
df["single_index"] = 0
df_flat = df.copy()
diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py
index b45d197af332e..d3a6d4bf7cebf 100644
--- a/pandas/tests/indexing/test_categorical.py
+++ b/pandas/tests/indexing/test_categorical.py
@@ -16,7 +16,6 @@
Timestamp,
)
import pandas._testing as tm
-from pandas.api.types import CategoricalDtype as CDT
@pytest.fixture
@@ -25,7 +24,9 @@ def df():
{
"A": np.arange(6, dtype="int64"),
},
- index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cab")), name="B"),
+ index=CategoricalIndex(
+ list("aabbca"), dtype=CategoricalDtype(list("cab")), name="B"
+ ),
)
@@ -35,13 +36,15 @@ def df2():
{
"A": np.arange(6, dtype="int64"),
},
- index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cabe")), name="B"),
+ index=CategoricalIndex(
+ list("aabbca"), dtype=CategoricalDtype(list("cabe")), name="B"
+ ),
)
class TestCategoricalIndex:
def test_loc_scalar(self, df):
- dtype = CDT(list("cab"))
+ dtype = CategoricalDtype(list("cab"))
result = df.loc["a"]
bidx = Series(list("aaa"), name="B").astype(dtype)
assert bidx.dtype == dtype
diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py
index f36fdf0d36ea9..7353b5ef76ba3 100644
--- a/pandas/tests/indexing/test_chaining_and_caching.py
+++ b/pandas/tests/indexing/test_chaining_and_caching.py
@@ -1,4 +1,4 @@
-from string import ascii_letters as letters
+from string import ascii_letters
import numpy as np
import pytest
@@ -24,9 +24,9 @@
def random_text(nobs=100):
# Construct a DataFrame where each row is a random slice from 'letters'
- idxs = np.random.default_rng(2).integers(len(letters), size=(nobs, 2))
+ idxs = np.random.default_rng(2).integers(len(ascii_letters), size=(nobs, 2))
idxs.sort(axis=1)
- strings = [letters[x[0] : x[1]] for x in idxs]
+ strings = [ascii_letters[x[0] : x[1]] for x in idxs]
return DataFrame(strings, columns=["letters"])
diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py
index 73de2b068b699..6c3bf01cb1857 100644
--- a/pandas/tests/io/formats/test_info.py
+++ b/pandas/tests/io/formats/test_info.py
@@ -1,6 +1,6 @@
from io import StringIO
import re
-from string import ascii_uppercase as uppercase
+from string import ascii_uppercase
import sys
import textwrap
@@ -452,9 +452,9 @@ def memory_usage(f):
return f.memory_usage(deep=True).sum()
N = 100
- M = len(uppercase)
+ M = len(ascii_uppercase)
index = MultiIndex.from_product(
- [list(uppercase), date_range("20160101", periods=N)],
+ [list(ascii_uppercase), date_range("20160101", periods=N)],
names=["id", "date"],
)
df = DataFrame(
diff --git a/pandas/tests/io/formats/test_series_info.py b/pandas/tests/io/formats/test_series_info.py
index 02827ee25042a..29dd704f6efa9 100644
--- a/pandas/tests/io/formats/test_series_info.py
+++ b/pandas/tests/io/formats/test_series_info.py
@@ -1,5 +1,5 @@
from io import StringIO
-from string import ascii_uppercase as uppercase
+from string import ascii_uppercase
import textwrap
import numpy as np
@@ -165,9 +165,9 @@ def test_info_memory_usage_bug_on_multiindex():
# GH 14308
# memory usage introspection should not materialize .values
N = 100
- M = len(uppercase)
+ M = len(ascii_uppercase)
index = MultiIndex.from_product(
- [list(uppercase), date_range("20160101", periods=N)],
+ [list(ascii_uppercase), date_range("20160101", periods=N)],
names=["id", "date"],
)
s = Series(np.random.default_rng(2).standard_normal(N * M), index=index)
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index ca3ce6ba34515..b3c2e67f7c318 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -2044,7 +2044,7 @@ def test_read_json_dtype_backend(self, string_storage, dtype_backend, orient):
)
if orient == "values":
- expected.columns = list(range(0, 8))
+ expected.columns = list(range(8))
tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py
index 5bb7097770820..d5f8c5200c4a3 100644
--- a/pandas/tests/io/json/test_ujson.py
+++ b/pandas/tests/io/json/test_ujson.py
@@ -1033,7 +1033,7 @@ def test_decode_floating_point(self, sign, float_number):
def test_encode_big_set(self):
s = set()
- for x in range(0, 100000):
+ for x in range(100000):
s.add(x)
# Make sure no Exception is raised.
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index db3909c147ad3..55445e44b9366 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -1012,7 +1012,7 @@ def test_timezone_aware_index(self, request, pa, timezone_aware_date_list):
def test_filter_row_groups(self, pa):
# https://github.com/pandas-dev/pandas/issues/26551
pytest.importorskip("pyarrow")
- df = pd.DataFrame({"a": list(range(0, 3))})
+ df = pd.DataFrame({"a": list(range(3))})
with tm.ensure_clean() as path:
df.to_parquet(path, engine=pa)
result = read_parquet(
@@ -1219,7 +1219,7 @@ def test_categorical(self, fp):
check_round_trip(df, fp)
def test_filter_row_groups(self, fp):
- d = {"a": list(range(0, 3))}
+ d = {"a": list(range(3))}
df = pd.DataFrame(d)
with tm.ensure_clean() as path:
df.to_parquet(path, engine=fp, compression=None, row_group_offsets=1)
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
index 9ec0ba0b12a76..bbdb22955297e 100644
--- a/pandas/tests/io/test_sql.py
+++ b/pandas/tests/io/test_sql.py
@@ -413,6 +413,8 @@ def mysql_pymysql_engine(iris_path, types_data):
for entry in types_data:
entry.pop("DateColWithTz")
create_and_load_types(engine, types_data, "mysql")
+ if not insp.has_table("iris_view"):
+ create_and_load_iris_view(engine)
yield engine
with engine.connect() as conn:
with conn.begin():
@@ -422,7 +424,7 @@ def mysql_pymysql_engine(iris_path, types_data):
@pytest.fixture
-def mysql_pymysql_conn(mysql_pymysql_engine):
+def mysql_pymysql_conn(iris_path, mysql_pymysql_engine):
with mysql_pymysql_engine.connect() as conn:
yield conn
@@ -440,6 +442,8 @@ def postgresql_psycopg2_engine(iris_path, types_data):
create_and_load_iris(engine, iris_path, "postgresql")
if not insp.has_table("types"):
create_and_load_types(engine, types_data, "postgresql")
+ if not insp.has_table("iris_view"):
+ create_and_load_iris_view(engine)
yield engine
with engine.connect() as conn:
with conn.begin():
@@ -462,9 +466,20 @@ def sqlite_str():
@pytest.fixture
-def sqlite_engine(sqlite_str):
+def sqlite_engine(sqlite_str, iris_path, types_data):
sqlalchemy = pytest.importorskip("sqlalchemy")
engine = sqlalchemy.create_engine(sqlite_str, poolclass=sqlalchemy.pool.NullPool)
+
+ insp = sqlalchemy.inspect(engine)
+ if not insp.has_table("iris"):
+ create_and_load_iris(engine, iris_path, "sqlite")
+ if not insp.has_table("iris_view"):
+ create_and_load_iris_view(engine)
+ if not insp.has_table("types"):
+ for entry in types_data:
+ entry.pop("DateColWithTz")
+ create_and_load_types(engine, types_data, "sqlite")
+
yield engine
engine.dispose()
@@ -476,17 +491,25 @@ def sqlite_conn(sqlite_engine):
@pytest.fixture
-def sqlite_iris_str(sqlite_str, iris_path):
+def sqlite_iris_str(sqlite_str, iris_path, types_data):
sqlalchemy = pytest.importorskip("sqlalchemy")
engine = sqlalchemy.create_engine(sqlite_str)
- create_and_load_iris(engine, iris_path, "sqlite")
+
+ insp = sqlalchemy.inspect(engine)
+ if not insp.has_table("iris"):
+ create_and_load_iris(engine, iris_path, "sqlite")
+ if not insp.has_table("iris_view"):
+ create_and_load_iris_view(engine)
+ if not insp.has_table("types"):
+ for entry in types_data:
+ entry.pop("DateColWithTz")
+ create_and_load_types(engine, types_data, "sqlite")
engine.dispose()
return sqlite_str
@pytest.fixture
def sqlite_iris_engine(sqlite_engine, iris_path):
- create_and_load_iris(sqlite_engine, iris_path, "sqlite")
return sqlite_engine
@@ -499,6 +522,7 @@ def sqlite_iris_conn(sqlite_iris_engine):
@pytest.fixture
def sqlite_buildin():
with contextlib.closing(sqlite3.connect(":memory:")) as closing_conn:
+ create_and_load_iris_view(closing_conn)
with closing_conn as conn:
yield conn
@@ -1097,6 +1121,7 @@ class PandasSQLTest:
"""
def load_iris_data(self, iris_path):
+ self.drop_view("iris_view", self.conn)
self.drop_table("iris", self.conn)
if isinstance(self.conn, sqlite3.Connection):
create_and_load_iris_sqlite3(self.conn, iris_path)
@@ -1221,470 +1246,695 @@ class DummyException(Exception):
# -- Testing the public API
-class _TestSQLApi(PandasSQLTest):
- """
- Base class to test the public API.
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable_iris)
+def test_api_read_sql_view(conn, request):
+ conn = request.getfixturevalue(conn)
+ iris_frame = sql.read_sql_query("SELECT * FROM iris_view", conn)
+ check_iris_frame(iris_frame)
- From this two classes are derived to run these tests for both the
- sqlalchemy mode (`TestSQLApi`) and the fallback mode
- (`TestSQLiteFallbackApi`). These tests are run with sqlite3. Specific
- tests for the different sql flavours are included in `_TestSQLAlchemy`.
- Notes:
- flavor can always be passed even in SQLAlchemy mode,
- should be correctly ignored.
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable_iris)
+def test_api_read_sql_with_chunksize_no_result(conn, request):
+ conn = request.getfixturevalue(conn)
+ query = 'SELECT * FROM iris_view WHERE "SepalLength" < 0.0'
+ with_batch = sql.read_sql_query(query, conn, chunksize=5)
+ without_batch = sql.read_sql_query(query, conn)
+ tm.assert_frame_equal(concat(with_batch), without_batch)
- we don't use drop_table because that isn't part of the public api
- """
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_to_sql(conn, request, test_frame1):
+ conn = request.getfixturevalue(conn)
+ if sql.has_table("test_frame1", conn):
+ with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+ pandasSQL.drop_table("test_frame1")
- flavor = "sqlite"
- mode: str
+ sql.to_sql(test_frame1, "test_frame1", conn)
+ assert sql.has_table("test_frame1", conn)
- @pytest.fixture(autouse=True)
- def setup_method(self, iris_path, types_data):
- self.conn = self.connect()
- self.load_iris_data(iris_path)
- self.load_types_data(types_data)
- self.load_test_data_and_sql()
- def load_test_data_and_sql(self):
- create_and_load_iris_view(self.conn)
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_to_sql_fail(conn, request, test_frame1):
+ conn = request.getfixturevalue(conn)
+ if sql.has_table("test_frame2", conn):
+ with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+ pandasSQL.drop_table("test_frame2")
- def test_read_sql_view(self):
- iris_frame = sql.read_sql_query("SELECT * FROM iris_view", self.conn)
- check_iris_frame(iris_frame)
+ sql.to_sql(test_frame1, "test_frame2", conn, if_exists="fail")
+ assert sql.has_table("test_frame2", conn)
- def test_read_sql_with_chunksize_no_result(self):
- query = "SELECT * FROM iris_view WHERE SepalLength < 0.0"
- with_batch = sql.read_sql_query(query, self.conn, chunksize=5)
- without_batch = sql.read_sql_query(query, self.conn)
- tm.assert_frame_equal(concat(with_batch), without_batch)
+ msg = "Table 'test_frame2' already exists"
+ with pytest.raises(ValueError, match=msg):
+ sql.to_sql(test_frame1, "test_frame2", conn, if_exists="fail")
- def test_to_sql(self, test_frame1):
- sql.to_sql(test_frame1, "test_frame1", self.conn)
- assert sql.has_table("test_frame1", self.conn)
- def test_to_sql_fail(self, test_frame1):
- sql.to_sql(test_frame1, "test_frame2", self.conn, if_exists="fail")
- assert sql.has_table("test_frame2", self.conn)
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_to_sql_replace(conn, request, test_frame1):
+ conn = request.getfixturevalue(conn)
+ if sql.has_table("test_frame3", conn):
+ with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+ pandasSQL.drop_table("test_frame3")
- msg = "Table 'test_frame2' already exists"
- with pytest.raises(ValueError, match=msg):
- sql.to_sql(test_frame1, "test_frame2", self.conn, if_exists="fail")
+ sql.to_sql(test_frame1, "test_frame3", conn, if_exists="fail")
+ # Add to table again
+ sql.to_sql(test_frame1, "test_frame3", conn, if_exists="replace")
+ assert sql.has_table("test_frame3", conn)
- def test_to_sql_replace(self, test_frame1):
- sql.to_sql(test_frame1, "test_frame3", self.conn, if_exists="fail")
- # Add to table again
- sql.to_sql(test_frame1, "test_frame3", self.conn, if_exists="replace")
- assert sql.has_table("test_frame3", self.conn)
+ num_entries = len(test_frame1)
+ num_rows = count_rows(conn, "test_frame3")
- num_entries = len(test_frame1)
- num_rows = count_rows(self.conn, "test_frame3")
+ assert num_rows == num_entries
- assert num_rows == num_entries
- def test_to_sql_append(self, test_frame1):
- assert sql.to_sql(test_frame1, "test_frame4", self.conn, if_exists="fail") == 4
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_to_sql_append(conn, request, test_frame1):
+ conn = request.getfixturevalue(conn)
+ if sql.has_table("test_frame4", conn):
+ with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+ pandasSQL.drop_table("test_frame4")
- # Add to table again
- assert (
- sql.to_sql(test_frame1, "test_frame4", self.conn, if_exists="append") == 4
- )
- assert sql.has_table("test_frame4", self.conn)
+ assert sql.to_sql(test_frame1, "test_frame4", conn, if_exists="fail") == 4
- num_entries = 2 * len(test_frame1)
- num_rows = count_rows(self.conn, "test_frame4")
+ # Add to table again
+ assert sql.to_sql(test_frame1, "test_frame4", conn, if_exists="append") == 4
+ assert sql.has_table("test_frame4", conn)
- assert num_rows == num_entries
+ num_entries = 2 * len(test_frame1)
+ num_rows = count_rows(conn, "test_frame4")
- def test_to_sql_type_mapping(self, test_frame3):
- sql.to_sql(test_frame3, "test_frame5", self.conn, index=False)
- result = sql.read_sql("SELECT * FROM test_frame5", self.conn)
+ assert num_rows == num_entries
- tm.assert_frame_equal(test_frame3, result)
- def test_to_sql_series(self):
- s = Series(np.arange(5, dtype="int64"), name="series")
- sql.to_sql(s, "test_series", self.conn, index=False)
- s2 = sql.read_sql_query("SELECT * FROM test_series", self.conn)
- tm.assert_frame_equal(s.to_frame(), s2)
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_to_sql_type_mapping(conn, request, test_frame3):
+ conn = request.getfixturevalue(conn)
+ if sql.has_table("test_frame5", conn):
+ with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+ pandasSQL.drop_table("test_frame5")
- def test_roundtrip(self, test_frame1):
- sql.to_sql(test_frame1, "test_frame_roundtrip", con=self.conn)
- result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=self.conn)
+ sql.to_sql(test_frame3, "test_frame5", conn, index=False)
+ result = sql.read_sql("SELECT * FROM test_frame5", conn)
- # HACK!
- result.index = test_frame1.index
- result.set_index("level_0", inplace=True)
- result.index.astype(int)
- result.index.name = None
- tm.assert_frame_equal(result, test_frame1)
+ tm.assert_frame_equal(test_frame3, result)
- def test_roundtrip_chunksize(self, test_frame1):
- sql.to_sql(
- test_frame1,
- "test_frame_roundtrip",
- con=self.conn,
- index=False,
- chunksize=2,
- )
- result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=self.conn)
- tm.assert_frame_equal(result, test_frame1)
- def test_execute_sql(self):
- # drop_sql = "DROP TABLE IF EXISTS test" # should already be done
- with sql.pandasSQL_builder(self.conn) as pandas_sql:
- iris_results = pandas_sql.execute("SELECT * FROM iris")
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_to_sql_series(conn, request):
+ conn = request.getfixturevalue(conn)
+ if sql.has_table("test_series", conn):
+ with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+ pandasSQL.drop_table("test_series")
+
+ s = Series(np.arange(5, dtype="int64"), name="series")
+ sql.to_sql(s, "test_series", conn, index=False)
+ s2 = sql.read_sql_query("SELECT * FROM test_series", conn)
+ tm.assert_frame_equal(s.to_frame(), s2)
+
+
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_roundtrip(conn, request, test_frame1):
+ conn = request.getfixturevalue(conn)
+ if sql.has_table("test_frame_roundtrip", conn):
+ with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+ pandasSQL.drop_table("test_frame_roundtrip")
+
+ sql.to_sql(test_frame1, "test_frame_roundtrip", con=conn)
+ result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=conn)
+
+ # HACK!
+ result.index = test_frame1.index
+ result.set_index("level_0", inplace=True)
+ result.index.astype(int)
+ result.index.name = None
+ tm.assert_frame_equal(result, test_frame1)
+
+
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_roundtrip_chunksize(conn, request, test_frame1):
+ conn = request.getfixturevalue(conn)
+ if sql.has_table("test_frame_roundtrip", conn):
+ with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+ pandasSQL.drop_table("test_frame_roundtrip")
+
+ sql.to_sql(
+ test_frame1,
+ "test_frame_roundtrip",
+ con=conn,
+ index=False,
+ chunksize=2,
+ )
+ result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=conn)
+ tm.assert_frame_equal(result, test_frame1)
+
+
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable_iris)
+def test_api_execute_sql(conn, request):
+ # drop_sql = "DROP TABLE IF EXISTS test" # should already be done
+ conn = request.getfixturevalue(conn)
+ with sql.pandasSQL_builder(conn) as pandas_sql:
+ iris_results = pandas_sql.execute("SELECT * FROM iris")
row = iris_results.fetchone()
- tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"])
+ tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"])
- def test_date_parsing(self):
- # Test date parsing in read_sql
- # No Parsing
- df = sql.read_sql_query("SELECT * FROM types", self.conn)
+
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_date_parsing(conn, request):
+ conn_name = conn
+ if conn_name in {"sqlite_buildin", "sqlite_str"}:
+ pytest.skip("types tables not created in sqlite_buildin or sqlite_str fixture")
+
+ conn = request.getfixturevalue(conn)
+ # Test date parsing in read_sql
+ # No Parsing
+ df = sql.read_sql_query("SELECT * FROM types", conn)
+ if not ("mysql" in conn_name or "postgres" in conn_name):
assert not issubclass(df.DateCol.dtype.type, np.datetime64)
- df = sql.read_sql_query(
- "SELECT * FROM types", self.conn, parse_dates=["DateCol"]
- )
- assert issubclass(df.DateCol.dtype.type, np.datetime64)
- assert df.DateCol.tolist() == [
- Timestamp(2000, 1, 3, 0, 0, 0),
- Timestamp(2000, 1, 4, 0, 0, 0),
- ]
+ df = sql.read_sql_query("SELECT * FROM types", conn, parse_dates=["DateCol"])
+ assert issubclass(df.DateCol.dtype.type, np.datetime64)
+ assert df.DateCol.tolist() == [
+ Timestamp(2000, 1, 3, 0, 0, 0),
+ Timestamp(2000, 1, 4, 0, 0, 0),
+ ]
- df = sql.read_sql_query(
- "SELECT * FROM types",
- self.conn,
- parse_dates={"DateCol": "%Y-%m-%d %H:%M:%S"},
- )
- assert issubclass(df.DateCol.dtype.type, np.datetime64)
- assert df.DateCol.tolist() == [
- Timestamp(2000, 1, 3, 0, 0, 0),
- Timestamp(2000, 1, 4, 0, 0, 0),
- ]
+ df = sql.read_sql_query(
+ "SELECT * FROM types",
+ conn,
+ parse_dates={"DateCol": "%Y-%m-%d %H:%M:%S"},
+ )
+ assert issubclass(df.DateCol.dtype.type, np.datetime64)
+ assert df.DateCol.tolist() == [
+ Timestamp(2000, 1, 3, 0, 0, 0),
+ Timestamp(2000, 1, 4, 0, 0, 0),
+ ]
- df = sql.read_sql_query(
- "SELECT * FROM types", self.conn, parse_dates=["IntDateCol"]
- )
- assert issubclass(df.IntDateCol.dtype.type, np.datetime64)
- assert df.IntDateCol.tolist() == [
- Timestamp(1986, 12, 25, 0, 0, 0),
- Timestamp(2013, 1, 1, 0, 0, 0),
- ]
+ df = sql.read_sql_query("SELECT * FROM types", conn, parse_dates=["IntDateCol"])
+ assert issubclass(df.IntDateCol.dtype.type, np.datetime64)
+ assert df.IntDateCol.tolist() == [
+ Timestamp(1986, 12, 25, 0, 0, 0),
+ Timestamp(2013, 1, 1, 0, 0, 0),
+ ]
- df = sql.read_sql_query(
- "SELECT * FROM types", self.conn, parse_dates={"IntDateCol": "s"}
- )
- assert issubclass(df.IntDateCol.dtype.type, np.datetime64)
- assert df.IntDateCol.tolist() == [
- Timestamp(1986, 12, 25, 0, 0, 0),
- Timestamp(2013, 1, 1, 0, 0, 0),
- ]
+ df = sql.read_sql_query(
+ "SELECT * FROM types", conn, parse_dates={"IntDateCol": "s"}
+ )
+ assert issubclass(df.IntDateCol.dtype.type, np.datetime64)
+ assert df.IntDateCol.tolist() == [
+ Timestamp(1986, 12, 25, 0, 0, 0),
+ Timestamp(2013, 1, 1, 0, 0, 0),
+ ]
- df = sql.read_sql_query(
+ df = sql.read_sql_query(
+ "SELECT * FROM types",
+ conn,
+ parse_dates={"IntDateOnlyCol": "%Y%m%d"},
+ )
+ assert issubclass(df.IntDateOnlyCol.dtype.type, np.datetime64)
+ assert df.IntDateOnlyCol.tolist() == [
+ Timestamp("2010-10-10"),
+ Timestamp("2010-12-12"),
+ ]
+
+
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable)
+@pytest.mark.parametrize("error", ["ignore", "raise", "coerce"])
+@pytest.mark.parametrize(
+ "read_sql, text, mode",
+ [
+ (sql.read_sql, "SELECT * FROM types", ("sqlalchemy", "fallback")),
+ (sql.read_sql, "types", ("sqlalchemy")),
+ (
+ sql.read_sql_query,
"SELECT * FROM types",
- self.conn,
- parse_dates={"IntDateOnlyCol": "%Y%m%d"},
- )
- assert issubclass(df.IntDateOnlyCol.dtype.type, np.datetime64)
- assert df.IntDateOnlyCol.tolist() == [
- Timestamp("2010-10-10"),
- Timestamp("2010-12-12"),
- ]
+ ("sqlalchemy", "fallback"),
+ ),
+ (sql.read_sql_table, "types", ("sqlalchemy")),
+ ],
+)
+def test_api_custom_dateparsing_error(
+ conn, request, read_sql, text, mode, error, types_data_frame
+):
+ conn_name = conn
+ if conn_name in {"sqlite_buildin", "sqlite_str"}:
+ pytest.skip("types tables not created in sqlite_buildin or sqlite_str fixture")
- @pytest.mark.parametrize("error", ["ignore", "raise", "coerce"])
- @pytest.mark.parametrize(
- "read_sql, text, mode",
- [
- (sql.read_sql, "SELECT * FROM types", ("sqlalchemy", "fallback")),
- (sql.read_sql, "types", ("sqlalchemy")),
- (
- sql.read_sql_query,
- "SELECT * FROM types",
- ("sqlalchemy", "fallback"),
- ),
- (sql.read_sql_table, "types", ("sqlalchemy")),
- ],
+ conn = request.getfixturevalue(conn)
+
+ expected = types_data_frame.astype({"DateCol": "datetime64[ns]"})
+
+ result = read_sql(
+ text,
+ con=conn,
+ parse_dates={
+ "DateCol": {"errors": error},
+ },
)
- def test_custom_dateparsing_error(
- self, read_sql, text, mode, error, types_data_frame
- ):
- if self.mode in mode:
- expected = types_data_frame.astype({"DateCol": "datetime64[ns]"})
+ if "postgres" in conn_name:
+ # TODO: clean up types_data_frame fixture
+ result = result.drop(columns=["DateColWithTz"])
+ result["BoolCol"] = result["BoolCol"].astype(int)
+ result["BoolColWithNull"] = result["BoolColWithNull"].astype(float)
- result = read_sql(
- text,
- con=self.conn,
- parse_dates={
- "DateCol": {"errors": error},
- },
- )
+ tm.assert_frame_equal(result, expected)
- tm.assert_frame_equal(result, expected)
- def test_date_and_index(self):
- # Test case where same column appears in parse_date and index_col
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_date_and_index(conn, request):
+ # Test case where same column appears in parse_date and index_col
+ conn_name = conn
+ if conn_name in {"sqlite_buildin", "sqlite_str"}:
+ pytest.skip("types tables not created in sqlite_buildin or sqlite_str fixture")
- df = sql.read_sql_query(
- "SELECT * FROM types",
- self.conn,
- index_col="DateCol",
- parse_dates=["DateCol", "IntDateCol"],
- )
+ conn = request.getfixturevalue(conn)
+ df = sql.read_sql_query(
+ "SELECT * FROM types",
+ conn,
+ index_col="DateCol",
+ parse_dates=["DateCol", "IntDateCol"],
+ )
- assert issubclass(df.index.dtype.type, np.datetime64)
- assert issubclass(df.IntDateCol.dtype.type, np.datetime64)
+ assert issubclass(df.index.dtype.type, np.datetime64)
+ assert issubclass(df.IntDateCol.dtype.type, np.datetime64)
- def test_timedelta(self):
- # see #6921
- df = to_timedelta(Series(["00:00:01", "00:00:03"], name="foo")).to_frame()
- with tm.assert_produces_warning(UserWarning):
- result_count = df.to_sql(name="test_timedelta", con=self.conn)
- assert result_count == 2
- result = sql.read_sql_query("SELECT * FROM test_timedelta", self.conn)
- tm.assert_series_equal(result["foo"], df["foo"].view("int64"))
-
- def test_complex_raises(self):
- df = DataFrame({"a": [1 + 1j, 2j]})
- msg = "Complex datatypes not supported"
- with pytest.raises(ValueError, match=msg):
- assert df.to_sql("test_complex", con=self.conn) is None
- @pytest.mark.parametrize(
- "index_name,index_label,expected",
- [
- # no index name, defaults to 'index'
- (None, None, "index"),
- # specifying index_label
- (None, "other_label", "other_label"),
- # using the index name
- ("index_name", None, "index_name"),
- # has index name, but specifying index_label
- ("index_name", "other_label", "other_label"),
- # index name is integer
- (0, None, "0"),
- # index name is None but index label is integer
- (None, 0, "0"),
- ],
- )
- def test_to_sql_index_label(self, index_name, index_label, expected):
- temp_frame = DataFrame({"col1": range(4)})
- temp_frame.index.name = index_name
- query = "SELECT * FROM test_index_label"
- sql.to_sql(temp_frame, "test_index_label", self.conn, index_label=index_label)
- frame = sql.read_sql_query(query, self.conn)
- assert frame.columns[0] == expected
-
- def test_to_sql_index_label_multiindex(self):
- expected_row_count = 4
- temp_frame = DataFrame(
- {"col1": range(4)},
- index=MultiIndex.from_product([("A0", "A1"), ("B0", "B1")]),
- )
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_timedelta(conn, request):
+ # see #6921
+ conn = request.getfixturevalue(conn)
+ if sql.has_table("test_timedelta", conn):
+ with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+ pandasSQL.drop_table("test_timedelta")
- # no index name, defaults to 'level_0' and 'level_1'
- result = sql.to_sql(temp_frame, "test_index_label", self.conn)
- assert result == expected_row_count
- frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn)
- assert frame.columns[0] == "level_0"
- assert frame.columns[1] == "level_1"
+ df = to_timedelta(Series(["00:00:01", "00:00:03"], name="foo")).to_frame()
+ with tm.assert_produces_warning(UserWarning):
+ result_count = df.to_sql(name="test_timedelta", con=conn)
+ assert result_count == 2
+ result = sql.read_sql_query("SELECT * FROM test_timedelta", conn)
+ tm.assert_series_equal(result["foo"], df["foo"].view("int64"))
+
+
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_complex_raises(conn, request):
+ conn = request.getfixturevalue(conn)
+ df = DataFrame({"a": [1 + 1j, 2j]})
+ msg = "Complex datatypes not supported"
+ with pytest.raises(ValueError, match=msg):
+ assert df.to_sql("test_complex", con=conn) is None
- # specifying index_label
- result = sql.to_sql(
- temp_frame,
- "test_index_label",
- self.conn,
- if_exists="replace",
- index_label=["A", "B"],
- )
- assert result == expected_row_count
- frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn)
- assert frame.columns[:2].tolist() == ["A", "B"]
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable)
+@pytest.mark.parametrize(
+ "index_name,index_label,expected",
+ [
+ # no index name, defaults to 'index'
+ (None, None, "index"),
+ # specifying index_label
+ (None, "other_label", "other_label"),
# using the index name
- temp_frame.index.names = ["A", "B"]
- result = sql.to_sql(
- temp_frame, "test_index_label", self.conn, if_exists="replace"
+ ("index_name", None, "index_name"),
+ # has index name, but specifying index_label
+ ("index_name", "other_label", "other_label"),
+ # index name is integer
+ (0, None, "0"),
+ # index name is None but index label is integer
+ (None, 0, "0"),
+ ],
+)
+def test_api_to_sql_index_label(conn, request, index_name, index_label, expected):
+ conn = request.getfixturevalue(conn)
+ if sql.has_table("test_index_label", conn):
+ with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+ pandasSQL.drop_table("test_index_label")
+
+ temp_frame = DataFrame({"col1": range(4)})
+ temp_frame.index.name = index_name
+ query = "SELECT * FROM test_index_label"
+ sql.to_sql(temp_frame, "test_index_label", conn, index_label=index_label)
+ frame = sql.read_sql_query(query, conn)
+ assert frame.columns[0] == expected
+
+
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_to_sql_index_label_multiindex(conn, request):
+ conn_name = conn
+ if "mysql" in conn_name:
+ request.node.add_marker(
+ pytest.mark.xfail(reason="MySQL can fail using TEXT without length as key")
)
- assert result == expected_row_count
- frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn)
- assert frame.columns[:2].tolist() == ["A", "B"]
- # has index name, but specifying index_label
- result = sql.to_sql(
+ conn = request.getfixturevalue(conn)
+ if sql.has_table("test_index_label", conn):
+ with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+ pandasSQL.drop_table("test_index_label")
+
+ expected_row_count = 4
+ temp_frame = DataFrame(
+ {"col1": range(4)},
+ index=MultiIndex.from_product([("A0", "A1"), ("B0", "B1")]),
+ )
+
+ # no index name, defaults to 'level_0' and 'level_1'
+ result = sql.to_sql(temp_frame, "test_index_label", conn)
+ assert result == expected_row_count
+ frame = sql.read_sql_query("SELECT * FROM test_index_label", conn)
+ assert frame.columns[0] == "level_0"
+ assert frame.columns[1] == "level_1"
+
+ # specifying index_label
+ result = sql.to_sql(
+ temp_frame,
+ "test_index_label",
+ conn,
+ if_exists="replace",
+ index_label=["A", "B"],
+ )
+ assert result == expected_row_count
+ frame = sql.read_sql_query("SELECT * FROM test_index_label", conn)
+ assert frame.columns[:2].tolist() == ["A", "B"]
+
+ # using the index name
+ temp_frame.index.names = ["A", "B"]
+ result = sql.to_sql(temp_frame, "test_index_label", conn, if_exists="replace")
+ assert result == expected_row_count
+ frame = sql.read_sql_query("SELECT * FROM test_index_label", conn)
+ assert frame.columns[:2].tolist() == ["A", "B"]
+
+ # has index name, but specifying index_label
+ result = sql.to_sql(
+ temp_frame,
+ "test_index_label",
+ conn,
+ if_exists="replace",
+ index_label=["C", "D"],
+ )
+ assert result == expected_row_count
+ frame = sql.read_sql_query("SELECT * FROM test_index_label", conn)
+ assert frame.columns[:2].tolist() == ["C", "D"]
+
+ msg = "Length of 'index_label' should match number of levels, which is 2"
+ with pytest.raises(ValueError, match=msg):
+ sql.to_sql(
temp_frame,
"test_index_label",
- self.conn,
+ conn,
if_exists="replace",
- index_label=["C", "D"],
+ index_label="C",
)
- assert result == expected_row_count
- frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn)
- assert frame.columns[:2].tolist() == ["C", "D"]
- msg = "Length of 'index_label' should match number of levels, which is 2"
- with pytest.raises(ValueError, match=msg):
- sql.to_sql(
- temp_frame,
- "test_index_label",
- self.conn,
- if_exists="replace",
- index_label="C",
- )
- def test_multiindex_roundtrip(self):
- df = DataFrame.from_records(
- [(1, 2.1, "line1"), (2, 1.5, "line2")],
- columns=["A", "B", "C"],
- index=["A", "B"],
- )
-
- df.to_sql(name="test_multiindex_roundtrip", con=self.conn)
- result = sql.read_sql_query(
- "SELECT * FROM test_multiindex_roundtrip", self.conn, index_col=["A", "B"]
- )
- tm.assert_frame_equal(df, result, check_index_type=True)
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_multiindex_roundtrip(conn, request):
+ conn = request.getfixturevalue(conn)
+ if sql.has_table("test_multiindex_roundtrip", conn):
+ with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+ pandasSQL.drop_table("test_multiindex_roundtrip")
+
+ df = DataFrame.from_records(
+ [(1, 2.1, "line1"), (2, 1.5, "line2")],
+ columns=["A", "B", "C"],
+ index=["A", "B"],
+ )
- @pytest.mark.parametrize(
- "dtype",
- [
- None,
- int,
- float,
- {"A": int, "B": float},
- ],
+ df.to_sql(name="test_multiindex_roundtrip", con=conn)
+ result = sql.read_sql_query(
+ "SELECT * FROM test_multiindex_roundtrip", conn, index_col=["A", "B"]
)
- def test_dtype_argument(self, dtype):
- # GH10285 Add dtype argument to read_sql_query
- df = DataFrame([[1.2, 3.4], [5.6, 7.8]], columns=["A", "B"])
- assert df.to_sql(name="test_dtype_argument", con=self.conn) == 2
-
- expected = df.astype(dtype)
- result = sql.read_sql_query(
- "SELECT A, B FROM test_dtype_argument", con=self.conn, dtype=dtype
- )
+ tm.assert_frame_equal(df, result, check_index_type=True)
- tm.assert_frame_equal(result, expected)
- def test_integer_col_names(self):
- df = DataFrame([[1, 2], [3, 4]], columns=[0, 1])
- sql.to_sql(df, "test_frame_integer_col_names", self.conn, if_exists="replace")
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable)
+@pytest.mark.parametrize(
+ "dtype",
+ [
+ None,
+ int,
+ float,
+ {"A": int, "B": float},
+ ],
+)
+def test_api_dtype_argument(conn, request, dtype):
+ # GH10285 Add dtype argument to read_sql_query
+ conn_name = conn
+ conn = request.getfixturevalue(conn)
+ if sql.has_table("test_dtype_argument", conn):
+ with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+ pandasSQL.drop_table("test_dtype_argument")
- def test_get_schema(self, test_frame1):
- create_sql = sql.get_schema(test_frame1, "test", con=self.conn)
- assert "CREATE" in create_sql
+ df = DataFrame([[1.2, 3.4], [5.6, 7.8]], columns=["A", "B"])
+ assert df.to_sql(name="test_dtype_argument", con=conn) == 2
- def test_get_schema_with_schema(self, test_frame1):
- # GH28486
- create_sql = sql.get_schema(test_frame1, "test", con=self.conn, schema="pypi")
- assert "CREATE TABLE pypi." in create_sql
+ expected = df.astype(dtype)
- def test_get_schema_dtypes(self):
- if self.mode == "sqlalchemy":
- from sqlalchemy import Integer
+ if "postgres" in conn_name:
+ query = 'SELECT "A", "B" FROM test_dtype_argument'
+ else:
+ query = "SELECT A, B FROM test_dtype_argument"
+ result = sql.read_sql_query(query, con=conn, dtype=dtype)
- dtype = Integer
- else:
- dtype = "INTEGER"
+ tm.assert_frame_equal(result, expected)
- float_frame = DataFrame({"a": [1.1, 1.2], "b": [2.1, 2.2]})
- create_sql = sql.get_schema(
- float_frame, "test", con=self.conn, dtype={"b": dtype}
- )
- assert "CREATE" in create_sql
- assert "INTEGER" in create_sql
- def test_get_schema_keys(self, test_frame1):
- frame = DataFrame({"Col1": [1.1, 1.2], "Col2": [2.1, 2.2]})
- create_sql = sql.get_schema(frame, "test", con=self.conn, keys="Col1")
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_integer_col_names(conn, request):
+ conn = request.getfixturevalue(conn)
+ df = DataFrame([[1, 2], [3, 4]], columns=[0, 1])
+ sql.to_sql(df, "test_frame_integer_col_names", conn, if_exists="replace")
+
+
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_get_schema(conn, request, test_frame1):
+ conn = request.getfixturevalue(conn)
+ create_sql = sql.get_schema(test_frame1, "test", con=conn)
+ assert "CREATE" in create_sql
+
+
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_get_schema_with_schema(conn, request, test_frame1):
+ # GH28486
+ conn = request.getfixturevalue(conn)
+ create_sql = sql.get_schema(test_frame1, "test", con=conn, schema="pypi")
+ assert "CREATE TABLE pypi." in create_sql
+
+
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_get_schema_dtypes(conn, request):
+ conn_name = conn
+ conn = request.getfixturevalue(conn)
+ float_frame = DataFrame({"a": [1.1, 1.2], "b": [2.1, 2.2]})
+
+ if conn_name == "sqlite_buildin":
+ dtype = "INTEGER"
+ else:
+ from sqlalchemy import Integer
+
+ dtype = Integer
+ create_sql = sql.get_schema(float_frame, "test", con=conn, dtype={"b": dtype})
+ assert "CREATE" in create_sql
+ assert "INTEGER" in create_sql
+
+
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_get_schema_keys(conn, request, test_frame1):
+ conn_name = conn
+ conn = request.getfixturevalue(conn)
+ frame = DataFrame({"Col1": [1.1, 1.2], "Col2": [2.1, 2.2]})
+ create_sql = sql.get_schema(frame, "test", con=conn, keys="Col1")
+
+ if "mysql" in conn_name:
+ constraint_sentence = "CONSTRAINT test_pk PRIMARY KEY (`Col1`)"
+ else:
constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("Col1")'
- assert constraint_sentence in create_sql
+ assert constraint_sentence in create_sql
- # multiple columns as key (GH10385)
- create_sql = sql.get_schema(test_frame1, "test", con=self.conn, keys=["A", "B"])
+ # multiple columns as key (GH10385)
+ create_sql = sql.get_schema(test_frame1, "test", con=conn, keys=["A", "B"])
+ if "mysql" in conn_name:
+ constraint_sentence = "CONSTRAINT test_pk PRIMARY KEY (`A`, `B`)"
+ else:
constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("A", "B")'
- assert constraint_sentence in create_sql
+ assert constraint_sentence in create_sql
- def test_chunksize_read(self):
- df = DataFrame(
- np.random.default_rng(2).standard_normal((22, 5)), columns=list("abcde")
- )
- df.to_sql(name="test_chunksize", con=self.conn, index=False)
- # reading the query in one time
- res1 = sql.read_sql_query("select * from test_chunksize", self.conn)
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_chunksize_read(conn, request):
+ conn_name = conn
+ conn = request.getfixturevalue(conn)
+ if sql.has_table("test_chunksize", conn):
+ with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+ pandasSQL.drop_table("test_chunksize")
+
+ df = DataFrame(
+ np.random.default_rng(2).standard_normal((22, 5)), columns=list("abcde")
+ )
+ df.to_sql(name="test_chunksize", con=conn, index=False)
+
+ # reading the query in one time
+ res1 = sql.read_sql_query("select * from test_chunksize", conn)
+
+ # reading the query in chunks with read_sql_query
+ res2 = DataFrame()
+ i = 0
+ sizes = [5, 5, 5, 5, 2]
+
+ for chunk in sql.read_sql_query("select * from test_chunksize", conn, chunksize=5):
+ res2 = concat([res2, chunk], ignore_index=True)
+ assert len(chunk) == sizes[i]
+ i += 1
+
+ tm.assert_frame_equal(res1, res2)
- # reading the query in chunks with read_sql_query
- res2 = DataFrame()
+ # reading the query in chunks with read_sql_query
+ if conn_name == "sqlite_buildin":
+ with pytest.raises(NotImplementedError, match=""):
+ sql.read_sql_table("test_chunksize", conn, chunksize=5)
+ else:
+ res3 = DataFrame()
i = 0
sizes = [5, 5, 5, 5, 2]
- for chunk in sql.read_sql_query(
- "select * from test_chunksize", self.conn, chunksize=5
- ):
- res2 = concat([res2, chunk], ignore_index=True)
+ for chunk in sql.read_sql_table("test_chunksize", conn, chunksize=5):
+ res3 = concat([res3, chunk], ignore_index=True)
assert len(chunk) == sizes[i]
i += 1
- tm.assert_frame_equal(res1, res2)
+ tm.assert_frame_equal(res1, res3)
- # reading the query in chunks with read_sql_query
- if self.mode == "sqlalchemy":
- res3 = DataFrame()
- i = 0
- sizes = [5, 5, 5, 5, 2]
- for chunk in sql.read_sql_table("test_chunksize", self.conn, chunksize=5):
- res3 = concat([res3, chunk], ignore_index=True)
- assert len(chunk) == sizes[i]
- i += 1
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_categorical(conn, request):
+ # GH8624
+ # test that categorical gets written correctly as dense column
+ conn = request.getfixturevalue(conn)
+ if sql.has_table("test_categorical", conn):
+ with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+ pandasSQL.drop_table("test_categorical")
- tm.assert_frame_equal(res1, res3)
+ df = DataFrame(
+ {
+ "person_id": [1, 2, 3],
+ "person_name": ["John P. Doe", "Jane Dove", "John P. Doe"],
+ }
+ )
+ df2 = df.copy()
+ df2["person_name"] = df2["person_name"].astype("category")
- def test_categorical(self):
- # GH8624
- # test that categorical gets written correctly as dense column
- df = DataFrame(
- {
- "person_id": [1, 2, 3],
- "person_name": ["John P. Doe", "Jane Dove", "John P. Doe"],
- }
- )
- df2 = df.copy()
- df2["person_name"] = df2["person_name"].astype("category")
+ df2.to_sql(name="test_categorical", con=conn, index=False)
+ res = sql.read_sql_query("SELECT * FROM test_categorical", conn)
- df2.to_sql(name="test_categorical", con=self.conn, index=False)
- res = sql.read_sql_query("SELECT * FROM test_categorical", self.conn)
+ tm.assert_frame_equal(res, df)
- tm.assert_frame_equal(res, df)
- def test_unicode_column_name(self):
- # GH 11431
- df = DataFrame([[1, 2], [3, 4]], columns=["\xe9", "b"])
- df.to_sql(name="test_unicode", con=self.conn, index=False)
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_unicode_column_name(conn, request):
+ # GH 11431
+ conn = request.getfixturevalue(conn)
+ if sql.has_table("test_unicode", conn):
+ with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+ pandasSQL.drop_table("test_unicode")
- def test_escaped_table_name(self):
- # GH 13206
- df = DataFrame({"A": [0, 1, 2], "B": [0.2, np.nan, 5.6]})
- df.to_sql(name="d1187b08-4943-4c8d-a7f6", con=self.conn, index=False)
+ df = DataFrame([[1, 2], [3, 4]], columns=["\xe9", "b"])
+ df.to_sql(name="test_unicode", con=conn, index=False)
- res = sql.read_sql_query("SELECT * FROM `d1187b08-4943-4c8d-a7f6`", self.conn)
- tm.assert_frame_equal(res, df)
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_escaped_table_name(conn, request):
+ # GH 13206
+ conn_name = conn
+ conn = request.getfixturevalue(conn)
+ if sql.has_table("d1187b08-4943-4c8d-a7f6", conn):
+ with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+ pandasSQL.drop_table("d1187b08-4943-4c8d-a7f6")
- def test_read_sql_duplicate_columns(self):
- # GH#53117
- df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": 1})
- df.to_sql(name="test_table", con=self.conn, index=False)
+ df = DataFrame({"A": [0, 1, 2], "B": [0.2, np.nan, 5.6]})
+ df.to_sql(name="d1187b08-4943-4c8d-a7f6", con=conn, index=False)
- result = pd.read_sql("SELECT a, b, a +1 as a, c FROM test_table;", self.conn)
- expected = DataFrame(
- [[1, 0.1, 2, 1], [2, 0.2, 3, 1], [3, 0.3, 4, 1]],
- columns=["a", "b", "a", "c"],
- )
- tm.assert_frame_equal(result, expected)
+ if "postgres" in conn_name:
+ query = 'SELECT * FROM "d1187b08-4943-4c8d-a7f6"'
+ else:
+ query = "SELECT * FROM `d1187b08-4943-4c8d-a7f6`"
+ res = sql.read_sql_query(query, conn)
+
+ tm.assert_frame_equal(res, df)
+
+
+@pytest.mark.db
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_read_sql_duplicate_columns(conn, request):
+ # GH#53117
+ conn = request.getfixturevalue(conn)
+ if sql.has_table("test_table", conn):
+ with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+ pandasSQL.drop_table("test_table")
+
+ df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": 1})
+ df.to_sql(name="test_table", con=conn, index=False)
+
+ result = pd.read_sql("SELECT a, b, a +1 as a, c FROM test_table;", conn)
+ expected = DataFrame(
+ [[1, 0.1, 2, 1], [2, 0.2, 3, 1], [3, 0.3, 4, 1]],
+ columns=["a", "b", "a", "c"],
+ )
+ tm.assert_frame_equal(result, expected)
+
+
+class _TestSQLApi(PandasSQLTest):
+ """
+ Base class to test the public API.
+
+ From this two classes are derived to run these tests for both the
+ sqlalchemy mode (`TestSQLApi`) and the fallback mode
+ (`TestSQLiteFallbackApi`). These tests are run with sqlite3. Specific
+ tests for the different sql flavours are included in `_TestSQLAlchemy`.
+
+ Notes:
+ flavor can always be passed even in SQLAlchemy mode,
+ should be correctly ignored.
+
+ we don't use drop_table because that isn't part of the public api
+
+ """
+
+ flavor = "sqlite"
+ mode: str
+
+ @pytest.fixture(autouse=True)
+ def setup_method(self, iris_path, types_data):
+ self.conn = self.connect()
+ self.load_iris_data(iris_path)
+ self.load_types_data(types_data)
+ self.load_test_data_and_sql()
+
+ def load_test_data_and_sql(self):
+ create_and_load_iris_view(self.conn)
@pytest.mark.skipif(not SQLALCHEMY_INSTALLED, reason="SQLAlchemy not installed")
@@ -2962,6 +3212,13 @@ def test_read_sql_string_inference(self):
tm.assert_frame_equal(result, expected)
+ def test_roundtripping_datetimes(self):
+ # GH#54877
+ df = DataFrame({"t": [datetime(2020, 12, 31, 12)]}, dtype="datetime64[ns]")
+ df.to_sql("test", self.conn, if_exists="replace", index=False)
+ result = pd.read_sql("select * from test", self.conn).iloc[0, 0]
+ assert result == "2020-12-31 12:00:00.000000"
+
@pytest.mark.db
class TestMySQLAlchemy(_TestSQLAlchemy):
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
index 7459aa1df8f3e..cd504616b6c5d 100644
--- a/pandas/tests/io/test_stata.py
+++ b/pandas/tests/io/test_stata.py
@@ -798,7 +798,7 @@ def test_missing_value_generator(self):
expected_values.insert(0, ".")
for t in types:
offset = valid_range[t][1]
- for i in range(0, 27):
+ for i in range(27):
val = StataMissingValue(offset + 1 + i)
assert val.string == expected_values[i]
diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py
index 66ecb93385a87..a955fa0b096f0 100644
--- a/pandas/tests/resample/test_datetime_index.py
+++ b/pandas/tests/resample/test_datetime_index.py
@@ -1077,8 +1077,12 @@ def test_resample_segfault(unit):
all_wins_and_wagers, columns=("ID", "timestamp", "A", "B")
).set_index("timestamp")
df.index = df.index.as_unit(unit)
- result = df.groupby("ID").resample("5min").sum()
- expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum())
+ msg = "DataFrameGroupBy.resample operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("ID").resample("5min").sum()
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum())
tm.assert_frame_equal(result, expected)
@@ -1097,7 +1101,9 @@ def test_resample_dtype_preservation(unit):
result = df.resample("1D").ffill()
assert result.val.dtype == np.int32
- result = df.groupby("group").resample("1D").ffill()
+ msg = "DataFrameGroupBy.resample operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("group").resample("1D").ffill()
assert result.val.dtype == np.int32
@@ -1823,8 +1829,12 @@ def f(data, add_arg):
# Testing dataframe
df = DataFrame({"A": 1, "B": 2}, index=date_range("2017", periods=10))
- result = df.groupby("A").resample("D").agg(f, multiplier).astype(float)
- expected = df.groupby("A").resample("D").mean().multiply(multiplier)
+ msg = "DataFrameGroupBy.resample operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("A").resample("D").agg(f, multiplier).astype(float)
+ msg = "DataFrameGroupBy.resample operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = df.groupby("A").resample("D").mean().multiply(multiplier)
tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py
index 1b20a7b99d1d7..f331851596317 100644
--- a/pandas/tests/resample/test_resample_api.py
+++ b/pandas/tests/resample/test_resample_api.py
@@ -77,7 +77,9 @@ def test_groupby_resample_api():
)
index = pd.MultiIndex.from_arrays([[1] * 8 + [2] * 8, i], names=["group", "date"])
expected = DataFrame({"val": [5] * 7 + [6] + [7] * 7 + [8]}, index=index)
- result = df.groupby("group").apply(lambda x: x.resample("1D").ffill())[["val"]]
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("group").apply(lambda x: x.resample("1D").ffill())[["val"]]
tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py
index 6f4f1154907dc..d47a8132f26bb 100644
--- a/pandas/tests/resample/test_resampler_grouper.py
+++ b/pandas/tests/resample/test_resampler_grouper.py
@@ -68,8 +68,12 @@ def test_deferred_with_groupby():
def f_0(x):
return x.set_index("date").resample("D").asfreq()
- expected = df.groupby("id").apply(f_0)
- result = df.set_index("date").groupby("id").resample("D").asfreq()
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = df.groupby("id").apply(f_0)
+ msg = "DataFrameGroupBy.resample operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.set_index("date").groupby("id").resample("D").asfreq()
tm.assert_frame_equal(result, expected)
df = DataFrame(
@@ -83,8 +87,12 @@ def f_0(x):
def f_1(x):
return x.resample("1D").ffill()
- expected = df.groupby("group").apply(f_1)
- result = df.groupby("group").resample("1D").ffill()
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = df.groupby("group").apply(f_1)
+ msg = "DataFrameGroupBy.resample operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("group").resample("1D").ffill()
tm.assert_frame_equal(result, expected)
@@ -99,7 +107,9 @@ def test_getitem(test_frame):
result = g.B.resample("2s").mean()
tm.assert_series_equal(result, expected)
- result = g.resample("2s").mean().B
+ msg = "DataFrameGroupBy.resample operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = g.resample("2s").mean().B
tm.assert_series_equal(result, expected)
@@ -230,8 +240,12 @@ def test_methods(f, test_frame):
g = test_frame.groupby("A")
r = g.resample("2s")
- result = getattr(r, f)()
- expected = g.apply(lambda x: getattr(x.resample("2s"), f)())
+ msg = "DataFrameGroupBy.resample operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = getattr(r, f)()
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = g.apply(lambda x: getattr(x.resample("2s"), f)())
tm.assert_equal(result, expected)
@@ -248,8 +262,12 @@ def test_methods_nunique(test_frame):
def test_methods_std_var(f, test_frame):
g = test_frame.groupby("A")
r = g.resample("2s")
- result = getattr(r, f)(ddof=1)
- expected = g.apply(lambda x: getattr(x.resample("2s"), f)(ddof=1))
+ msg = "DataFrameGroupBy.resample operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = getattr(r, f)(ddof=1)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = g.apply(lambda x: getattr(x.resample("2s"), f)(ddof=1))
tm.assert_frame_equal(result, expected)
@@ -258,18 +276,24 @@ def test_apply(test_frame):
r = g.resample("2s")
# reduction
- expected = g.resample("2s").sum()
+ msg = "DataFrameGroupBy.resample operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = g.resample("2s").sum()
def f_0(x):
return x.resample("2s").sum()
- result = r.apply(f_0)
+ msg = "DataFrameGroupBy.resample operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = r.apply(f_0)
tm.assert_frame_equal(result, expected)
def f_1(x):
return x.resample("2s").apply(lambda y: y.sum())
- result = g.apply(f_1)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = g.apply(f_1)
# y.sum() results in int64 instead of int32 on 32-bit architectures
expected = expected.astype("int64")
tm.assert_frame_equal(result, expected)
@@ -337,7 +361,9 @@ def test_resample_groupby_with_label():
# GH 13235
index = date_range("2000-01-01", freq="2D", periods=5)
df = DataFrame(index=index, data={"col0": [0, 0, 1, 1, 2], "col1": [1, 1, 1, 1, 1]})
- result = df.groupby("col0").resample("1W", label="left").sum()
+ msg = "DataFrameGroupBy.resample operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("col0").resample("1W", label="left").sum()
mi = [
np.array([0, 0, 1, 2], dtype=np.int64),
@@ -357,7 +383,9 @@ def test_consistency_with_window(test_frame):
# consistent return values with window
df = test_frame
expected = Index([1, 2, 3], name="A")
- result = df.groupby("A").resample("2s").mean()
+ msg = "DataFrameGroupBy.resample operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("A").resample("2s").mean()
assert result.index.nlevels == 2
tm.assert_index_equal(result.index.levels[0], expected)
@@ -455,7 +483,9 @@ def test_resample_groupby_agg_listlike():
def test_empty(keys):
# GH 26411
df = DataFrame([], columns=["a", "b"], index=TimedeltaIndex([]))
- result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean()
+ msg = "DataFrameGroupBy.resample operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean()
expected = (
DataFrame(columns=["a", "b"])
.set_index(keys, drop=False)
@@ -478,7 +508,8 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate):
if consolidate:
df = df._consolidate()
- result = df.groupby(["key"]).resample("W", on="date").min()
+ with tm.assert_produces_warning(FutureWarning):
+ result = df.groupby(["key"]).resample("W", on="date").min()
idx = pd.MultiIndex.from_arrays(
[
["A"] * 3 + ["B"] * 3,
@@ -530,7 +561,9 @@ def test_resample_no_index(keys):
df = DataFrame([], columns=["a", "b", "date"])
df["date"] = pd.to_datetime(df["date"])
df = df.set_index("date")
- result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean()
+ msg = "DataFrameGroupBy.resample operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean()
expected = DataFrame(columns=["a", "b", "date"]).set_index(keys, drop=False)
expected["date"] = pd.to_datetime(expected["date"])
expected = expected.set_index("date", append=True, drop=True)
@@ -577,7 +610,9 @@ def test_groupby_resample_size_all_index_same():
{"A": [1] * 3 + [2] * 3 + [1] * 3 + [2] * 3, "B": np.arange(12)},
index=date_range("31/12/2000 18:00", freq="H", periods=12),
)
- result = df.groupby("A").resample("D").size()
+ msg = "DataFrameGroupBy.resample operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = df.groupby("A").resample("D").size()
expected = Series(
3,
index=pd.MultiIndex.from_tuples(
diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py
index d7fdbc4fe5f08..8b1eab552c97d 100644
--- a/pandas/tests/resample/test_time_grouper.py
+++ b/pandas/tests/resample/test_time_grouper.py
@@ -323,12 +323,14 @@ def test_groupby_resample_interpolate():
df["week_starting"] = date_range("01/01/2018", periods=3, freq="W")
- result = (
- df.set_index("week_starting")
- .groupby("volume")
- .resample("1D")
- .interpolate(method="linear")
- )
+ msg = "DataFrameGroupBy.resample operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = (
+ df.set_index("week_starting")
+ .groupby("volume")
+ .resample("1D")
+ .interpolate(method="linear")
+ )
expected_ind = pd.MultiIndex.from_tuples(
[
diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py
index 3efcd930af581..5dde863f246d1 100644
--- a/pandas/tests/reshape/concat/test_concat.py
+++ b/pandas/tests/reshape/concat/test_concat.py
@@ -858,3 +858,12 @@ def test_concat_multiindex_with_category():
)
expected = expected.set_index(["c1", "c2"])
tm.assert_frame_equal(result, expected)
+
+
+def test_concat_ea_upcast():
+ # GH#54848
+ df1 = DataFrame(["a"], dtype="string")
+ df2 = DataFrame([1], dtype="Int64")
+ result = concat([df1, df2])
+ expected = DataFrame(["a", 1], index=[0, 0])
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
index 9cada6964c094..d889ae2e4806b 100644
--- a/pandas/tests/reshape/merge/test_merge.py
+++ b/pandas/tests/reshape/merge/test_merge.py
@@ -26,7 +26,6 @@
TimedeltaIndex,
)
import pandas._testing as tm
-from pandas.api.types import CategoricalDtype as CDT
from pandas.core.reshape.concat import concat
from pandas.core.reshape.merge import (
MergeError,
@@ -582,11 +581,11 @@ def test_merge_empty_frame(self, series_of_dtype, series_of_dtype2):
df_empty = df[:0]
expected = DataFrame(
{
- "value_x": Series(dtype=df.dtypes["value"]),
"key": Series(dtype=df.dtypes["key"]),
+ "value_x": Series(dtype=df.dtypes["value"]),
"value_y": Series(dtype=df.dtypes["value"]),
},
- columns=["value_x", "key", "value_y"],
+ columns=["key", "value_x", "value_y"],
)
actual = df_empty.merge(df, on="key")
tm.assert_frame_equal(actual, expected)
@@ -889,13 +888,13 @@ def test_merge_on_datetime64tz_empty(self):
result = left.merge(right, on="date")
expected = DataFrame(
{
+ "date": Series(dtype=dtz),
"value_x": Series(dtype=float),
"date2_x": Series(dtype=dtz),
- "date": Series(dtype=dtz),
"value_y": Series(dtype=float),
"date2_y": Series(dtype=dtz),
},
- columns=["value_x", "date2_x", "date", "value_y", "date2_y"],
+ columns=["date", "value_x", "date2_x", "value_y", "date2_y"],
)
tm.assert_frame_equal(result, expected)
@@ -1827,11 +1826,9 @@ def test_merge_empty(self, left_empty, how, exp):
if exp == "left":
expected = DataFrame({"A": [2, 1], "B": [3, 4], "C": [np.nan, np.nan]})
elif exp == "right":
- expected = DataFrame({"B": [np.nan], "A": [1], "C": [5]})
+ expected = DataFrame({"A": [1], "B": [np.nan], "C": [5]})
elif exp == "empty":
expected = DataFrame(columns=["A", "B", "C"], dtype="int64")
- if left_empty:
- expected = expected[["B", "A", "C"]]
elif exp == "empty_cross":
expected = DataFrame(columns=["A_x", "B", "A_y", "C"], dtype="int64")
@@ -1844,7 +1841,7 @@ def left():
{
"X": Series(
np.random.default_rng(2).choice(["foo", "bar"], size=(10,))
- ).astype(CDT(["foo", "bar"])),
+ ).astype(CategoricalDtype(["foo", "bar"])),
"Y": np.random.default_rng(2).choice(["one", "two", "three"], size=(10,)),
}
)
@@ -1853,7 +1850,10 @@ def left():
@pytest.fixture
def right():
return DataFrame(
- {"X": Series(["foo", "bar"]).astype(CDT(["foo", "bar"])), "Z": [1, 2]}
+ {
+ "X": Series(["foo", "bar"]).astype(CategoricalDtype(["foo", "bar"])),
+ "Z": [1, 2],
+ }
)
@@ -2004,8 +2004,8 @@ def test_other_columns(self, left, right):
"change",
[
lambda x: x,
- lambda x: x.astype(CDT(["foo", "bar", "bah"])),
- lambda x: x.astype(CDT(ordered=True)),
+ lambda x: x.astype(CategoricalDtype(["foo", "bar", "bah"])),
+ lambda x: x.astype(CategoricalDtype(ordered=True)),
],
)
def test_dtype_on_merged_different(self, change, join_type, left, right):
@@ -2112,11 +2112,13 @@ def test_merging_with_bool_or_int_cateorical_column(
# GH 17187
# merging with a boolean/int categorical column
df1 = DataFrame({"id": [1, 2, 3, 4], "cat": category_column})
- df1["cat"] = df1["cat"].astype(CDT(categories, ordered=ordered))
+ df1["cat"] = df1["cat"].astype(CategoricalDtype(categories, ordered=ordered))
df2 = DataFrame({"id": [2, 4], "num": [1, 9]})
result = df1.merge(df2)
expected = DataFrame({"id": [2, 4], "cat": expected_categories, "num": [1, 9]})
- expected["cat"] = expected["cat"].astype(CDT(categories, ordered=ordered))
+ expected["cat"] = expected["cat"].astype(
+ CategoricalDtype(categories, ordered=ordered)
+ )
tm.assert_frame_equal(expected, result)
def test_merge_on_int_array(self):
@@ -2481,14 +2483,12 @@ def test_merge_multiindex_columns():
result = frame_x.merge(frame_y, on="id", suffixes=((l_suf, r_suf)))
# Constructing the expected results
- expected_labels = [letter + l_suf for letter in letters] + [
- letter + r_suf for letter in letters
- ]
- expected_index = MultiIndex.from_product(
- [expected_labels, numbers], names=["outer", "inner"]
- )
+ tuples = [(letter + l_suf, num) for letter in letters for num in numbers]
+ tuples += [("id", "")]
+ tuples += [(letter + r_suf, num) for letter in letters for num in numbers]
+
+ expected_index = MultiIndex.from_tuples(tuples, names=["outer", "inner"])
expected = DataFrame(columns=expected_index)
- expected["id"] = ""
tm.assert_frame_equal(result, expected)
@@ -2949,13 +2949,36 @@ def test_merge_ea_int_and_float_numpy():
tm.assert_frame_equal(result, expected.astype("float64"))
-def test_merge_arrow_string_index():
+def test_merge_arrow_string_index(any_string_dtype):
# GH#54894
pytest.importorskip("pyarrow")
- left = DataFrame({"a": ["a", "b"]}, dtype="string[pyarrow]")
- right = DataFrame({"b": 1}, index=Index(["a", "c"], dtype="string[pyarrow]"))
+ left = DataFrame({"a": ["a", "b"]}, dtype=any_string_dtype)
+ right = DataFrame({"b": 1}, index=Index(["a", "c"], dtype=any_string_dtype))
result = left.merge(right, left_on="a", right_index=True, how="left")
expected = DataFrame(
- {"a": Series(["a", "b"], dtype="string[pyarrow]"), "b": [1, np.nan]}
+ {"a": Series(["a", "b"], dtype=any_string_dtype), "b": [1, np.nan]}
)
tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("left_empty", [True, False])
+@pytest.mark.parametrize("right_empty", [True, False])
+def test_merge_empty_frames_column_order(left_empty, right_empty):
+ # GH 51929
+ df1 = DataFrame(1, index=[0], columns=["A", "B"])
+ df2 = DataFrame(1, index=[0], columns=["A", "C", "D"])
+
+ if left_empty:
+ df1 = df1.iloc[:0]
+ if right_empty:
+ df2 = df2.iloc[:0]
+
+ result = merge(df1, df2, on=["A"], how="outer")
+ expected = DataFrame(1, index=[0], columns=["A", "B", "C", "D"])
+ if left_empty and right_empty:
+ expected = expected.iloc[:0]
+ elif left_empty:
+ expected.loc[:, "B"] = np.nan
+ elif right_empty:
+ expected.loc[:, ["C", "D"]] = np.nan
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py
index b2a6ac49fdff2..3a284f7732ac1 100644
--- a/pandas/tests/reshape/test_cut.py
+++ b/pandas/tests/reshape/test_cut.py
@@ -21,7 +21,7 @@
to_datetime,
)
import pandas._testing as tm
-from pandas.api.types import CategoricalDtype as CDT
+from pandas.api.types import CategoricalDtype
import pandas.core.reshape.tile as tmod
@@ -359,7 +359,7 @@ def test_cut_return_intervals():
IntervalIndex.from_breaks(exp_bins, closed="right").take(
[0, 0, 0, 1, 1, 1, 2, 2, 2]
)
- ).astype(CDT(ordered=True))
+ ).astype(CategoricalDtype(ordered=True))
tm.assert_series_equal(result, expected)
@@ -370,7 +370,7 @@ def test_series_ret_bins():
expected = Series(
IntervalIndex.from_breaks([-0.003, 1.5, 3], closed="right").repeat(2)
- ).astype(CDT(ordered=True))
+ ).astype(CategoricalDtype(ordered=True))
tm.assert_series_equal(result, expected)
@@ -445,7 +445,7 @@ def test_datetime_bin(conv):
Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2])),
]
)
- ).astype(CDT(ordered=True))
+ ).astype(CategoricalDtype(ordered=True))
bins = [conv(v) for v in bin_data]
result = Series(cut(data, bins=bins))
@@ -491,7 +491,7 @@ def test_datetime_cut(data):
),
]
)
- ).astype(CDT(ordered=True))
+ ).astype(CategoricalDtype(ordered=True))
tm.assert_series_equal(Series(result), expected)
@@ -534,7 +534,7 @@ def test_datetime_tz_cut(bins, box):
),
]
)
- ).astype(CDT(ordered=True))
+ ).astype(CategoricalDtype(ordered=True))
tm.assert_series_equal(result, expected)
@@ -700,7 +700,7 @@ def test_cut_with_duplicated_index_lowest_included():
def test_cut_with_nonexact_categorical_indices():
# GH 42424
- ser = Series(range(0, 100))
+ ser = Series(range(100))
ser1 = cut(ser, 10).value_counts().head(5)
ser2 = cut(ser, 10).value_counts().tail(5)
result = DataFrame({"1": ser1, "2": ser2})
diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
index 46da18445e135..28ad133a0c8d6 100644
--- a/pandas/tests/reshape/test_pivot.py
+++ b/pandas/tests/reshape/test_pivot.py
@@ -23,7 +23,7 @@
date_range,
)
import pandas._testing as tm
-from pandas.api.types import CategoricalDtype as CDT
+from pandas.api.types import CategoricalDtype
from pandas.core.reshape import reshape as reshape_lib
from pandas.core.reshape.pivot import pivot_table
@@ -33,7 +33,7 @@ def dropna(request):
return request.param
-@pytest.fixture(params=[([0] * 4, [1] * 4), (range(0, 3), range(1, 4))])
+@pytest.fixture(params=[([0] * 4, [1] * 4), (range(3), range(1, 4))])
def interval_values(request, closed):
left, right = request.param
return Categorical(pd.IntervalIndex.from_arrays(left, right, closed))
@@ -215,14 +215,16 @@ def test_pivot_table_dropna_categoricals(self, dropna):
{
"A": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
"B": [1, 2, 3, 1, 2, 3, 1, 2, 3],
- "C": range(0, 9),
+ "C": range(9),
}
)
- df["A"] = df["A"].astype(CDT(categories, ordered=False))
+ df["A"] = df["A"].astype(CategoricalDtype(categories, ordered=False))
result = df.pivot_table(index="B", columns="A", values="C", dropna=dropna)
expected_columns = Series(["a", "b", "c"], name="A")
- expected_columns = expected_columns.astype(CDT(categories, ordered=False))
+ expected_columns = expected_columns.astype(
+ CategoricalDtype(categories, ordered=False)
+ )
expected_index = Series([1, 2, 3], name="B")
expected = DataFrame(
[[0.0, 3.0, 6.0], [1.0, 4.0, 7.0], [2.0, 5.0, 8.0]],
diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py
index 907eeca6e9b5e..bcfbe5ed1aa20 100644
--- a/pandas/tests/reshape/test_qcut.py
+++ b/pandas/tests/reshape/test_qcut.py
@@ -20,7 +20,7 @@
timedelta_range,
)
import pandas._testing as tm
-from pandas.api.types import CategoricalDtype as CDT
+from pandas.api.types import CategoricalDtype
from pandas.tseries.offsets import (
Day,
@@ -129,7 +129,9 @@ def test_qcut_return_intervals():
exp_levels = np.array(
[Interval(-0.001, 2.664), Interval(2.664, 5.328), Interval(5.328, 8)]
)
- exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(CDT(ordered=True))
+ exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(
+ CategoricalDtype(ordered=True)
+ )
tm.assert_series_equal(res, exp)
@@ -199,7 +201,7 @@ def test_single_quantile(data, start, end, length, labels):
if labels is None:
intervals = IntervalIndex([Interval(start, end)] * length, closed="right")
- expected = Series(intervals).astype(CDT(ordered=True))
+ expected = Series(intervals).astype(CategoricalDtype(ordered=True))
else:
expected = Series([0] * length, dtype=np.intp)
@@ -249,7 +251,7 @@ def test_datetime_tz_qcut(bins):
),
]
)
- ).astype(CDT(ordered=True))
+ ).astype(CategoricalDtype(ordered=True))
tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/series/methods/test_interpolate.py b/pandas/tests/series/methods/test_interpolate.py
index 619690f400d98..549f429f09d35 100644
--- a/pandas/tests/series/methods/test_interpolate.py
+++ b/pandas/tests/series/methods/test_interpolate.py
@@ -858,3 +858,11 @@ def test_interpolate_asfreq_raises(self):
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(FutureWarning, match=msg2):
ser.interpolate(method="asfreq")
+
+ def test_interpolate_fill_value(self):
+ # GH#54920
+ pytest.importorskip("scipy")
+ ser = Series([np.nan, 0, 1, np.nan, 3, np.nan])
+ result = ser.interpolate(method="nearest", fill_value=0)
+ expected = Series([np.nan, 0, 1, 1, 3, 0])
+ tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/series/methods/test_pct_change.py b/pandas/tests/series/methods/test_pct_change.py
index 4dabf7b87e2cd..6740b8756853e 100644
--- a/pandas/tests/series/methods/test_pct_change.py
+++ b/pandas/tests/series/methods/test_pct_change.py
@@ -107,3 +107,11 @@ def test_pct_change_with_duplicated_indices(fill_method):
expected = Series([np.nan, np.nan, 1.0, 0.5, 2.0, 1.0], index=["a", "b"] * 3)
tm.assert_series_equal(result, expected)
+
+
+def test_pct_change_no_warning_na_beginning():
+ # GH#54981
+ ser = Series([None, None, 1, 2, 3])
+ result = ser.pct_change()
+ expected = Series([np.nan, np.nan, np.nan, 1, 0.5])
+ tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py
index bce7d2d554004..016208f2d2026 100644
--- a/pandas/tests/series/methods/test_reindex.py
+++ b/pandas/tests/series/methods/test_reindex.py
@@ -159,9 +159,9 @@ def test_reindex_inference():
def test_reindex_downcasting():
# GH4618 shifted series downcasting
- s = Series(False, index=range(0, 5))
+ s = Series(False, index=range(5))
result = s.shift(1).bfill()
- expected = Series(False, index=range(0, 5))
+ expected = Series(False, index=range(5))
tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index cb703d3439d44..661290fb00d13 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -17,7 +17,7 @@
is_integer_dtype,
is_object_dtype,
)
-from pandas.core.dtypes.dtypes import CategoricalDtype as CDT
+from pandas.core.dtypes.dtypes import CategoricalDtype
import pandas as pd
from pandas import (
@@ -1182,7 +1182,7 @@ def test_value_counts(self):
with tm.assert_produces_warning(FutureWarning, match=msg):
result = algos.value_counts(factor)
breaks = [-1.606, -1.018, -0.431, 0.155, 0.741]
- index = IntervalIndex.from_breaks(breaks).astype(CDT(ordered=True))
+ index = IntervalIndex.from_breaks(breaks).astype(CategoricalDtype(ordered=True))
expected = Series([1, 0, 2, 1], index=index, name="count")
tm.assert_series_equal(result.sort_index(), expected.sort_index())
diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py
index ab00e18fc4812..b8e0173ee131f 100644
--- a/pandas/tests/window/test_groupby.py
+++ b/pandas/tests/window/test_groupby.py
@@ -99,7 +99,9 @@ def test_rolling(self, f, roll_frame):
r = g.rolling(window=4)
result = getattr(r, f)()
- expected = g.apply(lambda x: getattr(x.rolling(4), f)())
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = g.apply(lambda x: getattr(x.rolling(4), f)())
# groupby.apply doesn't drop the grouped-by column
expected = expected.drop("A", axis=1)
# GH 39732
@@ -113,7 +115,9 @@ def test_rolling_ddof(self, f, roll_frame):
r = g.rolling(window=4)
result = getattr(r, f)(ddof=1)
- expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1))
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1))
# groupby.apply doesn't drop the grouped-by column
expected = expected.drop("A", axis=1)
# GH 39732
@@ -129,9 +133,11 @@ def test_rolling_quantile(self, interpolation, roll_frame):
r = g.rolling(window=4)
result = r.quantile(0.4, interpolation=interpolation)
- expected = g.apply(
- lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation)
- )
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = g.apply(
+ lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation)
+ )
# groupby.apply doesn't drop the grouped-by column
expected = expected.drop("A", axis=1)
# GH 39732
@@ -174,7 +180,9 @@ def test_rolling_corr_cov_other_diff_size_as_groups(self, f, roll_frame):
def func(x):
return getattr(x.rolling(4), f)(roll_frame)
- expected = g.apply(func)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = g.apply(func)
# GH 39591: The grouped column should be all np.nan
# (groupby.apply inserts 0s for cov)
expected["A"] = np.nan
@@ -190,7 +198,9 @@ def test_rolling_corr_cov_pairwise(self, f, roll_frame):
def func(x):
return getattr(x.B.rolling(4), f)(pairwise=True)
- expected = g.apply(func)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = g.apply(func)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
@@ -235,7 +245,9 @@ def test_rolling_apply(self, raw, roll_frame):
# reduction
result = r.apply(lambda x: x.sum(), raw=raw)
- expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw))
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw))
# groupby.apply doesn't drop the grouped-by column
expected = expected.drop("A", axis=1)
# GH 39732
@@ -466,20 +478,23 @@ def test_groupby_rolling_subset_with_closed(self):
# GH 35549
df = DataFrame(
{
- "column1": range(6),
- "column2": range(6),
- "group": 3 * ["A", "B"],
- "date": [Timestamp("2019-01-01")] * 6,
+ "column1": range(8),
+ "column2": range(8),
+ "group": ["A"] * 4 + ["B"] * 4,
+ "date": [
+ Timestamp(date)
+ for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"]
+ ]
+ * 2,
}
)
result = (
df.groupby("group").rolling("1D", on="date", closed="left")["column1"].sum()
)
expected = Series(
- [np.nan, 0.0, 2.0, np.nan, 1.0, 4.0],
- index=MultiIndex.from_tuples(
- [("A", Timestamp("2019-01-01"))] * 3
- + [("B", Timestamp("2019-01-01"))] * 3,
+ [np.nan, np.nan, 1.0, 1.0, np.nan, np.nan, 9.0, 9.0],
+ index=MultiIndex.from_frame(
+ df[["group", "date"]],
names=["group", "date"],
),
name="column1",
@@ -490,10 +505,14 @@ def test_groupby_subset_rolling_subset_with_closed(self):
# GH 35549
df = DataFrame(
{
- "column1": range(6),
- "column2": range(6),
- "group": 3 * ["A", "B"],
- "date": [Timestamp("2019-01-01")] * 6,
+ "column1": range(8),
+ "column2": range(8),
+ "group": ["A"] * 4 + ["B"] * 4,
+ "date": [
+ Timestamp(date)
+ for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"]
+ ]
+ * 2,
}
)
@@ -503,10 +522,9 @@ def test_groupby_subset_rolling_subset_with_closed(self):
.sum()
)
expected = Series(
- [np.nan, 0.0, 2.0, np.nan, 1.0, 4.0],
- index=MultiIndex.from_tuples(
- [("A", Timestamp("2019-01-01"))] * 3
- + [("B", Timestamp("2019-01-01"))] * 3,
+ [np.nan, np.nan, 1.0, 1.0, np.nan, np.nan, 9.0, 9.0],
+ index=MultiIndex.from_frame(
+ df[["group", "date"]],
names=["group", "date"],
),
name="column1",
@@ -778,9 +796,13 @@ def test_groupby_rolling_resulting_multiindex3(self):
def test_groupby_rolling_object_doesnt_affect_groupby_apply(self, roll_frame):
# GH 39732
g = roll_frame.groupby("A", group_keys=False)
- expected = g.apply(lambda x: x.rolling(4).sum()).index
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = g.apply(lambda x: x.rolling(4).sum()).index
_ = g.rolling(window=4)
- result = g.apply(lambda x: x.rolling(4).sum()).index
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = g.apply(lambda x: x.rolling(4).sum()).index
tm.assert_index_equal(result, expected)
@pytest.mark.parametrize(
@@ -954,11 +976,13 @@ def test_groupby_monotonic(self):
df["date"] = to_datetime(df["date"])
df = df.sort_values("date")
- expected = (
- df.set_index("date")
- .groupby("name")
- .apply(lambda x: x.rolling("180D")["amount"].sum())
- )
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = (
+ df.set_index("date")
+ .groupby("name")
+ .apply(lambda x: x.rolling("180D")["amount"].sum())
+ )
result = df.groupby("name").rolling("180D", on="date")["amount"].sum()
tm.assert_series_equal(result, expected)
@@ -977,9 +1001,13 @@ def test_datelike_on_monotonic_within_each_group(self):
}
)
- expected = (
- df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean())
- )
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = (
+ df.set_index("B")
+ .groupby("A")
+ .apply(lambda x: x.rolling("4s")["C"].mean())
+ )
result = df.groupby("A").rolling("4s", on="B").C.mean()
tm.assert_series_equal(result, expected)
@@ -1009,7 +1037,9 @@ def test_expanding(self, f, frame):
r = g.expanding()
result = getattr(r, f)()
- expected = g.apply(lambda x: getattr(x.expanding(), f)())
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = g.apply(lambda x: getattr(x.expanding(), f)())
# groupby.apply doesn't drop the grouped-by column
expected = expected.drop("A", axis=1)
# GH 39732
@@ -1023,7 +1053,9 @@ def test_expanding_ddof(self, f, frame):
r = g.expanding()
result = getattr(r, f)(ddof=0)
- expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0))
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0))
# groupby.apply doesn't drop the grouped-by column
expected = expected.drop("A", axis=1)
# GH 39732
@@ -1039,9 +1071,11 @@ def test_expanding_quantile(self, interpolation, frame):
r = g.expanding()
result = r.quantile(0.4, interpolation=interpolation)
- expected = g.apply(
- lambda x: x.expanding().quantile(0.4, interpolation=interpolation)
- )
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = g.apply(
+ lambda x: x.expanding().quantile(0.4, interpolation=interpolation)
+ )
# groupby.apply doesn't drop the grouped-by column
expected = expected.drop("A", axis=1)
# GH 39732
@@ -1059,7 +1093,9 @@ def test_expanding_corr_cov(self, f, frame):
def func_0(x):
return getattr(x.expanding(), f)(frame)
- expected = g.apply(func_0)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = g.apply(func_0)
# GH 39591: groupby.apply returns 1 instead of nan for windows
# with all nan values
null_idx = list(range(20, 61)) + list(range(72, 113))
@@ -1074,7 +1110,9 @@ def func_0(x):
def func_1(x):
return getattr(x.B.expanding(), f)(pairwise=True)
- expected = g.apply(func_1)
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = g.apply(func_1)
tm.assert_series_equal(result, expected)
def test_expanding_apply(self, raw, frame):
@@ -1083,7 +1121,11 @@ def test_expanding_apply(self, raw, frame):
# reduction
result = r.apply(lambda x: x.sum(), raw=raw)
- expected = g.apply(lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw))
+ msg = "DataFrameGroupBy.apply operated on the grouping columns"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ expected = g.apply(
+ lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw)
+ )
# groupby.apply doesn't drop the grouped-by column
expected = expected.drop("A", axis=1)
# GH 39732
diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py
index f4d903dc19fb7..3fe922539780d 100644
--- a/pandas/tests/window/test_rolling.py
+++ b/pandas/tests/window/test_rolling.py
@@ -100,9 +100,9 @@ def test_freq_window_not_implemented(window):
index=date_range("2015-12-24", periods=10, freq="D"),
)
with pytest.raises(
- NotImplementedError, match="step is not supported with frequency windows"
+ NotImplementedError, match="^step (not implemented|is not supported)"
):
- df.rolling("3D", step=3)
+ df.rolling(window, step=3).sum()
@pytest.mark.parametrize("agg", ["cov", "corr"])
@@ -304,6 +304,76 @@ def test_datetimelike_nonunique_index_centering(
tm.assert_equal(result, expected)
+@pytest.mark.parametrize(
+ "closed,expected",
+ [
+ ("left", [np.nan, np.nan, 1, 1, 1, 10, 14, 14, 18, 21]),
+ ("neither", [np.nan, np.nan, 1, 1, 1, 9, 5, 5, 13, 8]),
+ ("right", [0, 1, 3, 6, 10, 14, 11, 18, 21, 17]),
+ ("both", [0, 1, 3, 6, 10, 15, 20, 27, 26, 30]),
+ ],
+)
+def test_variable_window_nonunique(closed, expected, frame_or_series):
+ # GH 20712
+ index = DatetimeIndex(
+ [
+ "2011-01-01",
+ "2011-01-01",
+ "2011-01-02",
+ "2011-01-02",
+ "2011-01-02",
+ "2011-01-03",
+ "2011-01-04",
+ "2011-01-04",
+ "2011-01-05",
+ "2011-01-06",
+ ]
+ )
+
+ df = frame_or_series(range(10), index=index, dtype=float)
+ expected = frame_or_series(expected, index=index, dtype=float)
+
+ result = df.rolling("2D", closed=closed).sum()
+
+ tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+ "closed,expected",
+ [
+ ("left", [np.nan, np.nan, 1, 1, 1, 10, 15, 15, 18, 21]),
+ ("neither", [np.nan, np.nan, 1, 1, 1, 10, 15, 15, 13, 8]),
+ ("right", [0, 1, 3, 6, 10, 15, 21, 28, 21, 17]),
+ ("both", [0, 1, 3, 6, 10, 15, 21, 28, 26, 30]),
+ ],
+)
+def test_variable_offset_window_nonunique(closed, expected, frame_or_series):
+ # GH 20712
+ index = DatetimeIndex(
+ [
+ "2011-01-01",
+ "2011-01-01",
+ "2011-01-02",
+ "2011-01-02",
+ "2011-01-02",
+ "2011-01-03",
+ "2011-01-04",
+ "2011-01-04",
+ "2011-01-05",
+ "2011-01-06",
+ ]
+ )
+
+ df = frame_or_series(range(10), index=index, dtype=float)
+ expected = frame_or_series(expected, index=index, dtype=float)
+
+ offset = BusinessDay(2)
+ indexer = VariableOffsetWindowIndexer(index=index, offset=offset)
+ result = df.rolling(indexer, closed=closed, min_periods=1).sum()
+
+ tm.assert_equal(result, expected)
+
+
def test_even_number_window_alignment():
# see discussion in GH 38780
s = Series(range(3), index=date_range(start="2020-01-01", freq="D", periods=3))
diff --git a/pandas/tests/window/test_rolling_functions.py b/pandas/tests/window/test_rolling_functions.py
index 940f0845befa2..51f801ab3761b 100644
--- a/pandas/tests/window/test_rolling_functions.py
+++ b/pandas/tests/window/test_rolling_functions.py
@@ -388,7 +388,7 @@ def test_rolling_max_resample(step):
# So that we can have 3 datapoints on last day (4, 10, and 20)
indices.append(datetime(1975, 1, 5, 1))
indices.append(datetime(1975, 1, 5, 2))
- series = Series(list(range(0, 5)) + [10, 20], index=indices)
+ series = Series(list(range(5)) + [10, 20], index=indices)
# Use floats instead of ints as values
series = series.map(lambda x: float(x))
# Sort chronologically
@@ -425,7 +425,7 @@ def test_rolling_min_resample(step):
# So that we can have 3 datapoints on last day (4, 10, and 20)
indices.append(datetime(1975, 1, 5, 1))
indices.append(datetime(1975, 1, 5, 2))
- series = Series(list(range(0, 5)) + [10, 20], index=indices)
+ series = Series(list(range(5)) + [10, 20], index=indices)
# Use floats instead of ints as values
series = series.map(lambda x: float(x))
# Sort chronologically
@@ -445,7 +445,7 @@ def test_rolling_median_resample():
# So that we can have 3 datapoints on last day (4, 10, and 20)
indices.append(datetime(1975, 1, 5, 1))
indices.append(datetime(1975, 1, 5, 2))
- series = Series(list(range(0, 5)) + [10, 20], index=indices)
+ series = Series(list(range(5)) + [10, 20], index=indices)
# Use floats instead of ints as values
series = series.map(lambda x: float(x))
# Sort chronologically
diff --git a/pyproject.toml b/pyproject.toml
index 845c2a63e84f0..74d6aaee286a9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,8 @@ requires = [
# we don't want to force users to compile with 1.25 though
# (Ideally, in the future, though, oldest-supported-numpy can be dropped when our min numpy is 1.25.x)
"oldest-supported-numpy>=2022.8.16; python_version<'3.12'",
- "numpy>=1.22.4; python_version>='3.12'",
+ # TODO: This needs to be updated when the official numpy 1.26 comes out
+ "numpy>=1.26.0b1; python_version>='3.12'",
"versioneer[toml]"
]
@@ -30,7 +31,9 @@ license = {file = 'LICENSE'}
requires-python = '>=3.9'
dependencies = [
"numpy>=1.22.4; python_version<'3.11'",
- "numpy>=1.23.2; python_version>='3.11'",
+ "numpy>=1.23.2; python_version=='3.11'",
+ # TODO: This needs to be updated when the official numpy 1.26 comes out
+ "numpy>=1.26.0b1; python_version>='3.12'",
"python-dateutil>=2.8.2",
"pytz>=2020.1",
"tzdata>=2022.1"
diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py
index 47534226f972f..0931dd209ee05 100755
--- a/scripts/validate_unwanted_patterns.py
+++ b/scripts/validate_unwanted_patterns.py
@@ -33,6 +33,7 @@
"_agg_template_series",
"_agg_template_frame",
"_pipe_template",
+ "_apply_groupings_depr",
"__main__",
"_transform_template",
"_use_inf_as_na",