diff --git a/.circleci/config.yml b/.circleci/config.yml index 50f6a116a6630..ba124533e953a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -48,7 +48,7 @@ jobs: name: Build aarch64 wheels no_output_timeout: 30m # Sometimes the tests won't generate any output, make sure the job doesn't get killed by that command: | - pip3 install cibuildwheel==2.14.1 + pip3 install cibuildwheel==2.15.0 cibuildwheel --prerelease-pythons --output-dir wheelhouse environment: CIBW_BUILD: << parameters.cibw-build >> @@ -92,5 +92,4 @@ workflows: only: /^v.*/ matrix: parameters: - # TODO: Enable Python 3.12 wheels when numpy releases a version that supports Python 3.12 - cibw-build: ["cp39-manylinux_aarch64", "cp310-manylinux_aarch64", "cp311-manylinux_aarch64"]#, "cp312-manylinux_aarch64"] + cibw-build: ["cp39-manylinux_aarch64", "cp310-manylinux_aarch64", "cp311-manylinux_aarch64", "cp312-manylinux_aarch64"] diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index f87aef5385898..3bd68c07dcbc3 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -33,7 +33,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -109,7 +109,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -143,7 +143,7 @@ jobs: run: docker image prune -f - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -164,7 +164,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 8715c5306a3b0..2182e89731990 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -27,7 +27,7 @@ jobs: - python steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: github/codeql-action/init@v2 with: languages: ${{ matrix.language }} diff --git a/.github/workflows/comment-commands.yml b/.github/workflows/comment-commands.yml index 2550d4de34a45..55dd733d25b50 100644 --- a/.github/workflows/comment-commands.yml +++ b/.github/workflows/comment-commands.yml @@ -51,7 +51,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml index e05f12ac6416a..deaf2be0a0423 100644 --- a/.github/workflows/docbuild-and-upload.yml +++ b/.github/workflows/docbuild-and-upload.yml @@ -36,7 +36,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index 04abcf4ce8816..64a94d7fde5a9 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -34,7 +34,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -62,7 +62,7 @@ jobs: cancel-in-progress: true steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 6410f2edd6175..f2b426269098b 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -136,7 +136,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -194,7 +194,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -330,7 +330,7 @@ jobs: PYTEST_TARGET: pandas steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 5f541f1bae1fd..83d14b51092e6 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -48,7 +48,7 @@ jobs: sdist_file: ${{ steps.save-path.outputs.sdist_name }} steps: - name: Checkout pandas - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -97,14 +97,13 @@ jobs: - [macos-12, macosx_*] - [windows-2022, win_amd64] # TODO: support PyPy? - # TODO: Enable Python 3.12 wheels when numpy releases a version that supports Python 3.12 - python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"]]#, ["cp312", "3.12"]] + python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"]] env: IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} steps: - name: Checkout pandas - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -150,8 +149,10 @@ jobs: uses: mamba-org/setup-micromamba@v1 with: environment-name: wheel-env + # Use a fixed Python, since we might have an unreleased Python not + # yet present on conda-forge create-args: >- - python=${{ matrix.python[1] }} + python=3.11 anaconda-client wheel cache-downloads: true @@ -167,12 +168,13 @@ jobs: shell: pwsh run: | $TST_CMD = @" - python -m pip install pytz six numpy python-dateutil tzdata>=2022.1 hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17; - python -m pip install --find-links=pandas\wheelhouse --no-index pandas; + python -m pip install hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17; + python -m pip install `$(Get-Item pandas\wheelhouse\*.whl); python -c `'import pandas as pd; pd.test(extra_args=[\"`\"--no-strict-data-files`\"\", \"`\"-m not clipboard and not single_cpu and not slow and not network and not db`\"\"])`'; "@ - docker pull python:${{ matrix.python[1] }}-windowsservercore - docker run --env PANDAS_CI='1' -v ${PWD}:C:\pandas python:${{ matrix.python[1] }}-windowsservercore powershell -Command $TST_CMD + # add rc to the end of the image name if the Python version is unreleased + docker pull python:${{ matrix.python[1] == '3.12' && '3.12-rc' || format('{0}-windowsservercore', matrix.python[1]) }} + docker run --env PANDAS_CI='1' -v ${PWD}:C:\pandas python:${{ matrix.python[1] == '3.12' && '3.12-rc' || format('{0}-windowsservercore', matrix.python[1]) }} powershell -Command $TST_CMD - uses: actions/upload-artifact@v3 with: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9f9bcd78c07b0..c01bf65818167 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -24,7 +24,7 @@ repos: hooks: - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.0.285 + rev: v0.0.287 hooks: - id: ruff args: [--exit-non-zero-on-fix] @@ -34,7 +34,7 @@ repos: alias: ruff-selected-autofixes args: [--select, "ANN001,ANN204", --fix-only, --exit-non-zero-on-fix] - repo: https://github.com/jendrikseipp/vulture - rev: 'v2.7' + rev: 'v2.9.1' hooks: - id: vulture entry: python scripts/run_vulture.py @@ -84,7 +84,7 @@ repos: '--filter=-readability/casting,-runtime/int,-build/include_subdir,-readability/fn_size' ] - repo: https://github.com/pylint-dev/pylint - rev: v3.0.0a6 + rev: v3.0.0a7 hooks: - id: pylint stages: [manual] @@ -124,7 +124,7 @@ repos: types: [text] # overwrite types: [rst] types_or: [python, rst] - repo: https://github.com/sphinx-contrib/sphinx-lint - rev: v0.6.7 + rev: v0.6.8 hooks: - id: sphinx-lint - repo: local diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index 09c4acc0ab309..0229cf15fbfb8 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -90,7 +90,7 @@ def time_setitem(self, multiple_chunks): self.array[i] = "foo" def time_setitem_list(self, multiple_chunks): - indexer = list(range(0, 50)) + list(range(-1000, 0, 50)) + indexer = list(range(50)) + list(range(-1000, 0, 50)) self.array[indexer] = ["foo"] * len(indexer) def time_setitem_slice(self, multiple_chunks): diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 54bcdb0fa2843..04ac47a892a22 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -360,14 +360,14 @@ class MergeCategoricals: def setup(self): self.left_object = DataFrame( { - "X": np.random.choice(range(0, 10), size=(10000,)), + "X": np.random.choice(range(10), size=(10000,)), "Y": np.random.choice(["one", "two", "three"], size=(10000,)), } ) self.right_object = DataFrame( { - "X": np.random.choice(range(0, 10), size=(10000,)), + "X": np.random.choice(range(10), size=(10000,)), "Z": np.random.choice(["jjj", "kkk", "sss"], size=(10000,)), } ) diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 2190136220c6c..927003b13d6be 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -46,6 +46,7 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.5 - pytables>=3.7.0 + - python-calamine>=0.1.6 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index cf85345cb0cc2..00df41cce3bae 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -47,6 +47,7 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.5 - pytables>=3.7.0 + - python-calamine>=0.1.6 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 3c1630714a041..d50ea20da1e0c 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -46,6 +46,7 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.5 # - pytables>=3.7.0, 3.8.0 is first version that supports 3.11 + - python-calamine>=0.1.6 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index b1cea49e22d15..10862630bd596 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -48,6 +48,7 @@ dependencies: - pymysql=1.0.2 - pyreadstat=1.1.5 - pytables=3.7.0 + - python-calamine=0.1.6 - pyxlsb=1.0.9 - s3fs=2022.05.0 - scipy=1.8.1 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index b8a119ece4b03..904b55a813a9f 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -46,6 +46,7 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.5 - pytables>=3.7.0 + - python-calamine>=0.1.6 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index 71686837451b4..4060cea73e7f6 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -47,6 +47,7 @@ dependencies: - pymysql>=1.0.2 # - pyreadstat>=1.1.5 not available on ARM - pytables>=3.7.0 + - python-calamine>=0.1.6 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 diff --git a/doc/cheatsheet/README.md b/doc/cheatsheet/README.md new file mode 100644 index 0000000000000..6c33de104ed90 --- /dev/null +++ b/doc/cheatsheet/README.md @@ -0,0 +1,22 @@ +# Pandas Cheat Sheet + +The Pandas Cheat Sheet was created using Microsoft Powerpoint 2013. +To create the PDF version, within Powerpoint, simply do a "Save As" +and pick "PDF" as the format. + +This cheat sheet, originally written by Irv Lustig, [Princeton Consultants](https://www.princetonoptimization.com/), was inspired by the [RStudio Data Wrangling Cheatsheet](https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf). + +| Topic | PDF | PPT | +|------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Pandas_Cheat_Sheet | | | +| Pandas_Cheat_Sheet_JA | | | + + +**Alternative** + +Alternatively, if you want to complement your learning, you can use the Pandas Cheat sheets +developed by [DataCamp](https://www.datacamp.com/) in "PDF", "Google Colab" and "Streamlit" formats. + +| Topic | PDF | Streamlit | Google Colab | +|-------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Pandas | | | Open In Colab | diff --git a/doc/cheatsheet/README.txt b/doc/cheatsheet/README.txt deleted file mode 100644 index c57da38b31777..0000000000000 --- a/doc/cheatsheet/README.txt +++ /dev/null @@ -1,8 +0,0 @@ -The Pandas Cheat Sheet was created using Microsoft Powerpoint 2013. -To create the PDF version, within Powerpoint, simply do a "Save As" -and pick "PDF" as the format. - -This cheat sheet was inspired by the RStudio Data Wrangling Cheatsheet[1], written by Irv Lustig, Princeton Consultants[2]. - -[1]: https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf -[2]: https://www.princetonoptimization.com/ diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index ae7c9d4ea9c62..2c0787397e047 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -281,6 +281,7 @@ xlrd 2.0.1 excel Reading Excel xlsxwriter 3.0.3 excel Writing Excel openpyxl 3.0.10 excel Reading / writing for xlsx files pyxlsb 1.0.9 excel Reading for xlsb files +python-calamine 0.1.6 excel Reading for xls/xlsx/xlsb/ods files ========================= ================== =============== ============================================================= HTML diff --git a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst index 2dcc8b0abe3b8..caaff3557ae40 100644 --- a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst +++ b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst @@ -106,9 +106,9 @@ between square brackets ``[]``. .. note:: - If you are familiar to Python + If you are familiar with Python :ref:`dictionaries `, the selection of a - single column is very similar to selection of dictionary values based on + single column is very similar to the selection of dictionary values based on the key. You can create a ``Series`` from scratch as well: diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index c0d2a14507383..002e88533ab93 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -459,7 +459,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to df # List the size of the animals with the highest weight. - df.groupby("animal").apply(lambda subf: subf["size"][subf["weight"].idxmax()]) + df.groupby("animal").apply(lambda subf: subf["size"][subf["weight"].idxmax()], include_groups=False) `Using get_group `__ @@ -482,7 +482,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to return pd.Series(["L", avg_weight, True], index=["size", "weight", "adult"]) - expected_df = gb.apply(GrowUp) + expected_df = gb.apply(GrowUp, include_groups=False) expected_df `Expanding apply diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index c28123cec4491..5dd14e243fbb3 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -420,6 +420,12 @@ This is mainly syntactic sugar for the alternative, which is much more verbose: Additionally, this method avoids recomputing the internal grouping information derived from the passed key. +You can also include the grouping columns if you want to operate on them. + +.. ipython:: python + + grouped[["A", "B"]].sum() + .. _groupby.iterating-label: Iterating through groups @@ -1053,7 +1059,7 @@ missing values with the ``ffill()`` method. ).set_index("date") df_re - df_re.groupby("group").resample("1D").ffill() + df_re.groupby("group").resample("1D", include_groups=False).ffill() .. _groupby.filter: @@ -1219,13 +1225,13 @@ the argument ``group_keys`` which defaults to ``True``. Compare .. ipython:: python - df.groupby("A", group_keys=True).apply(lambda x: x) + df.groupby("A", group_keys=True).apply(lambda x: x, include_groups=False) with .. ipython:: python - df.groupby("A", group_keys=False).apply(lambda x: x) + df.groupby("A", group_keys=False).apply(lambda x: x, include_groups=False) Numba Accelerated Routines @@ -1709,7 +1715,7 @@ column index name will be used as the name of the inserted column: result = {"b_sum": x["b"].sum(), "c_mean": x["c"].mean()} return pd.Series(result, name="metrics") - result = df.groupby("a").apply(compute_metrics) + result = df.groupby("a").apply(compute_metrics, include_groups=False) result diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 17f52c98a741b..3408da439d0ef 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1811,8 +1811,8 @@ Writing JSON A ``Series`` or ``DataFrame`` can be converted to a valid JSON string. Use ``to_json`` with optional parameters: -* ``path_or_buf`` : the pathname or buffer to write the output - This can be ``None`` in which case a JSON string is returned +* ``path_or_buf`` : the pathname or buffer to write the output. + This can be ``None`` in which case a JSON string is returned. * ``orient`` : ``Series``: @@ -3454,7 +3454,8 @@ Excel files The :func:`~pandas.read_excel` method can read Excel 2007+ (``.xlsx``) files using the ``openpyxl`` Python module. Excel 2003 (``.xls``) files can be read using ``xlrd``. Binary Excel (``.xlsb``) -files can be read using ``pyxlsb``. +files can be read using ``pyxlsb``. All formats can be read +using :ref:`calamine` engine. The :meth:`~DataFrame.to_excel` instance method is used for saving a ``DataFrame`` to Excel. Generally the semantics are similar to working with :ref:`csv` data. @@ -3495,6 +3496,9 @@ using internally. * For the engine odf, pandas is using :func:`odf.opendocument.load` to read in (``.ods``) files. +* For the engine calamine, pandas is using :func:`python_calamine.load_workbook` + to read in (``.xlsx``), (``.xlsm``), (``.xls``), (``.xlsb``), (``.ods``) files. + .. code-block:: python # Returns a DataFrame @@ -3936,7 +3940,8 @@ The :func:`~pandas.read_excel` method can also read binary Excel files using the ``pyxlsb`` module. The semantics and features for reading binary Excel files mostly match what can be done for `Excel files`_ using ``engine='pyxlsb'``. ``pyxlsb`` does not recognize datetime types -in files and will return floats instead. +in files and will return floats instead (you can use :ref:`calamine` +if you need recognize datetime types). .. code-block:: python @@ -3948,6 +3953,20 @@ in files and will return floats instead. Currently pandas only supports *reading* binary Excel files. Writing is not implemented. +.. _io.calamine: + +Calamine (Excel and ODS files) +------------------------------ + +The :func:`~pandas.read_excel` method can read Excel file (``.xlsx``, ``.xlsm``, ``.xls``, ``.xlsb``) +and OpenDocument spreadsheets (``.ods``) using the ``python-calamine`` module. +This module is a binding for Rust library `calamine `__ +and is faster than other engines in most cases. The optional dependency 'python-calamine' needs to be installed. + +.. code-block:: python + + # Returns a DataFrame + pd.read_excel("path_to_file.xlsb", engine="calamine") .. _io.clipboard: diff --git a/doc/source/whatsnew/v0.10.0.rst b/doc/source/whatsnew/v0.10.0.rst index 3425986a37743..422efc1b36946 100644 --- a/doc/source/whatsnew/v0.10.0.rst +++ b/doc/source/whatsnew/v0.10.0.rst @@ -180,19 +180,36 @@ labeled the aggregated group with the end of the interval: the next day). DataFrame constructor with no columns specified. The v0.9.0 behavior (names ``X0``, ``X1``, ...) can be reproduced by specifying ``prefix='X'``: -.. ipython:: python - :okexcept: - - import io - - data = """ - a,b,c - 1,Yes,2 - 3,No,4 - """ - print(data) - pd.read_csv(io.StringIO(data), header=None) - pd.read_csv(io.StringIO(data), header=None, prefix="X") +.. code-block:: ipython + + In [6]: import io + + In [7]: data = """ + ...: a,b,c + ...: 1,Yes,2 + ...: 3,No,4 + ...: """ + ...: + + In [8]: print(data) + + a,b,c + 1,Yes,2 + 3,No,4 + + In [9]: pd.read_csv(io.StringIO(data), header=None) + Out[9]: + 0 1 2 + 0 a b c + 1 1 Yes 2 + 2 3 No 4 + + In [10]: pd.read_csv(io.StringIO(data), header=None, prefix="X") + Out[10]: + X0 X1 X2 + 0 a b c + 1 1 Yes 2 + 2 3 No 4 - Values like ``'Yes'`` and ``'No'`` are not interpreted as boolean by default, though this can be controlled by new ``true_values`` and ``false_values`` diff --git a/doc/source/whatsnew/v0.14.0.rst b/doc/source/whatsnew/v0.14.0.rst index 92c37243b7e81..9c537b3a48c74 100644 --- a/doc/source/whatsnew/v0.14.0.rst +++ b/doc/source/whatsnew/v0.14.0.rst @@ -328,13 +328,24 @@ More consistent behavior for some groupby methods: - groupby ``head`` and ``tail`` now act more like ``filter`` rather than an aggregation: - .. ipython:: python + .. code-block:: ipython - df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') - g.head(1) # filters DataFrame + In [1]: df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) - g.apply(lambda x: x.head(1)) # used to simply fall-through + In [2]: g = df.groupby('A') + + In [3]: g.head(1) # filters DataFrame + Out[3]: + A B + 0 1 2 + 2 5 6 + + In [4]: g.apply(lambda x: x.head(1)) # used to simply fall-through + Out[4]: + A B + A + 1 0 1 2 + 5 2 5 6 - groupby head and tail respect column selection: diff --git a/doc/source/whatsnew/v0.15.2.rst b/doc/source/whatsnew/v0.15.2.rst index bb7beef449d93..acc5409b86d09 100644 --- a/doc/source/whatsnew/v0.15.2.rst +++ b/doc/source/whatsnew/v0.15.2.rst @@ -24,25 +24,61 @@ API changes - Indexing in ``MultiIndex`` beyond lex-sort depth is now supported, though a lexically sorted index will have a better performance. (:issue:`2646`) - .. ipython:: python - :okexcept: - :okwarning: + .. code-block:: ipython + + In [1]: df = pd.DataFrame({'jim':[0, 0, 1, 1], + ...: 'joe':['x', 'x', 'z', 'y'], + ...: 'jolie':np.random.rand(4)}).set_index(['jim', 'joe']) + ...: - df = pd.DataFrame({'jim':[0, 0, 1, 1], - 'joe':['x', 'x', 'z', 'y'], - 'jolie':np.random.rand(4)}).set_index(['jim', 'joe']) - df - df.index.lexsort_depth + In [2]: df + Out[2]: + jolie + jim joe + 0 x 0.126970 + x 0.966718 + 1 z 0.260476 + y 0.897237 + + [4 rows x 1 columns] + + In [3]: df.index.lexsort_depth + Out[3]: 1 # in prior versions this would raise a KeyError # will now show a PerformanceWarning - df.loc[(1, 'z')] + In [4]: df.loc[(1, 'z')] + Out[4]: + jolie + jim joe + 1 z 0.260476 + + [1 rows x 1 columns] # lexically sorting - df2 = df.sort_index() - df2 - df2.index.lexsort_depth - df2.loc[(1,'z')] + In [5]: df2 = df.sort_index() + + In [6]: df2 + Out[6]: + jolie + jim joe + 0 x 0.126970 + x 0.966718 + 1 y 0.897237 + z 0.260476 + + [4 rows x 1 columns] + + In [7]: df2.index.lexsort_depth + Out[7]: 2 + + In [8]: df2.loc[(1,'z')] + Out[8]: + jolie + jim joe + 1 z 0.260476 + + [1 rows x 1 columns] - Bug in unique of Series with ``category`` dtype, which returned all categories regardless whether they were "used" or not (see :issue:`8559` for the discussion). diff --git a/doc/source/whatsnew/v0.18.1.rst b/doc/source/whatsnew/v0.18.1.rst index 7d9008fdbdecd..ee6a60144bc35 100644 --- a/doc/source/whatsnew/v0.18.1.rst +++ b/doc/source/whatsnew/v0.18.1.rst @@ -77,9 +77,52 @@ Previously you would have to do this to get a rolling window mean per-group: df = pd.DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}) df -.. ipython:: python +.. code-block:: ipython - df.groupby("A").apply(lambda x: x.rolling(4).B.mean()) + In [1]: df.groupby("A").apply(lambda x: x.rolling(4).B.mean()) + Out[1]: + A + 1 0 NaN + 1 NaN + 2 NaN + 3 1.5 + 4 2.5 + 5 3.5 + 6 4.5 + 7 5.5 + 8 6.5 + 9 7.5 + 10 8.5 + 11 9.5 + 12 10.5 + 13 11.5 + 14 12.5 + 15 13.5 + 16 14.5 + 17 15.5 + 18 16.5 + 19 17.5 + 2 20 NaN + 21 NaN + 22 NaN + 23 21.5 + 24 22.5 + 25 23.5 + 26 24.5 + 27 25.5 + 28 26.5 + 29 27.5 + 30 28.5 + 31 29.5 + 3 32 NaN + 33 NaN + 34 NaN + 35 33.5 + 36 34.5 + 37 35.5 + 38 36.5 + 39 37.5 + Name: B, dtype: float64 Now you can do: @@ -101,15 +144,53 @@ For ``.resample(..)`` type of operations, previously you would have to: df -.. ipython:: python +.. code-block:: ipython - df.groupby("group").apply(lambda x: x.resample("1D").ffill()) + In[1]: df.groupby("group").apply(lambda x: x.resample("1D").ffill()) + Out[1]: + group val + group date + 1 2016-01-03 1 5 + 2016-01-04 1 5 + 2016-01-05 1 5 + 2016-01-06 1 5 + 2016-01-07 1 5 + 2016-01-08 1 5 + 2016-01-09 1 5 + 2016-01-10 1 6 + 2 2016-01-17 2 7 + 2016-01-18 2 7 + 2016-01-19 2 7 + 2016-01-20 2 7 + 2016-01-21 2 7 + 2016-01-22 2 7 + 2016-01-23 2 7 + 2016-01-24 2 8 Now you can do: -.. ipython:: python +.. code-block:: ipython - df.groupby("group").resample("1D").ffill() + In[1]: df.groupby("group").resample("1D").ffill() + Out[1]: + group val + group date + 1 2016-01-03 1 5 + 2016-01-04 1 5 + 2016-01-05 1 5 + 2016-01-06 1 5 + 2016-01-07 1 5 + 2016-01-08 1 5 + 2016-01-09 1 5 + 2016-01-10 1 6 + 2 2016-01-17 2 7 + 2016-01-18 2 7 + 2016-01-19 2 7 + 2016-01-20 2 7 + 2016-01-21 2 7 + 2016-01-22 2 7 + 2016-01-23 2 7 + 2016-01-24 2 8 .. _whatsnew_0181.enhancements.method_chain: diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 4ad5f4e7b5c3d..b013d03b2d68c 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -286,6 +286,7 @@ value. (:issue:`17054`) .. ipython:: python + from io import StringIO result = pd.read_html(StringIO(""" diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index 1ef60960a51c3..6d5da7cdff3b3 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -13,15 +13,30 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed regression in :func:`concat` when :class:`DataFrame` 's have two different extension dtypes (:issue:`54848`) +- Fixed regression in :func:`merge` when merging over a PyArrow string index (:issue:`54894`) - Fixed regression in :func:`read_csv` when ``usecols`` is given and ``dtypes`` is a dict for ``engine="python"`` (:issue:`54868`) +- Fixed regression in :func:`read_csv` when ``delim_whitespace`` is True (:issue:`54918`, :issue:`54931`) +- Fixed regression in :meth:`.GroupBy.get_group` raising for ``axis=1`` (:issue:`54858`) - Fixed regression in :meth:`DataFrame.__setitem__` raising ``AssertionError`` when setting a :class:`Series` with a partial :class:`MultiIndex` (:issue:`54875`) +- Fixed regression in :meth:`DataFrame.filter` not respecting the order of elements for ``filter`` (:issue:`54980`) +- Fixed regression in :meth:`DataFrame.to_sql` not roundtripping datetime columns correctly for sqlite (:issue:`54877`) +- Fixed regression in :meth:`DataFrameGroupBy.agg` when aggregating a DataFrame with duplicate column names using a dictionary (:issue:`55006`) +- Fixed regression in :meth:`MultiIndex.append` raising when appending overlapping :class:`IntervalIndex` levels (:issue:`54934`) +- Fixed regression in :meth:`Series.drop_duplicates` for PyArrow strings (:issue:`54904`) +- Fixed regression in :meth:`Series.interpolate` raising when ``fill_value`` was given (:issue:`54920`) +- Fixed regression in :meth:`Series.value_counts` raising for numeric data if ``bins`` was specified (:issue:`54857`) +- Fixed regression when comparing a :class:`Series` with ``datetime64`` dtype with ``None`` (:issue:`54870`) .. --------------------------------------------------------------------------- .. _whatsnew_211.bug_fixes: Bug fixes ~~~~~~~~~ -- +- Fixed bug for :class:`ArrowDtype` raising ``NotImplementedError`` for fixed-size list (:issue:`55000`) +- Fixed bug in :meth:`DataFrame.stack` with ``future_stack=True`` and columns a non-:class:`MultiIndex` consisting of tuples (:issue:`54948`) +- Fixed bug in :meth:`Series.dt.tz` with :class:`ArrowDtype` where a string was returned instead of a ``tzinfo`` object (:issue:`55003`) +- Fixed bug in :meth:`Series.pct_change` and :meth:`DataFrame.pct_change` showing unnecessary ``FutureWarning`` (:issue:`54981`) .. --------------------------------------------------------------------------- .. _whatsnew_211.other: diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index c6d8ad8be62c4..b79d5341d226a 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -14,10 +14,27 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ -.. _whatsnew_220.enhancements.enhancement1: +.. _whatsnew_220.enhancements.calamine: + +Calamine engine for :func:`read_excel` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``calamine`` engine was added to :func:`read_excel`. +It uses ``python-calamine``, which provides Python bindings for the Rust library `calamine `__. +This engine supports Excel files (``.xlsx``, ``.xlsm``, ``.xls``, ``.xlsb``) and OpenDocument spreadsheets (``.ods``) (:issue:`50395`). + +There are two advantages of this engine: + +1. Calamine is often faster than other engines, some benchmarks show results up to 5x faster than 'openpyxl', 20x - 'odf', 4x - 'pyxlsb', and 1.5x - 'xlrd'. + But, 'openpyxl' and 'pyxlsb' are faster in reading a few rows from large files because of lazy iteration over rows. +2. Calamine supports the recognition of datetime in ``.xlsb`` files, unlike 'pyxlsb' which is the only other engine in pandas that can read ``.xlsb`` files. + +.. code-block:: python + + pd.read_excel("path_to_file.xlsb", engine="calamine") -enhancement1 -^^^^^^^^^^^^ + +For more, see :ref:`io.calamine` in the user guide on IO tools. .. _whatsnew_220.enhancements.enhancement2: @@ -28,7 +45,7 @@ enhancement2 Other enhancements ^^^^^^^^^^^^^^^^^^ -- +- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) - .. --------------------------------------------------------------------------- @@ -147,19 +164,22 @@ Deprecations - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_string` except ``buf``. (:issue:`54229`) - Deprecated allowing passing :class:`BlockManager` objects to :class:`DataFrame` or :class:`SingleBlockManager` objects to :class:`Series` (:issue:`52419`) - Deprecated downcasting behavior in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, :meth:`DataFrame.mask`, :meth:`Series.clip`, :meth:`DataFrame.clip`; in a future version these will not infer object-dtype columns to non-object dtype, or all-round floats to integer dtype. Call ``result.infer_objects(copy=False)`` on the result for object inference, or explicitly cast floats to ints. To opt in to the future version, use ``pd.set_option("future.downcasting", True)`` (:issue:`53656`) +- Deprecated including the groups in computations when using :meth:`DataFrameGroupBy.apply` and :meth:`DataFrameGroupBy.resample`; pass ``include_groups=False`` to exclude the groups (:issue:`7155`) - Deprecated not passing a tuple to :class:`DataFrameGroupBy.get_group` or :class:`SeriesGroupBy.get_group` when grouping by a length-1 list-like (:issue:`25971`) - Deprecated strings ``S``, ``U``, and ``N`` denoting units in :func:`to_timedelta` (:issue:`52536`) - Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`52536`) - Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`52536`) - Deprecated the extension test classes ``BaseNoReduceTests``, ``BaseBooleanReduceTests``, and ``BaseNumericReduceTests``, use ``BaseReduceTests`` instead (:issue:`54663`) -- .. --------------------------------------------------------------------------- .. _whatsnew_220.performance: Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`, :issue:`54883`) +- Performance improvement in :func:`concat` with ``axis=1`` and objects with unaligned indexes (:issue:`55084`) +- Performance improvement in :func:`to_dict` on converting DataFrame to dictionary (:issue:`50990`) +- Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`) +- Performance improvement in :meth:`Index.difference` (:issue:`55108`) - Performance improvement when indexing with more than 4 keys (:issue:`54550`) - @@ -169,10 +189,12 @@ Performance improvements Bug fixes ~~~~~~~~~ - Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`) +- Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) +- Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) Categorical ^^^^^^^^^^^ -- +- :meth:`Categorical.isin` raising ``InvalidIndexError`` for categorical containing overlapping :class:`Interval` values (:issue:`34974`) - Datetimelike @@ -227,7 +249,9 @@ MultiIndex I/O ^^^ +- Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`) - Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`) +- Bug in :func:`to_excel`, with ``OdsWriter`` (``ods`` files) writing boolean/string value (:issue:`54994`) Period ^^^^^^ @@ -246,8 +270,8 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ -- -- +- Bug in :func:`concat` ignoring ``sort`` parameter when passed :class:`DatetimeIndex` indexes (:issue:`54769`) +- Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`) Sparse ^^^^^^ diff --git a/environment.yml b/environment.yml index 1a9dffb55bca7..8deae839f5408 100644 --- a/environment.yml +++ b/environment.yml @@ -47,6 +47,7 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.5 - pytables>=3.7.0 + - python-calamine>=0.1.6 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 @@ -105,7 +106,7 @@ dependencies: - ipykernel # web - - jinja2 # in optional dependencies, but documented here as needed + # - jinja2 # already listed in optional dependencies, but documented here for reference - markdown - feedparser - pyyaml diff --git a/generate_version.py b/generate_version.py index 46e9f52bfc5de..06e38ce0fd978 100644 --- a/generate_version.py +++ b/generate_version.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + # Note: This file has to live next to setup.py or versioneer will not work import argparse import os diff --git a/meson.build b/meson.build index 09a1494135af4..e0e533ffade97 100644 --- a/meson.build +++ b/meson.build @@ -2,19 +2,17 @@ project( 'pandas', 'c', 'cpp', 'cython', - version: run_command(['python', 'generate_version.py', '--print'], check: true).stdout().strip(), + version: run_command(['generate_version.py', '--print'], check: true).stdout().strip(), license: 'BSD-3', meson_version: '>=1.0.1', default_options: [ 'buildtype=release', - # TODO: Reactivate werror, some warnings on Windows - #'werror=true', 'c_std=c99' ] ) fs = import('fs') -py = import('python').find_installation() +py = import('python').find_installation(pure: false) tempita = files('generate_pxi.py') versioneer = files('generate_version.py') @@ -30,7 +28,7 @@ add_project_arguments('-DNPY_TARGET_VERSION=NPY_1_21_API_VERSION', language : 'c if fs.exists('_version_meson.py') - py.install_sources('_version_meson.py', pure: false, subdir: 'pandas') + py.install_sources('_version_meson.py', subdir: 'pandas') else custom_target('write_version_file', output: '_version_meson.py', @@ -40,11 +38,15 @@ else build_by_default: true, build_always_stale: true, install: true, - install_dir: py.get_install_dir(pure: false) / 'pandas' + install_dir: py.get_install_dir() / 'pandas' ) meson.add_dist_script(py, versioneer, '-o', '_version_meson.py') endif # Needed by pandas.test() when it looks for the pytest ini options -py.install_sources('pyproject.toml', pure: false, subdir: 'pandas') +py.install_sources( + 'pyproject.toml', + subdir: 'pandas' +) + subdir('pandas') diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index f302c649bc7bd..1cf2c4343d844 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -69,7 +69,8 @@ libs_sources = { 'index': {'sources': ['index.pyx', _index_class_helper]}, 'indexing': {'sources': ['indexing.pyx']}, 'internals': {'sources': ['internals.pyx']}, - 'interval': {'sources': ['interval.pyx', _intervaltree_helper]}, + 'interval': {'sources': ['interval.pyx', _intervaltree_helper], + 'deps': _khash_primitive_helper_dep}, 'join': {'sources': ['join.pyx', _khash_primitive_helper], 'deps': _khash_primitive_helper_dep}, 'lib': {'sources': ['lib.pyx', 'src/parser/tokenizer.c']}, @@ -113,8 +114,9 @@ foreach ext_name, ext_dict : libs_sources ) endforeach -py.install_sources('__init__.py', - pure: false, - subdir: 'pandas/_libs') +py.install_sources( + '__init__.py', + subdir: 'pandas/_libs' +) subdir('window') diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 6d66e21ce49f5..5f51f48b43ca9 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -6,7 +6,6 @@ from csv import ( QUOTE_NONE, QUOTE_NONNUMERIC, ) -import sys import time import warnings @@ -880,9 +879,15 @@ cdef class TextReader: cdef _check_tokenize_status(self, int status): if self.parser.warn_msg != NULL: - print(PyUnicode_DecodeUTF8( - self.parser.warn_msg, strlen(self.parser.warn_msg), - self.encoding_errors), file=sys.stderr) + warnings.warn( + PyUnicode_DecodeUTF8( + self.parser.warn_msg, + strlen(self.parser.warn_msg), + self.encoding_errors + ), + ParserWarning, + stacklevel=find_stack_level() + ) free(self.parser.warn_msg) self.parser.warn_msg = NULL diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index abd3fb9e1fef3..ce8a38df172ef 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -664,7 +664,8 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes, ((!self->delim_whitespace && c == ' ' && self->skipinitialspace)) // applied when in a field -#define IS_DELIMITER(c) ((c == delimiter) || (delim_whitespace && isblank(c))) +#define IS_DELIMITER(c) \ + ((!delim_whitespace && c == delimiter) || (delim_whitespace && isblank(c))) #define _TOKEN_CLEANUP() \ self->stream_len = slen; \ diff --git a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c index e3e710ce1b876..942bd0b518144 100644 --- a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c @@ -44,6 +44,7 @@ Numeric decoder derived from TCL library #include #include #include +#include #include #include #include @@ -763,7 +764,12 @@ void Buffer_AppendIntUnchecked(JSONObjectEncoder *enc, JSINT32 value) { void Buffer_AppendLongUnchecked(JSONObjectEncoder *enc, JSINT64 value) { char *wstr; - JSUINT64 uvalue = (value < 0) ? -value : value; + JSUINT64 uvalue; + if (value == INT64_MIN) { + uvalue = INT64_MAX + UINT64_C(1); + } else { + uvalue = (value < 0) ? -value : value; + } wstr = enc->offset; // Conversion. Number is reversed. diff --git a/pandas/_libs/tslibs/meson.build b/pandas/_libs/tslibs/meson.build index 14d2eef46da20..167695b84514c 100644 --- a/pandas/_libs/tslibs/meson.build +++ b/pandas/_libs/tslibs/meson.build @@ -31,6 +31,7 @@ foreach ext_name, ext_dict : tslibs_sources ) endforeach -py.install_sources('__init__.py', - pure: false, - subdir: 'pandas/_libs/tslibs') +py.install_sources( + '__init__.py', + subdir: 'pandas/_libs/tslibs' +) diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 7b2ee68c73ad2..c3ee68e14a8d4 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -1,4 +1,3 @@ -cimport cython from cpython.datetime cimport ( PyDateTime_CheckExact, PyDateTime_DATE_GET_HOUR, @@ -18,6 +17,7 @@ from cpython.object cimport ( Py_LT, Py_NE, ) +from libc.stdint cimport INT64_MAX import_datetime() PandasDateTime_IMPORT @@ -545,7 +545,6 @@ cdef ndarray astype_round_check( return iresult -@cython.overflowcheck(True) cdef int64_t get_conversion_factor( NPY_DATETIMEUNIT from_unit, NPY_DATETIMEUNIT to_unit @@ -553,6 +552,7 @@ cdef int64_t get_conversion_factor( """ Find the factor by which we need to multiply to convert from from_unit to to_unit. """ + cdef int64_t value, overflow_limit, factor if ( from_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC or to_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC @@ -565,28 +565,44 @@ cdef int64_t get_conversion_factor( return 1 if from_unit == NPY_DATETIMEUNIT.NPY_FR_W: - return 7 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_D, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_D, to_unit) + factor = 7 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_D: - return 24 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_h, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_h, to_unit) + factor = 24 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_h: - return 60 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_m, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_m, to_unit) + factor = 60 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_m: - return 60 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_s, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_s, to_unit) + factor = 60 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_s: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ms, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ms, to_unit) + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ms: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_us, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_us, to_unit) + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_us: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ns, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ns, to_unit) + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ns: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ps, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ps, to_unit) + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ps: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_fs, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_fs, to_unit) + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_fs: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_as, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_as, to_unit) + factor = 1000 else: raise ValueError("Converting from M or Y units is not supported.") + overflow_limit = INT64_MAX // factor + if value > overflow_limit or value < -overflow_limit: + raise OverflowError("result would overflow") + + return factor * value + cdef int64_t convert_reso( int64_t value, @@ -595,7 +611,7 @@ cdef int64_t convert_reso( bint round_ok, ) except? -1: cdef: - int64_t res_value, mult, div, mod + int64_t res_value, mult, div, mod, overflow_limit if from_reso == to_reso: return value @@ -624,9 +640,12 @@ cdef int64_t convert_reso( else: # e.g. ns -> us, risk of overflow, but no risk of lossy rounding mult = get_conversion_factor(from_reso, to_reso) - with cython.overflowcheck(True): + overflow_limit = INT64_MAX // mult + if value > overflow_limit or value < -overflow_limit: # Note: caller is responsible for re-raising as OutOfBoundsTimedelta - res_value = value * mult + raise OverflowError("result would overflow") + + res_value = value * mult return res_value diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi index 8826757e31c32..c85865fea8fd0 100644 --- a/pandas/_libs/tslibs/period.pyi +++ b/pandas/_libs/tslibs/period.pyi @@ -89,7 +89,7 @@ class Period(PeriodMixin): @classmethod def _from_ordinal(cls, ordinal: int, freq) -> Period: ... @classmethod - def now(cls, freq: BaseOffset = ...) -> Period: ... + def now(cls, freq: Frequency = ...) -> Period: ... def strftime(self, fmt: str) -> str: ... def to_timestamp( self, diff --git a/pandas/_libs/tslibs/timedeltas.pyi b/pandas/_libs/tslibs/timedeltas.pyi index aba9b25b23154..6d993722ce1d4 100644 --- a/pandas/_libs/tslibs/timedeltas.pyi +++ b/pandas/_libs/tslibs/timedeltas.pyi @@ -14,6 +14,7 @@ from pandas._libs.tslibs import ( Tick, ) from pandas._typing import ( + Frequency, Self, npt, ) @@ -117,9 +118,9 @@ class Timedelta(timedelta): @property def asm8(self) -> np.timedelta64: ... # TODO: round/floor/ceil could return NaT? - def round(self, freq: str) -> Self: ... - def floor(self, freq: str) -> Self: ... - def ceil(self, freq: str) -> Self: ... + def round(self, freq: Frequency) -> Self: ... + def floor(self, freq: Frequency) -> Self: ... + def ceil(self, freq: Frequency) -> Self: ... @property def resolution_string(self) -> str: ... def __add__(self, other: timedelta) -> Timedelta: ... diff --git a/pandas/_libs/tslibs/timestamps.pyi b/pandas/_libs/tslibs/timestamps.pyi index 36ae2d6d892f1..e23f01b800874 100644 --- a/pandas/_libs/tslibs/timestamps.pyi +++ b/pandas/_libs/tslibs/timestamps.pyi @@ -8,6 +8,8 @@ from datetime import ( from time import struct_time from typing import ( ClassVar, + Literal, + TypeAlias, TypeVar, overload, ) @@ -27,6 +29,7 @@ from pandas._typing import ( ) _DatetimeT = TypeVar("_DatetimeT", bound=datetime) +_TimeZones: TypeAlias = str | _tzinfo | None | int def integer_op_not_supported(obj: object) -> TypeError: ... @@ -51,13 +54,13 @@ class Timestamp(datetime): tzinfo: _tzinfo | None = ..., *, nanosecond: int | None = ..., - tz: str | _tzinfo | None | int = ..., + tz: _TimeZones = ..., unit: str | int | None = ..., fold: int | None = ..., ) -> _DatetimeT | NaTType: ... @classmethod def _from_value_and_reso( - cls, value: int, reso: int, tz: _tzinfo | None + cls, value: int, reso: int, tz: _TimeZones ) -> Timestamp: ... @property def value(self) -> int: ... # np.int64 @@ -84,19 +87,19 @@ class Timestamp(datetime): @property def fold(self) -> int: ... @classmethod - def fromtimestamp(cls, ts: float, tz: _tzinfo | None = ...) -> Self: ... + def fromtimestamp(cls, ts: float, tz: _TimeZones = ...) -> Self: ... @classmethod def utcfromtimestamp(cls, ts: float) -> Self: ... @classmethod - def today(cls, tz: _tzinfo | str | None = ...) -> Self: ... + def today(cls, tz: _TimeZones = ...) -> Self: ... @classmethod def fromordinal( cls, ordinal: int, - tz: _tzinfo | str | None = ..., + tz: _TimeZones = ..., ) -> Self: ... @classmethod - def now(cls, tz: _tzinfo | str | None = ...) -> Self: ... + def now(cls, tz: _TimeZones = ...) -> Self: ... @classmethod def utcnow(cls) -> Self: ... # error: Signature of "combine" incompatible with supertype "datetime" @@ -131,7 +134,7 @@ class Timestamp(datetime): fold: int | None = ..., ) -> Self: ... # LSP violation: datetime.datetime.astimezone has a default value for tz - def astimezone(self, tz: _tzinfo | None) -> Self: ... # type: ignore[override] + def astimezone(self, tz: _TimeZones) -> Self: ... # type: ignore[override] def ctime(self) -> str: ... def isoformat(self, sep: str = ..., timespec: str = ...) -> str: ... @classmethod @@ -184,12 +187,12 @@ class Timestamp(datetime): def to_julian_date(self) -> np.float64: ... @property def asm8(self) -> np.datetime64: ... - def tz_convert(self, tz: _tzinfo | str | None) -> Self: ... + def tz_convert(self, tz: _TimeZones) -> Self: ... # TODO: could return NaT? def tz_localize( self, - tz: _tzinfo | str | None, - ambiguous: str = ..., + tz: _TimeZones, + ambiguous: bool | Literal["raise", "NaT"] = ..., nonexistent: TimestampNonexistent = ..., ) -> Self: ... def normalize(self) -> Self: ... @@ -197,19 +200,19 @@ class Timestamp(datetime): def round( self, freq: str, - ambiguous: bool | str = ..., + ambiguous: bool | Literal["raise", "NaT"] = ..., nonexistent: TimestampNonexistent = ..., ) -> Self: ... def floor( self, freq: str, - ambiguous: bool | str = ..., + ambiguous: bool | Literal["raise", "NaT"] = ..., nonexistent: TimestampNonexistent = ..., ) -> Self: ... def ceil( self, freq: str, - ambiguous: bool | str = ..., + ambiguous: bool | Literal["raise", "NaT"] = ..., nonexistent: TimestampNonexistent = ..., ) -> Self: ... def day_name(self, locale: str | None = ...) -> str: ... diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index 02934346130a5..7b306c5e681e0 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -138,6 +138,8 @@ def calculate_variable_window_bounds( break # end bound is previous end # or current index + elif index[end[i - 1]] == end_bound and not right_closed: + end[i] = end[i - 1] + 1 elif (index[end[i - 1]] - end_bound) * index_growth_sign <= 0: end[i] = i + 1 else: diff --git a/pandas/_typing.py b/pandas/_typing.py index 743815b91210d..c2bbebfbe2857 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -112,7 +112,7 @@ # Cannot use `Sequence` because a string is a sequence, and we don't want to # accept that. Could refine if https://github.com/python/typing/issues/256 is # resolved to differentiate between Sequence[str] and str -ListLike = Union[AnyArrayLike, list, range] +ListLike = Union[AnyArrayLike, list, tuple, range] # scalars diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index be0a762642e46..684e9dccdc0f9 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -30,6 +30,7 @@ pa_version_under9p0, pa_version_under11p0, pa_version_under13p0, + pa_version_under14p0, ) if TYPE_CHECKING: @@ -186,6 +187,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: "pa_version_under9p0", "pa_version_under11p0", "pa_version_under13p0", + "pa_version_under14p0", "IS64", "ISMUSL", "PY310", diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index c5792fa1379fe..fa0e9e974ea39 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -37,6 +37,7 @@ "pyarrow": "7.0.0", "pyreadstat": "1.1.5", "pytest": "7.3.2", + "python-calamine": "0.1.6", "pyxlsb": "1.0.9", "s3fs": "2022.05.0", "scipy": "1.8.1", @@ -62,6 +63,7 @@ "lxml.etree": "lxml", "odf": "odfpy", "pandas_gbq": "pandas-gbq", + "python_calamine": "python-calamine", "sqlalchemy": "SQLAlchemy", "tables": "pytables", } diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 049ce50920e28..12f58be109d98 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -15,6 +15,7 @@ pa_version_under11p0 = _palv < Version("11.0.0") pa_version_under12p0 = _palv < Version("12.0.0") pa_version_under13p0 = _palv < Version("13.0.0") + pa_version_under14p0 = _palv < Version("14.0.0") except ImportError: pa_version_under7p0 = True pa_version_under8p0 = True @@ -23,3 +24,4 @@ pa_version_under11p0 = True pa_version_under12p0 = True pa_version_under13p0 = True + pa_version_under14p0 = True diff --git a/pandas/conftest.py b/pandas/conftest.py index d327f8d619f13..b083317c3174b 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -71,6 +71,7 @@ Index, MultiIndex, ) +from pandas.util.version import Version if TYPE_CHECKING: from collections.abc import ( @@ -192,6 +193,10 @@ def pytest_collection_modifyitems(items, config) -> None: item.add_marker(pytest.mark.arraymanager) +hypothesis_health_checks = [hypothesis.HealthCheck.too_slow] +if Version(hypothesis.__version__) >= Version("6.83.2"): + hypothesis_health_checks.append(hypothesis.HealthCheck.differing_executors) + # Hypothesis hypothesis.settings.register_profile( "ci", @@ -203,7 +208,7 @@ def pytest_collection_modifyitems(items, config) -> None: # 2022-02-09: Changed deadline from 500 -> None. Deadline leads to # non-actionable, flaky CI failures (# GH 24641, 44969, 45118, 44969) deadline=None, - suppress_health_check=(hypothesis.HealthCheck.too_slow,), + suppress_health_check=tuple(hypothesis_health_checks), ) hypothesis.settings.load_profile("ci") diff --git a/pandas/core/_numba/executor.py b/pandas/core/_numba/executor.py index 5cd4779907146..0a26acb7df60a 100644 --- a/pandas/core/_numba/executor.py +++ b/pandas/core/_numba/executor.py @@ -15,6 +15,45 @@ from pandas.compat._optional import import_optional_dependency +@functools.cache +def generate_apply_looper(func, nopython=True, nogil=True, parallel=False): + if TYPE_CHECKING: + import numba + else: + numba = import_optional_dependency("numba") + nb_compat_func = numba.extending.register_jitable(func) + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def nb_looper(values, axis): + # Operate on the first row/col in order to get + # the output shape + if axis == 0: + first_elem = values[:, 0] + dim0 = values.shape[1] + else: + first_elem = values[0] + dim0 = values.shape[0] + res0 = nb_compat_func(first_elem) + # Use np.asarray to get shape for + # https://github.com/numba/numba/issues/4202#issuecomment-1185981507 + buf_shape = (dim0,) + np.atleast_1d(np.asarray(res0)).shape + if axis == 0: + buf_shape = buf_shape[::-1] + buff = np.empty(buf_shape) + + if axis == 1: + buff[0] = res0 + for i in numba.prange(1, values.shape[0]): + buff[i] = nb_compat_func(values[i]) + else: + buff[:, 0] = res0 + for j in numba.prange(1, values.shape[1]): + buff[:, j] = nb_compat_func(values[:, j]) + return buff + + return nb_looper + + @functools.cache def make_looper(func, result_dtype, is_grouped_kernel, nopython, nogil, parallel): if TYPE_CHECKING: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 0a9c1aad46f89..1d74bb8b83e4e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -878,7 +878,9 @@ def value_counts_internal( if bins is not None: from pandas.core.reshape.tile import cut - values = Series(values, copy=False) + if isinstance(values, Series): + values = values._values + try: ii = cut(values, bins, include_lowest=True) except TypeError as err: @@ -998,7 +1000,7 @@ def duplicated( duplicated : ndarray[bool] """ if hasattr(values, "dtype"): - if isinstance(values.dtype, ArrowDtype): + if isinstance(values.dtype, ArrowDtype) and values.dtype.kind in "ifub": values = values._to_masked() # type: ignore[union-attr] if isinstance(values.dtype, BaseMaskedDtype): diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 4d6dd8f4fd577..9748d4fe66739 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -49,6 +49,7 @@ ABCSeries, ) +from pandas.core._numba.executor import generate_apply_looper import pandas.core.common as com from pandas.core.construction import ensure_wrapped_if_datetimelike @@ -80,6 +81,8 @@ def frame_apply( raw: bool = False, result_type: str | None = None, by_row: Literal[False, "compat"] = "compat", + engine: str = "python", + engine_kwargs: dict[str, bool] | None = None, args=None, kwargs=None, ) -> FrameApply: @@ -100,6 +103,8 @@ def frame_apply( raw=raw, result_type=result_type, by_row=by_row, + engine=engine, + engine_kwargs=engine_kwargs, args=args, kwargs=kwargs, ) @@ -436,7 +441,13 @@ def compute_dict_like( Data for result. When aggregating with a Series, this can contain any Python object. """ + from pandas.core.groupby.generic import ( + DataFrameGroupBy, + SeriesGroupBy, + ) + obj = self.obj + is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy)) func = cast(AggFuncTypeDict, self.func) func = self.normalize_dictlike_arg(op_name, selected_obj, func) @@ -450,7 +461,7 @@ def compute_dict_like( colg = obj._gotitem(selection, ndim=1) results = [getattr(colg, op_name)(how, **kwargs) for _, how in func.items()] keys = list(func.keys()) - elif is_non_unique_col: + elif not is_groupby and is_non_unique_col: # key used for column selection and output # GH#51099 results = [] @@ -750,11 +761,15 @@ def __init__( result_type: str | None, *, by_row: Literal[False, "compat"] = False, + engine: str = "python", + engine_kwargs: dict[str, bool] | None = None, args, kwargs, ) -> None: if by_row is not False and by_row != "compat": raise ValueError(f"by_row={by_row} not allowed") + self.engine = engine + self.engine_kwargs = engine_kwargs super().__init__( obj, func, raw, result_type, by_row=by_row, args=args, kwargs=kwargs ) @@ -799,6 +814,12 @@ def values(self): def apply(self) -> DataFrame | Series: """compute the results""" + + if self.engine == "numba" and not self.raw: + raise ValueError( + "The numba engine in DataFrame.apply can only be used when raw=True" + ) + # dispatch to handle list-like or dict-like if is_list_like(self.func): return self.apply_list_or_dict_like() @@ -828,7 +849,7 @@ def apply(self) -> DataFrame | Series: # raw elif self.raw: - return self.apply_raw() + return self.apply_raw(engine=self.engine, engine_kwargs=self.engine_kwargs) return self.apply_standard() @@ -901,7 +922,7 @@ def apply_empty_result(self): else: return self.obj.copy() - def apply_raw(self): + def apply_raw(self, engine="python", engine_kwargs=None): """apply to the values as a numpy array""" def wrap_function(func): @@ -919,7 +940,27 @@ def wrapper(*args, **kwargs): return wrapper - result = np.apply_along_axis(wrap_function(self.func), self.axis, self.values) + if engine == "numba": + engine_kwargs = {} if engine_kwargs is None else engine_kwargs + + # error: Argument 1 to "__call__" of "_lru_cache_wrapper" has + # incompatible type "Callable[..., Any] | str | list[Callable + # [..., Any] | str] | dict[Hashable,Callable[..., Any] | str | + # list[Callable[..., Any] | str]]"; expected "Hashable" + nb_looper = generate_apply_looper( + self.func, **engine_kwargs # type: ignore[arg-type] + ) + result = nb_looper(self.values, self.axis) + # If we made the result 2-D, squeeze it back to 1-D + result = np.squeeze(result) + else: + result = np.apply_along_axis( + wrap_function(self.func), + self.axis, + self.values, + *self.args, + **self.kwargs, + ) # TODO: mixed type case if result.ndim == 2: @@ -1826,12 +1867,12 @@ def warn_alias_replacement( full_alias = alias else: full_alias = f"{type(obj).__name__}.{alias}" - alias = f"'{alias}'" + alias = f'"{alias}"' warnings.warn( f"The provided callable {func} is currently using " f"{full_alias}. In a future version of pandas, " f"the provided callable will be used directly. To keep current " - f"behavior pass {alias} instead.", + f"behavior pass the string {alias} instead.", category=FutureWarning, stacklevel=find_stack_level(), ) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e815b8292b0cc..2b2e0c843564f 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -18,6 +18,7 @@ from pandas._libs.tslibs import ( Timedelta, Timestamp, + timezones, ) from pandas.compat import ( pa_version_under7p0, @@ -29,12 +30,12 @@ from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs +from pandas.core.dtypes.cast import can_hold_element from pandas.core.dtypes.common import ( is_array_like, is_bool_dtype, is_integer, is_list_like, - is_object_dtype, is_scalar, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype @@ -1240,46 +1241,50 @@ def to_numpy( ) -> np.ndarray: if dtype is not None: dtype = np.dtype(dtype) - elif self._hasna: - dtype = np.dtype(object) if na_value is lib.no_default: na_value = self.dtype.na_value pa_type = self._pa_array.type + if not self._hasna or isna(na_value) or pa.types.is_null(pa_type): + data = self + else: + data = self.fillna(na_value) + copy = False + if pa.types.is_timestamp(pa_type) or pa.types.is_duration(pa_type): - result = self._maybe_convert_datelike_array() + result = data._maybe_convert_datelike_array() if dtype is None or dtype.kind == "O": result = result.to_numpy(dtype=object, na_value=na_value) else: result = result.to_numpy(dtype=dtype) - return result elif pa.types.is_time(pa_type) or pa.types.is_date(pa_type): # convert to list of python datetime.time objects before # wrapping in ndarray - result = np.array(list(self), dtype=dtype) - elif is_object_dtype(dtype) and self._hasna: - result = np.empty(len(self), dtype=object) - mask = ~self.isna() - result[mask] = np.asarray(self[mask]._pa_array) - elif pa.types.is_null(self._pa_array.type): - fill_value = None if isna(na_value) else na_value - return np.full(len(self), fill_value=fill_value, dtype=dtype) - elif self._hasna: - data = self.fillna(na_value) + result = np.array(list(data), dtype=dtype) + if data._hasna: + result[data.isna()] = na_value + elif pa.types.is_null(pa_type): + if dtype is not None and isna(na_value): + na_value = None + result = np.full(len(data), fill_value=na_value, dtype=dtype) + elif not data._hasna or (pa.types.is_floating(pa_type) and na_value is np.nan): result = data._pa_array.to_numpy() - if dtype is not None: - result = result.astype(dtype, copy=False) - return result - else: - result = self._pa_array.to_numpy() if dtype is not None: result = result.astype(dtype, copy=False) if copy: result = result.copy() - return result - if self._hasna: - result[self.isna()] = na_value + else: + if dtype is None: + empty = pa.array([], type=pa_type).to_numpy(zero_copy_only=False) + if can_hold_element(empty, na_value): + dtype = empty.dtype + else: + dtype = np.object_ + result = np.empty(len(data), dtype=dtype) + mask = data.isna() + result[mask] = na_value + result[~mask] = data[~mask]._pa_array.to_numpy() return result def unique(self) -> Self: @@ -2188,11 +2193,11 @@ def _str_rstrip(self, to_strip=None): return type(self)(result) def _str_removeprefix(self, prefix: str): - # TODO: Should work once https://github.com/apache/arrow/issues/14991 is fixed - # starts_with = pc.starts_with(self._pa_array, pattern=prefix) - # removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) - # result = pc.if_else(starts_with, removed, self._pa_array) - # return type(self)(result) + if not pa_version_under13p0: + starts_with = pc.starts_with(self._pa_array, pattern=prefix) + removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) + result = pc.if_else(starts_with, removed, self._pa_array) + return type(self)(result) predicate = lambda val: val.removeprefix(prefix) result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) @@ -2421,7 +2426,7 @@ def _dt_time(self): @property def _dt_tz(self): - return self.dtype.pyarrow_dtype.tz + return timezones.maybe_get_tz(self.dtype.pyarrow_dtype.tz) @property def _dt_unit(self): diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 9f63d1f97c54f..8d2633c10b428 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2597,7 +2597,7 @@ def isin(self, values) -> npt.NDArray[np.bool_]: ) values = sanitize_array(values, None, None) null_mask = np.asarray(isna(values)) - code_values = self.categories.get_indexer(values) + code_values = self.categories.get_indexer_for(values) code_values = code_values[null_mask | (code_values >= 0)] return algorithms.isin(self.codes, code_values) @@ -2948,7 +2948,7 @@ def recode_for_categories( return codes indexer = coerce_indexer_dtype( - new_categories.get_indexer(old_categories), new_categories + new_categories.get_indexer_for(old_categories), new_categories ) new_codes = take_nd(indexer, codes, fill_value=-1) return new_codes diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index ff273e221394a..52596f29ffc0c 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -107,6 +107,7 @@ algorithms, missing, nanops, + ops, ) from pandas.core.algorithms import ( checked_add_with_arr, @@ -947,8 +948,12 @@ def _cmp_method(self, other, op): dtype = getattr(other, "dtype", None) if is_object_dtype(dtype): - return op(np.asarray(self, dtype=object), other) - + # We have to use comp_method_OBJECT_ARRAY instead of numpy + # comparison otherwise it would raise when comparing to None + result = ops.comp_method_OBJECT_ARRAY( + op, np.asarray(self.astype(object)), other + ) + return result if other is NaT: if op is operator.ne: result = np.ones(self.shape, dtype=bool) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index c90127c0e9812..693ebad0ca16f 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -123,7 +123,8 @@ def __init__(self, storage=None) -> None: storage = get_option("mode.string_storage") if storage not in {"python", "pyarrow", "pyarrow_numpy"}: raise ValueError( - f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." + f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. " + f"Got {storage} instead." ) if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under7p0: raise ImportError( diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index f438f75707265..6262055827428 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -15,7 +15,10 @@ lib, missing as libmissing, ) -from pandas.compat import pa_version_under7p0 +from pandas.compat import ( + pa_version_under7p0, + pa_version_under13p0, +) from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( @@ -47,6 +50,8 @@ if TYPE_CHECKING: + from collections.abc import Sequence + from pandas._typing import ( Dtype, Scalar, @@ -334,19 +339,13 @@ def _str_startswith(self, pat: str, na=None): result = pc.starts_with(self._pa_array, pattern=pat) if not isna(na): result = result.fill_null(na) - result = self._result_converter(result) - if not isna(na): - result[isna(result)] = bool(na) - return result + return self._result_converter(result) def _str_endswith(self, pat: str, na=None): result = pc.ends_with(self._pa_array, pattern=pat) if not isna(na): result = result.fill_null(na) - result = self._result_converter(result) - if not isna(na): - result[isna(result)] = bool(na) - return result + return self._result_converter(result) def _str_replace( self, @@ -365,6 +364,12 @@ def _str_replace( result = func(self._pa_array, pattern=pat, replacement=repl, max_replacements=n) return type(self)(result) + def _str_repeat(self, repeats: int | Sequence[int]): + if not isinstance(repeats, int): + return super()._str_repeat(repeats) + else: + return type(self)(pc.binary_repeat(self._pa_array, repeats)) + def _str_match( self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None ): @@ -379,6 +384,19 @@ def _str_fullmatch( pat = f"{pat}$" return self._str_match(pat, case, flags, na) + def _str_slice( + self, start: int | None = None, stop: int | None = None, step: int | None = None + ): + if stop is None: + return super()._str_slice(start, stop, step) + if start is None: + start = 0 + if step is None: + step = 1 + return type(self)( + pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) + ) + def _str_isalnum(self): result = pc.utf8_is_alnum(self._pa_array) return self._result_converter(result) @@ -417,7 +435,7 @@ def _str_isupper(self): def _str_len(self): result = pc.utf8_length(self._pa_array) - return Int64Dtype().__from_arrow__(result) + return self._convert_int_dtype(result) def _str_lower(self): return type(self)(pc.utf8_lower(self._pa_array)) @@ -446,10 +464,56 @@ def _str_rstrip(self, to_strip=None): result = pc.utf8_rtrim(self._pa_array, characters=to_strip) return type(self)(result) + def _str_removeprefix(self, prefix: str): + if not pa_version_under13p0: + starts_with = pc.starts_with(self._pa_array, pattern=prefix) + removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) + result = pc.if_else(starts_with, removed, self._pa_array) + return type(self)(result) + return super()._str_removeprefix(prefix) + + def _str_removesuffix(self, suffix: str): + ends_with = pc.ends_with(self._pa_array, pattern=suffix) + removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) + result = pc.if_else(ends_with, removed, self._pa_array) + return type(self)(result) + + def _str_count(self, pat: str, flags: int = 0): + if flags: + return super()._str_count(pat, flags) + result = pc.count_substring_regex(self._pa_array, pat) + return self._convert_int_dtype(result) + + def _str_find(self, sub: str, start: int = 0, end: int | None = None): + if start != 0 and end is not None: + slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) + result = pc.find_substring(slices, sub) + not_found = pc.equal(result, -1) + offset_result = pc.add(result, end - start) + result = pc.if_else(not_found, result, offset_result) + elif start == 0 and end is None: + slices = self._pa_array + result = pc.find_substring(slices, sub) + else: + return super()._str_find(sub, start, end) + return self._convert_int_dtype(result) + + def _convert_int_dtype(self, result): + return Int64Dtype().__from_arrow__(result) + class ArrowStringArrayNumpySemantics(ArrowStringArray): _storage = "pyarrow_numpy" + def __init__(self, values) -> None: + _chk_pyarrow_available() + + if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_large_string( + values.type + ): + values = pc.cast(values, pa.string()) + super().__init__(values) + @classmethod def _result_converter(cls, values, na=None): if not isna(na): @@ -459,7 +523,10 @@ def _result_converter(cls, values, na=None): def __getattribute__(self, item): # ArrowStringArray and we both inherit from ArrowExtensionArray, which # creates inheritance problems (Diamond inheritance) - if item in ArrowStringArrayMixin.__dict__ and item != "_pa_array": + if item in ArrowStringArrayMixin.__dict__ and item not in ( + "_pa_array", + "__dict__", + ): return partial(getattr(ArrowStringArrayMixin, item), self) return super().__getattribute__(item) @@ -517,34 +584,11 @@ def _str_map( return lib.map_infer_mask(arr, f, mask.view("uint8")) def _convert_int_dtype(self, result): + result = result.to_numpy() if result.dtype == np.int32: result = result.astype(np.int64) return result - def _str_count(self, pat: str, flags: int = 0): - if flags: - return super()._str_count(pat, flags) - result = pc.count_substring_regex(self._pa_array, pat).to_numpy() - return self._convert_int_dtype(result) - - def _str_len(self): - result = pc.utf8_length(self._pa_array).to_numpy() - return self._convert_int_dtype(result) - - def _str_find(self, sub: str, start: int = 0, end: int | None = None): - if start != 0 and end is not None: - slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) - result = pc.find_substring(slices, sub) - not_found = pc.equal(result, -1) - offset_result = pc.add(result, end - start) - result = pc.if_else(not_found, result, offset_result) - elif start == 0 and end is None: - slices = self._pa_array - result = pc.find_substring(slices, sub) - else: - return super()._str_find(sub, start, end) - return self._convert_int_dtype(result.to_numpy()) - def _cmp_method(self, other, op): result = super()._cmp_method(other, op) return result.to_numpy(np.bool_, na_value=False) diff --git a/pandas/core/base.py b/pandas/core/base.py index d973f8f5fe35a..3026189e747bb 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -485,8 +485,8 @@ def array(self) -> ExtensionArray: types, this is the actual array. For NumPy native types, this is a thin (no copy) wrapper around :class:`numpy.ndarray`. - ``.array`` differs ``.values`` which may require converting the - data to a different form. + ``.array`` differs from ``.values``, which may require converting + the data to a different form. See Also -------- diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 62455f119a02f..750b374043193 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -513,11 +513,11 @@ def use_inf_as_na_cb(key) -> None: auto, {others}. """ -_xls_options = ["xlrd"] -_xlsm_options = ["xlrd", "openpyxl"] -_xlsx_options = ["xlrd", "openpyxl"] -_ods_options = ["odf"] -_xlsb_options = ["pyxlsb"] +_xls_options = ["xlrd", "calamine"] +_xlsm_options = ["xlrd", "openpyxl", "calamine"] +_xlsx_options = ["xlrd", "openpyxl", "calamine"] +_ods_options = ["odf", "calamine"] +_xlsb_options = ["pyxlsb", "calamine"] with cf.config_prefix("io.excel.xls"): diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index f76163cbbd0a1..12de63967c78f 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -70,7 +70,7 @@ from collections.abc import MutableMapping from datetime import tzinfo - import pyarrow as pa # noqa: F811, TCH004 + import pyarrow as pa # noqa: TCH004 from pandas._typing import ( Dtype, @@ -2148,6 +2148,8 @@ def type(self): return CategoricalDtypeType elif pa.types.is_list(pa_type) or pa.types.is_large_list(pa_type): return list + elif pa.types.is_fixed_size_list(pa_type): + return list elif pa.types.is_map(pa_type): return list elif pa.types.is_struct(pa_type): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 227994ab924c1..188df1689ead7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1937,11 +1937,17 @@ def to_dict( self, orient: Literal["dict", "list", "series", "split", "tight", "index"] = ..., into: type[dict] = ..., + index: bool = ..., ) -> dict: ... @overload - def to_dict(self, orient: Literal["records"], into: type[dict] = ...) -> list[dict]: + def to_dict( + self, + orient: Literal["records"], + into: type[dict] = ..., + index: bool = ..., + ) -> list[dict]: ... @deprecate_nonkeyword_arguments( @@ -8876,20 +8882,20 @@ def update( >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', ... 'Parrot', 'Parrot'], ... 'Max Speed': [380., 370., 24., 26.]}) - >>> df.groupby("Animal", group_keys=True).apply(lambda x: x) - Animal Max Speed + >>> df.groupby("Animal", group_keys=True)[['Max Speed']].apply(lambda x: x) + Max Speed Animal - Falcon 0 Falcon 380.0 - 1 Falcon 370.0 - Parrot 2 Parrot 24.0 - 3 Parrot 26.0 - - >>> df.groupby("Animal", group_keys=False).apply(lambda x: x) - Animal Max Speed - 0 Falcon 380.0 - 1 Falcon 370.0 - 2 Parrot 24.0 - 3 Parrot 26.0 + Falcon 0 380.0 + 1 370.0 + Parrot 2 24.0 + 3 26.0 + + >>> df.groupby("Animal", group_keys=False)[['Max Speed']].apply(lambda x: x) + Max Speed + 0 380.0 + 1 370.0 + 2 24.0 + 3 26.0 """ ) ) @@ -9932,6 +9938,8 @@ def apply( result_type: Literal["expand", "reduce", "broadcast"] | None = None, args=(), by_row: Literal[False, "compat"] = "compat", + engine: Literal["python", "numba"] = "python", + engine_kwargs: dict[str, bool] | None = None, **kwargs, ): """ @@ -9991,6 +9999,35 @@ def apply( If False, the funcs will be passed the whole Series at once. .. versionadded:: 2.1.0 + + engine : {'python', 'numba'}, default 'python' + Choose between the python (default) engine or the numba engine in apply. + + The numba engine will attempt to JIT compile the passed function, + which may result in speedups for large DataFrames. + It also supports the following engine_kwargs : + + - nopython (compile the function in nopython mode) + - nogil (release the GIL inside the JIT compiled function) + - parallel (try to apply the function in parallel over the DataFrame) + + Note: The numba compiler only supports a subset of + valid Python/numpy operations. + + Please read more about the `supported python features + `_ + and `supported numpy features + `_ + in numba to learn what you can or cannot use in the passed function. + + As of right now, the numba engine can only be used with raw=True. + + .. versionadded:: 2.2.0 + + engine_kwargs : dict + Pass keyword arguments to the engine. + This is currently only used by the numba engine, + see the documentation for the engine argument for more information. **kwargs Additional keyword arguments to pass as keywords arguments to `func`. @@ -10091,6 +10128,8 @@ def apply( raw=raw, result_type=result_type, by_row=by_row, + engine=engine, + engine_kwargs=engine_kwargs, args=args, kwargs=kwargs, ) @@ -11310,7 +11349,7 @@ def _reduce_axis1(self, name: str, func, skipna: bool) -> Series: def any( # type: ignore[override] self, *, - axis: Axis = 0, + axis: Axis | None = 0, bool_only: bool = False, skipna: bool = True, **kwargs, @@ -11325,7 +11364,7 @@ def any( # type: ignore[override] @doc(make_doc("all", ndim=2)) def all( self, - axis: Axis = 0, + axis: Axis | None = 0, bool_only: bool = False, skipna: bool = True, **kwargs, @@ -11724,6 +11763,7 @@ def quantile( axis: Axis = ..., numeric_only: bool = ..., interpolation: QuantileInterpolation = ..., + method: Literal["single", "table"] = ..., ) -> Series: ... @@ -11734,6 +11774,7 @@ def quantile( axis: Axis = ..., numeric_only: bool = ..., interpolation: QuantileInterpolation = ..., + method: Literal["single", "table"] = ..., ) -> Series | DataFrame: ... @@ -11744,6 +11785,7 @@ def quantile( axis: Axis = ..., numeric_only: bool = ..., interpolation: QuantileInterpolation = ..., + method: Literal["single", "table"] = ..., ) -> Series | DataFrame: ... @@ -11843,11 +11885,10 @@ def quantile( if not is_list_like(q): # BlockManager.quantile expects listlike, so we wrap and unwrap here - # error: List item 0 has incompatible type "Union[float, Union[Union[ - # ExtensionArray, ndarray[Any, Any]], Index, Series], Sequence[float]]"; - # expected "float" - res_df = self.quantile( # type: ignore[call-overload] - [q], + # error: List item 0 has incompatible type "float | ExtensionArray | + # ndarray[Any, Any] | Index | Series | Sequence[float]"; expected "float" + res_df = self.quantile( + [q], # type: ignore[list-item] axis=axis, numeric_only=numeric_only, interpolation=interpolation, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8e75d27a953ad..0523b25c39602 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2848,7 +2848,7 @@ def to_sql( index : bool, default True Write DataFrame index as a column. Uses `index_label` as the column - name in the table. + name in the table. Creates a table index for this column. index_label : str or sequence, default None Column label for index column(s). If None is given (default) and `index` is True, then the index names are used. @@ -5719,10 +5719,12 @@ def filter( if items is not None: name = self._get_axis_name(axis) + items = Index(items).intersection(labels) + if len(items) == 0: + # Keep the dtype of labels when we are empty + items = items.astype(labels.dtype) # error: Keywords must be strings - return self.reindex( # type: ignore[misc] - **{name: labels.intersection(items)} - ) + return self.reindex(**{name: items}) # type: ignore[misc] elif like: def f(x) -> bool_t: @@ -7939,6 +7941,51 @@ def replace( else: return result.__finalize__(self, method="replace") + @overload + def interpolate( + self, + method: InterpolateOptions = ..., + *, + axis: Axis = ..., + limit: int | None = ..., + inplace: Literal[False] = ..., + limit_direction: Literal["forward", "backward", "both"] | None = ..., + limit_area: Literal["inside", "outside"] | None = ..., + downcast: Literal["infer"] | None | lib.NoDefault = ..., + **kwargs, + ) -> Self: + ... + + @overload + def interpolate( + self, + method: InterpolateOptions = ..., + *, + axis: Axis = ..., + limit: int | None = ..., + inplace: Literal[True], + limit_direction: Literal["forward", "backward", "both"] | None = ..., + limit_area: Literal["inside", "outside"] | None = ..., + downcast: Literal["infer"] | None | lib.NoDefault = ..., + **kwargs, + ) -> None: + ... + + @overload + def interpolate( + self, + method: InterpolateOptions = ..., + *, + axis: Axis = ..., + limit: int | None = ..., + inplace: bool_t = ..., + limit_direction: Literal["forward", "backward", "both"] | None = ..., + limit_area: Literal["inside", "outside"] | None = ..., + downcast: Literal["infer"] | None | lib.NoDefault = ..., + **kwargs, + ) -> Self | None: + ... + @final def interpolate( self, @@ -8181,10 +8228,11 @@ def interpolate( stacklevel=find_stack_level(), ) - if "fill_value" in kwargs: + if method in fillna_methods and "fill_value" in kwargs: raise ValueError( "'fill_value' is not a valid keyword for " - f"{type(self).__name__}.interpolate" + f"{type(self).__name__}.interpolate with method from " + f"{fillna_methods}" ) if isinstance(obj.index, MultiIndex) and method != "linear": @@ -8608,6 +8656,42 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace): # GH 40420 return self.where(subset, threshold, axis=axis, inplace=inplace) + @overload + def clip( + self, + lower=..., + upper=..., + *, + axis: Axis | None = ..., + inplace: Literal[False] = ..., + **kwargs, + ) -> Self: + ... + + @overload + def clip( + self, + lower=..., + upper=..., + *, + axis: Axis | None = ..., + inplace: Literal[True], + **kwargs, + ) -> None: + ... + + @overload + def clip( + self, + lower=..., + upper=..., + *, + axis: Axis | None = ..., + inplace: bool_t = ..., + **kwargs, + ) -> Self | None: + ... + @final def clip( self, @@ -11710,15 +11794,21 @@ def pct_change( stacklevel=find_stack_level(), ) if fill_method is lib.no_default: - if self.isna().values.any(): - warnings.warn( - "The default fill_method='pad' in " - f"{type(self).__name__}.pct_change is deprecated and will be " - "removed in a future version. Call ffill before calling " - "pct_change to retain current behavior and silence this warning.", - FutureWarning, - stacklevel=find_stack_level(), - ) + cols = self.items() if self.ndim == 2 else [(None, self)] + for _, col in cols: + mask = col.isna().values + mask = mask[np.argmax(~mask) :] + if mask.any(): + warnings.warn( + "The default fill_method='pad' in " + f"{type(self).__name__}.pct_change is deprecated and will be " + "removed in a future version. Call ffill before calling " + "pct_change to retain current behavior and silence this " + "warning.", + FutureWarning, + stacklevel=find_stack_level(), + ) + break fill_method = "pad" if limit is lib.no_default: limit = None @@ -11744,7 +11834,7 @@ def _logical_func( self, name: str, func, - axis: Axis = 0, + axis: Axis | None = 0, bool_only: bool_t = False, skipna: bool_t = True, **kwargs, @@ -11757,7 +11847,10 @@ def _logical_func( res = self._logical_func( name, func, axis=0, bool_only=bool_only, skipna=skipna, **kwargs ) - return res._logical_func(name, func, skipna=skipna, **kwargs) + # error: Item "bool" of "Series | bool" has no attribute "_logical_func" + return res._logical_func( # type: ignore[union-attr] + name, func, skipna=skipna, **kwargs + ) elif axis is None: axis = 0 diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 11c97e30ab5cd..e6dd6a990d285 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -180,6 +180,19 @@ class providing the base-class of operations. A callable that takes a {input} as its first argument, and returns a dataframe, a series or a scalar. In addition the callable may take positional and keyword arguments. + include_groups : bool, default True + When True, will attempt to apply ``func`` to the groupings in + the case that they are columns of the DataFrame. If this raises a + TypeError, the result will be computed with the groupings excluded. + When False, the groupings will be excluded when applying ``func``. + + .. versionadded:: 2.2.0 + + .. deprecated:: 2.2.0 + + Setting include_groups to True is deprecated. Only the value + False will be allowed in a future version of pandas. + args, kwargs : tuple and dict Optional positional and keyword arguments to pass to ``func``. @@ -272,7 +285,7 @@ class providing the base-class of operations. each group together into a Series, including setting the index as appropriate: - >>> g1.apply(lambda x: x.C.max() - x.B.min()) + >>> g1.apply(lambda x: x.C.max() - x.B.min(), include_groups=False) A a 5 b 2 @@ -1080,7 +1093,8 @@ def get_group(self, name, obj=None) -> DataFrame | Series: raise KeyError(name) if obj is None: - return self._selected_obj.iloc[inds] + indexer = inds if self.axis == 0 else (slice(None), inds) + return self._selected_obj.iloc[indexer] else: warnings.warn( "obj is deprecated and will be removed in a future version. " @@ -1747,7 +1761,7 @@ def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs): input="dataframe", examples=_apply_docs["dataframe_examples"] ) ) - def apply(self, func, *args, **kwargs) -> NDFrameT: + def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: orig_func = func func = com.is_builtin_func(func) if orig_func != func: @@ -1780,10 +1794,25 @@ def f(g): else: f = func + if not include_groups: + return self._python_apply_general(f, self._obj_with_exclusions) + # ignore SettingWithCopy here in case the user mutates with option_context("mode.chained_assignment", None): try: result = self._python_apply_general(f, self._selected_obj) + if ( + not isinstance(self.obj, Series) + and self._selection is None + and self._selected_obj.shape != self._obj_with_exclusions.shape + ): + warnings.warn( + message=_apply_groupings_depr.format( + type(self).__name__, "apply" + ), + category=FutureWarning, + stacklevel=find_stack_level(), + ) except TypeError: # gh-20949 # try again, with .apply acting as a filtering @@ -3519,7 +3548,7 @@ def describe( return result @final - def resample(self, rule, *args, **kwargs) -> Resampler: + def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resampler: """ Provide resampling when using a TimeGrouper. @@ -3533,7 +3562,23 @@ def resample(self, rule, *args, **kwargs) -> Resampler: ---------- rule : str or DateOffset The offset string or object representing target grouper conversion. - *args, **kwargs + *args + Possible arguments are `how`, `fill_method`, `limit`, `kind` and + `on`, and other arguments of `TimeGrouper`. + include_groups : bool, default True + When True, will attempt to include the groupings in the operation in + the case that they are columns of the DataFrame. If this raises a + TypeError, the result will be computed with the groupings excluded. + When False, the groupings will be excluded when applying ``func``. + + .. versionadded:: 2.2.0 + + .. deprecated:: 2.2.0 + + Setting include_groups to True is deprecated. Only the value + False will be allowed in a future version of pandas. + + **kwargs Possible arguments are `how`, `fill_method`, `limit`, `kind` and `on`, and other arguments of `TimeGrouper`. @@ -3569,59 +3614,71 @@ def resample(self, rule, *args, **kwargs) -> Resampler: Downsample the DataFrame into 3 minute bins and sum the values of the timestamps falling into a bin. - >>> df.groupby('a').resample('3min').sum() - a b + >>> df.groupby('a').resample('3min', include_groups=False).sum() + b a - 0 2000-01-01 00:00:00 0 2 - 2000-01-01 00:03:00 0 1 - 5 2000-01-01 00:00:00 5 1 + 0 2000-01-01 00:00:00 2 + 2000-01-01 00:03:00 1 + 5 2000-01-01 00:00:00 1 Upsample the series into 30 second bins. - >>> df.groupby('a').resample('30s').sum() - a b + >>> df.groupby('a').resample('30s', include_groups=False).sum() + b a - 0 2000-01-01 00:00:00 0 1 - 2000-01-01 00:00:30 0 0 - 2000-01-01 00:01:00 0 1 - 2000-01-01 00:01:30 0 0 - 2000-01-01 00:02:00 0 0 - 2000-01-01 00:02:30 0 0 - 2000-01-01 00:03:00 0 1 - 5 2000-01-01 00:02:00 5 1 + 0 2000-01-01 00:00:00 1 + 2000-01-01 00:00:30 0 + 2000-01-01 00:01:00 1 + 2000-01-01 00:01:30 0 + 2000-01-01 00:02:00 0 + 2000-01-01 00:02:30 0 + 2000-01-01 00:03:00 1 + 5 2000-01-01 00:02:00 1 Resample by month. Values are assigned to the month of the period. - >>> df.groupby('a').resample('M').sum() - a b + >>> df.groupby('a').resample('M', include_groups=False).sum() + b a - 0 2000-01-31 0 3 - 5 2000-01-31 5 1 + 0 2000-01-31 3 + 5 2000-01-31 1 Downsample the series into 3 minute bins as above, but close the right side of the bin interval. - >>> df.groupby('a').resample('3min', closed='right').sum() - a b + >>> ( + ... df.groupby('a') + ... .resample('3min', closed='right', include_groups=False) + ... .sum() + ... ) + b a - 0 1999-12-31 23:57:00 0 1 - 2000-01-01 00:00:00 0 2 - 5 2000-01-01 00:00:00 5 1 + 0 1999-12-31 23:57:00 1 + 2000-01-01 00:00:00 2 + 5 2000-01-01 00:00:00 1 Downsample the series into 3 minute bins and close the right side of the bin interval, but label each bin using the right edge instead of the left. - >>> df.groupby('a').resample('3min', closed='right', label='right').sum() - a b + >>> ( + ... df.groupby('a') + ... .resample('3min', closed='right', label='right', include_groups=False) + ... .sum() + ... ) + b a - 0 2000-01-01 00:00:00 0 1 - 2000-01-01 00:03:00 0 2 - 5 2000-01-01 00:03:00 5 1 + 0 2000-01-01 00:00:00 1 + 2000-01-01 00:03:00 2 + 5 2000-01-01 00:03:00 1 """ from pandas.core.resample import get_resampler_for_grouping - return get_resampler_for_grouping(self, rule, *args, **kwargs) + # mypy flags that include_groups could be specified via `*args` or `**kwargs` + # GH#54961 would resolve. + return get_resampler_for_grouping( # type: ignore[misc] + self, rule, *args, include_groups=include_groups, **kwargs + ) @final def rolling(self, *args, **kwargs) -> RollingGroupby: @@ -5727,3 +5784,13 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde mi = MultiIndex(levels=levels, codes=codes, names=[idx.name, None]) return mi + + +# GH#7155 +_apply_groupings_depr = ( + "{}.{} operated on the grouping columns. This behavior is deprecated, " + "and in a future version of pandas the grouping columns will be excluded " + "from the operation. Either pass `include_groups=False` to exclude the " + "groupings or explicitly select the grouping columns after groupby to silence " + "this warning." +) diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py index 694a420ad2494..c13ec51ff3851 100644 --- a/pandas/core/indexers/objects.py +++ b/pandas/core/indexers/objects.py @@ -262,7 +262,9 @@ def get_window_bounds( # end bound is previous end # or current index end_diff = (self.index[end[i - 1]] - end_bound) * index_growth_sign - if end_diff <= zero: + if end_diff == zero and not right_closed: + end[i] = end[i - 1] + 1 + elif end_diff <= zero: end[i] = i + 1 else: end[i] = end[i - 1] diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 781dfae7fef64..877b8edb32520 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -239,8 +239,12 @@ def _unique_indices(inds, dtype) -> Index: Index """ if all(isinstance(ind, Index) for ind in inds): - result = inds[0].append(inds[1:]).unique() - result = result.astype(dtype, copy=False) + inds = [ind.astype(dtype, copy=False) for ind in inds] + result = inds[0].unique() + other = inds[1].append(inds[2:]) + diff = other[result.get_indexer_for(other) == -1] + if len(diff): + result = result.append(diff.unique()) if sort: result = result.sort_values() return result @@ -288,7 +292,6 @@ def _find_common_index_dtype(inds): raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") if len(dtis) == len(indexes): - sort = True result = indexes[0] elif len(dtis) > 1: @@ -377,5 +380,5 @@ def all_indexes_same(indexes) -> bool: def default_index(n: int) -> RangeIndex: - rng = range(0, n) + rng = range(n) return RangeIndex._simple_new(rng, name=None) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6a397862712de..8756bb3f3c81b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3615,21 +3615,10 @@ def difference(self, other, sort=None): def _difference(self, other, sort): # overridden by RangeIndex - - this = self.unique() - - indexer = this.get_indexer_for(other) - indexer = indexer.take((indexer != -1).nonzero()[0]) - - label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) - - the_diff: MultiIndex | ArrayLike - if isinstance(this, ABCMultiIndex): - the_diff = this.take(label_diff) - else: - the_diff = this._values.take(label_diff) + other = other.unique() + the_diff = self[other.get_indexer_for(self) == -1] + the_diff = the_diff if self.is_unique else the_diff.unique() the_diff = _maybe_try_sort(the_diff, sort) - return the_diff def _wrap_difference_result(self, other, result): @@ -4557,7 +4546,7 @@ def join( ------- join_index, (left_indexer, right_indexer) - Examples + Examples -------- >>> idx1 = pd.Index([1, 2, 3]) >>> idx2 = pd.Index([4, 5, 6]) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index dcb5f8caccd3e..400747cbf6b8d 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -198,8 +198,6 @@ class DatetimeIndex(DatetimeTimedeltaMixin): timetz dayofyear day_of_year - weekofyear - week dayofweek day_of_week weekday diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 4d33f0137d3c4..b2d463a8c6c26 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -177,7 +177,7 @@ def concatenate_managers( values = np.concatenate(vals, axis=1) # type: ignore[arg-type] elif is_1d_only_ea_dtype(blk.dtype): # TODO(EA2D): special-casing not needed with 2D EAs - values = concat_compat(vals, axis=1, ea_compat_axis=True) + values = concat_compat(vals, axis=0, ea_compat_axis=True) values = ensure_block_shape(values, ndim=2) else: values = concat_compat(vals, axis=1) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 4cb7b610074ba..b1db2d2e708e8 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -969,6 +969,10 @@ def fast_xs(self, loc: int) -> SingleBlockManager: n = len(self) if isinstance(dtype, ExtensionDtype): + # TODO: use object dtype as workaround for non-performant + # EA.__setitem__ methods. (primarily ArrowExtensionArray.__setitem__ + # when iteratively setting individual values) + # https://github.com/pandas-dev/pandas/pull/54508#issuecomment-1675827918 result = np.empty(n, dtype=object) else: result = np.empty(n, dtype=dtype) diff --git a/pandas/core/methods/to_dict.py b/pandas/core/methods/to_dict.py index e89f641e17296..f4e0dcddcd34a 100644 --- a/pandas/core/methods/to_dict.py +++ b/pandas/core/methods/to_dict.py @@ -106,13 +106,13 @@ def to_dict( return into_c((k, v.to_dict(into)) for k, v in df.items()) elif orient == "list": - object_dtype_indices_as_set = set(box_native_indices) + object_dtype_indices_as_set: set[int] = set(box_native_indices) return into_c( ( k, - list(map(maybe_box_native, v.tolist())) + list(map(maybe_box_native, v.to_numpy().tolist())) if i in object_dtype_indices_as_set - else v.tolist(), + else v.to_numpy().tolist(), ) for i, (k, v) in enumerate(df.items()) ) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 5ff18d8a25e36..9605bf154a8b7 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -32,7 +32,10 @@ Substitution, doc, ) -from pandas.util._exceptions import find_stack_level +from pandas.util._exceptions import ( + find_stack_level, + rewrite_warning, +) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -57,6 +60,7 @@ from pandas.core.groupby.groupby import ( BaseGroupBy, GroupBy, + _apply_groupings_depr, _pipe_template, get_groupby, ) @@ -163,6 +167,7 @@ def __init__( gpr_index: Index, group_keys: bool = False, selection=None, + include_groups: bool = True, ) -> None: self._timegrouper = timegrouper self.keys = None @@ -171,6 +176,7 @@ def __init__( self.kind = kind self.group_keys = group_keys self.as_index = True + self.include_groups = include_groups self.obj, self.ax, self._indexer = self._timegrouper._set_grouper( self._convert_obj(obj), sort=True, gpr_index=gpr_index @@ -444,7 +450,9 @@ def _groupby_and_aggregate(self, how, *args, **kwargs): # a DataFrame column, but aggregate_item_by_item operates column-wise # on Series, raising AttributeError or KeyError # (depending on whether the column lookup uses getattr/__getitem__) - result = grouped.apply(how, *args, **kwargs) + result = _apply( + grouped, how, *args, include_groups=self.include_groups, **kwargs + ) except ValueError as err: if "Must produce aggregated value" in str(err): @@ -456,15 +464,21 @@ def _groupby_and_aggregate(self, how, *args, **kwargs): # we have a non-reducing function # try to evaluate - result = grouped.apply(how, *args, **kwargs) + result = _apply( + grouped, how, *args, include_groups=self.include_groups, **kwargs + ) return self._wrap_result(result) - def _get_resampler_for_grouping(self, groupby: GroupBy, key): + def _get_resampler_for_grouping( + self, groupby: GroupBy, key, include_groups: bool = True + ): """ Return the correct class for resampling with groupby. """ - return self._resampler_for_grouping(groupby=groupby, key=key, parent=self) + return self._resampler_for_grouping( + groupby=groupby, key=key, parent=self, include_groups=include_groups + ) def _wrap_result(self, result): """ @@ -1590,6 +1604,7 @@ def __init__( groupby: GroupBy, key=None, selection: IndexLabel | None = None, + include_groups: bool = False, ) -> None: # reached via ._gotitem and _get_resampler_for_grouping @@ -1612,6 +1627,7 @@ def __init__( self.ax = parent.ax self.obj = parent.obj + self.include_groups = include_groups @no_type_check def _apply(self, f, *args, **kwargs): @@ -1628,7 +1644,7 @@ def func(x): return x.apply(f, *args, **kwargs) - result = self._groupby.apply(func) + result = _apply(self._groupby, func, include_groups=self.include_groups) return self._wrap_result(result) _upsample = _apply @@ -2003,6 +2019,7 @@ def get_resampler_for_grouping( limit: int | None = None, kind=None, on=None, + include_groups: bool = True, **kwargs, ) -> Resampler: """ @@ -2011,7 +2028,9 @@ def get_resampler_for_grouping( # .resample uses 'on' similar to how .groupby uses 'key' tg = TimeGrouper(freq=rule, key=on, **kwargs) resampler = tg._get_resampler(groupby.obj, kind=kind) - return resampler._get_resampler_for_grouping(groupby=groupby, key=tg.key) + return resampler._get_resampler_for_grouping( + groupby=groupby, include_groups=include_groups, key=tg.key + ) class TimeGrouper(Grouper): @@ -2789,3 +2808,18 @@ def maybe_warn_args_and_kwargs(cls, kernel: str, args, kwargs) -> None: category=FutureWarning, stacklevel=find_stack_level(), ) + + +def _apply( + grouped: GroupBy, how: Callable, *args, include_groups: bool, **kwargs +) -> DataFrame: + # GH#7155 - rewrite warning to appear as if it came from `.resample` + target_message = "DataFrameGroupBy.apply operated on the grouping columns" + new_message = _apply_groupings_depr.format("DataFrameGroupBy", "resample") + with rewrite_warning( + target_message=target_message, + target_category=FutureWarning, + new_message=new_message, + ): + result = grouped.apply(how, *args, include_groups=include_groups, **kwargs) + return result diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f3695fb87ea78..6d1ff07e07c76 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1272,12 +1272,7 @@ def _get_merge_keys( # work-around for merge_asof(right_index=True) right_keys.append(right.index._values) if lk is not None and lk == rk: # FIXME: what about other NAs? - # avoid key upcast in corner case (length-0) - lk = cast(Hashable, lk) - if len(left) > 0: - right_drop.append(rk) - else: - left_drop.append(lk) + right_drop.append(rk) else: rk = cast(ArrayLike, rk) right_keys.append(rk) @@ -2421,7 +2416,8 @@ def _factorize_keys( elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype: if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or ( - isinstance(lk.dtype, StringDtype) and lk.dtype.storage == "pyarrow" + isinstance(lk.dtype, StringDtype) + and lk.dtype.storage in ["pyarrow", "pyarrow_numpy"] ): import pyarrow as pa import pyarrow.compute as pc @@ -2437,8 +2433,12 @@ def _factorize_keys( length = len(dc.dictionary) llab, rlab, count = ( - pc.fill_null(dc.indices[slice(len_lk)], length).to_numpy(), - pc.fill_null(dc.indices[slice(len_lk, None)], length).to_numpy(), + pc.fill_null(dc.indices[slice(len_lk)], length) + .to_numpy() + .astype(np.intp, copy=False), + pc.fill_null(dc.indices[slice(len_lk, None)], length) + .to_numpy() + .astype(np.intp, copy=False), len(dc.dictionary), ) if how == "right": diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 924b56f7a14d5..79354fdd12a2d 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -7,6 +7,7 @@ from typing import ( TYPE_CHECKING, Callable, + Literal, cast, ) @@ -449,7 +450,7 @@ def _all_key(): return (margins_name,) + ("",) * (len(cols) - 1) if len(rows) > 0: - margin = data[rows].groupby(rows, observed=observed).apply(aggfunc) + margin = data.groupby(rows, observed=observed)[rows].apply(aggfunc) all_key = _all_key() table[all_key] = margin result = table @@ -467,7 +468,7 @@ def _all_key(): margin_keys = table.columns if len(cols): - row_margin = data[cols].groupby(cols, observed=observed).apply(aggfunc) + row_margin = data.groupby(cols, observed=observed)[cols].apply(aggfunc) else: row_margin = Series(np.nan, index=result.columns) @@ -569,7 +570,7 @@ def crosstab( margins: bool = False, margins_name: Hashable = "All", dropna: bool = True, - normalize: bool = False, + normalize: bool | Literal[0, 1, "all", "index", "columns"] = False, ) -> DataFrame: """ Compute a simple cross tabulation of two (or more) factors. diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index fc8d827cd31bb..bf7c7a1ee4dc7 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -908,7 +908,7 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: data = frame.copy() else: # Take the data from frame corresponding to this idx value - if not isinstance(idx, tuple): + if len(level) == 1: idx = (idx,) gen = iter(idx) column_indexer = tuple( diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 43eea7c669ce7..126f589f5df71 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -43,7 +43,6 @@ to_datetime, to_timedelta, ) -from pandas.core import nanops import pandas.core.algorithms as algos if TYPE_CHECKING: @@ -243,43 +242,18 @@ def cut( # NOTE: this binning code is changed a bit from histogram for var(x) == 0 original = x - x = _preprocess_for_cut(x) - x, dtype = _coerce_to_type(x) + x_idx = _preprocess_for_cut(x) + x_idx, dtype = _coerce_to_type(x_idx) if not np.iterable(bins): - if is_scalar(bins) and bins < 1: - raise ValueError("`bins` should be a positive integer.") - - sz = x.size - - if sz == 0: - raise ValueError("Cannot cut empty array") - - rng = (nanops.nanmin(x), nanops.nanmax(x)) - mn, mx = (mi + 0.0 for mi in rng) - - if np.isinf(mn) or np.isinf(mx): - # GH 24314 - raise ValueError( - "cannot specify integer `bins` when input data contains infinity" - ) - if mn == mx: # adjust end points before binning - mn -= 0.001 * abs(mn) if mn != 0 else 0.001 - mx += 0.001 * abs(mx) if mx != 0 else 0.001 - bins = np.linspace(mn, mx, bins + 1, endpoint=True) - else: # adjust end points after binning - bins = np.linspace(mn, mx, bins + 1, endpoint=True) - adj = (mx - mn) * 0.001 # 0.1% of the range - if right: - bins[0] -= adj - else: - bins[-1] += adj + bins = _nbins_to_bins(x_idx, bins, right) elif isinstance(bins, IntervalIndex): if bins.is_overlapping: raise ValueError("Overlapping IntervalIndex is not accepted.") else: + bins = Index(bins) if isinstance(getattr(bins, "dtype", None), DatetimeTZDtype): bins = np.asarray(bins, dtype=DT64NS_DTYPE) else: @@ -289,9 +263,10 @@ def cut( # GH 26045: cast to float64 to avoid an overflow if (np.diff(bins.astype("float64")) < 0).any(): raise ValueError("bins must increase monotonically.") + bins = Index(bins) fac, bins = _bins_to_cuts( - x, + x_idx, bins, right=right, labels=labels, @@ -367,18 +342,18 @@ def qcut( array([0, 0, 1, 2, 3]) """ original = x - x = _preprocess_for_cut(x) - x, dtype = _coerce_to_type(x) + x_idx = _preprocess_for_cut(x) + x_idx, dtype = _coerce_to_type(x_idx) quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q - x_np = np.asarray(x) + x_np = np.asarray(x_idx) x_np = x_np[~np.isnan(x_np)] bins = np.quantile(x_np, quantiles) fac, bins = _bins_to_cuts( - x, - bins, + x_idx, + Index(bins), labels=labels, precision=precision, include_lowest=True, @@ -389,9 +364,44 @@ def qcut( return _postprocess_for_cut(fac, bins, retbins, dtype, original) +def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index: + """ + If a user passed an integer N for bins, convert this to a sequence of N + equal(ish)-sized bins. + """ + if is_scalar(nbins) and nbins < 1: + raise ValueError("`bins` should be a positive integer.") + + if x_idx.size == 0: + raise ValueError("Cannot cut empty array") + + rng = (x_idx.min(), x_idx.max()) + mn, mx = rng + + if np.isinf(mn) or np.isinf(mx): + # GH#24314 + raise ValueError( + "cannot specify integer `bins` when input data contains infinity" + ) + + if mn == mx: # adjust end points before binning + mn -= 0.001 * abs(mn) if mn != 0 else 0.001 + mx += 0.001 * abs(mx) if mx != 0 else 0.001 + bins = np.linspace(mn, mx, nbins + 1, endpoint=True) + else: # adjust end points after binning + bins = np.linspace(mn, mx, nbins + 1, endpoint=True) + adj = (mx - mn) * 0.001 # 0.1% of the range + if right: + bins[0] -= adj + else: + bins[-1] += adj + + return Index(bins) + + def _bins_to_cuts( - x, - bins: np.ndarray, + x: Index, + bins: Index, right: bool = True, labels=None, precision: int = 3, @@ -408,6 +418,8 @@ def _bins_to_cuts( "invalid value for 'duplicates' parameter, valid options are: raise, drop" ) + result: Categorical | np.ndarray + if isinstance(bins, IntervalIndex): # we have a fast-path here ids = bins.get_indexer(x) @@ -474,7 +486,7 @@ def _bins_to_cuts( return result, bins -def _coerce_to_type(x): +def _coerce_to_type(x: Index) -> tuple[Index, DtypeObj | None]: """ if the passed data is of datetime/timedelta, bool or nullable int type, this method converts it to numeric so that cut or qcut method can @@ -498,11 +510,13 @@ def _coerce_to_type(x): # https://github.com/pandas-dev/pandas/pull/31290 # https://github.com/pandas-dev/pandas/issues/31389 elif isinstance(x.dtype, ExtensionDtype) and is_numeric_dtype(x.dtype): - x = x.to_numpy(dtype=np.float64, na_value=np.nan) + x_arr = x.to_numpy(dtype=np.float64, na_value=np.nan) + x = Index(x_arr) if dtype is not None: # GH 19768: force NaT to NaN during integer conversion - x = np.where(x.notna(), x.view(np.int64), np.nan) + x_arr = np.where(x.notna(), x.view(np.int64), np.nan) + x = Index(x_arr) return x, dtype @@ -564,7 +578,7 @@ def _convert_bin_to_datelike_type(bins, dtype: DtypeObj | None): def _format_labels( - bins, + bins: Index, precision: int, right: bool = True, include_lowest: bool = False, @@ -597,7 +611,7 @@ def _format_labels( return IntervalIndex.from_breaks(breaks, closed=closed) -def _preprocess_for_cut(x): +def _preprocess_for_cut(x) -> Index: """ handles preprocessing for cut where we convert passed input to array, strip the index information and store it @@ -611,7 +625,7 @@ def _preprocess_for_cut(x): if x.ndim != 1: raise ValueError("Input array must be 1 dimensional") - return x + return Index(x) def _postprocess_for_cut(fac, bins, retbins: bool, dtype: DtypeObj | None, original): @@ -627,6 +641,8 @@ def _postprocess_for_cut(fac, bins, retbins: bool, dtype: DtypeObj | None, origi return fac bins = _convert_bin_to_datelike_type(bins, dtype) + if isinstance(bins, Index) and is_numeric_dtype(bins.dtype): + bins = bins._values return fac, bins @@ -646,7 +662,7 @@ def _round_frac(x, precision: int): return np.around(x, digits) -def _infer_precision(base_precision: int, bins) -> int: +def _infer_precision(base_precision: int, bins: Index) -> int: """ Infer an appropriate precision for _round_frac """ diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 915595833468d..3b2ae5daffdba 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -28,4 +28,7 @@ def _arrow_dtype_mapping() -> dict: def arrow_string_types_mapper() -> Callable: pa = import_optional_dependency("pyarrow") - return {pa.string(): pd.StringDtype(storage="pyarrow_numpy")}.get + return { + pa.string(): pd.StringDtype(storage="pyarrow_numpy"), + pa.large_string(): pd.StringDtype(storage="pyarrow_numpy"), + }.get diff --git a/pandas/io/clipboard/__init__.py b/pandas/io/clipboard/__init__.py index 806d42381afc6..6491849925e86 100644 --- a/pandas/io/clipboard/__init__.py +++ b/pandas/io/clipboard/__init__.py @@ -17,9 +17,12 @@ On Windows, no additional modules are needed. On Mac, the pyobjc module is used, falling back to the pbcopy and pbpaste cli commands. (These commands should come with OS X.). -On Linux, install xclip or xsel via package manager. For example, in Debian: +On Linux, install xclip, xsel, or wl-clipboard (for "wayland" sessions) via +package manager. +For example, in Debian: sudo apt-get install xclip sudo apt-get install xsel + sudo apt-get install wl-clipboard Otherwise on Linux, you will need the PyQt5 modules installed. @@ -28,12 +31,11 @@ Cygwin is currently not supported. Security Note: This module runs programs with these names: - - which - - where - pbcopy - pbpaste - xclip - xsel + - wl-copy/wl-paste - klipper - qdbus A malicious user could rename or add programs with these names, tricking @@ -41,7 +43,7 @@ """ -__version__ = "1.7.0" +__version__ = "1.8.2" import contextlib @@ -55,7 +57,7 @@ ) import os import platform -from shutil import which +from shutil import which as _executable_exists import subprocess import time import warnings @@ -74,25 +76,14 @@ EXCEPT_MSG = """ Pyperclip could not find a copy/paste mechanism for your system. For more information, please visit - https://pyperclip.readthedocs.io/en/latest/#not-implemented-error + https://pyperclip.readthedocs.io/en/latest/index.html#not-implemented-error """ ENCODING = "utf-8" -# The "which" unix command finds where a command is. -if platform.system() == "Windows": - WHICH_CMD = "where" -else: - WHICH_CMD = "which" - -def _executable_exists(name): - return ( - subprocess.call( - [WHICH_CMD, name], stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - == 0 - ) +class PyperclipTimeoutException(PyperclipException): + pass def _stringifyText(text) -> str: @@ -229,6 +220,32 @@ def paste_xsel(primary=False): return copy_xsel, paste_xsel +def init_wl_clipboard(): + PRIMARY_SELECTION = "-p" + + def copy_wl(text, primary=False): + text = _stringifyText(text) # Converts non-str values to str. + args = ["wl-copy"] + if primary: + args.append(PRIMARY_SELECTION) + if not text: + args.append("--clear") + subprocess.check_call(args, close_fds=True) + else: + p = subprocess.Popen(args, stdin=subprocess.PIPE, close_fds=True) + p.communicate(input=text.encode(ENCODING)) + + def paste_wl(primary=False): + args = ["wl-paste", "-n"] + if primary: + args.append(PRIMARY_SELECTION) + p = subprocess.Popen(args, stdout=subprocess.PIPE, close_fds=True) + stdout, _stderr = p.communicate() + return stdout.decode(ENCODING) + + return copy_wl, paste_wl + + def init_klipper_clipboard(): def copy_klipper(text): text = _stringifyText(text) # Converts non-str values to str. @@ -534,7 +551,7 @@ def determine_clipboard(): return init_windows_clipboard() if platform.system() == "Linux": - if which("wslconfig.exe"): + if _executable_exists("wslconfig.exe"): return init_wsl_clipboard() # Setup for the macOS platform: @@ -549,6 +566,8 @@ def determine_clipboard(): # Setup for the LINUX platform: if HAS_DISPLAY: + if os.environ.get("WAYLAND_DISPLAY") and _executable_exists("wl-copy"): + return init_wl_clipboard() if _executable_exists("xsel"): return init_xsel_clipboard() if _executable_exists("xclip"): @@ -602,6 +621,7 @@ def set_clipboard(clipboard): "qt": init_qt_clipboard, # TODO - split this into 'qtpy', 'pyqt4', and 'pyqt5' "xclip": init_xclip_clipboard, "xsel": init_xsel_clipboard, + "wl-clipboard": init_wl_clipboard, "klipper": init_klipper_clipboard, "windows": init_windows_clipboard, "no": init_no_clipboard, @@ -671,7 +691,56 @@ def is_available() -> bool: copy, paste = lazy_load_stub_copy, lazy_load_stub_paste -__all__ = ["copy", "paste", "set_clipboard", "determine_clipboard"] +def waitForPaste(timeout=None): + """This function call blocks until a non-empty text string exists on the + clipboard. It returns this text. + + This function raises PyperclipTimeoutException if timeout was set to + a number of seconds that has elapsed without non-empty text being put on + the clipboard.""" + startTime = time.time() + while True: + clipboardText = paste() + if clipboardText != "": + return clipboardText + time.sleep(0.01) + + if timeout is not None and time.time() > startTime + timeout: + raise PyperclipTimeoutException( + "waitForPaste() timed out after " + str(timeout) + " seconds." + ) + + +def waitForNewPaste(timeout=None): + """This function call blocks until a new text string exists on the + clipboard that is different from the text that was there when the function + was first called. It returns this text. + + This function raises PyperclipTimeoutException if timeout was set to + a number of seconds that has elapsed without non-empty text being put on + the clipboard.""" + startTime = time.time() + originalText = paste() + while True: + currentText = paste() + if currentText != originalText: + return currentText + time.sleep(0.01) + + if timeout is not None and time.time() > startTime + timeout: + raise PyperclipTimeoutException( + "waitForNewPaste() timed out after " + str(timeout) + " seconds." + ) + + +__all__ = [ + "copy", + "paste", + "waitForPaste", + "waitForNewPaste", + "set_clipboard", + "determine_clipboard", +] # pandas aliases clipboard_get = paste diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 9ffbfb9f1149f..073115cab8695 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1,6 +1,5 @@ from __future__ import annotations -import abc from collections.abc import ( Hashable, Iterable, @@ -160,13 +159,15 @@ of dtype conversion. engine : str, default None If io is not a buffer or path, this must be set to identify io. - Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb". + Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", "calamine". Engine compatibility : - "xlrd" supports old-style Excel files (.xls). - "openpyxl" supports newer Excel file formats. - "odf" supports OpenDocument file formats (.odf, .ods, .odt). - "pyxlsb" supports Binary Excel files. + - "calamine" supports Excel (.xls, .xlsx, .xlsm, .xlsb) + and OpenDocument (.ods) file formats. .. versionchanged:: 1.2.0 The engine `xlrd `_ @@ -395,7 +396,7 @@ def read_excel( | Callable[[str], bool] | None = ..., dtype: DtypeArg | None = ..., - engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ..., + engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = ..., converters: dict[str, Callable] | dict[int, Callable] | None = ..., true_values: Iterable[Hashable] | None = ..., false_values: Iterable[Hashable] | None = ..., @@ -434,7 +435,7 @@ def read_excel( | Callable[[str], bool] | None = ..., dtype: DtypeArg | None = ..., - engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ..., + engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = ..., converters: dict[str, Callable] | dict[int, Callable] | None = ..., true_values: Iterable[Hashable] | None = ..., false_values: Iterable[Hashable] | None = ..., @@ -473,7 +474,7 @@ def read_excel( | Callable[[str], bool] | None = None, dtype: DtypeArg | None = None, - engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = None, + engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = None, converters: dict[str, Callable] | dict[int, Callable] | None = None, true_values: Iterable[Hashable] | None = None, false_values: Iterable[Hashable] | None = None, @@ -549,7 +550,7 @@ def read_excel( _WorkbookT = TypeVar("_WorkbookT") -class BaseExcelReader(Generic[_WorkbookT], metaclass=abc.ABCMeta): +class BaseExcelReader(Generic[_WorkbookT]): book: _WorkbookT def __init__( @@ -589,13 +590,11 @@ def __init__( ) @property - @abc.abstractmethod def _workbook_class(self) -> type[_WorkbookT]: - pass + raise NotImplementedError - @abc.abstractmethod def load_workbook(self, filepath_or_buffer, engine_kwargs) -> _WorkbookT: - pass + raise NotImplementedError def close(self) -> None: if hasattr(self, "book"): @@ -611,21 +610,17 @@ def close(self) -> None: self.handles.close() @property - @abc.abstractmethod def sheet_names(self) -> list[str]: - pass + raise NotImplementedError - @abc.abstractmethod def get_sheet_by_name(self, name: str): - pass + raise NotImplementedError - @abc.abstractmethod def get_sheet_by_index(self, index: int): - pass + raise NotImplementedError - @abc.abstractmethod def get_sheet_data(self, sheet, rows: int | None = None): - pass + raise NotImplementedError def raise_if_bad_sheet_by_index(self, index: int) -> None: n_sheets = len(self.sheet_names) @@ -940,7 +935,7 @@ def parse( @doc(storage_options=_shared_docs["storage_options"]) -class ExcelWriter(Generic[_WorkbookT], metaclass=abc.ABCMeta): +class ExcelWriter(Generic[_WorkbookT]): """ Class for writing DataFrame objects into excel sheets. @@ -1178,20 +1173,19 @@ def engine(self) -> str: return self._engine @property - @abc.abstractmethod def sheets(self) -> dict[str, Any]: """Mapping of sheet names to sheet objects.""" + raise NotImplementedError @property - @abc.abstractmethod def book(self) -> _WorkbookT: """ Book instance. Class type will depend on the engine used. This attribute can be used to access engine-specific features. """ + raise NotImplementedError - @abc.abstractmethod def _write_cells( self, cells, @@ -1214,12 +1208,13 @@ def _write_cells( freeze_panes: int tuple of length 2 contains the bottom-most row and right-most column to freeze """ + raise NotImplementedError - @abc.abstractmethod def _save(self) -> None: """ Save workbook to disk. """ + raise NotImplementedError def __init__( self, @@ -1463,13 +1458,15 @@ class ExcelFile: .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file. engine : str, default None If io is not a buffer or path, this must be set to identify io. - Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb`` + Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``, ``calamine`` Engine compatibility : - ``xlrd`` supports old-style Excel files (.xls). - ``openpyxl`` supports newer Excel file formats. - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). - ``pyxlsb`` supports Binary Excel files. + - ``calamine`` supports Excel (.xls, .xlsx, .xlsm, .xlsb) + and OpenDocument (.ods) file formats. .. versionchanged:: 1.2.0 @@ -1505,6 +1502,7 @@ class ExcelFile: ... df1 = pd.read_excel(xls, "Sheet1") # doctest: +SKIP """ + from pandas.io.excel._calamine import CalamineReader from pandas.io.excel._odfreader import ODFReader from pandas.io.excel._openpyxl import OpenpyxlReader from pandas.io.excel._pyxlsb import PyxlsbReader @@ -1515,6 +1513,7 @@ class ExcelFile: "openpyxl": OpenpyxlReader, "odf": ODFReader, "pyxlsb": PyxlsbReader, + "calamine": CalamineReader, } def __init__( diff --git a/pandas/io/excel/_calamine.py b/pandas/io/excel/_calamine.py new file mode 100644 index 0000000000000..d61a9fc664164 --- /dev/null +++ b/pandas/io/excel/_calamine.py @@ -0,0 +1,127 @@ +from __future__ import annotations + +from datetime import ( + date, + datetime, + time, + timedelta, +) +from typing import ( + TYPE_CHECKING, + Any, + Union, + cast, +) + +from pandas._typing import Scalar +from pandas.compat._optional import import_optional_dependency +from pandas.util._decorators import doc + +import pandas as pd +from pandas.core.shared_docs import _shared_docs + +from pandas.io.excel._base import BaseExcelReader + +if TYPE_CHECKING: + from python_calamine import ( + CalamineSheet, + CalamineWorkbook, + ) + + from pandas._typing import ( + FilePath, + ReadBuffer, + StorageOptions, + ) + +_CellValueT = Union[int, float, str, bool, time, date, datetime, timedelta] + + +class CalamineReader(BaseExcelReader["CalamineWorkbook"]): + @doc(storage_options=_shared_docs["storage_options"]) + def __init__( + self, + filepath_or_buffer: FilePath | ReadBuffer[bytes], + storage_options: StorageOptions | None = None, + engine_kwargs: dict | None = None, + ) -> None: + """ + Reader using calamine engine (xlsx/xls/xlsb/ods). + + Parameters + ---------- + filepath_or_buffer : str, path to be parsed or + an open readable stream. + {storage_options} + engine_kwargs : dict, optional + Arbitrary keyword arguments passed to excel engine. + """ + import_optional_dependency("python_calamine") + super().__init__( + filepath_or_buffer, + storage_options=storage_options, + engine_kwargs=engine_kwargs, + ) + + @property + def _workbook_class(self) -> type[CalamineWorkbook]: + from python_calamine import CalamineWorkbook + + return CalamineWorkbook + + def load_workbook( + self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs: Any + ) -> CalamineWorkbook: + from python_calamine import load_workbook + + return load_workbook( + filepath_or_buffer, **engine_kwargs # type: ignore[arg-type] + ) + + @property + def sheet_names(self) -> list[str]: + from python_calamine import SheetTypeEnum + + return [ + sheet.name + for sheet in self.book.sheets_metadata + if sheet.typ == SheetTypeEnum.WorkSheet + ] + + def get_sheet_by_name(self, name: str) -> CalamineSheet: + self.raise_if_bad_sheet_by_name(name) + return self.book.get_sheet_by_name(name) + + def get_sheet_by_index(self, index: int) -> CalamineSheet: + self.raise_if_bad_sheet_by_index(index) + return self.book.get_sheet_by_index(index) + + def get_sheet_data( + self, sheet: CalamineSheet, file_rows_needed: int | None = None + ) -> list[list[Scalar]]: + def _convert_cell(value: _CellValueT) -> Scalar: + if isinstance(value, float): + val = int(value) + if val == value: + return val + else: + return value + elif isinstance(value, date): + return pd.Timestamp(value) + elif isinstance(value, timedelta): + return pd.Timedelta(value) + elif isinstance(value, time): + # cast needed here because Scalar doesn't include datetime.time + return cast(Scalar, value) + + return value + + rows: list[list[_CellValueT]] = sheet.to_python(skip_empty_area=False) + data: list[list[Scalar]] = [] + + for row in rows: + data.append([_convert_cell(cell) for cell in row]) + if file_rows_needed is not None and len(data) >= file_rows_needed: + break + + return data diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py index 74cbe90acdae8..bc7dca2d95b6b 100644 --- a/pandas/io/excel/_odswriter.py +++ b/pandas/io/excel/_odswriter.py @@ -192,7 +192,15 @@ def _make_table_cell(self, cell) -> tuple[object, Any]: if isinstance(val, bool): value = str(val).lower() pvalue = str(val).upper() - if isinstance(val, datetime.datetime): + return ( + pvalue, + TableCell( + valuetype="boolean", + booleanvalue=value, + attributes=attributes, + ), + ) + elif isinstance(val, datetime.datetime): # Fast formatting value = val.isoformat() # Slow but locale-dependent @@ -210,17 +218,20 @@ def _make_table_cell(self, cell) -> tuple[object, Any]: pvalue, TableCell(valuetype="date", datevalue=value, attributes=attributes), ) + elif isinstance(val, str): + return ( + pvalue, + TableCell( + valuetype="string", + stringvalue=value, + attributes=attributes, + ), + ) else: - class_to_cell_type = { - str: "string", - int: "float", - float: "float", - bool: "boolean", - } return ( pvalue, TableCell( - valuetype=class_to_cell_type[type(val)], + valuetype="float", value=value, attributes=attributes, ), diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index 9970d465ced9d..b344d9849f16c 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -941,9 +941,7 @@ def write( if isinstance(writer, ExcelWriter): need_save = False else: - # error: Cannot instantiate abstract class 'ExcelWriter' with abstract - # attributes 'engine', 'save', 'supported_extensions' and 'write_cells' - writer = ExcelWriter( # type: ignore[abstract] + writer = ExcelWriter( writer, engine=engine, storage_options=storage_options, diff --git a/pandas/io/html.py b/pandas/io/html.py index 10701be4f7e0b..68d30fe5ba681 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -1033,7 +1033,7 @@ def read_html( io: FilePath | ReadBuffer[str], *, match: str | Pattern = ".+", - flavor: str | None = None, + flavor: str | Sequence[str] | None = None, header: int | Sequence[int] | None = None, index_col: int | Sequence[int] | None = None, skiprows: int | Sequence[int] | slice | None = None, @@ -1074,11 +1074,11 @@ def read_html( This value is converted to a regular expression so that there is consistent behavior between Beautiful Soup and lxml. - flavor : str, optional - The parsing engine to use. 'bs4' and 'html5lib' are synonymous with - each other, they are both there for backwards compatibility. The - default of ``None`` tries to use ``lxml`` to parse and if that fails it - falls back on ``bs4`` + ``html5lib``. + flavor : str or list-like, optional + The parsing engine (or list of parsing engines) to use. 'bs4' and + 'html5lib' are synonymous with each other, they are both there for + backwards compatibility. The default of ``None`` tries to use ``lxml`` + to parse and if that fails it falls back on ``bs4`` + ``html5lib``. header : int or list-like, optional The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 833f4986b6da6..52ea072d1483f 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -82,6 +82,7 @@ JSONEngine, JSONSerializable, ReadBuffer, + Self, StorageOptions, WriteBuffer, ) @@ -1056,7 +1057,7 @@ def close(self) -> None: if self.handles is not None: self.handles.close() - def __iter__(self: JsonReader[FrameSeriesStrT]) -> JsonReader[FrameSeriesStrT]: + def __iter__(self) -> Self: return self @overload @@ -1099,7 +1100,7 @@ def __next__(self) -> DataFrame | Series: else: return obj - def __enter__(self) -> JsonReader[FrameSeriesStrT]: + def __enter__(self) -> Self: return self def __exit__( diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 6846ea2b196b8..43fb4ec3b55fc 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -13,7 +13,6 @@ import csv from io import StringIO import re -import sys from typing import ( IO, TYPE_CHECKING, @@ -21,6 +20,7 @@ Literal, cast, ) +import warnings import numpy as np @@ -28,8 +28,10 @@ from pandas.errors import ( EmptyDataError, ParserError, + ParserWarning, ) from pandas.util._decorators import cache_readonly +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_bool_dtype, @@ -778,8 +780,11 @@ def _alert_malformed(self, msg: str, row_num: int) -> None: if self.on_bad_lines == self.BadLineHandleMethod.ERROR: raise ParserError(msg) if self.on_bad_lines == self.BadLineHandleMethod.WARN: - base = f"Skipping line {row_num}: " - sys.stderr.write(base + msg + "\n") + warnings.warn( + f"Skipping line {row_num}: {msg}\n", + ParserWarning, + stacklevel=find_stack_level(), + ) def _next_iter_line(self, row_num: int) -> list[Scalar] | None: """ diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 10d3ab230cb9d..e826aad478059 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -638,7 +638,10 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -697,7 +700,10 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -757,7 +763,10 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -817,7 +826,10 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -888,7 +900,10 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = "infer", names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, index_col: IndexLabel | Literal[False] | None = None, - usecols: list[HashableT] | Callable[[Hashable], bool] | None = None, + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = None, # General Parsing Configuration dtype: DtypeArg | None = None, engine: CSVEngine | None = None, @@ -983,7 +998,10 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -1040,7 +1058,10 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -1097,7 +1118,10 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -1154,7 +1178,10 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -1224,7 +1251,10 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = "infer", names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, index_col: IndexLabel | Literal[False] | None = None, - usecols: list[HashableT] | Callable[[Hashable], bool] | None = None, + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = None, # General Parsing Configuration dtype: DtypeArg | None = None, engine: CSVEngine | None = None, @@ -1307,6 +1337,51 @@ def read_table( return _read(filepath_or_buffer, kwds) +@overload +def read_fwf( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + colspecs: Sequence[tuple[int, int]] | str | None = ..., + widths: Sequence[int] | None = ..., + infer_nrows: int = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., + iterator: Literal[True], + chunksize: int | None = ..., + **kwds, +) -> TextFileReader: + ... + + +@overload +def read_fwf( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + colspecs: Sequence[tuple[int, int]] | str | None = ..., + widths: Sequence[int] | None = ..., + infer_nrows: int = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., + iterator: bool = ..., + chunksize: int, + **kwds, +) -> TextFileReader: + ... + + +@overload +def read_fwf( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + colspecs: Sequence[tuple[int, int]] | str | None = ..., + widths: Sequence[int] | None = ..., + infer_nrows: int = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., + iterator: Literal[False] = ..., + chunksize: None = ..., + **kwds, +) -> DataFrame: + ... + + def read_fwf( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], *, @@ -1314,6 +1389,8 @@ def read_fwf( widths: Sequence[int] | None = None, infer_nrows: int = 100, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + iterator: bool = False, + chunksize: int | None = None, **kwds, ) -> DataFrame | TextFileReader: r""" @@ -1412,6 +1489,8 @@ def read_fwf( kwds["colspecs"] = colspecs kwds["infer_nrows"] = infer_nrows kwds["engine"] = "python-fwf" + kwds["iterator"] = iterator + kwds["chunksize"] = chunksize check_dtype_backend(dtype_backend) kwds["dtype_backend"] = dtype_backend diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 7669d5aa4cea5..0788d9da06eb9 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -138,7 +138,7 @@ def _parse_date_columns(data_frame, parse_dates): if isinstance(df_col.dtype, DatetimeTZDtype) or col_name in parse_dates: try: fmt = parse_dates[col_name] - except TypeError: + except (KeyError, TypeError): fmt = None data_frame.isetitem(i, _handle_date_column(df_col, format=fmt)) @@ -2091,13 +2091,11 @@ def _adapt_time(t) -> str: adapt_date_iso = lambda val: val.isoformat() adapt_datetime_iso = lambda val: val.isoformat() - adapt_datetime_epoch = lambda val: int(val.timestamp()) sqlite3.register_adapter(time, _adapt_time) sqlite3.register_adapter(date, adapt_date_iso) sqlite3.register_adapter(datetime, adapt_datetime_iso) - sqlite3.register_adapter(datetime, adapt_datetime_epoch) convert_date = lambda val: date.fromisoformat(val.decode()) convert_datetime = lambda val: datetime.fromisoformat(val.decode()) diff --git a/pandas/meson.build b/pandas/meson.build index 1dc9955aa4ff6..f02258c98d46a 100644 --- a/pandas/meson.build +++ b/pandas/meson.build @@ -40,8 +40,9 @@ subdirs_list = [ 'util' ] foreach subdir: subdirs_list - install_subdir(subdir, install_dir: py.get_install_dir(pure: false) / 'pandas') + install_subdir(subdir, install_dir: py.get_install_dir() / 'pandas') endforeach + top_level_py_list = [ '__init__.py', '_typing.py', @@ -49,8 +50,4 @@ top_level_py_list = [ 'conftest.py', 'testing.py' ] -foreach file: top_level_py_list - py.install_sources(file, - pure: false, - subdir: 'pandas') -endforeach +py.install_sources(top_level_py_list, subdir: 'pandas') diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 3a3f73a68374b..227b72573f979 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -18,6 +18,13 @@ from pandas.tests.frame.common import zip_frames +@pytest.fixture(params=["python", "numba"]) +def engine(request): + if request.param == "numba": + pytest.importorskip("numba") + return request.param + + def test_apply(float_frame): with np.errstate(all="ignore"): # ufunc @@ -38,8 +45,9 @@ def test_apply(float_frame): @pytest.mark.parametrize("axis", [0, 1]) -def test_apply_args(float_frame, axis): - result = float_frame.apply(lambda x, y: x + y, axis, args=(1,)) +@pytest.mark.parametrize("raw", [True, False]) +def test_apply_args(float_frame, axis, raw): + result = float_frame.apply(lambda x, y: x + y, axis, args=(1,), raw=raw) expected = float_frame + 1 tm.assert_frame_equal(result, expected) @@ -234,36 +242,42 @@ def test_apply_broadcast_series_lambda_func(int_frame_const_col): @pytest.mark.parametrize("axis", [0, 1]) -def test_apply_raw_float_frame(float_frame, axis): +def test_apply_raw_float_frame(float_frame, axis, engine): + if engine == "numba": + pytest.skip("numba can't handle when UDF returns None.") + def _assert_raw(x): assert isinstance(x, np.ndarray) assert x.ndim == 1 - float_frame.apply(_assert_raw, axis=axis, raw=True) + float_frame.apply(_assert_raw, axis=axis, engine=engine, raw=True) @pytest.mark.parametrize("axis", [0, 1]) -def test_apply_raw_float_frame_lambda(float_frame, axis): - result = float_frame.apply(np.mean, axis=axis, raw=True) +def test_apply_raw_float_frame_lambda(float_frame, axis, engine): + result = float_frame.apply(np.mean, axis=axis, engine=engine, raw=True) expected = float_frame.apply(lambda x: x.values.mean(), axis=axis) tm.assert_series_equal(result, expected) -def test_apply_raw_float_frame_no_reduction(float_frame): +def test_apply_raw_float_frame_no_reduction(float_frame, engine): # no reduction - result = float_frame.apply(lambda x: x * 2, raw=True) + result = float_frame.apply(lambda x: x * 2, engine=engine, raw=True) expected = float_frame * 2 tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("axis", [0, 1]) -def test_apply_raw_mixed_type_frame(mixed_type_frame, axis): +def test_apply_raw_mixed_type_frame(mixed_type_frame, axis, engine): + if engine == "numba": + pytest.skip("isinstance check doesn't work with numba") + def _assert_raw(x): assert isinstance(x, np.ndarray) assert x.ndim == 1 # Mixed dtype (GH-32423) - mixed_type_frame.apply(_assert_raw, axis=axis, raw=True) + mixed_type_frame.apply(_assert_raw, axis=axis, engine=engine, raw=True) def test_apply_axis1(float_frame): @@ -300,14 +314,20 @@ def test_apply_mixed_dtype_corner_indexing(): ) @pytest.mark.parametrize("raw", [True, False]) @pytest.mark.parametrize("axis", [0, 1]) -def test_apply_empty_infer_type(ax, func, raw, axis): +def test_apply_empty_infer_type(ax, func, raw, axis, engine, request): df = DataFrame(**{ax: ["a", "b", "c"]}) with np.errstate(all="ignore"): test_res = func(np.array([], dtype="f8")) is_reduction = not isinstance(test_res, np.ndarray) - result = df.apply(func, axis=axis, raw=raw) + if engine == "numba" and raw is False: + mark = pytest.mark.xfail( + reason="numba engine only supports raw=True at the moment" + ) + request.node.add_marker(mark) + + result = df.apply(func, axis=axis, engine=engine, raw=raw) if is_reduction: agg_axis = df._get_agg_axis(axis) assert isinstance(result, Series) @@ -607,8 +627,10 @@ def non_reducing_function(row): assert names == list(df.index) -def test_apply_raw_function_runs_once(): +def test_apply_raw_function_runs_once(engine): # https://github.com/pandas-dev/pandas/issues/34506 + if engine == "numba": + pytest.skip("appending to list outside of numba func is not supported") df = DataFrame({"a": [1, 2, 3]}) values = [] # Save row values function is applied to @@ -623,7 +645,7 @@ def non_reducing_function(row): for func in [reducing_function, non_reducing_function]: del values[:] - df.apply(func, raw=True, axis=1) + df.apply(func, engine=engine, raw=True, axis=1) assert values == list(df.a.to_list()) @@ -1449,10 +1471,12 @@ def test_apply_no_suffix_index(): tm.assert_frame_equal(result, expected) -def test_apply_raw_returns_string(): +def test_apply_raw_returns_string(engine): # https://github.com/pandas-dev/pandas/issues/35940 + if engine == "numba": + pytest.skip("No object dtype support in numba") df = DataFrame({"A": ["aa", "bbb"]}) - result = df.apply(lambda x: x[0], axis=1, raw=True) + result = df.apply(lambda x: x[0], engine=engine, axis=1, raw=True) expected = Series(["aa", "bbb"]) tm.assert_series_equal(result, expected) @@ -1632,3 +1656,14 @@ def test_agg_dist_like_and_nonunique_columns(): result = df.agg({"A": "count"}) expected = df["A"].count() tm.assert_series_equal(result, expected) + + +def test_numba_unsupported(): + df = DataFrame( + {"A": [None, 2, 3], "B": [1.0, np.nan, 3.0], "C": ["foo", None, "bar"]} + ) + with pytest.raises( + ValueError, + match="The numba engine in DataFrame.apply can only be used when raw=True", + ): + df.apply(lambda x: x, engine="numba", raw=False) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 2c96d874fb3d4..c0ff0a77f33a7 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -377,8 +377,16 @@ def test_astype_int(dtype): tm.assert_numpy_array_equal(result, expected) arr = pd.array(["1", pd.NA, "3"], dtype=dtype) - msg = r"int\(\) argument must be a string, a bytes-like object or a( real)? number" - with pytest.raises(TypeError, match=msg): + if dtype.storage == "pyarrow_numpy": + err = ValueError + msg = "cannot convert float NaN to integer" + else: + err = TypeError + msg = ( + r"int\(\) argument must be a string, a bytes-like " + r"object or a( real)? number" + ) + with pytest.raises(err, match=msg): arr.astype("int64") diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 1ab628f186b47..c1d424f12bfc4 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -12,7 +12,10 @@ StringArray, StringDtype, ) -from pandas.core.arrays.string_arrow import ArrowStringArray +from pandas.core.arrays.string_arrow import ( + ArrowStringArray, + ArrowStringArrayNumpySemantics, +) skip_if_no_pyarrow = pytest.mark.skipif( pa_version_under7p0, @@ -166,6 +169,9 @@ def test_pyarrow_not_installed_raises(): with pytest.raises(ImportError, match=msg): ArrowStringArray([]) + with pytest.raises(ImportError, match=msg): + ArrowStringArrayNumpySemantics([]) + with pytest.raises(ImportError, match=msg): ArrowStringArray._from_sequence(["a", None, "b"]) @@ -235,9 +241,10 @@ def test_setitem_invalid_indexer_raises(): @skip_if_no_pyarrow -def test_pickle_roundtrip(): +@pytest.mark.parametrize("dtype", ["string[pyarrow]", "string[pyarrow_numpy]"]) +def test_pickle_roundtrip(dtype): # GH 42600 - expected = pd.Series(range(10), dtype="string[pyarrow]") + expected = pd.Series(range(10), dtype=dtype) expected_sliced = expected.head(2) full_pickled = pickle.dumps(expected) sliced_pickled = pickle.dumps(expected_sliced) @@ -249,3 +256,11 @@ def test_pickle_roundtrip(): result_sliced = pickle.loads(sliced_pickled) tm.assert_series_equal(result_sliced, expected_sliced) + + +@skip_if_no_pyarrow +def test_string_dtype_error_message(): + # GH#55051 + msg = "Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'." + with pytest.raises(ValueError, match=msg): + StringDtype("bla") diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 489f43729a004..5c21c4f7137a5 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -108,9 +108,13 @@ def test_groupby_extension_transform(self, data_for_grouping): def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) - df.groupby("B", group_keys=False).apply(groupby_apply_op) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby("B", group_keys=False).apply(groupby_apply_op) df.groupby("B", group_keys=False).A.apply(groupby_apply_op) - df.groupby("A", group_keys=False).apply(groupby_apply_op) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby("A", group_keys=False).apply(groupby_apply_op) df.groupby("A", group_keys=False).B.apply(groupby_apply_op) def test_groupby_apply_identity(self, data_for_grouping): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 13d80329f4d51..8968b9a7f25fe 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -31,6 +31,7 @@ import pytest from pandas._libs import lib +from pandas._libs.tslibs import timezones from pandas.compat import ( PY311, is_ci_environment, @@ -40,6 +41,7 @@ pa_version_under9p0, pa_version_under11p0, pa_version_under13p0, + pa_version_under14p0, ) from pandas.core.dtypes.dtypes import ( @@ -917,7 +919,7 @@ def _is_temporal_supported(self, opname, pa_dtype): or ( opname in ("__truediv__", "__rtruediv__", "__floordiv__", "__rfloordiv__") - and not pa_version_under13p0 + and not pa_version_under14p0 ) ) and pa.types.is_duration(pa_dtype) @@ -1595,6 +1597,19 @@ def test_to_numpy_null_array_no_dtype(): tm.assert_numpy_array_equal(result, expected) +def test_to_numpy_without_dtype(): + # GH 54808 + arr = pd.array([True, pd.NA], dtype="boolean[pyarrow]") + result = arr.to_numpy(na_value=False) + expected = np.array([True, False], dtype=np.bool_) + tm.assert_numpy_array_equal(result, expected) + + arr = pd.array([1.0, pd.NA], dtype="float32[pyarrow]") + result = arr.to_numpy(na_value=0.0) + expected = np.array([1.0, 0.0], dtype=np.float32) + tm.assert_numpy_array_equal(result, expected) + + def test_setitem_null_slice(data): # GH50248 orig = data.copy() @@ -2418,7 +2433,7 @@ def test_dt_tz(tz): dtype=ArrowDtype(pa.timestamp("ns", tz=tz)), ) result = ser.dt.tz - assert result == tz + assert result == timezones.maybe_get_tz(tz) def test_dt_isocalendar(): @@ -2979,6 +2994,15 @@ def test_groupby_count_return_arrow_dtype(data_missing): tm.assert_frame_equal(result, expected) +def test_fixed_size_list(): + # GH#55000 + ser = pd.Series( + [[1, 2], [3, 4]], dtype=ArrowDtype(pa.list_(pa.int64(), list_size=2)) + ) + result = ser.dtype.type + assert result == list + + def test_arrowextensiondtype_dataframe_repr(): # GH 54062 df = pd.DataFrame( diff --git a/pandas/tests/frame/methods/test_copy.py b/pandas/tests/frame/methods/test_copy.py index 95fcaaa473067..e7901ed363106 100644 --- a/pandas/tests/frame/methods/test_copy.py +++ b/pandas/tests/frame/methods/test_copy.py @@ -56,7 +56,7 @@ def test_copy_consolidates(self): } ) - for i in range(0, 10): + for i in range(10): df.loc[:, f"n_{i}"] = np.random.default_rng(2).integers(0, 100, size=55) assert len(df._mgr.blocks) == 11 diff --git a/pandas/tests/frame/methods/test_filter.py b/pandas/tests/frame/methods/test_filter.py index 1a2fbf8a65a55..9d5e6876bb08c 100644 --- a/pandas/tests/frame/methods/test_filter.py +++ b/pandas/tests/frame/methods/test_filter.py @@ -137,3 +137,17 @@ def test_filter_regex_non_string(self): result = df.filter(regex="STRING") expected = df[["STRING"]] tm.assert_frame_equal(result, expected) + + def test_filter_keep_order(self): + # GH#54980 + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + result = df.filter(items=["B", "A"]) + expected = df[["B", "A"]] + tm.assert_frame_equal(result, expected) + + def test_filter_different_dtype(self): + # GH#54980 + df = DataFrame({1: [1, 2, 3], 2: [4, 5, 6]}) + result = df.filter(items=["B", "A"]) + expected = df[[]] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_pct_change.py b/pandas/tests/frame/methods/test_pct_change.py index d0153da038a75..ede212ae18ae9 100644 --- a/pandas/tests/frame/methods/test_pct_change.py +++ b/pandas/tests/frame/methods/test_pct_change.py @@ -160,3 +160,21 @@ def test_pct_change_with_duplicated_indices(fill_method): index=["a", "b"] * 3, ) tm.assert_frame_equal(result, expected) + + +def test_pct_change_none_beginning_no_warning(): + # GH#54481 + df = DataFrame( + [ + [1, None], + [2, 1], + [3, 2], + [4, 3], + [5, 4], + ] + ) + result = df.pct_change() + expected = DataFrame( + {0: [np.nan, 1, 0.5, 1 / 3, 0.25], 1: [np.nan, np.nan, 1, 0.5, 1 / 3]} + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index 0858e33a989b7..56bdd2fc664cc 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -26,7 +26,7 @@ isna, ) import pandas._testing as tm -from pandas.api.types import CategoricalDtype as CDT +from pandas.api.types import CategoricalDtype class TestReindexSetIndex: @@ -1082,7 +1082,9 @@ def test_reindex_with_categoricalindex(self): { "A": np.arange(3, dtype="int64"), }, - index=CategoricalIndex(list("abc"), dtype=CDT(list("cabe")), name="B"), + index=CategoricalIndex( + list("abc"), dtype=CategoricalDtype(list("cabe")), name="B" + ), ) # reindexing @@ -1111,13 +1113,13 @@ def test_reindex_with_categoricalindex(self): result = df.reindex(Categorical(["a", "e"], categories=cats)) expected = DataFrame( - {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats))} + {"A": [0, np.nan], "B": Series(list("ae")).astype(CategoricalDtype(cats))} ).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(Categorical(["a"], categories=cats)) expected = DataFrame( - {"A": [0], "B": Series(list("a")).astype(CDT(cats))} + {"A": [0], "B": Series(list("a")).astype(CategoricalDtype(cats))} ).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) @@ -1138,13 +1140,19 @@ def test_reindex_with_categoricalindex(self): # give back the type of categorical that we received result = df.reindex(Categorical(["a", "e"], categories=cats, ordered=True)) expected = DataFrame( - {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats, ordered=True))} + { + "A": [0, np.nan], + "B": Series(list("ae")).astype(CategoricalDtype(cats, ordered=True)), + } ).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(Categorical(["a", "d"], categories=["a", "d"])) expected = DataFrame( - {"A": [0, np.nan], "B": Series(list("ad")).astype(CDT(["a", "d"]))} + { + "A": [0, np.nan], + "B": Series(list("ad")).astype(CategoricalDtype(["a", "d"])), + } ).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) @@ -1152,7 +1160,9 @@ def test_reindex_with_categoricalindex(self): { "A": np.arange(6, dtype="int64"), }, - index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cabe")), name="B"), + index=CategoricalIndex( + list("aabbca"), dtype=CategoricalDtype(list("cabe")), name="B" + ), ) # passed duplicate indexers are not allowed msg = "cannot reindex on an axis with duplicate labels" diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index d99dd36f3a2e3..339e19254fd10 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -788,15 +788,15 @@ def test_errorreset_index_rename(float_frame): def test_reset_index_false_index_name(): - result_series = Series(data=range(5, 10), index=range(0, 5)) + result_series = Series(data=range(5, 10), index=range(5)) result_series.index.name = False result_series.reset_index() - expected_series = Series(range(5, 10), RangeIndex(range(0, 5), name=False)) + expected_series = Series(range(5, 10), RangeIndex(range(5), name=False)) tm.assert_series_equal(result_series, expected_series) # GH 38147 - result_frame = DataFrame(data=range(5, 10), index=range(0, 5)) + result_frame = DataFrame(data=range(5, 10), index=range(5)) result_frame.index.name = False result_frame.reset_index() - expected_frame = DataFrame(range(5, 10), RangeIndex(range(0, 5), name=False)) + expected_frame = DataFrame(range(5, 10), RangeIndex(range(5), name=False)) tm.assert_frame_equal(result_frame, expected_frame) diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 228b62a418813..985a9e3602410 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -911,7 +911,7 @@ def test_sort_index_multiindex_sparse_column(self): expected = DataFrame( { i: pd.array([0.0, 0.0, 0.0, 0.0], dtype=pd.SparseDtype("float64", 0.0)) - for i in range(0, 4) + for i in range(4) }, index=MultiIndex.from_product([[1, 2], [1, 2]]), ) diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py index 7bb9518f9b0f9..61f0ad30b4519 100644 --- a/pandas/tests/frame/methods/test_to_dict.py +++ b/pandas/tests/frame/methods/test_to_dict.py @@ -166,6 +166,21 @@ def test_to_dict_not_unique_warning(self): with tm.assert_produces_warning(UserWarning): df.to_dict() + @pytest.mark.filterwarnings("ignore::UserWarning") + @pytest.mark.parametrize( + "orient,expected", + [ + ("list", {"A": [2, 5], "B": [3, 6]}), + ("dict", {"A": {0: 2, 1: 5}, "B": {0: 3, 1: 6}}), + ], + ) + def test_to_dict_not_unique(self, orient, expected): + # GH#54824: This is to make sure that dataframes with non-unique column + # would have uniform behavior throughout different orients + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "A", "B"]) + result = df.to_dict(orient) + assert result == expected + # orient - orient argument to to_dict function # item_getter - function for extracting value from # the resulting dict using column name and index diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 19fcddb5dbe2b..ed3fb079d745a 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -692,12 +692,12 @@ def test_constructor_error_msgs(self): arr = np.array([[4, 5, 6]]) msg = r"Shape of passed values is \(1, 3\), indices imply \(1, 4\)" with pytest.raises(ValueError, match=msg): - DataFrame(index=[0], columns=range(0, 4), data=arr) + DataFrame(index=[0], columns=range(4), data=arr) arr = np.array([4, 5, 6]) msg = r"Shape of passed values is \(3, 1\), indices imply \(1, 4\)" with pytest.raises(ValueError, match=msg): - DataFrame(index=[0], columns=range(0, 4), data=arr) + DataFrame(index=[0], columns=range(4), data=arr) # higher dim raise exception with pytest.raises(ValueError, match="Must pass 2-d input"): @@ -2393,7 +2393,7 @@ def test_construct_with_two_categoricalindex_series(self): def test_constructor_series_nonexact_categoricalindex(self): # GH 42424 - ser = Series(range(0, 100)) + ser = Series(range(100)) ser1 = cut(ser, 10).value_counts().head(5) ser2 = cut(ser, 10).value_counts().tail(5) result = DataFrame({"1": ser1, "2": ser2}) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index c90b871d5d66f..b54a795af4fdc 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1767,7 +1767,9 @@ def test_unstack_bug(self, future_stack): } ) - result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) unstacked = result.unstack() restacked = unstacked.stack(future_stack=future_stack) @@ -2508,3 +2510,19 @@ def test_unstack_mixed_level_names(self): index=MultiIndex.from_tuples([(1, "red"), (2, "blue")], names=[0, "y"]), ) tm.assert_frame_equal(result, expected) + + +def test_stack_tuple_columns(future_stack): + # GH#54948 - test stack when the input has a non-MultiIndex with tuples + df = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=[("a", 1), ("a", 2), ("b", 1)] + ) + result = df.stack(future_stack=future_stack) + expected = Series( + [1, 2, 3, 4, 5, 6, 7, 8, 9], + index=MultiIndex( + levels=[[0, 1, 2], [("a", 1), ("a", 2), ("b", 1)]], + codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], + ), + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index c01ca4922a84b..882f42ff18bdd 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -515,6 +515,18 @@ def test_groupby_agg_dict_with_getitem(): tm.assert_frame_equal(result, expected) +def test_groupby_agg_dict_dup_columns(): + # GH#55006 + df = DataFrame( + [[1, 2, 3, 4], [1, 3, 4, 5], [2, 4, 5, 6]], + columns=["a", "b", "c", "c"], + ) + gb = df.groupby("a") + result = gb.agg({"b": "sum"}) + expected = DataFrame({"b": [5, 4]}, index=Index([1, 2], name="a")) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "op", [ diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 9d3ebbd3672ae..7ea107f254104 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -499,13 +499,17 @@ def test_agg_timezone_round_trip(): assert ts == grouped.first()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1] ts = df["B"].iloc[2] assert ts == grouped.last()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1] def test_sum_uint64_overflow(): diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index d04ee7cec0db1..abcb9f68e0f5c 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -28,7 +28,9 @@ def test_apply_func_that_appends_group_to_list_without_copy(): def store(group): groups.append(group) - df.groupby("index").apply(store) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby("index").apply(store) expected_value = DataFrame( {"index": [0] * 10, 0: [1] * 10}, index=pd.RangeIndex(0, 100, 10) ) @@ -71,9 +73,11 @@ def test_apply_issues(): ["2011.05.16", "2011.05.17", "2011.05.18"], dtype=object, name="date" ) expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) - result = df.groupby("date", group_keys=False).apply( - lambda x: x["time"][x["value"].idxmax()] - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("date", group_keys=False).apply( + lambda x: x["time"][x["value"].idxmax()] + ) tm.assert_series_equal(result, expected) @@ -179,7 +183,9 @@ def f_constant_df(group): for func in [f_copy, f_nocopy, f_scalar, f_none, f_constant_df]: del names[:] - df.groupby("a", group_keys=False).apply(func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby("a", group_keys=False).apply(func) assert names == group_names @@ -197,9 +203,11 @@ def test_group_apply_once_per_group2(capsys): index=["0", "2", "4", "6", "8", "10", "12", "14"], ) - df.groupby("group_by_column", group_keys=False).apply( - lambda df: print("function_called") - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby("group_by_column", group_keys=False).apply( + lambda df: print("function_called") + ) result = capsys.readouterr().out.count("function_called") # If `groupby` behaves unexpectedly, this test will break @@ -219,8 +227,11 @@ def slow(group): def fast(group): return group.copy() - fast_df = df.groupby("A", group_keys=False).apply(fast) - slow_df = df.groupby("A", group_keys=False).apply(slow) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + fast_df = df.groupby("A", group_keys=False).apply(fast) + with tm.assert_produces_warning(FutureWarning, match=msg): + slow_df = df.groupby("A", group_keys=False).apply(slow) tm.assert_frame_equal(fast_df, slow_df) @@ -242,7 +253,9 @@ def test_groupby_apply_identity_maybecopy_index_identical(func): df = DataFrame({"g": [1, 2, 2, 2], "a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) - result = df.groupby("g", group_keys=False).apply(func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("g", group_keys=False).apply(func) tm.assert_frame_equal(result, df) @@ -285,8 +298,11 @@ def test_groupby_as_index_apply(): tm.assert_index_equal(res_as, exp) tm.assert_index_equal(res_not_as, exp) - res_as_apply = g_as.apply(lambda x: x.head(2)).index - res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + res_as_apply = g_as.apply(lambda x: x.head(2)).index + with tm.assert_produces_warning(FutureWarning, match=msg): + res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index # apply doesn't maintain the original ordering # changed in GH5610 as the as_index=False returns a MI here @@ -299,7 +315,9 @@ def test_groupby_as_index_apply(): ind = Index(list("abcde")) df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) - res = df.groupby(0, as_index=False, group_keys=False).apply(lambda x: x).index + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = df.groupby(0, as_index=False, group_keys=False).apply(lambda x: x).index tm.assert_index_equal(res, ind) @@ -328,13 +346,19 @@ def desc3(group): # weirdo return result - result = grouped.apply(desc) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.apply(desc) assert result.index.names == ("A", "B", "stat") - result2 = grouped.apply(desc2) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result2 = grouped.apply(desc2) assert result2.index.names == ("A", "B", "stat") - result3 = grouped.apply(desc3) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result3 = grouped.apply(desc3) assert result3.index.names == ("A", "B", None) @@ -364,7 +388,9 @@ def test_apply_series_yield_constant(df): def test_apply_frame_yield_constant(df): # GH13568 - result = df.groupby(["A", "B"]).apply(len) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(["A", "B"]).apply(len) assert isinstance(result, Series) assert result.name is None @@ -375,7 +401,9 @@ def test_apply_frame_yield_constant(df): def test_apply_frame_to_series(df): grouped = df.groupby(["A", "B"]) - result = grouped.apply(len) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.apply(len) expected = grouped.count()["C"] tm.assert_index_equal(result.index, expected.index) tm.assert_numpy_array_equal(result.values, expected.values) @@ -384,7 +412,9 @@ def test_apply_frame_to_series(df): def test_apply_frame_not_as_index_column_name(df): # GH 35964 - path within _wrap_applied_output not hit by a test grouped = df.groupby(["A", "B"], as_index=False) - result = grouped.apply(len) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.apply(len) expected = grouped.count().rename(columns={"C": np.nan}).drop(columns="D") # TODO(GH#34306): Use assert_frame_equal when column name is not np.nan tm.assert_index_equal(result.index, expected.index) @@ -407,7 +437,9 @@ def trans2(group): } ) - result = df.groupby("A").apply(trans) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(trans) exp = df.groupby("A")["C"].apply(trans2) tm.assert_series_equal(result, exp, check_names=False) assert result.name == "C" @@ -436,7 +468,9 @@ def test_apply_chunk_view(group_keys): # Low level tinkering could be unsafe, make sure not df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - result = df.groupby("key", group_keys=group_keys).apply(lambda x: x.iloc[:2]) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("key", group_keys=group_keys).apply(lambda x: x.iloc[:2]) expected = df.take([0, 1, 3, 4, 6, 7]) if group_keys: expected.index = MultiIndex.from_arrays( @@ -457,7 +491,9 @@ def test_apply_no_name_column_conflict(): # it works! #2605 grouped = df.groupby(["name", "name2"]) - grouped.apply(lambda x: x.sort_values("value", inplace=True)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + grouped.apply(lambda x: x.sort_values("value", inplace=True)) def test_apply_typecast_fail(): @@ -474,7 +510,9 @@ def f(group): group["v2"] = (v - v.min()) / (v.max() - v.min()) return group - result = df.groupby("d", group_keys=False).apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("d", group_keys=False).apply(f) expected = df.copy() expected["v2"] = np.tile([0.0, 0.5, 1], 2) @@ -498,7 +536,9 @@ def f(group): group["v2"] = (v - v.min()) / (v.max() - v.min()) return group - result = df.groupby("d", group_keys=False).apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("d", group_keys=False).apply(f) expected = df.copy() expected["v2"] = np.tile([0.0, 0.5, 1], 2) @@ -536,8 +576,11 @@ def filt2(x): else: return x[x.category == "c"] - expected = data.groupby("id_field").apply(filt1) - result = data.groupby("id_field").apply(filt2) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = data.groupby("id_field").apply(filt1) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = data.groupby("id_field").apply(filt2) tm.assert_frame_equal(result, expected) @@ -556,7 +599,9 @@ def test_apply_with_duplicated_non_sorted_axis(test_series): expected = ser.sort_index() tm.assert_series_equal(result, expected) else: - result = df.groupby("Y", group_keys=False).apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("Y", group_keys=False).apply(lambda x: x) # not expecting the order to remain the same for duplicated axis result = result.sort_values("Y") @@ -601,7 +646,9 @@ def f(g): g["value3"] = g["value1"] * 2 return g - result = grouped.apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.apply(f) assert "value3" in result @@ -615,9 +662,13 @@ def test_apply_numeric_coercion_when_datetime(): df = DataFrame( {"Number": [1, 2], "Date": ["2017-03-02"] * 2, "Str": ["foo", "inf"]} ) - expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) df.Date = pd.to_datetime(df.Date) - result = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) tm.assert_series_equal(result["Str"], expected["Str"]) # GH 15421 @@ -628,7 +679,9 @@ def test_apply_numeric_coercion_when_datetime(): def get_B(g): return g.iloc[0][["B"]] - result = df.groupby("A").apply(get_B)["B"] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(get_B)["B"] expected = df.B expected.index = df.A tm.assert_series_equal(result, expected) @@ -653,8 +706,11 @@ def predictions(tool): ) df2 = df1.copy() df2.oTime = pd.to_datetime(df2.oTime) - expected = df1.groupby("Key").apply(predictions).p1 - result = df2.groupby("Key").apply(predictions).p1 + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df1.groupby("Key").apply(predictions).p1 + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df2.groupby("Key").apply(predictions).p1 tm.assert_series_equal(expected, result) @@ -669,11 +725,13 @@ def test_apply_aggregating_timedelta_and_datetime(): } ) df["time_delta_zero"] = df.datetime - df.datetime - result = df.groupby("clientid").apply( - lambda ddf: Series( - {"clientid_age": ddf.time_delta_zero.min(), "date": ddf.datetime.min()} + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("clientid").apply( + lambda ddf: Series( + {"clientid_age": ddf.time_delta_zero.min(), "date": ddf.datetime.min()} + ) ) - ) expected = DataFrame( { "clientid": ["A", "B", "C"], @@ -716,11 +774,15 @@ def func_with_no_date(batch): def func_with_date(batch): return Series({"b": datetime(2015, 1, 1), "c": 2}) - dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date) dfg_no_conversion_expected = DataFrame({"c": 2}, index=[1]) dfg_no_conversion_expected.index.name = "a" - dfg_conversion = df.groupby(by=["a"]).apply(func_with_date) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + dfg_conversion = df.groupby(by=["a"]).apply(func_with_date) dfg_conversion_expected = DataFrame( {"b": pd.Timestamp(2015, 1, 1).as_unit("ns"), "c": 2}, index=[1] ) @@ -764,7 +826,9 @@ def test_groupby_apply_all_none(): def test_func(x): pass - result = test_df.groupby("groups").apply(test_func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = test_df.groupby("groups").apply(test_func) expected = DataFrame() tm.assert_frame_equal(result, expected) @@ -779,8 +843,11 @@ def test_func(x): return None return x.iloc[[0, -1]] - result1 = test_df1.groupby("groups").apply(test_func) - result2 = test_df2.groupby("groups").apply(test_func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result1 = test_df1.groupby("groups").apply(test_func) + with tm.assert_produces_warning(FutureWarning, match=msg): + result2 = test_df2.groupby("groups").apply(test_func) index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], names=["groups", None]) index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], names=["groups", None]) expected1 = DataFrame({"groups": [1, 1], "vars": [0, 2]}, index=index1) @@ -793,7 +860,9 @@ def test_groupby_apply_return_empty_chunk(): # GH 22221: apply filter which returns some empty groups df = DataFrame({"value": [0, 1], "group": ["filled", "empty"]}) groups = df.groupby("group") - result = groups.apply(lambda group: group[group.value != 1]["value"]) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = groups.apply(lambda group: group[group.value != 1]["value"]) expected = Series( [0], name="value", @@ -820,7 +889,9 @@ def test_apply_with_mixed_types(): def test_func_returns_object(): # GH 28652 df = DataFrame({"a": [1, 2]}, index=Index([1, 2])) - result = df.groupby("a").apply(lambda g: g.index) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("a").apply(lambda g: g.index) expected = Series([Index([1]), Index([2])], index=Index([1, 2], name="a")) tm.assert_series_equal(result, expected) @@ -837,7 +908,9 @@ def test_apply_datetime_issue(group_column_dtlike): # standard int values in range(len(num_columns)) df = DataFrame({"a": ["foo"], "b": [group_column_dtlike]}) - result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) expected = DataFrame( ["spam"], Index(["foo"], dtype="object", name="a"), columns=[42] @@ -876,7 +949,9 @@ def test_apply_series_return_dataframe_groups(): def most_common_values(df): return Series({c: s.value_counts().index[0] for c, s in df.items()}) - result = tdf.groupby("day").apply(most_common_values)["userId"] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = tdf.groupby("day").apply(most_common_values)["userId"] expected = Series( ["17661101"], index=pd.DatetimeIndex(["2015-02-24"], name="day"), name="userId" ) @@ -917,7 +992,9 @@ def test_groupby_apply_datetime_result_dtypes(): ], columns=["observation", "color", "mood", "intensity", "score"], ) - result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes expected = Series( [np.dtype("datetime64[ns]"), object, object, np.int64, object], index=["observation", "color", "mood", "intensity", "score"], @@ -937,7 +1014,9 @@ def test_groupby_apply_datetime_result_dtypes(): def test_apply_index_has_complex_internals(index): # GH 31248 df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index) - result = df.groupby("group", group_keys=False).apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("group", group_keys=False).apply(lambda x: x) tm.assert_frame_equal(result, df) @@ -960,7 +1039,9 @@ def test_apply_index_has_complex_internals(index): def test_apply_function_returns_non_pandas_non_scalar(function, expected_values): # GH 31441 df = DataFrame(["A", "A", "B", "B"], columns=["groups"]) - result = df.groupby("groups").apply(function) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("groups").apply(function) expected = Series(expected_values, index=Index(["A", "B"], name="groups")) tm.assert_series_equal(result, expected) @@ -972,7 +1053,9 @@ def fct(group): df = DataFrame({"A": ["a", "a", "b", "none"], "B": [1, 2, 3, np.nan]}) - result = df.groupby("A").apply(fct) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(fct) expected = Series( [[1.0, 2.0], [3.0], [np.nan]], index=Index(["a", "b", "none"], name="A") ) @@ -983,7 +1066,9 @@ def fct(group): def test_apply_function_index_return(function): # GH: 22541 df = DataFrame([1, 2, 2, 2, 1, 2, 3, 1, 3, 1], columns=["id"]) - result = df.groupby("id").apply(function) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("id").apply(function) expected = Series( [Index([0, 4, 7, 9]), Index([1, 2, 3, 5]), Index([6, 8])], index=Index([1, 2, 3], name="id"), @@ -1019,7 +1104,9 @@ def test_apply_result_type(group_keys, udf): # We'd like to control whether the group keys end up in the index # regardless of whether the UDF happens to be a transform. df = DataFrame({"A": ["a", "b"], "B": [1, 2]}) - df_result = df.groupby("A", group_keys=group_keys).apply(udf) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df_result = df.groupby("A", group_keys=group_keys).apply(udf) series_result = df.B.groupby(df.A, group_keys=group_keys).apply(udf) if group_keys: @@ -1034,8 +1121,11 @@ def test_result_order_group_keys_false(): # GH 34998 # apply result order should not depend on whether index is the same or just equal df = DataFrame({"A": [2, 1, 2], "B": [1, 2, 3]}) - result = df.groupby("A", group_keys=False).apply(lambda x: x) - expected = df.groupby("A", group_keys=False).apply(lambda x: x.copy()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A", group_keys=False).apply(lambda x: x) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby("A", group_keys=False).apply(lambda x: x.copy()) tm.assert_frame_equal(result, expected) @@ -1047,8 +1137,15 @@ def test_apply_with_timezones_aware(): df1 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_no_tz}) df2 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_tz}) - result1 = df1.groupby("x", group_keys=False).apply(lambda df: df[["x", "y"]].copy()) - result2 = df2.groupby("x", group_keys=False).apply(lambda df: df[["x", "y"]].copy()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result1 = df1.groupby("x", group_keys=False).apply( + lambda df: df[["x", "y"]].copy() + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result2 = df2.groupby("x", group_keys=False).apply( + lambda df: df[["x", "y"]].copy() + ) tm.assert_frame_equal(result1, result2) @@ -1103,7 +1200,9 @@ def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp(): ) grp = df.groupby(["A", "B"]) - result = grp.apply(lambda x: x.head(1)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grp.apply(lambda x: x.head(1)) expected = df.iloc[[0, 2, 3]] expected = expected.reset_index() @@ -1151,7 +1250,9 @@ def test_apply_dropna_with_indexed_same(dropna): }, index=list("xxyxz"), ) - result = df.groupby("group", dropna=dropna, group_keys=False).apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("group", dropna=dropna, group_keys=False).apply(lambda x: x) expected = df.dropna() if dropna else df.iloc[[0, 3, 1, 2, 4]] tm.assert_frame_equal(result, expected) @@ -1176,7 +1277,9 @@ def test_apply_dropna_with_indexed_same(dropna): def test_apply_as_index_constant_lambda(as_index, expected): # GH 13217 df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 1, 2, 2], "c": [1, 1, 1, 1]}) - result = df.groupby(["a", "b"], as_index=as_index).apply(lambda x: 1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(["a", "b"], as_index=as_index).apply(lambda x: 1) tm.assert_equal(result, expected) @@ -1186,7 +1289,9 @@ def test_sort_index_groups(): {"A": [1, 2, 3, 4, 5], "B": [6, 7, 8, 9, 0], "C": [1, 1, 1, 2, 2]}, index=range(5), ) - result = df.groupby("C").apply(lambda x: x.A.sort_index()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("C").apply(lambda x: x.A.sort_index()) expected = Series( range(1, 6), index=MultiIndex.from_tuples( @@ -1206,9 +1311,11 @@ def test_positional_slice_groups_datetimelike(): "let": list("abcde"), } ) - result = expected.groupby( - [expected.let, expected.date.dt.date], group_keys=False - ).apply(lambda x: x.iloc[0:]) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = expected.groupby( + [expected.let, expected.date.dt.date], group_keys=False + ).apply(lambda x: x.iloc[0:]) tm.assert_frame_equal(result, expected) @@ -1251,24 +1358,29 @@ def test_apply_na(dropna): {"grp": [1, 1, 2, 2], "y": [1, 0, 2, 5], "z": [1, 2, np.nan, np.nan]} ) dfgrp = df.groupby("grp", dropna=dropna) - result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z")) - expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z")) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1)) tm.assert_frame_equal(result, expected) def test_apply_empty_string_nan_coerce_bug(): # GH#24903 - result = ( - DataFrame( - { - "a": [1, 1, 2, 2], - "b": ["", "", "", ""], - "c": pd.to_datetime([1, 2, 3, 4], unit="s"), - } + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ( + DataFrame( + { + "a": [1, 1, 2, 2], + "b": ["", "", "", ""], + "c": pd.to_datetime([1, 2, 3, 4], unit="s"), + } + ) + .groupby(["a", "b"]) + .apply(lambda df: df.iloc[-1]) ) - .groupby(["a", "b"]) - .apply(lambda df: df.iloc[-1]) - ) expected = DataFrame( [[1, "", pd.to_datetime(2, unit="s")], [2, "", pd.to_datetime(4, unit="s")]], columns=["a", "b", "c"], @@ -1293,9 +1405,11 @@ def test_apply_index_key_error_bug(index_values): }, index=Index(["a2", "a3", "aa"], name="a"), ) - result = result.groupby("a").apply( - lambda df: Series([df["b"].mean()], index=["b_mean"]) - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = result.groupby("a").apply( + lambda df: Series([df["b"].mean()], index=["b_mean"]) + ) tm.assert_frame_equal(result, expected) @@ -1343,7 +1457,9 @@ def test_apply_index_key_error_bug(index_values): def test_apply_nonmonotonic_float_index(arg, idx): # GH 34455 expected = DataFrame({"col": arg}, index=idx) - result = expected.groupby("col", group_keys=False).apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = expected.groupby("col", group_keys=False).apply(lambda x: x) tm.assert_frame_equal(result, expected) @@ -1390,33 +1506,16 @@ def test_empty_df(method, op): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize( - "group_col", - [([0.0, np.nan, 0.0, 0.0]), ([np.nan, 0.0, 0.0, 0.0]), ([0, 0.0, 0.0, np.nan])], -) -def test_apply_inconsistent_output(group_col): - # GH 34478 - df = DataFrame({"group_col": group_col, "value_col": [2, 2, 2, 2]}) - - result = df.groupby("group_col").value_col.apply( - lambda x: x.value_counts().reindex(index=[1, 2, 3]) - ) - expected = Series( - [np.nan, 3.0, np.nan], - name="value_col", - index=MultiIndex.from_product([[0.0], [1, 2, 3]], names=["group_col", 0.0]), - ) - - tm.assert_series_equal(result, expected) - - -def test_apply_array_output_multi_getitem(): - # GH 18930 - df = DataFrame( - {"A": {"a": 1, "b": 2}, "B": {"a": 1, "b": 2}, "C": {"a": 1, "b": 2}} - ) - result = df.groupby("A")[["B", "C"]].apply(lambda x: np.array([0])) - expected = Series( - [np.array([0])] * 2, index=Index([1, 2], name="A"), name=("B", "C") - ) - tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("include_groups", [True, False]) +def test_include_groups(include_groups): + # GH#7155 + df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}) + gb = df.groupby("a") + warn = FutureWarning if include_groups else None + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(warn, match=msg): + result = gb.apply(lambda x: x.sum(), include_groups=include_groups) + expected = DataFrame({"a": [2, 2], "b": [7, 5]}, index=Index([1, 2], name="a")) + if not include_groups: + expected = expected[["b"]] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_apply_mutate.py b/pandas/tests/groupby/test_apply_mutate.py index 9bc07b584e9d1..09d5e06bf6ddd 100644 --- a/pandas/tests/groupby/test_apply_mutate.py +++ b/pandas/tests/groupby/test_apply_mutate.py @@ -13,10 +13,16 @@ def test_group_by_copy(): } ).set_index("name") - grp_by_same_value = df.groupby(["age"], group_keys=False).apply(lambda group: group) - grp_by_copy = df.groupby(["age"], group_keys=False).apply( - lambda group: group.copy() - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + grp_by_same_value = df.groupby(["age"], group_keys=False).apply( + lambda group: group + ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + grp_by_copy = df.groupby(["age"], group_keys=False).apply( + lambda group: group.copy() + ) tm.assert_frame_equal(grp_by_same_value, grp_by_copy) @@ -47,8 +53,11 @@ def f_no_copy(x): x["rank"] = x.val.rank(method="min") return x.groupby("cat2")["rank"].min() - grpby_copy = df.groupby("cat1").apply(f_copy) - grpby_no_copy = df.groupby("cat1").apply(f_no_copy) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + grpby_copy = df.groupby("cat1").apply(f_copy) + with tm.assert_produces_warning(FutureWarning, match=msg): + grpby_no_copy = df.groupby("cat1").apply(f_no_copy) tm.assert_series_equal(grpby_copy, grpby_no_copy) @@ -58,8 +67,11 @@ def test_no_mutate_but_looks_like(): # second does not, but should yield the same results df = pd.DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key) - result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key) + with tm.assert_produces_warning(FutureWarning, match=msg): + result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key) tm.assert_series_equal(result1, result2) @@ -73,7 +85,9 @@ def fn(x): x.loc[x.index[-1], "col2"] = 0 return x.col2 - result = df.groupby(["col1"], as_index=False).apply(fn) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(["col1"], as_index=False).apply(fn) expected = pd.Series( [1, 2, 0, 4, 5, 0], index=pd.MultiIndex.from_tuples( diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index f2d21c10f7a15..b11240c841420 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -124,7 +124,9 @@ def test_basic(): # TODO: split this test def f(x): return x.drop_duplicates("person_name").iloc[0] - result = g.apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = g.apply(f) expected = x.iloc[[0, 1]].copy() expected.index = Index([1, 2], name="person_id") expected["person_name"] = expected["person_name"].astype("object") @@ -329,7 +331,9 @@ def test_apply(ordered): # but for transform we should still get back the original index idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"]) expected = Series(1, index=idx) - result = grouped.apply(lambda x: 1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.apply(lambda x: 1) tm.assert_series_equal(result, expected) @@ -2013,7 +2017,10 @@ def test_category_order_apply(as_index, sort, observed, method, index_kind, orde df["a2"] = df["a"] df = df.set_index(keys) gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) - op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True)) + warn = FutureWarning if method == "apply" and index_kind == "range" else None + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(warn, match=msg): + op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True)) if (method == "transform" or not as_index) and index_kind == "range": result = op_result["a"].cat.categories else: diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index 25a4fd2550df6..16d7fe61b90ad 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -289,7 +289,9 @@ def test_count(): for key in ["1st", "2nd", ["1st", "2nd"]]: left = df.groupby(key).count() - right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) tm.assert_frame_equal(left, right) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 0abf6428730ff..287310a18c7df 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -95,10 +95,12 @@ def test_builtins_apply(keys, f): assert result.shape == (ngroups, 3), assert_msg npfunc = lambda x: getattr(np, fname)(x, axis=0) # numpy's equivalent function - expected = gb.apply(npfunc) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = gb.apply(npfunc) tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning(None): + with tm.assert_produces_warning(FutureWarning, match=msg): expected2 = gb.apply(lambda x: npfunc(x)) tm.assert_frame_equal(result, expected2) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c0ac94c09e1ea..fdd959f0e8754 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -150,7 +150,9 @@ def test_groupby_nonobject_dtype(mframe, df_mixed_floats): def max_value(group): return group.loc[group["value"].idxmax()] - applied = df.groupby("A").apply(max_value) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + applied = df.groupby("A").apply(max_value) result = applied.dtypes expected = df.dtypes tm.assert_series_equal(result, expected) @@ -171,7 +173,9 @@ def f_0(grp): return grp.iloc[0] expected = df.groupby("A").first()[["B"]] - result = df.groupby("A").apply(f_0)[["B"]] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(f_0)[["B"]] tm.assert_frame_equal(result, expected) def f_1(grp): @@ -179,9 +183,10 @@ def f_1(grp): return None return grp.iloc[0] - result = df.groupby("A").apply(f_1)[["B"]] - # Cast to avoid upcast when setting nan below - e = expected.copy().astype("float64") + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(f_1)[["B"]] + e = expected.copy() e.loc["Tiger"] = np.nan tm.assert_frame_equal(result, e) @@ -190,9 +195,10 @@ def f_2(grp): return None return grp.iloc[0] - result = df.groupby("A").apply(f_2)[["B"]] - # Explicit cast to float to avoid implicit cast when setting nan - e = expected.copy().astype({"B": "float"}) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(f_2)[["B"]] + e = expected.copy() e.loc["Pony"] = np.nan tm.assert_frame_equal(result, e) @@ -202,7 +208,9 @@ def f_3(grp): return None return grp.iloc[0] - result = df.groupby("A").apply(f_3)[["C"]] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(f_3)[["C"]] e = df.groupby("A").first()[["C"]] e.loc["Pony"] = pd.NaT tm.assert_frame_equal(result, e) @@ -213,7 +221,9 @@ def f_4(grp): return None return grp.iloc[0].loc["C"] - result = df.groupby("A").apply(f_4) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(f_4) e = df.groupby("A").first()["C"].copy() e.loc["Pony"] = np.nan e.name = None @@ -392,8 +402,11 @@ def f3(x): depr_msg = "The behavior of array concatenation with empty entries is deprecated" # correct result - result1 = df.groupby("a").apply(f1) - result2 = df2.groupby("a").apply(f1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result1 = df.groupby("a").apply(f1) + with tm.assert_produces_warning(FutureWarning, match=msg): + result2 = df2.groupby("a").apply(f1) tm.assert_frame_equal(result1, result2) # should fail (not the same number of levels) @@ -1322,11 +1335,15 @@ def summarize_random_name(df): # inconsistent. return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["A"]) - metrics = df.groupby("A").apply(summarize) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + metrics = df.groupby("A").apply(summarize) assert metrics.columns.name is None - metrics = df.groupby("A").apply(summarize, "metrics") + with tm.assert_produces_warning(FutureWarning, match=msg): + metrics = df.groupby("A").apply(summarize, "metrics") assert metrics.columns.name == "metrics" - metrics = df.groupby("A").apply(summarize_random_name) + with tm.assert_produces_warning(FutureWarning, match=msg): + metrics = df.groupby("A").apply(summarize_random_name) assert metrics.columns.name is None @@ -1619,7 +1636,9 @@ def test_dont_clobber_name_column(): {"key": ["a", "a", "a", "b", "b", "b"], "name": ["foo", "bar", "baz"] * 2} ) - result = df.groupby("key", group_keys=False).apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("key", group_keys=False).apply(lambda x: x) tm.assert_frame_equal(result, df) @@ -1693,7 +1712,9 @@ def freducex(x): grouped = df.groupby(grouper, group_keys=False) # make sure all these work - grouped.apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + grouped.apply(f) grouped.aggregate(freduce) grouped.aggregate({"C": freduce, "D": freduce}) grouped.transform(f) @@ -1714,7 +1735,9 @@ def f(group): names.append(group.name) return group.copy() - df.groupby("a", sort=False, group_keys=False).apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby("a", sort=False, group_keys=False).apply(f) expected_names = [0, 1, 2] assert names == expected_names @@ -1920,7 +1943,9 @@ def test_groupby_preserves_sort(sort_column, group_column): def test_sort(x): tm.assert_frame_equal(x, x.sort_values(by=sort_column)) - g.apply(test_sort) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + g.apply(test_sort) def test_pivot_table_values_key_error(): @@ -1928,7 +1953,7 @@ def test_pivot_table_values_key_error(): df = DataFrame( { "eventDate": date_range(datetime.today(), periods=20, freq="M").tolist(), - "thename": range(0, 20), + "thename": range(20), } ) @@ -2102,7 +2127,9 @@ def test_empty_groupby_apply_nonunique_columns(): df[3] = df[3].astype(np.int64) df.columns = [0, 1, 2, 0] gb = df.groupby(df[1], group_keys=False) - res = gb.apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = gb.apply(lambda x: x) assert (res.dtypes == df.dtypes).all() @@ -3187,3 +3214,34 @@ def test_depr_get_group_len_1_list_likes(test_series, kwarg, value, name, warn): else: expected = DataFrame({"b": [3, 4]}, index=Index([1, 1], name="a")) tm.assert_equal(result, expected) + + +def test_groupby_ngroup_with_nan(): + # GH#50100 + df = DataFrame({"a": Categorical([np.nan]), "b": [1]}) + result = df.groupby(["a", "b"], dropna=False, observed=False).ngroup() + expected = Series([0]) + tm.assert_series_equal(result, expected) + + +def test_get_group_axis_1(): + # GH#54858 + df = DataFrame( + { + "col1": [0, 3, 2, 3], + "col2": [4, 1, 6, 7], + "col3": [3, 8, 2, 10], + "col4": [1, 13, 6, 15], + "col5": [-4, 5, 6, -7], + } + ) + with tm.assert_produces_warning(FutureWarning, match="deprecated"): + grouped = df.groupby(axis=1, by=[1, 2, 3, 2, 1]) + result = grouped.get_group(1) + expected = DataFrame( + { + "col1": [0, 3, 2, 3], + "col5": [-4, 5, 6, -7], + } + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 099e7bc3890d0..d82278c277d48 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -324,7 +324,9 @@ def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, df = pd.DataFrame(data) gb = df.groupby("groups", dropna=dropna) - result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))})) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))})) mi_tuples = tuple(zip(data["groups"], selected_data["values"])) mi = pd.MultiIndex.from_tuples(mi_tuples, names=["groups", None]) diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index 773c1e60e97af..601e67bbca5e3 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -63,7 +63,9 @@ def func(group): assert hasattr(group, "testattr") return group.testattr - result = custom_df.groupby("c").apply(func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = custom_df.groupby("c").apply(func) expected = tm.SubclassedSeries(["hello"] * 3, index=Index([7, 8, 9], name="c")) tm.assert_series_equal(result, expected) @@ -101,5 +103,7 @@ def test_groupby_resample_preserves_subclass(obj): df = df.set_index("Date") # Confirm groupby.resample() preserves dataframe type - result = df.groupby("Buyer").resample("5D").sum() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("Buyer").resample("5D").sum() assert isinstance(result, obj) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index e0793ada679c2..d05b60fd56b5f 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -224,7 +224,9 @@ def test_grouper_creation_bug(self): result = g.sum() tm.assert_frame_equal(result, expected) - result = g.apply(lambda x: x.sum()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = g.apply(lambda x: x.sum()) expected["A"] = [0, 2, 4] expected = expected.loc[:, ["A", "B"]] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index c9fe011f7063b..1a26559ef4447 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -470,8 +470,12 @@ def test_timegrouper_apply_return_type_series(self): def sumfunc_series(x): return Series([x["value"].sum()], ("sum",)) - expected = df.groupby(Grouper(key="date")).apply(sumfunc_series) - result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_series) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby(Grouper(key="date")).apply(sumfunc_series) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_series) tm.assert_frame_equal( result.reset_index(drop=True), expected.reset_index(drop=True) ) @@ -487,8 +491,11 @@ def test_timegrouper_apply_return_type_value(self): def sumfunc_value(x): return x.value.sum() - expected = df.groupby(Grouper(key="date")).apply(sumfunc_value) - result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_value) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby(Grouper(key="date")).apply(sumfunc_value) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_value) tm.assert_series_equal( result.reset_index(drop=True), expected.reset_index(drop=True) ) @@ -842,7 +849,7 @@ def test_grouper_period_index(self): result = period_series.groupby(period_series.index.month).sum() expected = Series( - range(0, periods), index=Index(range(1, periods + 1), name=index.name) + range(periods), index=Index(range(1, periods + 1), name=index.name) ) tm.assert_series_equal(result, expected) @@ -895,7 +902,9 @@ def test_groupby_apply_timegrouper_with_nat_apply_squeeze( assert gb._selected_obj._get_axis(gb.axis).nlevels == 1 # function that returns a Series - res = gb.apply(lambda x: x["Quantity"] * 2) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = gb.apply(lambda x: x["Quantity"] * 2) expected = DataFrame( [[36, 6, 6, 10, 2]], diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 7c50124e57e29..944dda8977882 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -327,9 +327,12 @@ def test_against_frame_and_seriesgroupby( ) if frame: # compare against apply with DataFrame value_counts - expected = gp.apply( - _frame_value_counts, ["gender", "education"], normalize, sort, ascending - ) + warn = FutureWarning if groupby == "column" else None + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(warn, match=msg): + expected = gp.apply( + _frame_value_counts, ["gender", "education"], normalize, sort, ascending + ) if as_index: tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 062dfe3931423..acb4b93ba1af3 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -636,7 +636,9 @@ def f(group): return group[:1] grouped = df.groupby("c") - result = grouped.apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.apply(f) assert result["d"].dtype == np.float64 @@ -790,7 +792,13 @@ def test_cython_transform_frame(request, op, args, targop, df_fix, gb_target): f = gb[["float", "float_missing"]].apply(targop) expected = concat([f, i], axis=1) else: - expected = gb.apply(targop) + if op != "shift" or not isinstance(gb_target.get("by"), (str, list)): + warn = None + else: + warn = FutureWarning + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(warn, match=msg): + expected = gb.apply(targop) expected = expected.sort_index(axis=1) if op == "shift": diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 64cbe657a8aff..87facbf529411 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -228,6 +228,13 @@ def test_isin(self): expected = np.array([False] * 5 + [True]) tm.assert_numpy_array_equal(result, expected) + def test_isin_overlapping_intervals(self): + # GH 34974 + idx = pd.IntervalIndex([pd.Interval(0, 2), pd.Interval(0, 1)]) + result = CategoricalIndex(idx).isin(idx) + expected = np.array([True, True]) + tm.assert_numpy_array_equal(result, expected) + def test_identical(self): ci1 = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) ci2 = CategoricalIndex(["a", "b"], categories=["a", "b", "c"], ordered=True) diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 2e7b38abf4212..b56bad7f2e833 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -343,9 +343,11 @@ def test_difference_freq(self, sort): tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal("freq", idx_diff, expected) + # preserve frequency when the difference is a contiguous + # subset of the original range other = date_range("20160922", "20160925", freq="D") idx_diff = index.difference(other, sort) - expected = DatetimeIndex(["20160920", "20160921"], freq=None) + expected = DatetimeIndex(["20160920", "20160921"], freq="D") tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal("freq", idx_diff, expected) diff --git a/pandas/tests/indexes/multi/test_partial_indexing.py b/pandas/tests/indexes/multi/test_partial_indexing.py index 47efc43d5eae0..66163dad3deae 100644 --- a/pandas/tests/indexes/multi/test_partial_indexing.py +++ b/pandas/tests/indexes/multi/test_partial_indexing.py @@ -31,7 +31,7 @@ def df(): dr = date_range("2016-01-01", "2016-01-03", freq="12H") abc = ["a", "b", "c"] mi = MultiIndex.from_product([dr, abc]) - frame = DataFrame({"c1": range(0, 15)}, index=mi) + frame = DataFrame({"c1": range(15)}, index=mi) return frame diff --git a/pandas/tests/indexes/multi/test_reshape.py b/pandas/tests/indexes/multi/test_reshape.py index da9838d4a2ed3..06dbb33aadf97 100644 --- a/pandas/tests/indexes/multi/test_reshape.py +++ b/pandas/tests/indexes/multi/test_reshape.py @@ -169,6 +169,28 @@ def test_append_names_dont_match(): tm.assert_index_equal(result, expected) +def test_append_overlapping_interval_levels(): + # GH 54934 + ivl1 = pd.IntervalIndex.from_breaks([0.0, 1.0, 2.0]) + ivl2 = pd.IntervalIndex.from_breaks([0.5, 1.5, 2.5]) + mi1 = MultiIndex.from_product([ivl1, ivl1]) + mi2 = MultiIndex.from_product([ivl2, ivl2]) + result = mi1.append(mi2) + expected = MultiIndex.from_tuples( + [ + (pd.Interval(0.0, 1.0), pd.Interval(0.0, 1.0)), + (pd.Interval(0.0, 1.0), pd.Interval(1.0, 2.0)), + (pd.Interval(1.0, 2.0), pd.Interval(0.0, 1.0)), + (pd.Interval(1.0, 2.0), pd.Interval(1.0, 2.0)), + (pd.Interval(0.5, 1.5), pd.Interval(0.5, 1.5)), + (pd.Interval(0.5, 1.5), pd.Interval(1.5, 2.5)), + (pd.Interval(1.5, 2.5), pd.Interval(0.5, 1.5)), + (pd.Interval(1.5, 2.5), pd.Interval(1.5, 2.5)), + ] + ) + tm.assert_index_equal(result, expected) + + def test_repeat(): reps = 2 numbers = [1, 2, 3] diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 5f137df281fa3..132704434829e 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -10,9 +10,6 @@ ) import pandas._testing as tm -# aliases to make some tests easier to read -RI = RangeIndex - class TestRangeIndex: @pytest.fixture @@ -507,25 +504,31 @@ def test_len_specialised(self, step): @pytest.mark.parametrize( "indices, expected", [ - ([RI(1, 12, 5)], RI(1, 12, 5)), - ([RI(0, 6, 4)], RI(0, 6, 4)), - ([RI(1, 3), RI(3, 7)], RI(1, 7)), - ([RI(1, 5, 2), RI(5, 6)], RI(1, 6, 2)), - ([RI(1, 3, 2), RI(4, 7, 3)], RI(1, 7, 3)), - ([RI(-4, 3, 2), RI(4, 7, 2)], RI(-4, 7, 2)), - ([RI(-4, -8), RI(-8, -12)], RI(0, 0)), - ([RI(-4, -8), RI(3, -4)], RI(0, 0)), - ([RI(-4, -8), RI(3, 5)], RI(3, 5)), - ([RI(-4, -2), RI(3, 5)], Index([-4, -3, 3, 4])), - ([RI(-2), RI(3, 5)], RI(3, 5)), - ([RI(2), RI(2)], Index([0, 1, 0, 1])), - ([RI(2), RI(2, 5), RI(5, 8, 4)], RI(0, 6)), - ([RI(2), RI(3, 5), RI(5, 8, 4)], Index([0, 1, 3, 4, 5])), - ([RI(-2, 2), RI(2, 5), RI(5, 8, 4)], RI(-2, 6)), - ([RI(3), Index([-1, 3, 15])], Index([0, 1, 2, -1, 3, 15])), - ([RI(3), Index([-1, 3.1, 15.0])], Index([0, 1, 2, -1, 3.1, 15.0])), - ([RI(3), Index(["a", None, 14])], Index([0, 1, 2, "a", None, 14])), - ([RI(3, 1), Index(["a", None, 14])], Index(["a", None, 14])), + ([RangeIndex(1, 12, 5)], RangeIndex(1, 12, 5)), + ([RangeIndex(0, 6, 4)], RangeIndex(0, 6, 4)), + ([RangeIndex(1, 3), RangeIndex(3, 7)], RangeIndex(1, 7)), + ([RangeIndex(1, 5, 2), RangeIndex(5, 6)], RangeIndex(1, 6, 2)), + ([RangeIndex(1, 3, 2), RangeIndex(4, 7, 3)], RangeIndex(1, 7, 3)), + ([RangeIndex(-4, 3, 2), RangeIndex(4, 7, 2)], RangeIndex(-4, 7, 2)), + ([RangeIndex(-4, -8), RangeIndex(-8, -12)], RangeIndex(0, 0)), + ([RangeIndex(-4, -8), RangeIndex(3, -4)], RangeIndex(0, 0)), + ([RangeIndex(-4, -8), RangeIndex(3, 5)], RangeIndex(3, 5)), + ([RangeIndex(-4, -2), RangeIndex(3, 5)], Index([-4, -3, 3, 4])), + ([RangeIndex(-2), RangeIndex(3, 5)], RangeIndex(3, 5)), + ([RangeIndex(2), RangeIndex(2)], Index([0, 1, 0, 1])), + ([RangeIndex(2), RangeIndex(2, 5), RangeIndex(5, 8, 4)], RangeIndex(0, 6)), + ( + [RangeIndex(2), RangeIndex(3, 5), RangeIndex(5, 8, 4)], + Index([0, 1, 3, 4, 5]), + ), + ( + [RangeIndex(-2, 2), RangeIndex(2, 5), RangeIndex(5, 8, 4)], + RangeIndex(-2, 6), + ), + ([RangeIndex(3), Index([-1, 3, 15])], Index([0, 1, 2, -1, 3, 15])), + ([RangeIndex(3), Index([-1, 3.1, 15.0])], Index([0, 1, 2, -1, 3.1, 15.0])), + ([RangeIndex(3), Index(["a", None, 14])], Index([0, 1, 2, "a", None, 14])), + ([RangeIndex(3, 1), Index(["a", None, 14])], Index(["a", None, 14])), ], ) def test_append(self, indices, expected): @@ -567,7 +570,7 @@ def test_format_empty(self): assert empty_idx.format(name=True) == [""] @pytest.mark.parametrize( - "RI", + "ri", [ RangeIndex(0, -1, -1), RangeIndex(0, 1, 1), @@ -576,10 +579,10 @@ def test_format_empty(self): RangeIndex(-3, -5, -2), ], ) - def test_append_len_one(self, RI): + def test_append_len_one(self, ri): # GH39401 - result = RI.append([]) - tm.assert_index_equal(result, RI, exact=True) + result = ri.append([]) + tm.assert_index_equal(result, ri, exact=True) @pytest.mark.parametrize("base", [RangeIndex(0, 2), Index([0, 1])]) def test_isin_range(self, base): diff --git a/pandas/tests/indexes/timedeltas/test_setops.py b/pandas/tests/indexes/timedeltas/test_setops.py index cb6dce1e7ad80..6cdd6944e90ea 100644 --- a/pandas/tests/indexes/timedeltas/test_setops.py +++ b/pandas/tests/indexes/timedeltas/test_setops.py @@ -219,9 +219,11 @@ def test_difference_freq(self, sort): tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal("freq", idx_diff, expected) + # preserve frequency when the difference is a contiguous + # subset of the original range other = timedelta_range("2 days", "5 days", freq="D") idx_diff = index.difference(other, sort) - expected = TimedeltaIndex(["0 days", "1 days"], freq=None) + expected = TimedeltaIndex(["0 days", "1 days"], freq="D") tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal("freq", idx_diff, expected) diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index 9d11827e2923e..b86e233110e88 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -148,7 +148,7 @@ def test_frame_getitem_simple_key_error( def test_tuple_string_column_names(): # GH#50372 mi = MultiIndex.from_tuples([("a", "aa"), ("a", "ab"), ("b", "ba"), ("b", "bb")]) - df = DataFrame([range(0, 4), range(1, 5), range(2, 6)], columns=mi) + df = DataFrame([range(4), range(1, 5), range(2, 6)], columns=mi) df["single_index"] = 0 df_flat = df.copy() diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index b45d197af332e..d3a6d4bf7cebf 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -16,7 +16,6 @@ Timestamp, ) import pandas._testing as tm -from pandas.api.types import CategoricalDtype as CDT @pytest.fixture @@ -25,7 +24,9 @@ def df(): { "A": np.arange(6, dtype="int64"), }, - index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cab")), name="B"), + index=CategoricalIndex( + list("aabbca"), dtype=CategoricalDtype(list("cab")), name="B" + ), ) @@ -35,13 +36,15 @@ def df2(): { "A": np.arange(6, dtype="int64"), }, - index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cabe")), name="B"), + index=CategoricalIndex( + list("aabbca"), dtype=CategoricalDtype(list("cabe")), name="B" + ), ) class TestCategoricalIndex: def test_loc_scalar(self, df): - dtype = CDT(list("cab")) + dtype = CategoricalDtype(list("cab")) result = df.loc["a"] bidx = Series(list("aaa"), name="B").astype(dtype) assert bidx.dtype == dtype diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index f36fdf0d36ea9..7353b5ef76ba3 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -1,4 +1,4 @@ -from string import ascii_letters as letters +from string import ascii_letters import numpy as np import pytest @@ -24,9 +24,9 @@ def random_text(nobs=100): # Construct a DataFrame where each row is a random slice from 'letters' - idxs = np.random.default_rng(2).integers(len(letters), size=(nobs, 2)) + idxs = np.random.default_rng(2).integers(len(ascii_letters), size=(nobs, 2)) idxs.sort(axis=1) - strings = [letters[x[0] : x[1]] for x in idxs] + strings = [ascii_letters[x[0] : x[1]] for x in idxs] return DataFrame(strings, columns=["letters"]) diff --git a/pandas/tests/io/excel/test_odswriter.py b/pandas/tests/io/excel/test_odswriter.py index 21d31ec8a7fb5..ecee58362f8a9 100644 --- a/pandas/tests/io/excel/test_odswriter.py +++ b/pandas/tests/io/excel/test_odswriter.py @@ -1,7 +1,12 @@ +from datetime import ( + date, + datetime, +) import re import pytest +import pandas as pd import pandas._testing as tm from pandas.io.excel import ExcelWriter @@ -47,3 +52,47 @@ def test_book_and_sheets_consistent(ext): table = odf.table.Table(name="test_name") writer.book.spreadsheet.addElement(table) assert writer.sheets == {"test_name": table} + + +@pytest.mark.parametrize( + ["value", "cell_value_type", "cell_value_attribute", "cell_value"], + argvalues=[ + (True, "boolean", "boolean-value", "true"), + ("test string", "string", "string-value", "test string"), + (1, "float", "value", "1"), + (1.5, "float", "value", "1.5"), + ( + datetime(2010, 10, 10, 10, 10, 10), + "date", + "date-value", + "2010-10-10T10:10:10", + ), + (date(2010, 10, 10), "date", "date-value", "2010-10-10"), + ], +) +def test_cell_value_type(ext, value, cell_value_type, cell_value_attribute, cell_value): + # GH#54994 ODS: cell attributes should follow specification + # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#refTable13 + from odf.namespaces import OFFICENS + from odf.table import ( + TableCell, + TableRow, + ) + + table_cell_name = TableCell().qname + + with tm.ensure_clean(ext) as f: + pd.DataFrame([[value]]).to_excel(f, header=False, index=False) + + with pd.ExcelFile(f) as wb: + sheet = wb._reader.get_sheet_by_index(0) + sheet_rows = sheet.getElementsByType(TableRow) + sheet_cells = [ + x + for x in sheet_rows[0].childNodes + if hasattr(x, "qname") and x.qname == table_cell_name + ] + + cell = sheet_cells[0] + assert cell.attributes.get((OFFICENS, "value-type")) == cell_value_type + assert cell.attributes.get((OFFICENS, cell_value_attribute)) == cell_value diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 6db70c894f692..8dd9f96a05a90 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -54,6 +54,7 @@ ), pytest.param("pyxlsb", marks=td.skip_if_no("pyxlsb")), pytest.param("odf", marks=td.skip_if_no("odf")), + pytest.param("calamine", marks=td.skip_if_no("python_calamine")), ] @@ -67,11 +68,11 @@ def _is_valid_engine_ext_pair(engine, read_ext: str) -> bool: return False if engine == "odf" and read_ext != ".ods": return False - if read_ext == ".ods" and engine != "odf": + if read_ext == ".ods" and engine not in {"odf", "calamine"}: return False if engine == "pyxlsb" and read_ext != ".xlsb": return False - if read_ext == ".xlsb" and engine != "pyxlsb": + if read_ext == ".xlsb" and engine not in {"pyxlsb", "calamine"}: return False if engine == "xlrd" and read_ext != ".xls": return False @@ -160,9 +161,9 @@ def test_engine_kwargs(self, read_ext, engine): "ods": {"foo": "abcd"}, } - if read_ext[1:] in {"xls", "xlsb"}: + if engine in {"xlrd", "pyxlsb"}: msg = re.escape(r"open_workbook() got an unexpected keyword argument 'foo'") - elif read_ext[1:] == "ods": + elif engine == "odf": msg = re.escape(r"load() got an unexpected keyword argument 'foo'") else: msg = re.escape(r"load_workbook() got an unexpected keyword argument 'foo'") @@ -194,8 +195,8 @@ def test_usecols_int(self, read_ext): usecols=3, ) - def test_usecols_list(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_usecols_list(self, request, engine, read_ext, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -218,8 +219,8 @@ def test_usecols_list(self, request, read_ext, df_ref): tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) - def test_usecols_str(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_usecols_str(self, request, engine, read_ext, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -275,9 +276,9 @@ def test_usecols_str(self, request, read_ext, df_ref): "usecols", [[0, 1, 3], [0, 3, 1], [1, 0, 3], [1, 3, 0], [3, 0, 1], [3, 1, 0]] ) def test_usecols_diff_positional_int_columns_order( - self, request, read_ext, usecols, df_ref + self, request, engine, read_ext, usecols, df_ref ): - if read_ext == ".xlsb": + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -298,8 +299,8 @@ def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_r result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", usecols=usecols) tm.assert_frame_equal(result, expected, check_names=False) - def test_read_excel_without_slicing(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_read_excel_without_slicing(self, request, engine, read_ext, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -310,8 +311,8 @@ def test_read_excel_without_slicing(self, request, read_ext, df_ref): result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0) tm.assert_frame_equal(result, expected, check_names=False) - def test_usecols_excel_range_str(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_usecols_excel_range_str(self, request, engine, read_ext, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -398,20 +399,26 @@ def test_excel_stop_iterator(self, read_ext): expected = DataFrame([["aaaa", "bbbbb"]], columns=["Test", "Test1"]) tm.assert_frame_equal(parsed, expected) - def test_excel_cell_error_na(self, request, read_ext): - if read_ext == ".xlsb": + def test_excel_cell_error_na(self, request, engine, read_ext): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) ) + # https://github.com/tafia/calamine/issues/355 + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail(reason="Calamine can't extract error from ods files") + ) + parsed = pd.read_excel("test3" + read_ext, sheet_name="Sheet1") expected = DataFrame([[np.nan]], columns=["Test"]) tm.assert_frame_equal(parsed, expected) - def test_excel_table(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_excel_table(self, request, engine, read_ext, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -431,8 +438,8 @@ def test_excel_table(self, request, read_ext, df_ref): ) tm.assert_frame_equal(df3, df1.iloc[:-1]) - def test_reader_special_dtypes(self, request, read_ext): - if read_ext == ".xlsb": + def test_reader_special_dtypes(self, request, engine, read_ext): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -800,8 +807,8 @@ def test_date_conversion_overflow(self, request, engine, read_ext): result = pd.read_excel("testdateoverflow" + read_ext) tm.assert_frame_equal(result, expected) - def test_sheet_name(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_sheet_name(self, request, read_ext, engine, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -869,6 +876,11 @@ def test_corrupt_bytes_raises(self, engine): "Unsupported format, or corrupt file: Expected BOF " "record; found b'foo'" ) + elif engine == "calamine": + from python_calamine import CalamineError + + error = CalamineError + msg = "Cannot detect file format" else: error = BadZipFile msg = "File is not a zip file" @@ -969,6 +981,14 @@ def test_reader_seconds(self, request, engine, read_ext): ) ) + # GH 55045 + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail( + reason="ODS file contains bad datetime (seconds as text)" + ) + ) + # Test reading times with and without milliseconds. GH5945. expected = DataFrame.from_dict( { @@ -994,15 +1014,21 @@ def test_reader_seconds(self, request, engine, read_ext): actual = pd.read_excel("times_1904" + read_ext, sheet_name="Sheet1") tm.assert_frame_equal(actual, expected) - def test_read_excel_multiindex(self, request, read_ext): + def test_read_excel_multiindex(self, request, engine, read_ext): # see gh-4679 - if read_ext == ".xlsb": + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) ) + # https://github.com/tafia/calamine/issues/354 + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail(reason="Last test fails in calamine") + ) + mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]]) mi_file = "testmultiindex" + read_ext @@ -1088,10 +1114,10 @@ def test_read_excel_multiindex(self, request, read_ext): ], ) def test_read_excel_multiindex_blank_after_name( - self, request, read_ext, sheet_name, idx_lvl2 + self, request, engine, read_ext, sheet_name, idx_lvl2 ): # GH34673 - if read_ext == ".xlsb": + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb (GH4679" @@ -1212,9 +1238,9 @@ def test_read_excel_bool_header_arg(self, read_ext): with pytest.raises(TypeError, match=msg): pd.read_excel("test1" + read_ext, header=arg) - def test_read_excel_skiprows(self, request, read_ext): + def test_read_excel_skiprows(self, request, engine, read_ext): # GH 4903 - if read_ext == ".xlsb": + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -1267,9 +1293,9 @@ def test_read_excel_skiprows(self, request, read_ext): ) tm.assert_frame_equal(actual, expected) - def test_read_excel_skiprows_callable_not_in(self, request, read_ext): + def test_read_excel_skiprows_callable_not_in(self, request, engine, read_ext): # GH 4903 - if read_ext == ".xlsb": + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -1397,7 +1423,7 @@ def test_trailing_blanks(self, read_ext): def test_ignore_chartsheets_by_str(self, request, engine, read_ext): # GH 41448 - if engine == "odf": + if read_ext == ".ods": pytest.skip("chartsheets do not exist in the ODF format") if engine == "pyxlsb": request.node.add_marker( @@ -1410,7 +1436,7 @@ def test_ignore_chartsheets_by_str(self, request, engine, read_ext): def test_ignore_chartsheets_by_int(self, request, engine, read_ext): # GH 41448 - if engine == "odf": + if read_ext == ".ods": pytest.skip("chartsheets do not exist in the ODF format") if engine == "pyxlsb": request.node.add_marker( @@ -1540,8 +1566,8 @@ def test_excel_passes_na_filter(self, read_ext, na_filter): expected = DataFrame(expected, columns=["Test"]) tm.assert_frame_equal(parsed, expected) - def test_excel_table_sheet_by_index(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_excel_table_sheet_by_index(self, request, engine, read_ext, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -1569,8 +1595,8 @@ def test_excel_table_sheet_by_index(self, request, read_ext, df_ref): tm.assert_frame_equal(df3, df1.iloc[:-1]) - def test_sheet_name(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_sheet_name(self, request, engine, read_ext, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -1639,7 +1665,7 @@ def test_excel_read_binary(self, engine, read_ext): def test_excel_read_binary_via_read_excel(self, read_ext, engine): # GH 38424 with open("test1" + read_ext, "rb") as f: - result = pd.read_excel(f) + result = pd.read_excel(f, engine=engine) expected = pd.read_excel("test1" + read_ext, engine=engine) tm.assert_frame_equal(result, expected) @@ -1691,7 +1717,7 @@ def test_engine_invalid_option(self, read_ext): def test_ignore_chartsheets(self, request, engine, read_ext): # GH 41448 - if engine == "odf": + if read_ext == ".ods": pytest.skip("chartsheets do not exist in the ODF format") if engine == "pyxlsb": request.node.add_marker( @@ -1711,6 +1737,10 @@ def test_corrupt_files_closed(self, engine, read_ext): import xlrd errors = (BadZipFile, xlrd.biffh.XLRDError) + elif engine == "calamine": + from python_calamine import CalamineError + + errors = (CalamineError,) with tm.ensure_clean(f"corrupt{read_ext}") as file: Path(file).write_text("corrupt", encoding="utf-8") diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index 73de2b068b699..6c3bf01cb1857 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -1,6 +1,6 @@ from io import StringIO import re -from string import ascii_uppercase as uppercase +from string import ascii_uppercase import sys import textwrap @@ -452,9 +452,9 @@ def memory_usage(f): return f.memory_usage(deep=True).sum() N = 100 - M = len(uppercase) + M = len(ascii_uppercase) index = MultiIndex.from_product( - [list(uppercase), date_range("20160101", periods=N)], + [list(ascii_uppercase), date_range("20160101", periods=N)], names=["id", "date"], ) df = DataFrame( diff --git a/pandas/tests/io/formats/test_series_info.py b/pandas/tests/io/formats/test_series_info.py index 02827ee25042a..29dd704f6efa9 100644 --- a/pandas/tests/io/formats/test_series_info.py +++ b/pandas/tests/io/formats/test_series_info.py @@ -1,5 +1,5 @@ from io import StringIO -from string import ascii_uppercase as uppercase +from string import ascii_uppercase import textwrap import numpy as np @@ -165,9 +165,9 @@ def test_info_memory_usage_bug_on_multiindex(): # GH 14308 # memory usage introspection should not materialize .values N = 100 - M = len(uppercase) + M = len(ascii_uppercase) index = MultiIndex.from_product( - [list(uppercase), date_range("20160101", periods=N)], + [list(ascii_uppercase), date_range("20160101", periods=N)], names=["id", "date"], ) s = Series(np.random.default_rng(2).standard_normal(N * M), index=index) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index ca3ce6ba34515..b3c2e67f7c318 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2044,7 +2044,7 @@ def test_read_json_dtype_backend(self, string_storage, dtype_backend, orient): ) if orient == "values": - expected.columns = list(range(0, 8)) + expected.columns = list(range(8)) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 5bb7097770820..d5f8c5200c4a3 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -1033,7 +1033,7 @@ def test_decode_floating_point(self, sign, float_number): def test_encode_big_set(self): s = set() - for x in range(0, 100000): + for x in range(100000): s.add(x) # Make sure no Exception is raised. diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index 492b4d5ec058e..0c5a2e0d04e5a 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -15,6 +15,7 @@ from pandas.errors import ( EmptyDataError, ParserError, + ParserWarning, ) from pandas import DataFrame @@ -129,18 +130,16 @@ def test_unexpected_keyword_parameter_exception(all_parsers): parser.read_table("foo.tsv", foo=1) -def test_suppress_error_output(all_parsers, capsys): +def test_suppress_error_output(all_parsers): # see gh-15925 parser = all_parsers data = "a\n1\n1,2,3\n4\n5,6,7" expected = DataFrame({"a": [1, 4]}) - result = parser.read_csv(StringIO(data), on_bad_lines="skip") + with tm.assert_produces_warning(None): + result = parser.read_csv(StringIO(data), on_bad_lines="skip") tm.assert_frame_equal(result, expected) - captured = capsys.readouterr() - assert captured.err == "" - def test_error_bad_lines(all_parsers): # see gh-15925 @@ -152,19 +151,18 @@ def test_error_bad_lines(all_parsers): parser.read_csv(StringIO(data), on_bad_lines="error") -def test_warn_bad_lines(all_parsers, capsys): +def test_warn_bad_lines(all_parsers): # see gh-15925 parser = all_parsers data = "a\n1\n1,2,3\n4\n5,6,7" expected = DataFrame({"a": [1, 4]}) - result = parser.read_csv(StringIO(data), on_bad_lines="warn") + with tm.assert_produces_warning( + ParserWarning, match="Skipping line", check_stacklevel=False + ): + result = parser.read_csv(StringIO(data), on_bad_lines="warn") tm.assert_frame_equal(result, expected) - captured = capsys.readouterr() - assert "Skipping line 3" in captured.err - assert "Skipping line 5" in captured.err - def test_read_csv_wrong_num_columns(all_parsers): # Too few columns. @@ -245,7 +243,7 @@ def test_bad_header_uniform_error(all_parsers): parser.read_csv(StringIO(data), index_col=0, on_bad_lines="error") -def test_on_bad_lines_warn_correct_formatting(all_parsers, capsys): +def test_on_bad_lines_warn_correct_formatting(all_parsers): # see gh-15925 parser = all_parsers data = """1,2 @@ -256,17 +254,8 @@ def test_on_bad_lines_warn_correct_formatting(all_parsers, capsys): """ expected = DataFrame({"1": "a", "2": ["b"] * 2}) - result = parser.read_csv(StringIO(data), on_bad_lines="warn") + with tm.assert_produces_warning( + ParserWarning, match="Skipping line", check_stacklevel=False + ): + result = parser.read_csv(StringIO(data), on_bad_lines="warn") tm.assert_frame_equal(result, expected) - - captured = capsys.readouterr() - if parser.engine == "c": - warn = """Skipping line 3: expected 2 fields, saw 3 -Skipping line 4: expected 2 fields, saw 3 - -""" - else: - warn = """Skipping line 3: Expected 2 fields in line 3, saw 3 -Skipping line 4: Expected 2 fields in line 4, saw 3 -""" - assert captured.err == warn diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 32a010b3aeb34..18eee01f87621 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -19,7 +19,10 @@ from pandas.compat import is_ci_environment from pandas.compat.numpy import np_version_gte1p24 -from pandas.errors import ParserError +from pandas.errors import ( + ParserError, + ParserWarning, +) import pandas.util._test_decorators as td from pandas import ( @@ -461,7 +464,7 @@ def test_data_after_quote(c_parser_only): tm.assert_frame_equal(result, expected) -def test_comment_whitespace_delimited(c_parser_only, capsys): +def test_comment_whitespace_delimited(c_parser_only): parser = c_parser_only test_input = """\ 1 2 @@ -474,18 +477,17 @@ def test_comment_whitespace_delimited(c_parser_only, capsys): 8# 1 field, NaN 9 2 3 # skipped line # comment""" - df = parser.read_csv( - StringIO(test_input), - comment="#", - header=None, - delimiter="\\s+", - skiprows=0, - on_bad_lines="warn", - ) - captured = capsys.readouterr() - # skipped lines 2, 3, 4, 9 - for line_num in (2, 3, 4, 9): - assert f"Skipping line {line_num}" in captured.err + with tm.assert_produces_warning( + ParserWarning, match="Skipping line", check_stacklevel=False + ): + df = parser.read_csv( + StringIO(test_input), + comment="#", + header=None, + delimiter="\\s+", + skiprows=0, + on_bad_lines="warn", + ) expected = DataFrame([[1, 2], [5, 2], [6, 2], [7, np.nan], [8, np.nan]]) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 6cadff511d95c..efab9a049a83c 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -662,3 +662,29 @@ def test_header_missing_rows(all_parsers): msg = r"Passed header=\[0,1,2\], len of 3, but only 2 lines in file" with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), header=[0, 1, 2]) + + +@skip_pyarrow +def test_header_multiple_whitespaces(all_parsers): + # GH#54931 + parser = all_parsers + data = """aa bb(1,1) cc(1,1) + 0 2 3.5""" + + result = parser.read_csv(StringIO(data), sep=r"\s+") + expected = DataFrame({"aa": [0], "bb(1,1)": 2, "cc(1,1)": 3.5}) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_header_delim_whitespace(all_parsers): + # GH#54918 + parser = all_parsers + data = """a,b +1,2 +3,4 + """ + + result = parser.read_csv(StringIO(data), delim_whitespace=True) + expected = DataFrame({"a,b": ["1,2", "3,4"]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 959b988e208c1..dbd474c6ae0b9 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -274,7 +274,7 @@ def test_multi_char_sep_quotes(python_parser_only, quoting): parser.read_csv(StringIO(data), quoting=quoting, **kwargs) -def test_none_delimiter(python_parser_only, capsys): +def test_none_delimiter(python_parser_only): # see gh-13374 and gh-17465 parser = python_parser_only data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9" @@ -283,12 +283,14 @@ def test_none_delimiter(python_parser_only, capsys): # We expect the third line in the data to be # skipped because it is malformed, but we do # not expect any errors to occur. - result = parser.read_csv(StringIO(data), header=0, sep=None, on_bad_lines="warn") + with tm.assert_produces_warning( + ParserWarning, match="Skipping line 3", check_stacklevel=False + ): + result = parser.read_csv( + StringIO(data), header=0, sep=None, on_bad_lines="warn" + ) tm.assert_frame_equal(result, expected) - captured = capsys.readouterr() - assert "Skipping line 3" in captured.err - @pytest.mark.parametrize("data", ['a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz']) @pytest.mark.parametrize("skipfooter", [0, 1]) diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index f150ed3903443..e2d785a38eb51 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -12,6 +12,7 @@ import pandas._libs.parsers as parser from pandas._libs.parsers import TextReader +from pandas.errors import ParserWarning from pandas import DataFrame import pandas._testing as tm @@ -125,7 +126,7 @@ def test_integer_thousands_alt(self): expected = DataFrame([123456, 12500]) tm.assert_frame_equal(result, expected) - def test_skip_bad_lines(self, capsys): + def test_skip_bad_lines(self): # too many lines, see #2430 for why data = "a:b:c\nd:e:f\ng:h:i\nj:k:l:m\nl:m:n\no:p:q:r" @@ -145,14 +146,11 @@ def test_skip_bad_lines(self, capsys): } assert_array_dicts_equal(result, expected) - reader = TextReader( - StringIO(data), delimiter=":", header=None, on_bad_lines=1 # Warn - ) - reader.read() - captured = capsys.readouterr() - - assert "Skipping line 4" in captured.err - assert "Skipping line 6" in captured.err + with tm.assert_produces_warning(ParserWarning, match="Skipping line"): + reader = TextReader( + StringIO(data), delimiter=":", header=None, on_bad_lines=1 # Warn + ) + reader.read() def test_header_not_enough_lines(self): data = "skip this\nskip this\na,b,c\n1,2,3\n4,5,6" diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 408348f555f58..1c280f98aee0a 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1015,7 +1015,7 @@ def test_timezone_aware_index(self, request, pa, timezone_aware_date_list): def test_filter_row_groups(self, pa): # https://github.com/pandas-dev/pandas/issues/26551 pytest.importorskip("pyarrow") - df = pd.DataFrame({"a": list(range(0, 3))}) + df = pd.DataFrame({"a": list(range(3))}) with tm.ensure_clean() as path: df.to_parquet(path, engine=pa) result = read_parquet( @@ -1142,6 +1142,25 @@ def test_roundtrip_decimal(self, tmp_path, pa): expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]") tm.assert_frame_equal(result, expected) + def test_infer_string_large_string_type(self, tmp_path, pa): + # GH#54798 + import pyarrow as pa + import pyarrow.parquet as pq + + path = tmp_path / "large_string.p" + + table = pa.table({"a": pa.array([None, "b", "c"], pa.large_string())}) + pq.write_table(table, path) + + with pd.option_context("future.infer_string", True): + result = read_parquet(path) + expected = pd.DataFrame( + data={"a": [None, "b", "c"]}, + dtype="string[pyarrow_numpy]", + columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"), + ) + tm.assert_frame_equal(result, expected) + class TestParquetFastParquet(Base): def test_basic(self, fp, df_full): @@ -1203,7 +1222,7 @@ def test_categorical(self, fp): check_round_trip(df, fp) def test_filter_row_groups(self, fp): - d = {"a": list(range(0, 3))} + d = {"a": list(range(3))} df = pd.DataFrame(d) with tm.ensure_clean() as path: df.to_parquet(path, engine=fp, compression=None, row_group_offsets=1) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 9ec0ba0b12a76..1abe0ad55a864 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -413,6 +413,8 @@ def mysql_pymysql_engine(iris_path, types_data): for entry in types_data: entry.pop("DateColWithTz") create_and_load_types(engine, types_data, "mysql") + if not insp.has_table("iris_view"): + create_and_load_iris_view(engine) yield engine with engine.connect() as conn: with conn.begin(): @@ -422,7 +424,7 @@ def mysql_pymysql_engine(iris_path, types_data): @pytest.fixture -def mysql_pymysql_conn(mysql_pymysql_engine): +def mysql_pymysql_conn(iris_path, mysql_pymysql_engine): with mysql_pymysql_engine.connect() as conn: yield conn @@ -440,6 +442,8 @@ def postgresql_psycopg2_engine(iris_path, types_data): create_and_load_iris(engine, iris_path, "postgresql") if not insp.has_table("types"): create_and_load_types(engine, types_data, "postgresql") + if not insp.has_table("iris_view"): + create_and_load_iris_view(engine) yield engine with engine.connect() as conn: with conn.begin(): @@ -462,9 +466,20 @@ def sqlite_str(): @pytest.fixture -def sqlite_engine(sqlite_str): +def sqlite_engine(sqlite_str, iris_path, types_data): sqlalchemy = pytest.importorskip("sqlalchemy") engine = sqlalchemy.create_engine(sqlite_str, poolclass=sqlalchemy.pool.NullPool) + + insp = sqlalchemy.inspect(engine) + if not insp.has_table("iris"): + create_and_load_iris(engine, iris_path, "sqlite") + if not insp.has_table("iris_view"): + create_and_load_iris_view(engine) + if not insp.has_table("types"): + for entry in types_data: + entry.pop("DateColWithTz") + create_and_load_types(engine, types_data, "sqlite") + yield engine engine.dispose() @@ -476,17 +491,25 @@ def sqlite_conn(sqlite_engine): @pytest.fixture -def sqlite_iris_str(sqlite_str, iris_path): +def sqlite_iris_str(sqlite_str, iris_path, types_data): sqlalchemy = pytest.importorskip("sqlalchemy") engine = sqlalchemy.create_engine(sqlite_str) - create_and_load_iris(engine, iris_path, "sqlite") + + insp = sqlalchemy.inspect(engine) + if not insp.has_table("iris"): + create_and_load_iris(engine, iris_path, "sqlite") + if not insp.has_table("iris_view"): + create_and_load_iris_view(engine) + if not insp.has_table("types"): + for entry in types_data: + entry.pop("DateColWithTz") + create_and_load_types(engine, types_data, "sqlite") engine.dispose() return sqlite_str @pytest.fixture def sqlite_iris_engine(sqlite_engine, iris_path): - create_and_load_iris(sqlite_engine, iris_path, "sqlite") return sqlite_engine @@ -499,6 +522,7 @@ def sqlite_iris_conn(sqlite_iris_engine): @pytest.fixture def sqlite_buildin(): with contextlib.closing(sqlite3.connect(":memory:")) as closing_conn: + create_and_load_iris_view(closing_conn) with closing_conn as conn: yield conn @@ -1097,6 +1121,7 @@ class PandasSQLTest: """ def load_iris_data(self, iris_path): + self.drop_view("iris_view", self.conn) self.drop_table("iris", self.conn) if isinstance(self.conn, sqlite3.Connection): create_and_load_iris_sqlite3(self.conn, iris_path) @@ -1116,18 +1141,21 @@ def load_types_data(self, types_data): def _read_sql_iris_parameter(self, sql_strings): query = sql_strings["read_parameters"][self.flavor] params = ("Iris-setosa", 5.1) - iris_frame = self.pandasSQL.read_query(query, params=params) + with self.pandasSQL.run_transaction(): + iris_frame = self.pandasSQL.read_query(query, params=params) check_iris_frame(iris_frame) def _read_sql_iris_named_parameter(self, sql_strings): query = sql_strings["read_named_parameters"][self.flavor] params = {"name": "Iris-setosa", "length": 5.1} - iris_frame = self.pandasSQL.read_query(query, params=params) + with self.pandasSQL.run_transaction(): + iris_frame = self.pandasSQL.read_query(query, params=params) check_iris_frame(iris_frame) def _read_sql_iris_no_parameter_with_percent(self, sql_strings): query = sql_strings["read_no_parameters_with_percent"][self.flavor] - iris_frame = self.pandasSQL.read_query(query, params=None) + with self.pandasSQL.run_transaction(): + iris_frame = self.pandasSQL.read_query(query, params=None) check_iris_frame(iris_frame) def _to_sql_empty(self, test_frame1): @@ -1157,7 +1185,8 @@ def _to_sql_with_sql_engine(self, test_frame1, engine="auto", **engine_kwargs): def _roundtrip(self, test_frame1): self.drop_table("test_frame_roundtrip", self.conn) assert self.pandasSQL.to_sql(test_frame1, "test_frame_roundtrip") == 4 - result = self.pandasSQL.read_query("SELECT * FROM test_frame_roundtrip") + with self.pandasSQL.run_transaction(): + result = self.pandasSQL.read_query("SELECT * FROM test_frame_roundtrip") result.set_index("level_0", inplace=True) # result.index.astype(int) @@ -1207,13 +1236,14 @@ class DummyException(Exception): except DummyException: # ignore raised exception pass - res = self.pandasSQL.read_query("SELECT * FROM test_trans") + with self.pandasSQL.run_transaction(): + res = self.pandasSQL.read_query("SELECT * FROM test_trans") assert len(res) == 0 # Make sure when transaction is committed, rows do get inserted with self.pandasSQL.run_transaction() as trans: trans.execute(ins_sql) - res2 = self.pandasSQL.read_query("SELECT * FROM test_trans") + res2 = self.pandasSQL.read_query("SELECT * FROM test_trans") assert len(res2) == 1 @@ -1221,470 +1251,695 @@ class DummyException(Exception): # -- Testing the public API -class _TestSQLApi(PandasSQLTest): - """ - Base class to test the public API. +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable_iris) +def test_api_read_sql_view(conn, request): + conn = request.getfixturevalue(conn) + iris_frame = sql.read_sql_query("SELECT * FROM iris_view", conn) + check_iris_frame(iris_frame) - From this two classes are derived to run these tests for both the - sqlalchemy mode (`TestSQLApi`) and the fallback mode - (`TestSQLiteFallbackApi`). These tests are run with sqlite3. Specific - tests for the different sql flavours are included in `_TestSQLAlchemy`. - Notes: - flavor can always be passed even in SQLAlchemy mode, - should be correctly ignored. +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable_iris) +def test_api_read_sql_with_chunksize_no_result(conn, request): + conn = request.getfixturevalue(conn) + query = 'SELECT * FROM iris_view WHERE "SepalLength" < 0.0' + with_batch = sql.read_sql_query(query, conn, chunksize=5) + without_batch = sql.read_sql_query(query, conn) + tm.assert_frame_equal(concat(with_batch), without_batch) - we don't use drop_table because that isn't part of the public api - """ +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_to_sql(conn, request, test_frame1): + conn = request.getfixturevalue(conn) + if sql.has_table("test_frame1", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_frame1") - flavor = "sqlite" - mode: str + sql.to_sql(test_frame1, "test_frame1", conn) + assert sql.has_table("test_frame1", conn) - @pytest.fixture(autouse=True) - def setup_method(self, iris_path, types_data): - self.conn = self.connect() - self.load_iris_data(iris_path) - self.load_types_data(types_data) - self.load_test_data_and_sql() - def load_test_data_and_sql(self): - create_and_load_iris_view(self.conn) +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_to_sql_fail(conn, request, test_frame1): + conn = request.getfixturevalue(conn) + if sql.has_table("test_frame2", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_frame2") - def test_read_sql_view(self): - iris_frame = sql.read_sql_query("SELECT * FROM iris_view", self.conn) - check_iris_frame(iris_frame) + sql.to_sql(test_frame1, "test_frame2", conn, if_exists="fail") + assert sql.has_table("test_frame2", conn) - def test_read_sql_with_chunksize_no_result(self): - query = "SELECT * FROM iris_view WHERE SepalLength < 0.0" - with_batch = sql.read_sql_query(query, self.conn, chunksize=5) - without_batch = sql.read_sql_query(query, self.conn) - tm.assert_frame_equal(concat(with_batch), without_batch) + msg = "Table 'test_frame2' already exists" + with pytest.raises(ValueError, match=msg): + sql.to_sql(test_frame1, "test_frame2", conn, if_exists="fail") - def test_to_sql(self, test_frame1): - sql.to_sql(test_frame1, "test_frame1", self.conn) - assert sql.has_table("test_frame1", self.conn) - def test_to_sql_fail(self, test_frame1): - sql.to_sql(test_frame1, "test_frame2", self.conn, if_exists="fail") - assert sql.has_table("test_frame2", self.conn) +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_to_sql_replace(conn, request, test_frame1): + conn = request.getfixturevalue(conn) + if sql.has_table("test_frame3", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_frame3") - msg = "Table 'test_frame2' already exists" - with pytest.raises(ValueError, match=msg): - sql.to_sql(test_frame1, "test_frame2", self.conn, if_exists="fail") + sql.to_sql(test_frame1, "test_frame3", conn, if_exists="fail") + # Add to table again + sql.to_sql(test_frame1, "test_frame3", conn, if_exists="replace") + assert sql.has_table("test_frame3", conn) - def test_to_sql_replace(self, test_frame1): - sql.to_sql(test_frame1, "test_frame3", self.conn, if_exists="fail") - # Add to table again - sql.to_sql(test_frame1, "test_frame3", self.conn, if_exists="replace") - assert sql.has_table("test_frame3", self.conn) + num_entries = len(test_frame1) + num_rows = count_rows(conn, "test_frame3") - num_entries = len(test_frame1) - num_rows = count_rows(self.conn, "test_frame3") + assert num_rows == num_entries - assert num_rows == num_entries - def test_to_sql_append(self, test_frame1): - assert sql.to_sql(test_frame1, "test_frame4", self.conn, if_exists="fail") == 4 +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_to_sql_append(conn, request, test_frame1): + conn = request.getfixturevalue(conn) + if sql.has_table("test_frame4", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_frame4") - # Add to table again - assert ( - sql.to_sql(test_frame1, "test_frame4", self.conn, if_exists="append") == 4 - ) - assert sql.has_table("test_frame4", self.conn) + assert sql.to_sql(test_frame1, "test_frame4", conn, if_exists="fail") == 4 - num_entries = 2 * len(test_frame1) - num_rows = count_rows(self.conn, "test_frame4") + # Add to table again + assert sql.to_sql(test_frame1, "test_frame4", conn, if_exists="append") == 4 + assert sql.has_table("test_frame4", conn) - assert num_rows == num_entries + num_entries = 2 * len(test_frame1) + num_rows = count_rows(conn, "test_frame4") - def test_to_sql_type_mapping(self, test_frame3): - sql.to_sql(test_frame3, "test_frame5", self.conn, index=False) - result = sql.read_sql("SELECT * FROM test_frame5", self.conn) + assert num_rows == num_entries - tm.assert_frame_equal(test_frame3, result) - def test_to_sql_series(self): - s = Series(np.arange(5, dtype="int64"), name="series") - sql.to_sql(s, "test_series", self.conn, index=False) - s2 = sql.read_sql_query("SELECT * FROM test_series", self.conn) - tm.assert_frame_equal(s.to_frame(), s2) +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_to_sql_type_mapping(conn, request, test_frame3): + conn = request.getfixturevalue(conn) + if sql.has_table("test_frame5", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_frame5") - def test_roundtrip(self, test_frame1): - sql.to_sql(test_frame1, "test_frame_roundtrip", con=self.conn) - result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=self.conn) + sql.to_sql(test_frame3, "test_frame5", conn, index=False) + result = sql.read_sql("SELECT * FROM test_frame5", conn) - # HACK! - result.index = test_frame1.index - result.set_index("level_0", inplace=True) - result.index.astype(int) - result.index.name = None - tm.assert_frame_equal(result, test_frame1) + tm.assert_frame_equal(test_frame3, result) - def test_roundtrip_chunksize(self, test_frame1): - sql.to_sql( - test_frame1, - "test_frame_roundtrip", - con=self.conn, - index=False, - chunksize=2, - ) - result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=self.conn) - tm.assert_frame_equal(result, test_frame1) - def test_execute_sql(self): - # drop_sql = "DROP TABLE IF EXISTS test" # should already be done - with sql.pandasSQL_builder(self.conn) as pandas_sql: - iris_results = pandas_sql.execute("SELECT * FROM iris") +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_to_sql_series(conn, request): + conn = request.getfixturevalue(conn) + if sql.has_table("test_series", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_series") + + s = Series(np.arange(5, dtype="int64"), name="series") + sql.to_sql(s, "test_series", conn, index=False) + s2 = sql.read_sql_query("SELECT * FROM test_series", conn) + tm.assert_frame_equal(s.to_frame(), s2) + + +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_roundtrip(conn, request, test_frame1): + conn = request.getfixturevalue(conn) + if sql.has_table("test_frame_roundtrip", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_frame_roundtrip") + + sql.to_sql(test_frame1, "test_frame_roundtrip", con=conn) + result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=conn) + + # HACK! + result.index = test_frame1.index + result.set_index("level_0", inplace=True) + result.index.astype(int) + result.index.name = None + tm.assert_frame_equal(result, test_frame1) + + +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_roundtrip_chunksize(conn, request, test_frame1): + conn = request.getfixturevalue(conn) + if sql.has_table("test_frame_roundtrip", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_frame_roundtrip") + + sql.to_sql( + test_frame1, + "test_frame_roundtrip", + con=conn, + index=False, + chunksize=2, + ) + result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=conn) + tm.assert_frame_equal(result, test_frame1) + + +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable_iris) +def test_api_execute_sql(conn, request): + # drop_sql = "DROP TABLE IF EXISTS test" # should already be done + conn = request.getfixturevalue(conn) + with sql.pandasSQL_builder(conn) as pandas_sql: + iris_results = pandas_sql.execute("SELECT * FROM iris") row = iris_results.fetchone() - tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) + tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) - def test_date_parsing(self): - # Test date parsing in read_sql - # No Parsing - df = sql.read_sql_query("SELECT * FROM types", self.conn) + +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_date_parsing(conn, request): + conn_name = conn + if conn_name in {"sqlite_buildin", "sqlite_str"}: + pytest.skip("types tables not created in sqlite_buildin or sqlite_str fixture") + + conn = request.getfixturevalue(conn) + # Test date parsing in read_sql + # No Parsing + df = sql.read_sql_query("SELECT * FROM types", conn) + if not ("mysql" in conn_name or "postgres" in conn_name): assert not issubclass(df.DateCol.dtype.type, np.datetime64) - df = sql.read_sql_query( - "SELECT * FROM types", self.conn, parse_dates=["DateCol"] - ) - assert issubclass(df.DateCol.dtype.type, np.datetime64) - assert df.DateCol.tolist() == [ - Timestamp(2000, 1, 3, 0, 0, 0), - Timestamp(2000, 1, 4, 0, 0, 0), - ] + df = sql.read_sql_query("SELECT * FROM types", conn, parse_dates=["DateCol"]) + assert issubclass(df.DateCol.dtype.type, np.datetime64) + assert df.DateCol.tolist() == [ + Timestamp(2000, 1, 3, 0, 0, 0), + Timestamp(2000, 1, 4, 0, 0, 0), + ] - df = sql.read_sql_query( - "SELECT * FROM types", - self.conn, - parse_dates={"DateCol": "%Y-%m-%d %H:%M:%S"}, - ) - assert issubclass(df.DateCol.dtype.type, np.datetime64) - assert df.DateCol.tolist() == [ - Timestamp(2000, 1, 3, 0, 0, 0), - Timestamp(2000, 1, 4, 0, 0, 0), - ] + df = sql.read_sql_query( + "SELECT * FROM types", + conn, + parse_dates={"DateCol": "%Y-%m-%d %H:%M:%S"}, + ) + assert issubclass(df.DateCol.dtype.type, np.datetime64) + assert df.DateCol.tolist() == [ + Timestamp(2000, 1, 3, 0, 0, 0), + Timestamp(2000, 1, 4, 0, 0, 0), + ] - df = sql.read_sql_query( - "SELECT * FROM types", self.conn, parse_dates=["IntDateCol"] - ) - assert issubclass(df.IntDateCol.dtype.type, np.datetime64) - assert df.IntDateCol.tolist() == [ - Timestamp(1986, 12, 25, 0, 0, 0), - Timestamp(2013, 1, 1, 0, 0, 0), - ] + df = sql.read_sql_query("SELECT * FROM types", conn, parse_dates=["IntDateCol"]) + assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + assert df.IntDateCol.tolist() == [ + Timestamp(1986, 12, 25, 0, 0, 0), + Timestamp(2013, 1, 1, 0, 0, 0), + ] - df = sql.read_sql_query( - "SELECT * FROM types", self.conn, parse_dates={"IntDateCol": "s"} - ) - assert issubclass(df.IntDateCol.dtype.type, np.datetime64) - assert df.IntDateCol.tolist() == [ - Timestamp(1986, 12, 25, 0, 0, 0), - Timestamp(2013, 1, 1, 0, 0, 0), - ] + df = sql.read_sql_query( + "SELECT * FROM types", conn, parse_dates={"IntDateCol": "s"} + ) + assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + assert df.IntDateCol.tolist() == [ + Timestamp(1986, 12, 25, 0, 0, 0), + Timestamp(2013, 1, 1, 0, 0, 0), + ] - df = sql.read_sql_query( + df = sql.read_sql_query( + "SELECT * FROM types", + conn, + parse_dates={"IntDateOnlyCol": "%Y%m%d"}, + ) + assert issubclass(df.IntDateOnlyCol.dtype.type, np.datetime64) + assert df.IntDateOnlyCol.tolist() == [ + Timestamp("2010-10-10"), + Timestamp("2010-12-12"), + ] + + +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +@pytest.mark.parametrize("error", ["ignore", "raise", "coerce"]) +@pytest.mark.parametrize( + "read_sql, text, mode", + [ + (sql.read_sql, "SELECT * FROM types", ("sqlalchemy", "fallback")), + (sql.read_sql, "types", ("sqlalchemy")), + ( + sql.read_sql_query, "SELECT * FROM types", - self.conn, - parse_dates={"IntDateOnlyCol": "%Y%m%d"}, - ) - assert issubclass(df.IntDateOnlyCol.dtype.type, np.datetime64) - assert df.IntDateOnlyCol.tolist() == [ - Timestamp("2010-10-10"), - Timestamp("2010-12-12"), - ] + ("sqlalchemy", "fallback"), + ), + (sql.read_sql_table, "types", ("sqlalchemy")), + ], +) +def test_api_custom_dateparsing_error( + conn, request, read_sql, text, mode, error, types_data_frame +): + conn_name = conn + if conn_name in {"sqlite_buildin", "sqlite_str"}: + pytest.skip("types tables not created in sqlite_buildin or sqlite_str fixture") - @pytest.mark.parametrize("error", ["ignore", "raise", "coerce"]) - @pytest.mark.parametrize( - "read_sql, text, mode", - [ - (sql.read_sql, "SELECT * FROM types", ("sqlalchemy", "fallback")), - (sql.read_sql, "types", ("sqlalchemy")), - ( - sql.read_sql_query, - "SELECT * FROM types", - ("sqlalchemy", "fallback"), - ), - (sql.read_sql_table, "types", ("sqlalchemy")), - ], + conn = request.getfixturevalue(conn) + + expected = types_data_frame.astype({"DateCol": "datetime64[ns]"}) + + result = read_sql( + text, + con=conn, + parse_dates={ + "DateCol": {"errors": error}, + }, ) - def test_custom_dateparsing_error( - self, read_sql, text, mode, error, types_data_frame - ): - if self.mode in mode: - expected = types_data_frame.astype({"DateCol": "datetime64[ns]"}) + if "postgres" in conn_name: + # TODO: clean up types_data_frame fixture + result = result.drop(columns=["DateColWithTz"]) + result["BoolCol"] = result["BoolCol"].astype(int) + result["BoolColWithNull"] = result["BoolColWithNull"].astype(float) - result = read_sql( - text, - con=self.conn, - parse_dates={ - "DateCol": {"errors": error}, - }, - ) + tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result, expected) - def test_date_and_index(self): - # Test case where same column appears in parse_date and index_col +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_date_and_index(conn, request): + # Test case where same column appears in parse_date and index_col + conn_name = conn + if conn_name in {"sqlite_buildin", "sqlite_str"}: + pytest.skip("types tables not created in sqlite_buildin or sqlite_str fixture") - df = sql.read_sql_query( - "SELECT * FROM types", - self.conn, - index_col="DateCol", - parse_dates=["DateCol", "IntDateCol"], - ) + conn = request.getfixturevalue(conn) + df = sql.read_sql_query( + "SELECT * FROM types", + conn, + index_col="DateCol", + parse_dates=["DateCol", "IntDateCol"], + ) - assert issubclass(df.index.dtype.type, np.datetime64) - assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + assert issubclass(df.index.dtype.type, np.datetime64) + assert issubclass(df.IntDateCol.dtype.type, np.datetime64) - def test_timedelta(self): - # see #6921 - df = to_timedelta(Series(["00:00:01", "00:00:03"], name="foo")).to_frame() - with tm.assert_produces_warning(UserWarning): - result_count = df.to_sql(name="test_timedelta", con=self.conn) - assert result_count == 2 - result = sql.read_sql_query("SELECT * FROM test_timedelta", self.conn) - tm.assert_series_equal(result["foo"], df["foo"].view("int64")) - - def test_complex_raises(self): - df = DataFrame({"a": [1 + 1j, 2j]}) - msg = "Complex datatypes not supported" - with pytest.raises(ValueError, match=msg): - assert df.to_sql("test_complex", con=self.conn) is None - @pytest.mark.parametrize( - "index_name,index_label,expected", - [ - # no index name, defaults to 'index' - (None, None, "index"), - # specifying index_label - (None, "other_label", "other_label"), - # using the index name - ("index_name", None, "index_name"), - # has index name, but specifying index_label - ("index_name", "other_label", "other_label"), - # index name is integer - (0, None, "0"), - # index name is None but index label is integer - (None, 0, "0"), - ], - ) - def test_to_sql_index_label(self, index_name, index_label, expected): - temp_frame = DataFrame({"col1": range(4)}) - temp_frame.index.name = index_name - query = "SELECT * FROM test_index_label" - sql.to_sql(temp_frame, "test_index_label", self.conn, index_label=index_label) - frame = sql.read_sql_query(query, self.conn) - assert frame.columns[0] == expected - - def test_to_sql_index_label_multiindex(self): - expected_row_count = 4 - temp_frame = DataFrame( - {"col1": range(4)}, - index=MultiIndex.from_product([("A0", "A1"), ("B0", "B1")]), - ) +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_timedelta(conn, request): + # see #6921 + conn = request.getfixturevalue(conn) + if sql.has_table("test_timedelta", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_timedelta") - # no index name, defaults to 'level_0' and 'level_1' - result = sql.to_sql(temp_frame, "test_index_label", self.conn) - assert result == expected_row_count - frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn) - assert frame.columns[0] == "level_0" - assert frame.columns[1] == "level_1" + df = to_timedelta(Series(["00:00:01", "00:00:03"], name="foo")).to_frame() + with tm.assert_produces_warning(UserWarning): + result_count = df.to_sql(name="test_timedelta", con=conn) + assert result_count == 2 + result = sql.read_sql_query("SELECT * FROM test_timedelta", conn) + tm.assert_series_equal(result["foo"], df["foo"].view("int64")) + + +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_complex_raises(conn, request): + conn = request.getfixturevalue(conn) + df = DataFrame({"a": [1 + 1j, 2j]}) + msg = "Complex datatypes not supported" + with pytest.raises(ValueError, match=msg): + assert df.to_sql("test_complex", con=conn) is None - # specifying index_label - result = sql.to_sql( - temp_frame, - "test_index_label", - self.conn, - if_exists="replace", - index_label=["A", "B"], - ) - assert result == expected_row_count - frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn) - assert frame.columns[:2].tolist() == ["A", "B"] +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +@pytest.mark.parametrize( + "index_name,index_label,expected", + [ + # no index name, defaults to 'index' + (None, None, "index"), + # specifying index_label + (None, "other_label", "other_label"), # using the index name - temp_frame.index.names = ["A", "B"] - result = sql.to_sql( - temp_frame, "test_index_label", self.conn, if_exists="replace" + ("index_name", None, "index_name"), + # has index name, but specifying index_label + ("index_name", "other_label", "other_label"), + # index name is integer + (0, None, "0"), + # index name is None but index label is integer + (None, 0, "0"), + ], +) +def test_api_to_sql_index_label(conn, request, index_name, index_label, expected): + conn = request.getfixturevalue(conn) + if sql.has_table("test_index_label", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_index_label") + + temp_frame = DataFrame({"col1": range(4)}) + temp_frame.index.name = index_name + query = "SELECT * FROM test_index_label" + sql.to_sql(temp_frame, "test_index_label", conn, index_label=index_label) + frame = sql.read_sql_query(query, conn) + assert frame.columns[0] == expected + + +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_to_sql_index_label_multiindex(conn, request): + conn_name = conn + if "mysql" in conn_name: + request.node.add_marker( + pytest.mark.xfail(reason="MySQL can fail using TEXT without length as key") ) - assert result == expected_row_count - frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn) - assert frame.columns[:2].tolist() == ["A", "B"] - # has index name, but specifying index_label - result = sql.to_sql( + conn = request.getfixturevalue(conn) + if sql.has_table("test_index_label", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_index_label") + + expected_row_count = 4 + temp_frame = DataFrame( + {"col1": range(4)}, + index=MultiIndex.from_product([("A0", "A1"), ("B0", "B1")]), + ) + + # no index name, defaults to 'level_0' and 'level_1' + result = sql.to_sql(temp_frame, "test_index_label", conn) + assert result == expected_row_count + frame = sql.read_sql_query("SELECT * FROM test_index_label", conn) + assert frame.columns[0] == "level_0" + assert frame.columns[1] == "level_1" + + # specifying index_label + result = sql.to_sql( + temp_frame, + "test_index_label", + conn, + if_exists="replace", + index_label=["A", "B"], + ) + assert result == expected_row_count + frame = sql.read_sql_query("SELECT * FROM test_index_label", conn) + assert frame.columns[:2].tolist() == ["A", "B"] + + # using the index name + temp_frame.index.names = ["A", "B"] + result = sql.to_sql(temp_frame, "test_index_label", conn, if_exists="replace") + assert result == expected_row_count + frame = sql.read_sql_query("SELECT * FROM test_index_label", conn) + assert frame.columns[:2].tolist() == ["A", "B"] + + # has index name, but specifying index_label + result = sql.to_sql( + temp_frame, + "test_index_label", + conn, + if_exists="replace", + index_label=["C", "D"], + ) + assert result == expected_row_count + frame = sql.read_sql_query("SELECT * FROM test_index_label", conn) + assert frame.columns[:2].tolist() == ["C", "D"] + + msg = "Length of 'index_label' should match number of levels, which is 2" + with pytest.raises(ValueError, match=msg): + sql.to_sql( temp_frame, "test_index_label", - self.conn, + conn, if_exists="replace", - index_label=["C", "D"], + index_label="C", ) - assert result == expected_row_count - frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn) - assert frame.columns[:2].tolist() == ["C", "D"] - msg = "Length of 'index_label' should match number of levels, which is 2" - with pytest.raises(ValueError, match=msg): - sql.to_sql( - temp_frame, - "test_index_label", - self.conn, - if_exists="replace", - index_label="C", - ) - def test_multiindex_roundtrip(self): - df = DataFrame.from_records( - [(1, 2.1, "line1"), (2, 1.5, "line2")], - columns=["A", "B", "C"], - index=["A", "B"], - ) - - df.to_sql(name="test_multiindex_roundtrip", con=self.conn) - result = sql.read_sql_query( - "SELECT * FROM test_multiindex_roundtrip", self.conn, index_col=["A", "B"] - ) - tm.assert_frame_equal(df, result, check_index_type=True) +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_multiindex_roundtrip(conn, request): + conn = request.getfixturevalue(conn) + if sql.has_table("test_multiindex_roundtrip", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_multiindex_roundtrip") + + df = DataFrame.from_records( + [(1, 2.1, "line1"), (2, 1.5, "line2")], + columns=["A", "B", "C"], + index=["A", "B"], + ) - @pytest.mark.parametrize( - "dtype", - [ - None, - int, - float, - {"A": int, "B": float}, - ], + df.to_sql(name="test_multiindex_roundtrip", con=conn) + result = sql.read_sql_query( + "SELECT * FROM test_multiindex_roundtrip", conn, index_col=["A", "B"] ) - def test_dtype_argument(self, dtype): - # GH10285 Add dtype argument to read_sql_query - df = DataFrame([[1.2, 3.4], [5.6, 7.8]], columns=["A", "B"]) - assert df.to_sql(name="test_dtype_argument", con=self.conn) == 2 - - expected = df.astype(dtype) - result = sql.read_sql_query( - "SELECT A, B FROM test_dtype_argument", con=self.conn, dtype=dtype - ) + tm.assert_frame_equal(df, result, check_index_type=True) - tm.assert_frame_equal(result, expected) - def test_integer_col_names(self): - df = DataFrame([[1, 2], [3, 4]], columns=[0, 1]) - sql.to_sql(df, "test_frame_integer_col_names", self.conn, if_exists="replace") +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +@pytest.mark.parametrize( + "dtype", + [ + None, + int, + float, + {"A": int, "B": float}, + ], +) +def test_api_dtype_argument(conn, request, dtype): + # GH10285 Add dtype argument to read_sql_query + conn_name = conn + conn = request.getfixturevalue(conn) + if sql.has_table("test_dtype_argument", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_dtype_argument") - def test_get_schema(self, test_frame1): - create_sql = sql.get_schema(test_frame1, "test", con=self.conn) - assert "CREATE" in create_sql + df = DataFrame([[1.2, 3.4], [5.6, 7.8]], columns=["A", "B"]) + assert df.to_sql(name="test_dtype_argument", con=conn) == 2 - def test_get_schema_with_schema(self, test_frame1): - # GH28486 - create_sql = sql.get_schema(test_frame1, "test", con=self.conn, schema="pypi") - assert "CREATE TABLE pypi." in create_sql + expected = df.astype(dtype) - def test_get_schema_dtypes(self): - if self.mode == "sqlalchemy": - from sqlalchemy import Integer + if "postgres" in conn_name: + query = 'SELECT "A", "B" FROM test_dtype_argument' + else: + query = "SELECT A, B FROM test_dtype_argument" + result = sql.read_sql_query(query, con=conn, dtype=dtype) - dtype = Integer - else: - dtype = "INTEGER" + tm.assert_frame_equal(result, expected) - float_frame = DataFrame({"a": [1.1, 1.2], "b": [2.1, 2.2]}) - create_sql = sql.get_schema( - float_frame, "test", con=self.conn, dtype={"b": dtype} - ) - assert "CREATE" in create_sql - assert "INTEGER" in create_sql - def test_get_schema_keys(self, test_frame1): - frame = DataFrame({"Col1": [1.1, 1.2], "Col2": [2.1, 2.2]}) - create_sql = sql.get_schema(frame, "test", con=self.conn, keys="Col1") +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_integer_col_names(conn, request): + conn = request.getfixturevalue(conn) + df = DataFrame([[1, 2], [3, 4]], columns=[0, 1]) + sql.to_sql(df, "test_frame_integer_col_names", conn, if_exists="replace") + + +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_get_schema(conn, request, test_frame1): + conn = request.getfixturevalue(conn) + create_sql = sql.get_schema(test_frame1, "test", con=conn) + assert "CREATE" in create_sql + + +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_get_schema_with_schema(conn, request, test_frame1): + # GH28486 + conn = request.getfixturevalue(conn) + create_sql = sql.get_schema(test_frame1, "test", con=conn, schema="pypi") + assert "CREATE TABLE pypi." in create_sql + + +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_get_schema_dtypes(conn, request): + conn_name = conn + conn = request.getfixturevalue(conn) + float_frame = DataFrame({"a": [1.1, 1.2], "b": [2.1, 2.2]}) + + if conn_name == "sqlite_buildin": + dtype = "INTEGER" + else: + from sqlalchemy import Integer + + dtype = Integer + create_sql = sql.get_schema(float_frame, "test", con=conn, dtype={"b": dtype}) + assert "CREATE" in create_sql + assert "INTEGER" in create_sql + + +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_get_schema_keys(conn, request, test_frame1): + conn_name = conn + conn = request.getfixturevalue(conn) + frame = DataFrame({"Col1": [1.1, 1.2], "Col2": [2.1, 2.2]}) + create_sql = sql.get_schema(frame, "test", con=conn, keys="Col1") + + if "mysql" in conn_name: + constraint_sentence = "CONSTRAINT test_pk PRIMARY KEY (`Col1`)" + else: constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("Col1")' - assert constraint_sentence in create_sql + assert constraint_sentence in create_sql - # multiple columns as key (GH10385) - create_sql = sql.get_schema(test_frame1, "test", con=self.conn, keys=["A", "B"]) + # multiple columns as key (GH10385) + create_sql = sql.get_schema(test_frame1, "test", con=conn, keys=["A", "B"]) + if "mysql" in conn_name: + constraint_sentence = "CONSTRAINT test_pk PRIMARY KEY (`A`, `B`)" + else: constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("A", "B")' - assert constraint_sentence in create_sql + assert constraint_sentence in create_sql - def test_chunksize_read(self): - df = DataFrame( - np.random.default_rng(2).standard_normal((22, 5)), columns=list("abcde") - ) - df.to_sql(name="test_chunksize", con=self.conn, index=False) - # reading the query in one time - res1 = sql.read_sql_query("select * from test_chunksize", self.conn) +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_chunksize_read(conn, request): + conn_name = conn + conn = request.getfixturevalue(conn) + if sql.has_table("test_chunksize", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_chunksize") + + df = DataFrame( + np.random.default_rng(2).standard_normal((22, 5)), columns=list("abcde") + ) + df.to_sql(name="test_chunksize", con=conn, index=False) + + # reading the query in one time + res1 = sql.read_sql_query("select * from test_chunksize", conn) + + # reading the query in chunks with read_sql_query + res2 = DataFrame() + i = 0 + sizes = [5, 5, 5, 5, 2] + + for chunk in sql.read_sql_query("select * from test_chunksize", conn, chunksize=5): + res2 = concat([res2, chunk], ignore_index=True) + assert len(chunk) == sizes[i] + i += 1 + + tm.assert_frame_equal(res1, res2) - # reading the query in chunks with read_sql_query - res2 = DataFrame() + # reading the query in chunks with read_sql_query + if conn_name == "sqlite_buildin": + with pytest.raises(NotImplementedError, match=""): + sql.read_sql_table("test_chunksize", conn, chunksize=5) + else: + res3 = DataFrame() i = 0 sizes = [5, 5, 5, 5, 2] - for chunk in sql.read_sql_query( - "select * from test_chunksize", self.conn, chunksize=5 - ): - res2 = concat([res2, chunk], ignore_index=True) + for chunk in sql.read_sql_table("test_chunksize", conn, chunksize=5): + res3 = concat([res3, chunk], ignore_index=True) assert len(chunk) == sizes[i] i += 1 - tm.assert_frame_equal(res1, res2) + tm.assert_frame_equal(res1, res3) - # reading the query in chunks with read_sql_query - if self.mode == "sqlalchemy": - res3 = DataFrame() - i = 0 - sizes = [5, 5, 5, 5, 2] - for chunk in sql.read_sql_table("test_chunksize", self.conn, chunksize=5): - res3 = concat([res3, chunk], ignore_index=True) - assert len(chunk) == sizes[i] - i += 1 +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_categorical(conn, request): + # GH8624 + # test that categorical gets written correctly as dense column + conn = request.getfixturevalue(conn) + if sql.has_table("test_categorical", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_categorical") - tm.assert_frame_equal(res1, res3) + df = DataFrame( + { + "person_id": [1, 2, 3], + "person_name": ["John P. Doe", "Jane Dove", "John P. Doe"], + } + ) + df2 = df.copy() + df2["person_name"] = df2["person_name"].astype("category") - def test_categorical(self): - # GH8624 - # test that categorical gets written correctly as dense column - df = DataFrame( - { - "person_id": [1, 2, 3], - "person_name": ["John P. Doe", "Jane Dove", "John P. Doe"], - } - ) - df2 = df.copy() - df2["person_name"] = df2["person_name"].astype("category") + df2.to_sql(name="test_categorical", con=conn, index=False) + res = sql.read_sql_query("SELECT * FROM test_categorical", conn) - df2.to_sql(name="test_categorical", con=self.conn, index=False) - res = sql.read_sql_query("SELECT * FROM test_categorical", self.conn) + tm.assert_frame_equal(res, df) - tm.assert_frame_equal(res, df) - def test_unicode_column_name(self): - # GH 11431 - df = DataFrame([[1, 2], [3, 4]], columns=["\xe9", "b"]) - df.to_sql(name="test_unicode", con=self.conn, index=False) +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_unicode_column_name(conn, request): + # GH 11431 + conn = request.getfixturevalue(conn) + if sql.has_table("test_unicode", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_unicode") - def test_escaped_table_name(self): - # GH 13206 - df = DataFrame({"A": [0, 1, 2], "B": [0.2, np.nan, 5.6]}) - df.to_sql(name="d1187b08-4943-4c8d-a7f6", con=self.conn, index=False) + df = DataFrame([[1, 2], [3, 4]], columns=["\xe9", "b"]) + df.to_sql(name="test_unicode", con=conn, index=False) - res = sql.read_sql_query("SELECT * FROM `d1187b08-4943-4c8d-a7f6`", self.conn) - tm.assert_frame_equal(res, df) +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_escaped_table_name(conn, request): + # GH 13206 + conn_name = conn + conn = request.getfixturevalue(conn) + if sql.has_table("d1187b08-4943-4c8d-a7f6", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("d1187b08-4943-4c8d-a7f6") - def test_read_sql_duplicate_columns(self): - # GH#53117 - df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": 1}) - df.to_sql(name="test_table", con=self.conn, index=False) + df = DataFrame({"A": [0, 1, 2], "B": [0.2, np.nan, 5.6]}) + df.to_sql(name="d1187b08-4943-4c8d-a7f6", con=conn, index=False) - result = pd.read_sql("SELECT a, b, a +1 as a, c FROM test_table;", self.conn) - expected = DataFrame( - [[1, 0.1, 2, 1], [2, 0.2, 3, 1], [3, 0.3, 4, 1]], - columns=["a", "b", "a", "c"], - ) - tm.assert_frame_equal(result, expected) + if "postgres" in conn_name: + query = 'SELECT * FROM "d1187b08-4943-4c8d-a7f6"' + else: + query = "SELECT * FROM `d1187b08-4943-4c8d-a7f6`" + res = sql.read_sql_query(query, conn) + + tm.assert_frame_equal(res, df) + + +@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_api_read_sql_duplicate_columns(conn, request): + # GH#53117 + conn = request.getfixturevalue(conn) + if sql.has_table("test_table", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_table") + + df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": 1}) + df.to_sql(name="test_table", con=conn, index=False) + + result = pd.read_sql("SELECT a, b, a +1 as a, c FROM test_table;", conn) + expected = DataFrame( + [[1, 0.1, 2, 1], [2, 0.2, 3, 1], [3, 0.3, 4, 1]], + columns=["a", "b", "a", "c"], + ) + tm.assert_frame_equal(result, expected) + + +class _TestSQLApi(PandasSQLTest): + """ + Base class to test the public API. + + From this two classes are derived to run these tests for both the + sqlalchemy mode (`TestSQLApi`) and the fallback mode + (`TestSQLiteFallbackApi`). These tests are run with sqlite3. Specific + tests for the different sql flavours are included in `_TestSQLAlchemy`. + + Notes: + flavor can always be passed even in SQLAlchemy mode, + should be correctly ignored. + + we don't use drop_table because that isn't part of the public api + + """ + + flavor = "sqlite" + mode: str + + @pytest.fixture(autouse=True) + def setup_method(self, iris_path, types_data): + self.conn = self.connect() + self.load_iris_data(iris_path) + self.load_types_data(types_data) + self.load_test_data_and_sql() + + def load_test_data_and_sql(self): + create_and_load_iris_view(self.conn) @pytest.mark.skipif(not SQLALCHEMY_INSTALLED, reason="SQLAlchemy not installed") @@ -2962,6 +3217,13 @@ def test_read_sql_string_inference(self): tm.assert_frame_equal(result, expected) + def test_roundtripping_datetimes(self): + # GH#54877 + df = DataFrame({"t": [datetime(2020, 12, 31, 12)]}, dtype="datetime64[ns]") + df.to_sql("test", self.conn, if_exists="replace", index=False) + result = pd.read_sql("select * from test", self.conn).iloc[0, 0] + assert result == "2020-12-31 12:00:00.000000" + @pytest.mark.db class TestMySQLAlchemy(_TestSQLAlchemy): diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 7459aa1df8f3e..cd504616b6c5d 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -798,7 +798,7 @@ def test_missing_value_generator(self): expected_values.insert(0, ".") for t in types: offset = valid_range[t][1] - for i in range(0, 27): + for i in range(27): val = StataMissingValue(offset + 1 + i) assert val.string == expected_values[i] diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 66ecb93385a87..a955fa0b096f0 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1077,8 +1077,12 @@ def test_resample_segfault(unit): all_wins_and_wagers, columns=("ID", "timestamp", "A", "B") ).set_index("timestamp") df.index = df.index.as_unit(unit) - result = df.groupby("ID").resample("5min").sum() - expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum()) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("ID").resample("5min").sum() + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum()) tm.assert_frame_equal(result, expected) @@ -1097,7 +1101,9 @@ def test_resample_dtype_preservation(unit): result = df.resample("1D").ffill() assert result.val.dtype == np.int32 - result = df.groupby("group").resample("1D").ffill() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("group").resample("1D").ffill() assert result.val.dtype == np.int32 @@ -1823,8 +1829,12 @@ def f(data, add_arg): # Testing dataframe df = DataFrame({"A": 1, "B": 2}, index=date_range("2017", periods=10)) - result = df.groupby("A").resample("D").agg(f, multiplier).astype(float) - expected = df.groupby("A").resample("D").mean().multiply(multiplier) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").resample("D").agg(f, multiplier).astype(float) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby("A").resample("D").mean().multiply(multiplier) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 1b20a7b99d1d7..f331851596317 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -77,7 +77,9 @@ def test_groupby_resample_api(): ) index = pd.MultiIndex.from_arrays([[1] * 8 + [2] * 8, i], names=["group", "date"]) expected = DataFrame({"val": [5] * 7 + [6] + [7] * 7 + [8]}, index=index) - result = df.groupby("group").apply(lambda x: x.resample("1D").ffill())[["val"]] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("group").apply(lambda x: x.resample("1D").ffill())[["val"]] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 6f4f1154907dc..d47a8132f26bb 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -68,8 +68,12 @@ def test_deferred_with_groupby(): def f_0(x): return x.set_index("date").resample("D").asfreq() - expected = df.groupby("id").apply(f_0) - result = df.set_index("date").groupby("id").resample("D").asfreq() + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby("id").apply(f_0) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.set_index("date").groupby("id").resample("D").asfreq() tm.assert_frame_equal(result, expected) df = DataFrame( @@ -83,8 +87,12 @@ def f_0(x): def f_1(x): return x.resample("1D").ffill() - expected = df.groupby("group").apply(f_1) - result = df.groupby("group").resample("1D").ffill() + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby("group").apply(f_1) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("group").resample("1D").ffill() tm.assert_frame_equal(result, expected) @@ -99,7 +107,9 @@ def test_getitem(test_frame): result = g.B.resample("2s").mean() tm.assert_series_equal(result, expected) - result = g.resample("2s").mean().B + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = g.resample("2s").mean().B tm.assert_series_equal(result, expected) @@ -230,8 +240,12 @@ def test_methods(f, test_frame): g = test_frame.groupby("A") r = g.resample("2s") - result = getattr(r, f)() - expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = getattr(r, f)() + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) tm.assert_equal(result, expected) @@ -248,8 +262,12 @@ def test_methods_nunique(test_frame): def test_methods_std_var(f, test_frame): g = test_frame.groupby("A") r = g.resample("2s") - result = getattr(r, f)(ddof=1) - expected = g.apply(lambda x: getattr(x.resample("2s"), f)(ddof=1)) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = getattr(r, f)(ddof=1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: getattr(x.resample("2s"), f)(ddof=1)) tm.assert_frame_equal(result, expected) @@ -258,18 +276,24 @@ def test_apply(test_frame): r = g.resample("2s") # reduction - expected = g.resample("2s").sum() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.resample("2s").sum() def f_0(x): return x.resample("2s").sum() - result = r.apply(f_0) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = r.apply(f_0) tm.assert_frame_equal(result, expected) def f_1(x): return x.resample("2s").apply(lambda y: y.sum()) - result = g.apply(f_1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = g.apply(f_1) # y.sum() results in int64 instead of int32 on 32-bit architectures expected = expected.astype("int64") tm.assert_frame_equal(result, expected) @@ -337,7 +361,9 @@ def test_resample_groupby_with_label(): # GH 13235 index = date_range("2000-01-01", freq="2D", periods=5) df = DataFrame(index=index, data={"col0": [0, 0, 1, 1, 2], "col1": [1, 1, 1, 1, 1]}) - result = df.groupby("col0").resample("1W", label="left").sum() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("col0").resample("1W", label="left").sum() mi = [ np.array([0, 0, 1, 2], dtype=np.int64), @@ -357,7 +383,9 @@ def test_consistency_with_window(test_frame): # consistent return values with window df = test_frame expected = Index([1, 2, 3], name="A") - result = df.groupby("A").resample("2s").mean() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").resample("2s").mean() assert result.index.nlevels == 2 tm.assert_index_equal(result.index.levels[0], expected) @@ -455,7 +483,9 @@ def test_resample_groupby_agg_listlike(): def test_empty(keys): # GH 26411 df = DataFrame([], columns=["a", "b"], index=TimedeltaIndex([])) - result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() expected = ( DataFrame(columns=["a", "b"]) .set_index(keys, drop=False) @@ -478,7 +508,8 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate): if consolidate: df = df._consolidate() - result = df.groupby(["key"]).resample("W", on="date").min() + with tm.assert_produces_warning(FutureWarning): + result = df.groupby(["key"]).resample("W", on="date").min() idx = pd.MultiIndex.from_arrays( [ ["A"] * 3 + ["B"] * 3, @@ -530,7 +561,9 @@ def test_resample_no_index(keys): df = DataFrame([], columns=["a", "b", "date"]) df["date"] = pd.to_datetime(df["date"]) df = df.set_index("date") - result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() expected = DataFrame(columns=["a", "b", "date"]).set_index(keys, drop=False) expected["date"] = pd.to_datetime(expected["date"]) expected = expected.set_index("date", append=True, drop=True) @@ -577,7 +610,9 @@ def test_groupby_resample_size_all_index_same(): {"A": [1] * 3 + [2] * 3 + [1] * 3 + [2] * 3, "B": np.arange(12)}, index=date_range("31/12/2000 18:00", freq="H", periods=12), ) - result = df.groupby("A").resample("D").size() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").resample("D").size() expected = Series( 3, index=pd.MultiIndex.from_tuples( diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index d7fdbc4fe5f08..8b1eab552c97d 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -323,12 +323,14 @@ def test_groupby_resample_interpolate(): df["week_starting"] = date_range("01/01/2018", periods=3, freq="W") - result = ( - df.set_index("week_starting") - .groupby("volume") - .resample("1D") - .interpolate(method="linear") - ) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ( + df.set_index("week_starting") + .groupby("volume") + .resample("1D") + .interpolate(method="linear") + ) expected_ind = pd.MultiIndex.from_tuples( [ diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 3efcd930af581..5dde863f246d1 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -858,3 +858,12 @@ def test_concat_multiindex_with_category(): ) expected = expected.set_index(["c1", "c2"]) tm.assert_frame_equal(result, expected) + + +def test_concat_ea_upcast(): + # GH#54848 + df1 = DataFrame(["a"], dtype="string") + df2 = DataFrame([1], dtype="Int64") + result = concat([df1, df2]) + expected = DataFrame(["a", 1], index=[0, 0]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 2f50a19189987..12d28c388d508 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -77,23 +77,23 @@ def test_concat_datetime_timezone(self): exp_idx = DatetimeIndex( [ - "2010-12-31 15:00:00+00:00", - "2010-12-31 16:00:00+00:00", - "2010-12-31 17:00:00+00:00", "2010-12-31 23:00:00+00:00", "2011-01-01 00:00:00+00:00", "2011-01-01 01:00:00+00:00", + "2010-12-31 15:00:00+00:00", + "2010-12-31 16:00:00+00:00", + "2010-12-31 17:00:00+00:00", ] ) expected = DataFrame( [ - [np.nan, 1], - [np.nan, 2], - [np.nan, 3], [1, np.nan], [2, np.nan], [3, np.nan], + [np.nan, 1], + [np.nan, 2], + [np.nan, 3], ], index=exp_idx, columns=["a", "b"], diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 02d7e2059e8e1..d889ae2e4806b 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -26,7 +26,6 @@ TimedeltaIndex, ) import pandas._testing as tm -from pandas.api.types import CategoricalDtype as CDT from pandas.core.reshape.concat import concat from pandas.core.reshape.merge import ( MergeError, @@ -582,11 +581,11 @@ def test_merge_empty_frame(self, series_of_dtype, series_of_dtype2): df_empty = df[:0] expected = DataFrame( { - "value_x": Series(dtype=df.dtypes["value"]), "key": Series(dtype=df.dtypes["key"]), + "value_x": Series(dtype=df.dtypes["value"]), "value_y": Series(dtype=df.dtypes["value"]), }, - columns=["value_x", "key", "value_y"], + columns=["key", "value_x", "value_y"], ) actual = df_empty.merge(df, on="key") tm.assert_frame_equal(actual, expected) @@ -889,13 +888,13 @@ def test_merge_on_datetime64tz_empty(self): result = left.merge(right, on="date") expected = DataFrame( { + "date": Series(dtype=dtz), "value_x": Series(dtype=float), "date2_x": Series(dtype=dtz), - "date": Series(dtype=dtz), "value_y": Series(dtype=float), "date2_y": Series(dtype=dtz), }, - columns=["value_x", "date2_x", "date", "value_y", "date2_y"], + columns=["date", "value_x", "date2_x", "value_y", "date2_y"], ) tm.assert_frame_equal(result, expected) @@ -1827,11 +1826,9 @@ def test_merge_empty(self, left_empty, how, exp): if exp == "left": expected = DataFrame({"A": [2, 1], "B": [3, 4], "C": [np.nan, np.nan]}) elif exp == "right": - expected = DataFrame({"B": [np.nan], "A": [1], "C": [5]}) + expected = DataFrame({"A": [1], "B": [np.nan], "C": [5]}) elif exp == "empty": expected = DataFrame(columns=["A", "B", "C"], dtype="int64") - if left_empty: - expected = expected[["B", "A", "C"]] elif exp == "empty_cross": expected = DataFrame(columns=["A_x", "B", "A_y", "C"], dtype="int64") @@ -1844,7 +1841,7 @@ def left(): { "X": Series( np.random.default_rng(2).choice(["foo", "bar"], size=(10,)) - ).astype(CDT(["foo", "bar"])), + ).astype(CategoricalDtype(["foo", "bar"])), "Y": np.random.default_rng(2).choice(["one", "two", "three"], size=(10,)), } ) @@ -1853,7 +1850,10 @@ def left(): @pytest.fixture def right(): return DataFrame( - {"X": Series(["foo", "bar"]).astype(CDT(["foo", "bar"])), "Z": [1, 2]} + { + "X": Series(["foo", "bar"]).astype(CategoricalDtype(["foo", "bar"])), + "Z": [1, 2], + } ) @@ -2004,8 +2004,8 @@ def test_other_columns(self, left, right): "change", [ lambda x: x, - lambda x: x.astype(CDT(["foo", "bar", "bah"])), - lambda x: x.astype(CDT(ordered=True)), + lambda x: x.astype(CategoricalDtype(["foo", "bar", "bah"])), + lambda x: x.astype(CategoricalDtype(ordered=True)), ], ) def test_dtype_on_merged_different(self, change, join_type, left, right): @@ -2112,11 +2112,13 @@ def test_merging_with_bool_or_int_cateorical_column( # GH 17187 # merging with a boolean/int categorical column df1 = DataFrame({"id": [1, 2, 3, 4], "cat": category_column}) - df1["cat"] = df1["cat"].astype(CDT(categories, ordered=ordered)) + df1["cat"] = df1["cat"].astype(CategoricalDtype(categories, ordered=ordered)) df2 = DataFrame({"id": [2, 4], "num": [1, 9]}) result = df1.merge(df2) expected = DataFrame({"id": [2, 4], "cat": expected_categories, "num": [1, 9]}) - expected["cat"] = expected["cat"].astype(CDT(categories, ordered=ordered)) + expected["cat"] = expected["cat"].astype( + CategoricalDtype(categories, ordered=ordered) + ) tm.assert_frame_equal(expected, result) def test_merge_on_int_array(self): @@ -2481,14 +2483,12 @@ def test_merge_multiindex_columns(): result = frame_x.merge(frame_y, on="id", suffixes=((l_suf, r_suf))) # Constructing the expected results - expected_labels = [letter + l_suf for letter in letters] + [ - letter + r_suf for letter in letters - ] - expected_index = MultiIndex.from_product( - [expected_labels, numbers], names=["outer", "inner"] - ) + tuples = [(letter + l_suf, num) for letter in letters for num in numbers] + tuples += [("id", "")] + tuples += [(letter + r_suf, num) for letter in letters for num in numbers] + + expected_index = MultiIndex.from_tuples(tuples, names=["outer", "inner"]) expected = DataFrame(columns=expected_index) - expected["id"] = "" tm.assert_frame_equal(result, expected) @@ -2947,3 +2947,38 @@ def test_merge_ea_int_and_float_numpy(): result = df2.merge(df1) tm.assert_frame_equal(result, expected.astype("float64")) + + +def test_merge_arrow_string_index(any_string_dtype): + # GH#54894 + pytest.importorskip("pyarrow") + left = DataFrame({"a": ["a", "b"]}, dtype=any_string_dtype) + right = DataFrame({"b": 1}, index=Index(["a", "c"], dtype=any_string_dtype)) + result = left.merge(right, left_on="a", right_index=True, how="left") + expected = DataFrame( + {"a": Series(["a", "b"], dtype=any_string_dtype), "b": [1, np.nan]} + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("left_empty", [True, False]) +@pytest.mark.parametrize("right_empty", [True, False]) +def test_merge_empty_frames_column_order(left_empty, right_empty): + # GH 51929 + df1 = DataFrame(1, index=[0], columns=["A", "B"]) + df2 = DataFrame(1, index=[0], columns=["A", "C", "D"]) + + if left_empty: + df1 = df1.iloc[:0] + if right_empty: + df2 = df2.iloc[:0] + + result = merge(df1, df2, on=["A"], how="outer") + expected = DataFrame(1, index=[0], columns=["A", "B", "C", "D"]) + if left_empty and right_empty: + expected = expected.iloc[:0] + elif left_empty: + expected.loc[:, "B"] = np.nan + elif right_empty: + expected.loc[:, ["C", "D"]] = np.nan + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index b2a6ac49fdff2..3a284f7732ac1 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -21,7 +21,7 @@ to_datetime, ) import pandas._testing as tm -from pandas.api.types import CategoricalDtype as CDT +from pandas.api.types import CategoricalDtype import pandas.core.reshape.tile as tmod @@ -359,7 +359,7 @@ def test_cut_return_intervals(): IntervalIndex.from_breaks(exp_bins, closed="right").take( [0, 0, 0, 1, 1, 1, 2, 2, 2] ) - ).astype(CDT(ordered=True)) + ).astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(result, expected) @@ -370,7 +370,7 @@ def test_series_ret_bins(): expected = Series( IntervalIndex.from_breaks([-0.003, 1.5, 3], closed="right").repeat(2) - ).astype(CDT(ordered=True)) + ).astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(result, expected) @@ -445,7 +445,7 @@ def test_datetime_bin(conv): Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2])), ] ) - ).astype(CDT(ordered=True)) + ).astype(CategoricalDtype(ordered=True)) bins = [conv(v) for v in bin_data] result = Series(cut(data, bins=bins)) @@ -491,7 +491,7 @@ def test_datetime_cut(data): ), ] ) - ).astype(CDT(ordered=True)) + ).astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(Series(result), expected) @@ -534,7 +534,7 @@ def test_datetime_tz_cut(bins, box): ), ] ) - ).astype(CDT(ordered=True)) + ).astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(result, expected) @@ -700,7 +700,7 @@ def test_cut_with_duplicated_index_lowest_included(): def test_cut_with_nonexact_categorical_indices(): # GH 42424 - ser = Series(range(0, 100)) + ser = Series(range(100)) ser1 = cut(ser, 10).value_counts().head(5) ser2 = cut(ser, 10).value_counts().tail(5) result = DataFrame({"1": ser1, "2": ser2}) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 46da18445e135..28ad133a0c8d6 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -23,7 +23,7 @@ date_range, ) import pandas._testing as tm -from pandas.api.types import CategoricalDtype as CDT +from pandas.api.types import CategoricalDtype from pandas.core.reshape import reshape as reshape_lib from pandas.core.reshape.pivot import pivot_table @@ -33,7 +33,7 @@ def dropna(request): return request.param -@pytest.fixture(params=[([0] * 4, [1] * 4), (range(0, 3), range(1, 4))]) +@pytest.fixture(params=[([0] * 4, [1] * 4), (range(3), range(1, 4))]) def interval_values(request, closed): left, right = request.param return Categorical(pd.IntervalIndex.from_arrays(left, right, closed)) @@ -215,14 +215,16 @@ def test_pivot_table_dropna_categoricals(self, dropna): { "A": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], "B": [1, 2, 3, 1, 2, 3, 1, 2, 3], - "C": range(0, 9), + "C": range(9), } ) - df["A"] = df["A"].astype(CDT(categories, ordered=False)) + df["A"] = df["A"].astype(CategoricalDtype(categories, ordered=False)) result = df.pivot_table(index="B", columns="A", values="C", dropna=dropna) expected_columns = Series(["a", "b", "c"], name="A") - expected_columns = expected_columns.astype(CDT(categories, ordered=False)) + expected_columns = expected_columns.astype( + CategoricalDtype(categories, ordered=False) + ) expected_index = Series([1, 2, 3], name="B") expected = DataFrame( [[0.0, 3.0, 6.0], [1.0, 4.0, 7.0], [2.0, 5.0, 8.0]], diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py index 907eeca6e9b5e..bcfbe5ed1aa20 100644 --- a/pandas/tests/reshape/test_qcut.py +++ b/pandas/tests/reshape/test_qcut.py @@ -20,7 +20,7 @@ timedelta_range, ) import pandas._testing as tm -from pandas.api.types import CategoricalDtype as CDT +from pandas.api.types import CategoricalDtype from pandas.tseries.offsets import ( Day, @@ -129,7 +129,9 @@ def test_qcut_return_intervals(): exp_levels = np.array( [Interval(-0.001, 2.664), Interval(2.664, 5.328), Interval(5.328, 8)] ) - exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(CDT(ordered=True)) + exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype( + CategoricalDtype(ordered=True) + ) tm.assert_series_equal(res, exp) @@ -199,7 +201,7 @@ def test_single_quantile(data, start, end, length, labels): if labels is None: intervals = IntervalIndex([Interval(start, end)] * length, closed="right") - expected = Series(intervals).astype(CDT(ordered=True)) + expected = Series(intervals).astype(CategoricalDtype(ordered=True)) else: expected = Series([0] * length, dtype=np.intp) @@ -249,7 +251,7 @@ def test_datetime_tz_qcut(bins): ), ] ) - ).astype(CDT(ordered=True)) + ).astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index f1d8acf47b29a..cb797a4168088 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -927,7 +927,6 @@ def test_timedelta_hash_equality(self): @pytest.mark.xfail( reason="pd.Timedelta violates the Python hash invariant (GH#44504).", - raises=AssertionError, ) @given( st.integers( diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index a388f0f80fa94..dbaba146e600e 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -486,3 +486,12 @@ def test_getitem_str_second_with_datetimeindex(): msg = r"Timestamp\('2012-01-02 18:01:02-0600', tz='US/Central'\)" with pytest.raises(KeyError, match=msg): df[df.index[2]] + + +def test_compare_datetime_with_all_none(): + # GH#54870 + ser = Series(["2020-01-01", "2020-01-02"], dtype="datetime64[ns]") + ser2 = Series([None, None]) + result = ser > ser2 + expected = Series([False, False]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_drop_duplicates.py b/pandas/tests/series/methods/test_drop_duplicates.py index 324ab1204e16e..10b2e98586365 100644 --- a/pandas/tests/series/methods/test_drop_duplicates.py +++ b/pandas/tests/series/methods/test_drop_duplicates.py @@ -1,6 +1,7 @@ import numpy as np import pytest +import pandas as pd from pandas import ( Categorical, Series, @@ -256,3 +257,11 @@ def test_duplicated_arrow_dtype(self): result = ser.drop_duplicates() expected = Series([True, False, None], dtype="bool[pyarrow]") tm.assert_series_equal(result, expected) + + def test_drop_duplicates_arrow_strings(self): + # GH#54904 + pa = pytest.importorskip("pyarrow") + ser = Series(["a", "a"], dtype=pd.ArrowDtype(pa.string())) + result = ser.drop_duplicates() + expecetd = Series(["a"], dtype=pd.ArrowDtype(pa.string())) + tm.assert_series_equal(result, expecetd) diff --git a/pandas/tests/series/methods/test_interpolate.py b/pandas/tests/series/methods/test_interpolate.py index 619690f400d98..549f429f09d35 100644 --- a/pandas/tests/series/methods/test_interpolate.py +++ b/pandas/tests/series/methods/test_interpolate.py @@ -858,3 +858,11 @@ def test_interpolate_asfreq_raises(self): with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning(FutureWarning, match=msg2): ser.interpolate(method="asfreq") + + def test_interpolate_fill_value(self): + # GH#54920 + pytest.importorskip("scipy") + ser = Series([np.nan, 0, 1, np.nan, 3, np.nan]) + result = ser.interpolate(method="nearest", fill_value=0) + expected = Series([np.nan, 0, 1, 1, 3, 0]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_pct_change.py b/pandas/tests/series/methods/test_pct_change.py index 4dabf7b87e2cd..6740b8756853e 100644 --- a/pandas/tests/series/methods/test_pct_change.py +++ b/pandas/tests/series/methods/test_pct_change.py @@ -107,3 +107,11 @@ def test_pct_change_with_duplicated_indices(fill_method): expected = Series([np.nan, np.nan, 1.0, 0.5, 2.0, 1.0], index=["a", "b"] * 3) tm.assert_series_equal(result, expected) + + +def test_pct_change_no_warning_na_beginning(): + # GH#54981 + ser = Series([None, None, 1, 2, 3]) + result = ser.pct_change() + expected = Series([np.nan, np.nan, np.nan, 1, 0.5]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index bce7d2d554004..016208f2d2026 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -159,9 +159,9 @@ def test_reindex_inference(): def test_reindex_downcasting(): # GH4618 shifted series downcasting - s = Series(False, index=range(0, 5)) + s = Series(False, index=range(5)) result = s.shift(1).bfill() - expected = Series(False, index=range(0, 5)) + expected = Series(False, index=range(5)) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 856c31b9ccb06..661290fb00d13 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -17,7 +17,7 @@ is_integer_dtype, is_object_dtype, ) -from pandas.core.dtypes.dtypes import CategoricalDtype as CDT +from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd from pandas import ( @@ -1182,7 +1182,7 @@ def test_value_counts(self): with tm.assert_produces_warning(FutureWarning, match=msg): result = algos.value_counts(factor) breaks = [-1.606, -1.018, -0.431, 0.155, 0.741] - index = IntervalIndex.from_breaks(breaks).astype(CDT(ordered=True)) + index = IntervalIndex.from_breaks(breaks).astype(CategoricalDtype(ordered=True)) expected = Series([1, 0, 2, 1], index=index, name="count") tm.assert_series_equal(result.sort_index(), expected.sort_index()) @@ -1412,6 +1412,19 @@ def test_value_counts_uint64(self): tm.assert_series_equal(result, expected) + def test_value_counts_series(self): + # GH#54857 + values = np.array([3, 1, 2, 3, 4, np.nan]) + result = Series(values).value_counts(bins=3) + expected = Series( + [2, 2, 1], + index=IntervalIndex.from_tuples( + [(0.996, 2.0), (2.0, 3.0), (3.0, 4.0)], dtype="interval[float64, right]" + ), + name="count", + ) + tm.assert_series_equal(result, expected) + class TestDuplicated: def test_duplicated_with_nas(self): diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index ab00e18fc4812..b8e0173ee131f 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -99,7 +99,9 @@ def test_rolling(self, f, roll_frame): r = g.rolling(window=4) result = getattr(r, f)() - expected = g.apply(lambda x: getattr(x.rolling(4), f)()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: getattr(x.rolling(4), f)()) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -113,7 +115,9 @@ def test_rolling_ddof(self, f, roll_frame): r = g.rolling(window=4) result = getattr(r, f)(ddof=1) - expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -129,9 +133,11 @@ def test_rolling_quantile(self, interpolation, roll_frame): r = g.rolling(window=4) result = r.quantile(0.4, interpolation=interpolation) - expected = g.apply( - lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation) - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply( + lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation) + ) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -174,7 +180,9 @@ def test_rolling_corr_cov_other_diff_size_as_groups(self, f, roll_frame): def func(x): return getattr(x.rolling(4), f)(roll_frame) - expected = g.apply(func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(func) # GH 39591: The grouped column should be all np.nan # (groupby.apply inserts 0s for cov) expected["A"] = np.nan @@ -190,7 +198,9 @@ def test_rolling_corr_cov_pairwise(self, f, roll_frame): def func(x): return getattr(x.B.rolling(4), f)(pairwise=True) - expected = g.apply(func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(func) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -235,7 +245,9 @@ def test_rolling_apply(self, raw, roll_frame): # reduction result = r.apply(lambda x: x.sum(), raw=raw) - expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -466,20 +478,23 @@ def test_groupby_rolling_subset_with_closed(self): # GH 35549 df = DataFrame( { - "column1": range(6), - "column2": range(6), - "group": 3 * ["A", "B"], - "date": [Timestamp("2019-01-01")] * 6, + "column1": range(8), + "column2": range(8), + "group": ["A"] * 4 + ["B"] * 4, + "date": [ + Timestamp(date) + for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"] + ] + * 2, } ) result = ( df.groupby("group").rolling("1D", on="date", closed="left")["column1"].sum() ) expected = Series( - [np.nan, 0.0, 2.0, np.nan, 1.0, 4.0], - index=MultiIndex.from_tuples( - [("A", Timestamp("2019-01-01"))] * 3 - + [("B", Timestamp("2019-01-01"))] * 3, + [np.nan, np.nan, 1.0, 1.0, np.nan, np.nan, 9.0, 9.0], + index=MultiIndex.from_frame( + df[["group", "date"]], names=["group", "date"], ), name="column1", @@ -490,10 +505,14 @@ def test_groupby_subset_rolling_subset_with_closed(self): # GH 35549 df = DataFrame( { - "column1": range(6), - "column2": range(6), - "group": 3 * ["A", "B"], - "date": [Timestamp("2019-01-01")] * 6, + "column1": range(8), + "column2": range(8), + "group": ["A"] * 4 + ["B"] * 4, + "date": [ + Timestamp(date) + for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"] + ] + * 2, } ) @@ -503,10 +522,9 @@ def test_groupby_subset_rolling_subset_with_closed(self): .sum() ) expected = Series( - [np.nan, 0.0, 2.0, np.nan, 1.0, 4.0], - index=MultiIndex.from_tuples( - [("A", Timestamp("2019-01-01"))] * 3 - + [("B", Timestamp("2019-01-01"))] * 3, + [np.nan, np.nan, 1.0, 1.0, np.nan, np.nan, 9.0, 9.0], + index=MultiIndex.from_frame( + df[["group", "date"]], names=["group", "date"], ), name="column1", @@ -778,9 +796,13 @@ def test_groupby_rolling_resulting_multiindex3(self): def test_groupby_rolling_object_doesnt_affect_groupby_apply(self, roll_frame): # GH 39732 g = roll_frame.groupby("A", group_keys=False) - expected = g.apply(lambda x: x.rolling(4).sum()).index + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: x.rolling(4).sum()).index _ = g.rolling(window=4) - result = g.apply(lambda x: x.rolling(4).sum()).index + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = g.apply(lambda x: x.rolling(4).sum()).index tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -954,11 +976,13 @@ def test_groupby_monotonic(self): df["date"] = to_datetime(df["date"]) df = df.sort_values("date") - expected = ( - df.set_index("date") - .groupby("name") - .apply(lambda x: x.rolling("180D")["amount"].sum()) - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = ( + df.set_index("date") + .groupby("name") + .apply(lambda x: x.rolling("180D")["amount"].sum()) + ) result = df.groupby("name").rolling("180D", on="date")["amount"].sum() tm.assert_series_equal(result, expected) @@ -977,9 +1001,13 @@ def test_datelike_on_monotonic_within_each_group(self): } ) - expected = ( - df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean()) - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = ( + df.set_index("B") + .groupby("A") + .apply(lambda x: x.rolling("4s")["C"].mean()) + ) result = df.groupby("A").rolling("4s", on="B").C.mean() tm.assert_series_equal(result, expected) @@ -1009,7 +1037,9 @@ def test_expanding(self, f, frame): r = g.expanding() result = getattr(r, f)() - expected = g.apply(lambda x: getattr(x.expanding(), f)()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: getattr(x.expanding(), f)()) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -1023,7 +1053,9 @@ def test_expanding_ddof(self, f, frame): r = g.expanding() result = getattr(r, f)(ddof=0) - expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -1039,9 +1071,11 @@ def test_expanding_quantile(self, interpolation, frame): r = g.expanding() result = r.quantile(0.4, interpolation=interpolation) - expected = g.apply( - lambda x: x.expanding().quantile(0.4, interpolation=interpolation) - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply( + lambda x: x.expanding().quantile(0.4, interpolation=interpolation) + ) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -1059,7 +1093,9 @@ def test_expanding_corr_cov(self, f, frame): def func_0(x): return getattr(x.expanding(), f)(frame) - expected = g.apply(func_0) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(func_0) # GH 39591: groupby.apply returns 1 instead of nan for windows # with all nan values null_idx = list(range(20, 61)) + list(range(72, 113)) @@ -1074,7 +1110,9 @@ def func_0(x): def func_1(x): return getattr(x.B.expanding(), f)(pairwise=True) - expected = g.apply(func_1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(func_1) tm.assert_series_equal(result, expected) def test_expanding_apply(self, raw, frame): @@ -1083,7 +1121,11 @@ def test_expanding_apply(self, raw, frame): # reduction result = r.apply(lambda x: x.sum(), raw=raw) - expected = g.apply(lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply( + lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw) + ) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index f4d903dc19fb7..3fe922539780d 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -100,9 +100,9 @@ def test_freq_window_not_implemented(window): index=date_range("2015-12-24", periods=10, freq="D"), ) with pytest.raises( - NotImplementedError, match="step is not supported with frequency windows" + NotImplementedError, match="^step (not implemented|is not supported)" ): - df.rolling("3D", step=3) + df.rolling(window, step=3).sum() @pytest.mark.parametrize("agg", ["cov", "corr"]) @@ -304,6 +304,76 @@ def test_datetimelike_nonunique_index_centering( tm.assert_equal(result, expected) +@pytest.mark.parametrize( + "closed,expected", + [ + ("left", [np.nan, np.nan, 1, 1, 1, 10, 14, 14, 18, 21]), + ("neither", [np.nan, np.nan, 1, 1, 1, 9, 5, 5, 13, 8]), + ("right", [0, 1, 3, 6, 10, 14, 11, 18, 21, 17]), + ("both", [0, 1, 3, 6, 10, 15, 20, 27, 26, 30]), + ], +) +def test_variable_window_nonunique(closed, expected, frame_or_series): + # GH 20712 + index = DatetimeIndex( + [ + "2011-01-01", + "2011-01-01", + "2011-01-02", + "2011-01-02", + "2011-01-02", + "2011-01-03", + "2011-01-04", + "2011-01-04", + "2011-01-05", + "2011-01-06", + ] + ) + + df = frame_or_series(range(10), index=index, dtype=float) + expected = frame_or_series(expected, index=index, dtype=float) + + result = df.rolling("2D", closed=closed).sum() + + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "closed,expected", + [ + ("left", [np.nan, np.nan, 1, 1, 1, 10, 15, 15, 18, 21]), + ("neither", [np.nan, np.nan, 1, 1, 1, 10, 15, 15, 13, 8]), + ("right", [0, 1, 3, 6, 10, 15, 21, 28, 21, 17]), + ("both", [0, 1, 3, 6, 10, 15, 21, 28, 26, 30]), + ], +) +def test_variable_offset_window_nonunique(closed, expected, frame_or_series): + # GH 20712 + index = DatetimeIndex( + [ + "2011-01-01", + "2011-01-01", + "2011-01-02", + "2011-01-02", + "2011-01-02", + "2011-01-03", + "2011-01-04", + "2011-01-04", + "2011-01-05", + "2011-01-06", + ] + ) + + df = frame_or_series(range(10), index=index, dtype=float) + expected = frame_or_series(expected, index=index, dtype=float) + + offset = BusinessDay(2) + indexer = VariableOffsetWindowIndexer(index=index, offset=offset) + result = df.rolling(indexer, closed=closed, min_periods=1).sum() + + tm.assert_equal(result, expected) + + def test_even_number_window_alignment(): # see discussion in GH 38780 s = Series(range(3), index=date_range(start="2020-01-01", freq="D", periods=3)) diff --git a/pandas/tests/window/test_rolling_functions.py b/pandas/tests/window/test_rolling_functions.py index 940f0845befa2..51f801ab3761b 100644 --- a/pandas/tests/window/test_rolling_functions.py +++ b/pandas/tests/window/test_rolling_functions.py @@ -388,7 +388,7 @@ def test_rolling_max_resample(step): # So that we can have 3 datapoints on last day (4, 10, and 20) indices.append(datetime(1975, 1, 5, 1)) indices.append(datetime(1975, 1, 5, 2)) - series = Series(list(range(0, 5)) + [10, 20], index=indices) + series = Series(list(range(5)) + [10, 20], index=indices) # Use floats instead of ints as values series = series.map(lambda x: float(x)) # Sort chronologically @@ -425,7 +425,7 @@ def test_rolling_min_resample(step): # So that we can have 3 datapoints on last day (4, 10, and 20) indices.append(datetime(1975, 1, 5, 1)) indices.append(datetime(1975, 1, 5, 2)) - series = Series(list(range(0, 5)) + [10, 20], index=indices) + series = Series(list(range(5)) + [10, 20], index=indices) # Use floats instead of ints as values series = series.map(lambda x: float(x)) # Sort chronologically @@ -445,7 +445,7 @@ def test_rolling_median_resample(): # So that we can have 3 datapoints on last day (4, 10, and 20) indices.append(datetime(1975, 1, 5, 1)) indices.append(datetime(1975, 1, 5, 2)) - series = Series(list(range(0, 5)) + [10, 20], index=indices) + series = Series(list(range(5)) + [10, 20], index=indices) # Use floats instead of ints as values series = series.map(lambda x: float(x)) # Sort chronologically diff --git a/pyproject.toml b/pyproject.toml index 845c2a63e84f0..9e579036c128b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,8 @@ requires = [ # we don't want to force users to compile with 1.25 though # (Ideally, in the future, though, oldest-supported-numpy can be dropped when our min numpy is 1.25.x) "oldest-supported-numpy>=2022.8.16; python_version<'3.12'", - "numpy>=1.22.4; python_version>='3.12'", + # TODO: This needs to be updated when the official numpy 1.26 comes out + "numpy>=1.26.0b1; python_version>='3.12'", "versioneer[toml]" ] @@ -30,7 +31,9 @@ license = {file = 'LICENSE'} requires-python = '>=3.9' dependencies = [ "numpy>=1.22.4; python_version<'3.11'", - "numpy>=1.23.2; python_version>='3.11'", + "numpy>=1.23.2; python_version=='3.11'", + # TODO: This needs to be updated when the official numpy 1.26 comes out + "numpy>=1.26.0b1; python_version>='3.12'", "python-dateutil>=2.8.2", "pytz>=2020.1", "tzdata>=2022.1" @@ -66,7 +69,7 @@ computation = ['scipy>=1.8.1', 'xarray>=2022.03.0'] fss = ['fsspec>=2022.05.0'] aws = ['s3fs>=2022.05.0'] gcp = ['gcsfs>=2022.05.0', 'pandas-gbq>=0.17.5'] -excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.10', 'pyxlsb>=1.0.9', 'xlrd>=2.0.1', 'xlsxwriter>=3.0.3'] +excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.10', 'python-calamine>=0.1.6', 'pyxlsb>=1.0.9', 'xlrd>=2.0.1', 'xlsxwriter>=3.0.3'] parquet = ['pyarrow>=7.0.0'] feather = ['pyarrow>=7.0.0'] hdf5 = [# blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) @@ -109,6 +112,7 @@ all = ['beautifulsoup4>=4.11.1', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0', 'pytest-asyncio>=0.17.0', + 'python-calamine>=0.1.6', 'pyxlsb>=1.0.9', 'qtpy>=2.2.0', 'scipy>=1.8.1', diff --git a/requirements-dev.txt b/requirements-dev.txt index be02007a36333..01e0701bc39a7 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -36,6 +36,7 @@ pyarrow>=7.0.0 pymysql>=1.0.2 pyreadstat>=1.1.5 tables>=3.7.0 +python-calamine>=0.1.6 pyxlsb>=1.0.9 s3fs>=2022.05.0 scipy>=1.8.1 @@ -76,7 +77,6 @@ ipywidgets nbformat notebook>=6.0.3 ipykernel -jinja2 markdown feedparser pyyaml diff --git a/scripts/tests/data/deps_expected_random.yaml b/scripts/tests/data/deps_expected_random.yaml index c70025f8f019d..1ede20f5cc0d8 100644 --- a/scripts/tests/data/deps_expected_random.yaml +++ b/scripts/tests/data/deps_expected_random.yaml @@ -44,6 +44,7 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.2 - pytables>=3.6.1 + - python-calamine>=0.1.6 - pyxlsb>=1.0.8 - s3fs>=2021.08.0 - scipy>=1.7.1 diff --git a/scripts/tests/data/deps_minimum.toml b/scripts/tests/data/deps_minimum.toml index b43815a982139..501ec4f061f17 100644 --- a/scripts/tests/data/deps_minimum.toml +++ b/scripts/tests/data/deps_minimum.toml @@ -62,7 +62,7 @@ computation = ['scipy>=1.7.1', 'xarray>=0.21.0'] fss = ['fsspec>=2021.07.0'] aws = ['s3fs>=2021.08.0'] gcp = ['gcsfs>=2021.07.0', 'pandas-gbq>=0.15.0'] -excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.7', 'pyxlsb>=1.0.8', 'xlrd>=2.0.1', 'xlsxwriter>=1.4.3'] +excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.7', 'python-calamine>=0.1.6', 'pyxlsb>=1.0.8', 'xlrd>=2.0.1', 'xlsxwriter>=1.4.3'] parquet = ['pyarrow>=7.0.0'] feather = ['pyarrow>=7.0.0'] hdf5 = [# blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) @@ -103,6 +103,7 @@ all = ['beautifulsoup4>=5.9.3', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0', 'pytest-asyncio>=0.17.0', + 'python-calamine>=0.1.6', 'pyxlsb>=1.0.8', 'qtpy>=2.2.0', 'scipy>=1.7.1', diff --git a/scripts/tests/data/deps_unmodified_random.yaml b/scripts/tests/data/deps_unmodified_random.yaml index 503eb3c7c7734..14bedd1025bf8 100644 --- a/scripts/tests/data/deps_unmodified_random.yaml +++ b/scripts/tests/data/deps_unmodified_random.yaml @@ -44,6 +44,7 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.2 - pytables>=3.6.1 + - python-calamine>=0.1.6 - pyxlsb>=1.0.8 - s3fs>=2021.08.0 - scipy>=1.7.1 diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 47534226f972f..0931dd209ee05 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -33,6 +33,7 @@ "_agg_template_series", "_agg_template_frame", "_pipe_template", + "_apply_groupings_depr", "__main__", "_transform_template", "_use_inf_as_na",