Merge branch 'main' into 54984

phofl · Sep 6, 2023 · a5b0716 · a5b0716
2 parents b3b3fdf + 88683e9
commit a5b0716
Show file tree

Hide file tree

Showing 60 changed files with 678 additions and 257 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -24,7 +24,7 @@ repos:
     hooks:
       - id: black
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.0.285
+    rev: v0.0.287
     hooks:
     -   id: ruff
         args: [--exit-non-zero-on-fix]
@@ -34,7 +34,7 @@ repos:
         alias: ruff-selected-autofixes
         args: [--select, "ANN001,ANN204", --fix-only, --exit-non-zero-on-fix]
 -   repo: https://github.com/jendrikseipp/vulture
-    rev: 'v2.7'
+    rev: 'v2.9.1'
     hooks:
       - id: vulture
         entry: python scripts/run_vulture.py
@@ -84,7 +84,7 @@ repos:
             '--filter=-readability/casting,-runtime/int,-build/include_subdir,-readability/fn_size'
         ]
 -   repo: https://github.com/pylint-dev/pylint
-    rev: v3.0.0a6
+    rev: v3.0.0a7
     hooks:
     -   id: pylint
         stages: [manual]
@@ -124,7 +124,7 @@ repos:
         types: [text]  # overwrite types: [rst]
         types_or: [python, rst]
 -   repo: https://github.com/sphinx-contrib/sphinx-lint
-    rev: v0.6.7
+    rev: v0.6.8
     hooks:
     - id: sphinx-lint
 -   repo: local

diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py
@@ -90,7 +90,7 @@ def time_setitem(self, multiple_chunks):
             self.array[i] = "foo"
 
     def time_setitem_list(self, multiple_chunks):
-        indexer = list(range(0, 50)) + list(range(-1000, 0, 50))
+        indexer = list(range(50)) + list(range(-1000, 0, 50))
         self.array[indexer] = ["foo"] * len(indexer)
 
     def time_setitem_slice(self, multiple_chunks):

diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py
@@ -360,14 +360,14 @@ class MergeCategoricals:
     def setup(self):
         self.left_object = DataFrame(
             {
-                "X": np.random.choice(range(0, 10), size=(10000,)),
+                "X": np.random.choice(range(10), size=(10000,)),
                 "Y": np.random.choice(["one", "two", "three"], size=(10000,)),
             }
         )
 
         self.right_object = DataFrame(
             {
-                "X": np.random.choice(range(0, 10), size=(10000,)),
+                "X": np.random.choice(range(10), size=(10000,)),
                 "Z": np.random.choice(["jjj", "kkk", "sss"], size=(10000,)),
             }
         )

diff --git a/doc/cheatsheet/README.md b/doc/cheatsheet/README.md
@@ -0,0 +1,22 @@
+# Pandas Cheat Sheet
+
+The Pandas Cheat Sheet was created using Microsoft Powerpoint 2013.
+To create the PDF version, within Powerpoint, simply do a "Save As"
+and pick "PDF" as the format.
+
+This cheat sheet, originally written by Irv Lustig, [Princeton Consultants](https://www.princetonoptimization.com/), was inspired by the [RStudio Data Wrangling Cheatsheet](https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf).
+
+| Topic                  | PDF                                                                                                                                                                                                                                     | PPT                                                                                                                                                                                                                                               |
+|------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Pandas_Cheat_Sheet     | <a href="https://github.com/pandas-dev/pandas/blob/main/doc/cheatsheet/Pandas_Cheat_Sheet.pdf" target="_parent"><img src="https://img.shields.io/badge/Open in PDF-%23FF0000.svg?style=flat-square&logo=adobe&logoColor=white"/></a>    | <a href="https://github.com/pandas-dev/pandas/blob/main/doc/cheatsheet/Pandas_Cheat_Sheet.pptx" target="_parent"><img src="https://img.shields.io/badge/Open in PPT-B7472A?style=flat-square&logo=microsoft-powerpoint&logoColor=white"/></a>     |
+| Pandas_Cheat_Sheet_JA  | <a href="https://github.com/pandas-dev/pandas/blob/main/doc/cheatsheet/Pandas_Cheat_Sheet_JA.pdf" target="_parent"><img src="https://img.shields.io/badge/Open in PDF-%23FF0000.svg?style=flat-square&logo=adobe&logoColor=white"/></a> | <a href="https://github.com/pandas-dev/pandas/blob/main/doc/cheatsheet/Pandas_Cheat_Sheet_JA.pptx" target="_parent"><img  src="https://img.shields.io/badge/Open in PPT-B7472A?style=flat-square&logo=microsoft-powerpoint&logoColor=white"/></a> |
+
+
+**Alternative**
+
+Alternatively, if you want to complement your learning, you can use the Pandas Cheat sheets
+developed by [DataCamp](https://www.datacamp.com/) in "PDF", "Google Colab" and "Streamlit" formats.
+
+| Topic       | PDF                                                                                                                                                                                                                                  | Streamlit                                                                                                                                                        | Google Colab                                                                                                                                                                                                                                   |
+|-------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Pandas      | <a href="https://github.com/fralfaro/DS-Cheat-Sheets/blob/main/docs/files/pandas_cs.pdf" target="_parent"><img src="https://img.shields.io/badge/Open in PDF-%23FF0000.svg?style=flat-square&logo=adobe&logoColor=white"/></a>       | <a href="https://ds-cheat-sheets-pandas.streamlit.app/" target="_parent"><img src="https://static.streamlit.io/badges/streamlit_badge_black_white.svg"/></a>     | <a href="https://colab.research.google.com/github/fralfaro/DS-Cheat-Sheets/blob/main/docs/examples/pandas/pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>         |
diff --git a/doc/cheatsheet/README.txt b/doc/cheatsheet/README.txt
diff --git a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst
@@ -106,9 +106,9 @@ between square brackets ``[]``.
     </ul>
 
 .. note::
-    If you are familiar to Python
+    If you are familiar with Python
     :ref:`dictionaries <python:tut-dictionaries>`, the selection of a
-    single column is very similar to selection of dictionary values based on
+    single column is very similar to the selection of dictionary values based on
     the key.
 
 You can create a ``Series`` from scratch as well:

diff --git a/doc/source/whatsnew/v0.15.2.rst b/doc/source/whatsnew/v0.15.2.rst
@@ -24,25 +24,61 @@ API changes
 - Indexing in ``MultiIndex`` beyond lex-sort depth is now supported, though
   a lexically sorted index will have a better performance. (:issue:`2646`)
 
-  .. ipython:: python
-    :okexcept:
-    :okwarning:
+  .. code-block:: ipython
+
+    In [1]: df = pd.DataFrame({'jim':[0, 0, 1, 1],
+       ...:                    'joe':['x', 'x', 'z', 'y'],
+       ...:                    'jolie':np.random.rand(4)}).set_index(['jim', 'joe'])
+       ...:
 
-    df = pd.DataFrame({'jim':[0, 0, 1, 1],
-                       'joe':['x', 'x', 'z', 'y'],
-                       'jolie':np.random.rand(4)}).set_index(['jim', 'joe'])
-    df
-    df.index.lexsort_depth
+    In [2]: df
+    Out[2]:
+                jolie
+    jim joe
+    0   x    0.126970
+        x    0.966718
+    1   z    0.260476
+        y    0.897237
+
+    [4 rows x 1 columns]
+
+    In [3]: df.index.lexsort_depth
+    Out[3]: 1
 
     # in prior versions this would raise a KeyError
     # will now show a PerformanceWarning
-    df.loc[(1, 'z')]
+    In [4]: df.loc[(1, 'z')]
+    Out[4]:
+                jolie
+    jim joe
+    1   z    0.260476
+
+    [1 rows x 1 columns]
 
     # lexically sorting
-    df2 = df.sort_index()
-    df2
-    df2.index.lexsort_depth
-    df2.loc[(1,'z')]
+    In [5]: df2 = df.sort_index()
+
+    In [6]: df2
+    Out[6]:
+                jolie
+    jim joe
+    0   x    0.126970
+        x    0.966718
+    1   y    0.897237
+        z    0.260476
+
+    [4 rows x 1 columns]
+
+    In [7]: df2.index.lexsort_depth
+    Out[7]: 2
+
+    In [8]: df2.loc[(1,'z')]
+    Out[8]:
+                jolie
+    jim joe
+    1   z    0.260476
+
+    [1 rows x 1 columns]
 
 - Bug in unique of Series with ``category`` dtype, which returned all categories regardless
   whether they were "used" or not (see :issue:`8559` for the discussion).

diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst
@@ -13,13 +13,17 @@ including other versions of pandas.
 
 Fixed regressions
 ~~~~~~~~~~~~~~~~~
+- Fixed regression in :func:`concat` when :class:`DataFrame` 's have two different extension dtypes (:issue:`54848`)
 - Fixed regression in :func:`merge` when merging over a PyArrow string index (:issue:`54894`)
 - Fixed regression in :func:`read_csv` when ``usecols`` is given and ``dtypes`` is a dict for ``engine="python"`` (:issue:`54868`)
 - Fixed regression in :func:`read_csv` when ``delim_whitespace`` is True (:issue:`54918`, :issue:`54931`)
 - Fixed regression in :meth:`.GroupBy.get_group` raising for ``axis=1`` (:issue:`54858`)
 - Fixed regression in :meth:`DataFrame.__setitem__` raising ``AssertionError`` when setting a :class:`Series` with a partial :class:`MultiIndex` (:issue:`54875`)
+- Fixed regression in :meth:`DataFrame.filter` not respecting the order of elements for ``filter`` (:issue:`54980`)
+- Fixed regression in :meth:`DataFrame.to_sql` not roundtripping datetime columns correctly for sqlite (:issue:`54877`)
 - Fixed regression in :meth:`MultiIndex.append` raising when appending overlapping :class:`IntervalIndex` levels (:issue:`54934`)
 - Fixed regression in :meth:`Series.drop_duplicates` for PyArrow strings (:issue:`54904`)
+- Fixed regression in :meth:`Series.interpolate` raising when ``fill_value`` was given (:issue:`54920`)
 - Fixed regression in :meth:`Series.value_counts` raising for numeric data if ``bins`` was specified (:issue:`54857`)
 - Fixed regression when comparing a :class:`Series` with ``datetime64`` dtype with ``None`` (:issue:`54870`)
 
@@ -28,7 +32,9 @@ Fixed regressions
 
 Bug fixes
 ~~~~~~~~~
+- Fixed bug for :class:`ArrowDtype` raising ``NotImplementedError`` for fixed-size list (:issue:`55000`)
 - Fixed bug in :meth:`DataFrame.stack` with ``future_stack=True`` and columns a non-:class:`MultiIndex` consisting of tuples (:issue:`54948`)
+- Fixed bug in :meth:`Series.pct_change` and :meth:`DataFrame.pct_change` showing unnecessary ``FutureWarning`` (:issue:`54981`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_211.other:

diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -168,6 +168,7 @@ Performance improvements
 Bug fixes
 ~~~~~~~~~
 - Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`)
+- Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`)
 
 Categorical
 ^^^^^^^^^^^
@@ -245,7 +246,7 @@ Groupby/resample/rolling
 
 Reshaping
 ^^^^^^^^^
--
+- Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`)
 -
 
 Sparse

diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx
@@ -138,6 +138,8 @@ def calculate_variable_window_bounds(
                         break
             # end bound is previous end
             # or current index
+            elif index[end[i - 1]] == end_bound and not right_closed:
+                end[i] = end[i - 1] + 1
             elif (index[end[i - 1]] - end_bound) * index_growth_sign <= 0:
                 end[i] = i + 1
             else:

diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -71,6 +71,7 @@
     Index,
     MultiIndex,
 )
+from pandas.util.version import Version
 
 if TYPE_CHECKING:
     from collections.abc import (
@@ -191,6 +192,10 @@ def pytest_collection_modifyitems(items, config) -> None:
             item.add_marker(pytest.mark.arraymanager)
 
 
+hypothesis_health_checks = [hypothesis.HealthCheck.too_slow]
+if Version(hypothesis.__version__) >= Version("6.83.2"):
+    hypothesis_health_checks.append(hypothesis.HealthCheck.differing_executors)
+
 # Hypothesis
 hypothesis.settings.register_profile(
     "ci",
@@ -202,7 +207,7 @@ def pytest_collection_modifyitems(items, config) -> None:
     # 2022-02-09: Changed deadline from 500 -> None. Deadline leads to
     # non-actionable, flaky CI failures (# GH 24641, 44969, 45118, 44969)
     deadline=None,
-    suppress_health_check=(hypothesis.HealthCheck.too_slow,),
+    suppress_health_check=tuple(hypothesis_health_checks),
 )
 hypothesis.settings.load_profile("ci")
 

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -2192,11 +2192,11 @@ def _str_rstrip(self, to_strip=None):
         return type(self)(result)
 
     def _str_removeprefix(self, prefix: str):
-        # TODO: Should work once https://github.com/apache/arrow/issues/14991 is fixed
-        # starts_with = pc.starts_with(self._pa_array, pattern=prefix)
-        # removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
-        # result = pc.if_else(starts_with, removed, self._pa_array)
-        # return type(self)(result)
+        if not pa_version_under13p0:
+            starts_with = pc.starts_with(self._pa_array, pattern=prefix)
+            removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
+            result = pc.if_else(starts_with, removed, self._pa_array)
+            return type(self)(result)
         predicate = lambda val: val.removeprefix(prefix)
         result = self._apply_elementwise(predicate)
         return type(self)(pa.chunked_array(result))

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -15,7 +15,10 @@
     lib,
     missing as libmissing,
 )
-from pandas.compat import pa_version_under7p0
+from pandas.compat import (
+    pa_version_under7p0,
+    pa_version_under13p0,
+)
 from pandas.util._exceptions import find_stack_level
 
 from pandas.core.dtypes.common import (
@@ -417,7 +420,7 @@ def _str_isupper(self):
 
     def _str_len(self):
         result = pc.utf8_length(self._pa_array)
-        return Int64Dtype().__from_arrow__(result)
+        return self._convert_int_dtype(result)
 
     def _str_lower(self):
         return type(self)(pc.utf8_lower(self._pa_array))
@@ -446,6 +449,43 @@ def _str_rstrip(self, to_strip=None):
             result = pc.utf8_rtrim(self._pa_array, characters=to_strip)
         return type(self)(result)
 
+    def _str_removeprefix(self, prefix: str):
+        if not pa_version_under13p0:
+            starts_with = pc.starts_with(self._pa_array, pattern=prefix)
+            removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
+            result = pc.if_else(starts_with, removed, self._pa_array)
+            return type(self)(result)
+        return super()._str_removeprefix(prefix)
+
+    def _str_removesuffix(self, suffix: str):
+        ends_with = pc.ends_with(self._pa_array, pattern=suffix)
+        removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
+        result = pc.if_else(ends_with, removed, self._pa_array)
+        return type(self)(result)
+
+    def _str_count(self, pat: str, flags: int = 0):
+        if flags:
+            return super()._str_count(pat, flags)
+        result = pc.count_substring_regex(self._pa_array, pat)
+        return self._convert_int_dtype(result)
+
+    def _str_find(self, sub: str, start: int = 0, end: int | None = None):
+        if start != 0 and end is not None:
+            slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
+            result = pc.find_substring(slices, sub)
+            not_found = pc.equal(result, -1)
+            offset_result = pc.add(result, end - start)
+            result = pc.if_else(not_found, result, offset_result)
+        elif start == 0 and end is None:
+            slices = self._pa_array
+            result = pc.find_substring(slices, sub)
+        else:
+            return super()._str_find(sub, start, end)
+        return self._convert_int_dtype(result)
+
+    def _convert_int_dtype(self, result):
+        return Int64Dtype().__from_arrow__(result)
+
 
 class ArrowStringArrayNumpySemantics(ArrowStringArray):
     _storage = "pyarrow_numpy"
@@ -526,34 +566,11 @@ def _str_map(
             return lib.map_infer_mask(arr, f, mask.view("uint8"))
 
     def _convert_int_dtype(self, result):
+        result = result.to_numpy()
         if result.dtype == np.int32:
             result = result.astype(np.int64)
         return result
 
-    def _str_count(self, pat: str, flags: int = 0):
-        if flags:
-            return super()._str_count(pat, flags)
-        result = pc.count_substring_regex(self._pa_array, pat).to_numpy()
-        return self._convert_int_dtype(result)
-
-    def _str_len(self):
-        result = pc.utf8_length(self._pa_array).to_numpy()
-        return self._convert_int_dtype(result)
-
-    def _str_find(self, sub: str, start: int = 0, end: int | None = None):
-        if start != 0 and end is not None:
-            slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
-            result = pc.find_substring(slices, sub)
-            not_found = pc.equal(result, -1)
-            offset_result = pc.add(result, end - start)
-            result = pc.if_else(not_found, result, offset_result)
-        elif start == 0 and end is None:
-            slices = self._pa_array
-            result = pc.find_substring(slices, sub)
-        else:
-            return super()._str_find(sub, start, end)
-        return self._convert_int_dtype(result.to_numpy())
-
     def _cmp_method(self, other, op):
         result = super()._cmp_method(other, op)
         return result.to_numpy(np.bool_, na_value=False)

diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -485,8 +485,8 @@ def array(self) -> ExtensionArray:
             types, this is the actual array. For NumPy native types, this
             is a thin (no copy) wrapper around :class:`numpy.ndarray`.
 
-            ``.array`` differs ``.values`` which may require converting the
-            data to a different form.
+            ``.array`` differs from ``.values``, which may require converting
+            the data to a different form.
 
         See Also
         --------