Merge branch 'main' into 54352

phofl · Sep 5, 2023 · 5d112af · 5d112af
2 parents 3bc7d5d + e681413
commit 5d112af
Show file tree

Hide file tree

Showing 15 changed files with 167 additions and 49 deletions.
diff --git a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst
@@ -106,9 +106,9 @@ between square brackets ``[]``.
     </ul>
 
 .. note::
-    If you are familiar to Python
+    If you are familiar with Python
     :ref:`dictionaries <python:tut-dictionaries>`, the selection of a
-    single column is very similar to selection of dictionary values based on
+    single column is very similar to the selection of dictionary values based on
     the key.
 
 You can create a ``Series`` from scratch as well:

diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst
@@ -13,6 +13,7 @@ including other versions of pandas.
 
 Fixed regressions
 ~~~~~~~~~~~~~~~~~
+- Fixed regression in :func:`concat` when :class:`DataFrame` 's have two different extension dtypes (:issue:`54848`)
 - Fixed regression in :func:`merge` when merging over a PyArrow string index (:issue:`54894`)
 - Fixed regression in :func:`read_csv` when ``usecols`` is given and ``dtypes`` is a dict for ``engine="python"`` (:issue:`54868`)
 - Fixed regression in :func:`read_csv` when ``delim_whitespace`` is True (:issue:`54918`, :issue:`54931`)
@@ -21,6 +22,7 @@ Fixed regressions
 - Fixed regression in :meth:`DataFrame.to_sql` not roundtripping datetime columns correctly for sqlite (:issue:`54877`)
 - Fixed regression in :meth:`MultiIndex.append` raising when appending overlapping :class:`IntervalIndex` levels (:issue:`54934`)
 - Fixed regression in :meth:`Series.drop_duplicates` for PyArrow strings (:issue:`54904`)
+- Fixed regression in :meth:`Series.interpolate` raising when ``fill_value`` was given (:issue:`54920`)
 - Fixed regression in :meth:`Series.value_counts` raising for numeric data if ``bins`` was specified (:issue:`54857`)
 - Fixed regression when comparing a :class:`Series` with ``datetime64`` dtype with ``None`` (:issue:`54870`)
 
@@ -29,6 +31,7 @@ Fixed regressions
 
 Bug fixes
 ~~~~~~~~~
+- Fixed bug for :class:`ArrowDtype` raising ``NotImplementedError`` for fixed-size list (:issue:`55000`)
 - Fixed bug in :meth:`DataFrame.stack` with ``future_stack=True`` and columns a non-:class:`MultiIndex` consisting of tuples (:issue:`54948`)
 
 .. ---------------------------------------------------------------------------

diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -168,6 +168,7 @@ Performance improvements
 Bug fixes
 ~~~~~~~~~
 - Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`)
+- Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`)
 
 Categorical
 ^^^^^^^^^^^

diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx
@@ -138,6 +138,8 @@ def calculate_variable_window_bounds(
                         break
             # end bound is previous end
             # or current index
+            elif index[end[i - 1]] == end_bound and not right_closed:
+                end[i] = end[i - 1] + 1
             elif (index[end[i - 1]] - end_bound) * index_growth_sign <= 0:
                 end[i] = i + 1
             else:

diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -71,6 +71,7 @@
     Index,
     MultiIndex,
 )
+from pandas.util.version import Version
 
 if TYPE_CHECKING:
     from collections.abc import (
@@ -191,6 +192,10 @@ def pytest_collection_modifyitems(items, config) -> None:
             item.add_marker(pytest.mark.arraymanager)
 
 
+hypothesis_health_checks = [hypothesis.HealthCheck.too_slow]
+if Version(hypothesis.__version__) >= Version("6.83.2"):
+    hypothesis_health_checks.append(hypothesis.HealthCheck.differing_executors)
+
 # Hypothesis
 hypothesis.settings.register_profile(
     "ci",
@@ -202,7 +207,7 @@ def pytest_collection_modifyitems(items, config) -> None:
     # 2022-02-09: Changed deadline from 500 -> None. Deadline leads to
     # non-actionable, flaky CI failures (# GH 24641, 44969, 45118, 44969)
     deadline=None,
-    suppress_health_check=(hypothesis.HealthCheck.too_slow,),
+    suppress_health_check=tuple(hypothesis_health_checks),
 )
 hypothesis.settings.load_profile("ci")
 

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -417,7 +417,7 @@ def _str_isupper(self):
 
     def _str_len(self):
         result = pc.utf8_length(self._pa_array)
-        return Int64Dtype().__from_arrow__(result)
+        return self._convert_int_dtype(result)
 
     def _str_lower(self):
         return type(self)(pc.utf8_lower(self._pa_array))
@@ -446,6 +446,29 @@ def _str_rstrip(self, to_strip=None):
             result = pc.utf8_rtrim(self._pa_array, characters=to_strip)
         return type(self)(result)
 
+    def _str_count(self, pat: str, flags: int = 0):
+        if flags:
+            return super()._str_count(pat, flags)
+        result = pc.count_substring_regex(self._pa_array, pat)
+        return self._convert_int_dtype(result)
+
+    def _str_find(self, sub: str, start: int = 0, end: int | None = None):
+        if start != 0 and end is not None:
+            slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
+            result = pc.find_substring(slices, sub)
+            not_found = pc.equal(result, -1)
+            offset_result = pc.add(result, end - start)
+            result = pc.if_else(not_found, result, offset_result)
+        elif start == 0 and end is None:
+            slices = self._pa_array
+            result = pc.find_substring(slices, sub)
+        else:
+            return super()._str_find(sub, start, end)
+        return self._convert_int_dtype(result)
+
+    def _convert_int_dtype(self, result):
+        return Int64Dtype().__from_arrow__(result)
+
 
 class ArrowStringArrayNumpySemantics(ArrowStringArray):
     _storage = "pyarrow_numpy"
@@ -526,34 +549,11 @@ def _str_map(
             return lib.map_infer_mask(arr, f, mask.view("uint8"))
 
     def _convert_int_dtype(self, result):
+        result = result.to_numpy()
         if result.dtype == np.int32:
             result = result.astype(np.int64)
         return result
 
-    def _str_count(self, pat: str, flags: int = 0):
-        if flags:
-            return super()._str_count(pat, flags)
-        result = pc.count_substring_regex(self._pa_array, pat).to_numpy()
-        return self._convert_int_dtype(result)
-
-    def _str_len(self):
-        result = pc.utf8_length(self._pa_array).to_numpy()
-        return self._convert_int_dtype(result)
-
-    def _str_find(self, sub: str, start: int = 0, end: int | None = None):
-        if start != 0 and end is not None:
-            slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
-            result = pc.find_substring(slices, sub)
-            not_found = pc.equal(result, -1)
-            offset_result = pc.add(result, end - start)
-            result = pc.if_else(not_found, result, offset_result)
-        elif start == 0 and end is None:
-            slices = self._pa_array
-            result = pc.find_substring(slices, sub)
-        else:
-            return super()._str_find(sub, start, end)
-        return self._convert_int_dtype(result.to_numpy())
-
     def _cmp_method(self, other, op):
         result = super()._cmp_method(other, op)
         return result.to_numpy(np.bool_, na_value=False)

diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -2148,6 +2148,8 @@ def type(self):
             return CategoricalDtypeType
         elif pa.types.is_list(pa_type) or pa.types.is_large_list(pa_type):
             return list
+        elif pa.types.is_fixed_size_list(pa_type):
+            return list
         elif pa.types.is_map(pa_type):
             return list
         elif pa.types.is_struct(pa_type):

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -2847,7 +2847,7 @@ def to_sql(
 
         index : bool, default True
             Write DataFrame index as a column. Uses `index_label` as the column
-            name in the table.
+            name in the table. Creates a table index for this column.
         index_label : str or sequence, default None
             Column label for index column(s). If None is given (default) and
             `index` is True, then the index names are used.
@@ -8225,10 +8225,11 @@ def interpolate(
                         stacklevel=find_stack_level(),
                     )
 
-        if "fill_value" in kwargs:
+        if method in fillna_methods and "fill_value" in kwargs:
             raise ValueError(
                 "'fill_value' is not a valid keyword for "
-                f"{type(self).__name__}.interpolate"
+                f"{type(self).__name__}.interpolate with method from "
+                f"{fillna_methods}"
             )
 
         if isinstance(obj.index, MultiIndex) and method != "linear":

diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py
@@ -262,7 +262,9 @@ def get_window_bounds(
             # end bound is previous end
             # or current index
             end_diff = (self.index[end[i - 1]] - end_bound) * index_growth_sign
-            if end_diff <= zero:
+            if end_diff == zero and not right_closed:
+                end[i] = end[i - 1] + 1
+            elif end_diff <= zero:
                 end[i] = i + 1
             else:
                 end[i] = end[i - 1]

diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py
@@ -177,7 +177,7 @@ def concatenate_managers(
                 values = np.concatenate(vals, axis=1)  # type: ignore[arg-type]
             elif is_1d_only_ea_dtype(blk.dtype):
                 # TODO(EA2D): special-casing not needed with 2D EAs
-                values = concat_compat(vals, axis=1, ea_compat_axis=True)
+                values = concat_compat(vals, axis=0, ea_compat_axis=True)
                 values = ensure_block_shape(values, ndim=2)
             else:
                 values = concat_compat(vals, axis=1)

diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
@@ -2992,6 +2992,15 @@ def test_groupby_count_return_arrow_dtype(data_missing):
     tm.assert_frame_equal(result, expected)
 
 
+def test_fixed_size_list():
+    # GH#55000
+    ser = pd.Series(
+        [[1, 2], [3, 4]], dtype=ArrowDtype(pa.list_(pa.int64(), list_size=2))
+    )
+    result = ser.dtype.type
+    assert result == list
+
+
 def test_arrowextensiondtype_dataframe_repr():
     # GH 54062
     df = pd.DataFrame(

diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py
@@ -858,3 +858,12 @@ def test_concat_multiindex_with_category():
     )
     expected = expected.set_index(["c1", "c2"])
     tm.assert_frame_equal(result, expected)
+
+
+def test_concat_ea_upcast():
+    # GH#54848
+    df1 = DataFrame(["a"], dtype="string")
+    df2 = DataFrame([1], dtype="Int64")
+    result = concat([df1, df2])
+    expected = DataFrame(["a", 1], index=[0, 0])
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/series/methods/test_interpolate.py b/pandas/tests/series/methods/test_interpolate.py
@@ -858,3 +858,11 @@ def test_interpolate_asfreq_raises(self):
         with pytest.raises(ValueError, match=msg):
             with tm.assert_produces_warning(FutureWarning, match=msg2):
                 ser.interpolate(method="asfreq")
+
+    def test_interpolate_fill_value(self):
+        # GH#54920
+        pytest.importorskip("scipy")
+        ser = Series([np.nan, 0, 1, np.nan, 3, np.nan])
+        result = ser.interpolate(method="nearest", fill_value=0)
+        expected = Series([np.nan, 0, 1, 1, 3, 0])
+        tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py
@@ -466,20 +466,23 @@ def test_groupby_rolling_subset_with_closed(self):
         # GH 35549
         df = DataFrame(
             {
-                "column1": range(6),
-                "column2": range(6),
-                "group": 3 * ["A", "B"],
-                "date": [Timestamp("2019-01-01")] * 6,
+                "column1": range(8),
+                "column2": range(8),
+                "group": ["A"] * 4 + ["B"] * 4,
+                "date": [
+                    Timestamp(date)
+                    for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"]
+                ]
+                * 2,
             }
         )
         result = (
             df.groupby("group").rolling("1D", on="date", closed="left")["column1"].sum()
         )
         expected = Series(
-            [np.nan, 0.0, 2.0, np.nan, 1.0, 4.0],
-            index=MultiIndex.from_tuples(
-                [("A", Timestamp("2019-01-01"))] * 3
-                + [("B", Timestamp("2019-01-01"))] * 3,
+            [np.nan, np.nan, 1.0, 1.0, np.nan, np.nan, 9.0, 9.0],
+            index=MultiIndex.from_frame(
+                df[["group", "date"]],
                 names=["group", "date"],
             ),
             name="column1",
@@ -490,10 +493,14 @@ def test_groupby_subset_rolling_subset_with_closed(self):
         # GH 35549
         df = DataFrame(
             {
-                "column1": range(6),
-                "column2": range(6),
-                "group": 3 * ["A", "B"],
-                "date": [Timestamp("2019-01-01")] * 6,
+                "column1": range(8),
+                "column2": range(8),
+                "group": ["A"] * 4 + ["B"] * 4,
+                "date": [
+                    Timestamp(date)
+                    for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"]
+                ]
+                * 2,
             }
         )
 
@@ -503,10 +510,9 @@ def test_groupby_subset_rolling_subset_with_closed(self):
             .sum()
         )
         expected = Series(
-            [np.nan, 0.0, 2.0, np.nan, 1.0, 4.0],
-            index=MultiIndex.from_tuples(
-                [("A", Timestamp("2019-01-01"))] * 3
-                + [("B", Timestamp("2019-01-01"))] * 3,
+            [np.nan, np.nan, 1.0, 1.0, np.nan, np.nan, 9.0, 9.0],
+            index=MultiIndex.from_frame(
+                df[["group", "date"]],
                 names=["group", "date"],
             ),
             name="column1",

diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py
@@ -304,6 +304,76 @@ def test_datetimelike_nonunique_index_centering(
     tm.assert_equal(result, expected)
 
 
+@pytest.mark.parametrize(
+    "closed,expected",
+    [
+        ("left", [np.nan, np.nan, 1, 1, 1, 10, 14, 14, 18, 21]),
+        ("neither", [np.nan, np.nan, 1, 1, 1, 9, 5, 5, 13, 8]),
+        ("right", [0, 1, 3, 6, 10, 14, 11, 18, 21, 17]),
+        ("both", [0, 1, 3, 6, 10, 15, 20, 27, 26, 30]),
+    ],
+)
+def test_variable_window_nonunique(closed, expected, frame_or_series):
+    # GH 20712
+    index = DatetimeIndex(
+        [
+            "2011-01-01",
+            "2011-01-01",
+            "2011-01-02",
+            "2011-01-02",
+            "2011-01-02",
+            "2011-01-03",
+            "2011-01-04",
+            "2011-01-04",
+            "2011-01-05",
+            "2011-01-06",
+        ]
+    )
+
+    df = frame_or_series(range(10), index=index, dtype=float)
+    expected = frame_or_series(expected, index=index, dtype=float)
+
+    result = df.rolling("2D", closed=closed).sum()
+
+    tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "closed,expected",
+    [
+        ("left", [np.nan, np.nan, 1, 1, 1, 10, 15, 15, 18, 21]),
+        ("neither", [np.nan, np.nan, 1, 1, 1, 10, 15, 15, 13, 8]),
+        ("right", [0, 1, 3, 6, 10, 15, 21, 28, 21, 17]),
+        ("both", [0, 1, 3, 6, 10, 15, 21, 28, 26, 30]),
+    ],
+)
+def test_variable_offset_window_nonunique(closed, expected, frame_or_series):
+    # GH 20712
+    index = DatetimeIndex(
+        [
+            "2011-01-01",
+            "2011-01-01",
+            "2011-01-02",
+            "2011-01-02",
+            "2011-01-02",
+            "2011-01-03",
+            "2011-01-04",
+            "2011-01-04",
+            "2011-01-05",
+            "2011-01-06",
+        ]
+    )
+
+    df = frame_or_series(range(10), index=index, dtype=float)
+    expected = frame_or_series(expected, index=index, dtype=float)
+
+    offset = BusinessDay(2)
+    indexer = VariableOffsetWindowIndexer(index=index, offset=offset)
+    result = df.rolling(indexer, closed=closed, min_periods=1).sum()
+
+    tm.assert_equal(result, expected)
+
+
 def test_even_number_window_alignment():
     # see discussion in GH 38780
     s = Series(range(3), index=date_range(start="2020-01-01", freq="D", periods=3))