Skip to content

Commit

Permalink
Merge branch 'main' into 54352
Browse files Browse the repository at this point in the history
  • Loading branch information
phofl authored Sep 5, 2023
2 parents 3bc7d5d + e681413 commit 5d112af
Show file tree
Hide file tree
Showing 15 changed files with 167 additions and 49 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,9 @@ between square brackets ``[]``.
</ul>

.. note::
If you are familiar to Python
If you are familiar with Python
:ref:`dictionaries <python:tut-dictionaries>`, the selection of a
single column is very similar to selection of dictionary values based on
single column is very similar to the selection of dictionary values based on
the key.

You can create a ``Series`` from scratch as well:
Expand Down
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v2.1.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ including other versions of pandas.

Fixed regressions
~~~~~~~~~~~~~~~~~
- Fixed regression in :func:`concat` when :class:`DataFrame` 's have two different extension dtypes (:issue:`54848`)
- Fixed regression in :func:`merge` when merging over a PyArrow string index (:issue:`54894`)
- Fixed regression in :func:`read_csv` when ``usecols`` is given and ``dtypes`` is a dict for ``engine="python"`` (:issue:`54868`)
- Fixed regression in :func:`read_csv` when ``delim_whitespace`` is True (:issue:`54918`, :issue:`54931`)
Expand All @@ -21,6 +22,7 @@ Fixed regressions
- Fixed regression in :meth:`DataFrame.to_sql` not roundtripping datetime columns correctly for sqlite (:issue:`54877`)
- Fixed regression in :meth:`MultiIndex.append` raising when appending overlapping :class:`IntervalIndex` levels (:issue:`54934`)
- Fixed regression in :meth:`Series.drop_duplicates` for PyArrow strings (:issue:`54904`)
- Fixed regression in :meth:`Series.interpolate` raising when ``fill_value`` was given (:issue:`54920`)
- Fixed regression in :meth:`Series.value_counts` raising for numeric data if ``bins`` was specified (:issue:`54857`)
- Fixed regression when comparing a :class:`Series` with ``datetime64`` dtype with ``None`` (:issue:`54870`)

Expand All @@ -29,6 +31,7 @@ Fixed regressions

Bug fixes
~~~~~~~~~
- Fixed bug for :class:`ArrowDtype` raising ``NotImplementedError`` for fixed-size list (:issue:`55000`)
- Fixed bug in :meth:`DataFrame.stack` with ``future_stack=True`` and columns a non-:class:`MultiIndex` consisting of tuples (:issue:`54948`)

.. ---------------------------------------------------------------------------
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ Performance improvements
Bug fixes
~~~~~~~~~
- Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`)
- Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`)

Categorical
^^^^^^^^^^^
Expand Down
2 changes: 2 additions & 0 deletions pandas/_libs/window/indexers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ def calculate_variable_window_bounds(
break
# end bound is previous end
# or current index
elif index[end[i - 1]] == end_bound and not right_closed:
end[i] = end[i - 1] + 1
elif (index[end[i - 1]] - end_bound) * index_growth_sign <= 0:
end[i] = i + 1
else:
Expand Down
7 changes: 6 additions & 1 deletion pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
Index,
MultiIndex,
)
from pandas.util.version import Version

if TYPE_CHECKING:
from collections.abc import (
Expand Down Expand Up @@ -191,6 +192,10 @@ def pytest_collection_modifyitems(items, config) -> None:
item.add_marker(pytest.mark.arraymanager)


hypothesis_health_checks = [hypothesis.HealthCheck.too_slow]
if Version(hypothesis.__version__) >= Version("6.83.2"):
hypothesis_health_checks.append(hypothesis.HealthCheck.differing_executors)

# Hypothesis
hypothesis.settings.register_profile(
"ci",
Expand All @@ -202,7 +207,7 @@ def pytest_collection_modifyitems(items, config) -> None:
# 2022-02-09: Changed deadline from 500 -> None. Deadline leads to
# non-actionable, flaky CI failures (# GH 24641, 44969, 45118, 44969)
deadline=None,
suppress_health_check=(hypothesis.HealthCheck.too_slow,),
suppress_health_check=tuple(hypothesis_health_checks),
)
hypothesis.settings.load_profile("ci")

Expand Down
50 changes: 25 additions & 25 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ def _str_isupper(self):

def _str_len(self):
result = pc.utf8_length(self._pa_array)
return Int64Dtype().__from_arrow__(result)
return self._convert_int_dtype(result)

def _str_lower(self):
return type(self)(pc.utf8_lower(self._pa_array))
Expand Down Expand Up @@ -446,6 +446,29 @@ def _str_rstrip(self, to_strip=None):
result = pc.utf8_rtrim(self._pa_array, characters=to_strip)
return type(self)(result)

def _str_count(self, pat: str, flags: int = 0):
if flags:
return super()._str_count(pat, flags)
result = pc.count_substring_regex(self._pa_array, pat)
return self._convert_int_dtype(result)

def _str_find(self, sub: str, start: int = 0, end: int | None = None):
if start != 0 and end is not None:
slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
result = pc.find_substring(slices, sub)
not_found = pc.equal(result, -1)
offset_result = pc.add(result, end - start)
result = pc.if_else(not_found, result, offset_result)
elif start == 0 and end is None:
slices = self._pa_array
result = pc.find_substring(slices, sub)
else:
return super()._str_find(sub, start, end)
return self._convert_int_dtype(result)

def _convert_int_dtype(self, result):
return Int64Dtype().__from_arrow__(result)


class ArrowStringArrayNumpySemantics(ArrowStringArray):
_storage = "pyarrow_numpy"
Expand Down Expand Up @@ -526,34 +549,11 @@ def _str_map(
return lib.map_infer_mask(arr, f, mask.view("uint8"))

def _convert_int_dtype(self, result):
result = result.to_numpy()
if result.dtype == np.int32:
result = result.astype(np.int64)
return result

def _str_count(self, pat: str, flags: int = 0):
if flags:
return super()._str_count(pat, flags)
result = pc.count_substring_regex(self._pa_array, pat).to_numpy()
return self._convert_int_dtype(result)

def _str_len(self):
result = pc.utf8_length(self._pa_array).to_numpy()
return self._convert_int_dtype(result)

def _str_find(self, sub: str, start: int = 0, end: int | None = None):
if start != 0 and end is not None:
slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
result = pc.find_substring(slices, sub)
not_found = pc.equal(result, -1)
offset_result = pc.add(result, end - start)
result = pc.if_else(not_found, result, offset_result)
elif start == 0 and end is None:
slices = self._pa_array
result = pc.find_substring(slices, sub)
else:
return super()._str_find(sub, start, end)
return self._convert_int_dtype(result.to_numpy())

def _cmp_method(self, other, op):
result = super()._cmp_method(other, op)
return result.to_numpy(np.bool_, na_value=False)
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2148,6 +2148,8 @@ def type(self):
return CategoricalDtypeType
elif pa.types.is_list(pa_type) or pa.types.is_large_list(pa_type):
return list
elif pa.types.is_fixed_size_list(pa_type):
return list
elif pa.types.is_map(pa_type):
return list
elif pa.types.is_struct(pa_type):
Expand Down
7 changes: 4 additions & 3 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2847,7 +2847,7 @@ def to_sql(
index : bool, default True
Write DataFrame index as a column. Uses `index_label` as the column
name in the table.
name in the table. Creates a table index for this column.
index_label : str or sequence, default None
Column label for index column(s). If None is given (default) and
`index` is True, then the index names are used.
Expand Down Expand Up @@ -8225,10 +8225,11 @@ def interpolate(
stacklevel=find_stack_level(),
)

if "fill_value" in kwargs:
if method in fillna_methods and "fill_value" in kwargs:
raise ValueError(
"'fill_value' is not a valid keyword for "
f"{type(self).__name__}.interpolate"
f"{type(self).__name__}.interpolate with method from "
f"{fillna_methods}"
)

if isinstance(obj.index, MultiIndex) and method != "linear":
Expand Down
4 changes: 3 additions & 1 deletion pandas/core/indexers/objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,9 @@ def get_window_bounds(
# end bound is previous end
# or current index
end_diff = (self.index[end[i - 1]] - end_bound) * index_growth_sign
if end_diff <= zero:
if end_diff == zero and not right_closed:
end[i] = end[i - 1] + 1
elif end_diff <= zero:
end[i] = i + 1
else:
end[i] = end[i - 1]
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/internals/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def concatenate_managers(
values = np.concatenate(vals, axis=1) # type: ignore[arg-type]
elif is_1d_only_ea_dtype(blk.dtype):
# TODO(EA2D): special-casing not needed with 2D EAs
values = concat_compat(vals, axis=1, ea_compat_axis=True)
values = concat_compat(vals, axis=0, ea_compat_axis=True)
values = ensure_block_shape(values, ndim=2)
else:
values = concat_compat(vals, axis=1)
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -2992,6 +2992,15 @@ def test_groupby_count_return_arrow_dtype(data_missing):
tm.assert_frame_equal(result, expected)


def test_fixed_size_list():
# GH#55000
ser = pd.Series(
[[1, 2], [3, 4]], dtype=ArrowDtype(pa.list_(pa.int64(), list_size=2))
)
result = ser.dtype.type
assert result == list


def test_arrowextensiondtype_dataframe_repr():
# GH 54062
df = pd.DataFrame(
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/reshape/concat/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -858,3 +858,12 @@ def test_concat_multiindex_with_category():
)
expected = expected.set_index(["c1", "c2"])
tm.assert_frame_equal(result, expected)


def test_concat_ea_upcast():
# GH#54848
df1 = DataFrame(["a"], dtype="string")
df2 = DataFrame([1], dtype="Int64")
result = concat([df1, df2])
expected = DataFrame(["a", 1], index=[0, 0])
tm.assert_frame_equal(result, expected)
8 changes: 8 additions & 0 deletions pandas/tests/series/methods/test_interpolate.py
Original file line number Diff line number Diff line change
Expand Up @@ -858,3 +858,11 @@ def test_interpolate_asfreq_raises(self):
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(FutureWarning, match=msg2):
ser.interpolate(method="asfreq")

def test_interpolate_fill_value(self):
# GH#54920
pytest.importorskip("scipy")
ser = Series([np.nan, 0, 1, np.nan, 3, np.nan])
result = ser.interpolate(method="nearest", fill_value=0)
expected = Series([np.nan, 0, 1, 1, 3, 0])
tm.assert_series_equal(result, expected)
38 changes: 22 additions & 16 deletions pandas/tests/window/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,20 +466,23 @@ def test_groupby_rolling_subset_with_closed(self):
# GH 35549
df = DataFrame(
{
"column1": range(6),
"column2": range(6),
"group": 3 * ["A", "B"],
"date": [Timestamp("2019-01-01")] * 6,
"column1": range(8),
"column2": range(8),
"group": ["A"] * 4 + ["B"] * 4,
"date": [
Timestamp(date)
for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"]
]
* 2,
}
)
result = (
df.groupby("group").rolling("1D", on="date", closed="left")["column1"].sum()
)
expected = Series(
[np.nan, 0.0, 2.0, np.nan, 1.0, 4.0],
index=MultiIndex.from_tuples(
[("A", Timestamp("2019-01-01"))] * 3
+ [("B", Timestamp("2019-01-01"))] * 3,
[np.nan, np.nan, 1.0, 1.0, np.nan, np.nan, 9.0, 9.0],
index=MultiIndex.from_frame(
df[["group", "date"]],
names=["group", "date"],
),
name="column1",
Expand All @@ -490,10 +493,14 @@ def test_groupby_subset_rolling_subset_with_closed(self):
# GH 35549
df = DataFrame(
{
"column1": range(6),
"column2": range(6),
"group": 3 * ["A", "B"],
"date": [Timestamp("2019-01-01")] * 6,
"column1": range(8),
"column2": range(8),
"group": ["A"] * 4 + ["B"] * 4,
"date": [
Timestamp(date)
for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"]
]
* 2,
}
)

Expand All @@ -503,10 +510,9 @@ def test_groupby_subset_rolling_subset_with_closed(self):
.sum()
)
expected = Series(
[np.nan, 0.0, 2.0, np.nan, 1.0, 4.0],
index=MultiIndex.from_tuples(
[("A", Timestamp("2019-01-01"))] * 3
+ [("B", Timestamp("2019-01-01"))] * 3,
[np.nan, np.nan, 1.0, 1.0, np.nan, np.nan, 9.0, 9.0],
index=MultiIndex.from_frame(
df[["group", "date"]],
names=["group", "date"],
),
name="column1",
Expand Down
70 changes: 70 additions & 0 deletions pandas/tests/window/test_rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,76 @@ def test_datetimelike_nonunique_index_centering(
tm.assert_equal(result, expected)


@pytest.mark.parametrize(
"closed,expected",
[
("left", [np.nan, np.nan, 1, 1, 1, 10, 14, 14, 18, 21]),
("neither", [np.nan, np.nan, 1, 1, 1, 9, 5, 5, 13, 8]),
("right", [0, 1, 3, 6, 10, 14, 11, 18, 21, 17]),
("both", [0, 1, 3, 6, 10, 15, 20, 27, 26, 30]),
],
)
def test_variable_window_nonunique(closed, expected, frame_or_series):
# GH 20712
index = DatetimeIndex(
[
"2011-01-01",
"2011-01-01",
"2011-01-02",
"2011-01-02",
"2011-01-02",
"2011-01-03",
"2011-01-04",
"2011-01-04",
"2011-01-05",
"2011-01-06",
]
)

df = frame_or_series(range(10), index=index, dtype=float)
expected = frame_or_series(expected, index=index, dtype=float)

result = df.rolling("2D", closed=closed).sum()

tm.assert_equal(result, expected)


@pytest.mark.parametrize(
"closed,expected",
[
("left", [np.nan, np.nan, 1, 1, 1, 10, 15, 15, 18, 21]),
("neither", [np.nan, np.nan, 1, 1, 1, 10, 15, 15, 13, 8]),
("right", [0, 1, 3, 6, 10, 15, 21, 28, 21, 17]),
("both", [0, 1, 3, 6, 10, 15, 21, 28, 26, 30]),
],
)
def test_variable_offset_window_nonunique(closed, expected, frame_or_series):
# GH 20712
index = DatetimeIndex(
[
"2011-01-01",
"2011-01-01",
"2011-01-02",
"2011-01-02",
"2011-01-02",
"2011-01-03",
"2011-01-04",
"2011-01-04",
"2011-01-05",
"2011-01-06",
]
)

df = frame_or_series(range(10), index=index, dtype=float)
expected = frame_or_series(expected, index=index, dtype=float)

offset = BusinessDay(2)
indexer = VariableOffsetWindowIndexer(index=index, offset=offset)
result = df.rolling(indexer, closed=closed, min_periods=1).sum()

tm.assert_equal(result, expected)


def test_even_number_window_alignment():
# see discussion in GH 38780
s = Series(range(3), index=date_range(start="2020-01-01", freq="D", periods=3))
Expand Down

0 comments on commit 5d112af

Please sign in to comment.