diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9f9bcd78c07b0..c01bf65818167 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -24,7 +24,7 @@ repos:
hooks:
- id: black
- repo: https://github.com/astral-sh/ruff-pre-commit
- rev: v0.0.285
+ rev: v0.0.287
hooks:
- id: ruff
args: [--exit-non-zero-on-fix]
@@ -34,7 +34,7 @@ repos:
alias: ruff-selected-autofixes
args: [--select, "ANN001,ANN204", --fix-only, --exit-non-zero-on-fix]
- repo: https://github.com/jendrikseipp/vulture
- rev: 'v2.7'
+ rev: 'v2.9.1'
hooks:
- id: vulture
entry: python scripts/run_vulture.py
@@ -84,7 +84,7 @@ repos:
'--filter=-readability/casting,-runtime/int,-build/include_subdir,-readability/fn_size'
]
- repo: https://github.com/pylint-dev/pylint
- rev: v3.0.0a6
+ rev: v3.0.0a7
hooks:
- id: pylint
stages: [manual]
@@ -124,7 +124,7 @@ repos:
types: [text] # overwrite types: [rst]
types_or: [python, rst]
- repo: https://github.com/sphinx-contrib/sphinx-lint
- rev: v0.6.7
+ rev: v0.6.8
hooks:
- id: sphinx-lint
- repo: local
diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py
index 09c4acc0ab309..0229cf15fbfb8 100644
--- a/asv_bench/benchmarks/array.py
+++ b/asv_bench/benchmarks/array.py
@@ -90,7 +90,7 @@ def time_setitem(self, multiple_chunks):
self.array[i] = "foo"
def time_setitem_list(self, multiple_chunks):
- indexer = list(range(0, 50)) + list(range(-1000, 0, 50))
+ indexer = list(range(50)) + list(range(-1000, 0, 50))
self.array[indexer] = ["foo"] * len(indexer)
def time_setitem_slice(self, multiple_chunks):
diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py
index 54bcdb0fa2843..04ac47a892a22 100644
--- a/asv_bench/benchmarks/join_merge.py
+++ b/asv_bench/benchmarks/join_merge.py
@@ -360,14 +360,14 @@ class MergeCategoricals:
def setup(self):
self.left_object = DataFrame(
{
- "X": np.random.choice(range(0, 10), size=(10000,)),
+ "X": np.random.choice(range(10), size=(10000,)),
"Y": np.random.choice(["one", "two", "three"], size=(10000,)),
}
)
self.right_object = DataFrame(
{
- "X": np.random.choice(range(0, 10), size=(10000,)),
+ "X": np.random.choice(range(10), size=(10000,)),
"Z": np.random.choice(["jjj", "kkk", "sss"], size=(10000,)),
}
)
diff --git a/doc/cheatsheet/README.md b/doc/cheatsheet/README.md
new file mode 100644
index 0000000000000..6c33de104ed90
--- /dev/null
+++ b/doc/cheatsheet/README.md
@@ -0,0 +1,22 @@
+# Pandas Cheat Sheet
+
+The Pandas Cheat Sheet was created using Microsoft Powerpoint 2013.
+To create the PDF version, within Powerpoint, simply do a "Save As"
+and pick "PDF" as the format.
+
+This cheat sheet, originally written by Irv Lustig, [Princeton Consultants](https://www.princetonoptimization.com/), was inspired by the [RStudio Data Wrangling Cheatsheet](https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf).
+
+| Topic | PDF | PPT |
+|------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Pandas_Cheat_Sheet | | |
+| Pandas_Cheat_Sheet_JA | | |
+
+
+**Alternative**
+
+Alternatively, if you want to complement your learning, you can use the Pandas Cheat sheets
+developed by [DataCamp](https://www.datacamp.com/) in "PDF", "Google Colab" and "Streamlit" formats.
+
+| Topic | PDF | Streamlit | Google Colab |
+|-------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Pandas | | | |
diff --git a/doc/cheatsheet/README.txt b/doc/cheatsheet/README.txt
deleted file mode 100644
index c57da38b31777..0000000000000
--- a/doc/cheatsheet/README.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-The Pandas Cheat Sheet was created using Microsoft Powerpoint 2013.
-To create the PDF version, within Powerpoint, simply do a "Save As"
-and pick "PDF" as the format.
-
-This cheat sheet was inspired by the RStudio Data Wrangling Cheatsheet[1], written by Irv Lustig, Princeton Consultants[2].
-
-[1]: https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf
-[2]: https://www.princetonoptimization.com/
diff --git a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst
index 2dcc8b0abe3b8..caaff3557ae40 100644
--- a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst
+++ b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst
@@ -106,9 +106,9 @@ between square brackets ``[]``.
.. note::
- If you are familiar to Python
+ If you are familiar with Python
:ref:`dictionaries `, the selection of a
- single column is very similar to selection of dictionary values based on
+ single column is very similar to the selection of dictionary values based on
the key.
You can create a ``Series`` from scratch as well:
diff --git a/doc/source/whatsnew/v0.15.2.rst b/doc/source/whatsnew/v0.15.2.rst
index bb7beef449d93..acc5409b86d09 100644
--- a/doc/source/whatsnew/v0.15.2.rst
+++ b/doc/source/whatsnew/v0.15.2.rst
@@ -24,25 +24,61 @@ API changes
- Indexing in ``MultiIndex`` beyond lex-sort depth is now supported, though
a lexically sorted index will have a better performance. (:issue:`2646`)
- .. ipython:: python
- :okexcept:
- :okwarning:
+ .. code-block:: ipython
+
+ In [1]: df = pd.DataFrame({'jim':[0, 0, 1, 1],
+ ...: 'joe':['x', 'x', 'z', 'y'],
+ ...: 'jolie':np.random.rand(4)}).set_index(['jim', 'joe'])
+ ...:
- df = pd.DataFrame({'jim':[0, 0, 1, 1],
- 'joe':['x', 'x', 'z', 'y'],
- 'jolie':np.random.rand(4)}).set_index(['jim', 'joe'])
- df
- df.index.lexsort_depth
+ In [2]: df
+ Out[2]:
+ jolie
+ jim joe
+ 0 x 0.126970
+ x 0.966718
+ 1 z 0.260476
+ y 0.897237
+
+ [4 rows x 1 columns]
+
+ In [3]: df.index.lexsort_depth
+ Out[3]: 1
# in prior versions this would raise a KeyError
# will now show a PerformanceWarning
- df.loc[(1, 'z')]
+ In [4]: df.loc[(1, 'z')]
+ Out[4]:
+ jolie
+ jim joe
+ 1 z 0.260476
+
+ [1 rows x 1 columns]
# lexically sorting
- df2 = df.sort_index()
- df2
- df2.index.lexsort_depth
- df2.loc[(1,'z')]
+ In [5]: df2 = df.sort_index()
+
+ In [6]: df2
+ Out[6]:
+ jolie
+ jim joe
+ 0 x 0.126970
+ x 0.966718
+ 1 y 0.897237
+ z 0.260476
+
+ [4 rows x 1 columns]
+
+ In [7]: df2.index.lexsort_depth
+ Out[7]: 2
+
+ In [8]: df2.loc[(1,'z')]
+ Out[8]:
+ jolie
+ jim joe
+ 1 z 0.260476
+
+ [1 rows x 1 columns]
- Bug in unique of Series with ``category`` dtype, which returned all categories regardless
whether they were "used" or not (see :issue:`8559` for the discussion).
diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst
index a6848dad6e3cd..fe511b5cdec67 100644
--- a/doc/source/whatsnew/v2.1.1.rst
+++ b/doc/source/whatsnew/v2.1.1.rst
@@ -13,13 +13,17 @@ including other versions of pandas.
Fixed regressions
~~~~~~~~~~~~~~~~~
+- Fixed regression in :func:`concat` when :class:`DataFrame` 's have two different extension dtypes (:issue:`54848`)
- Fixed regression in :func:`merge` when merging over a PyArrow string index (:issue:`54894`)
- Fixed regression in :func:`read_csv` when ``usecols`` is given and ``dtypes`` is a dict for ``engine="python"`` (:issue:`54868`)
- Fixed regression in :func:`read_csv` when ``delim_whitespace`` is True (:issue:`54918`, :issue:`54931`)
- Fixed regression in :meth:`.GroupBy.get_group` raising for ``axis=1`` (:issue:`54858`)
- Fixed regression in :meth:`DataFrame.__setitem__` raising ``AssertionError`` when setting a :class:`Series` with a partial :class:`MultiIndex` (:issue:`54875`)
+- Fixed regression in :meth:`DataFrame.filter` not respecting the order of elements for ``filter`` (:issue:`54980`)
+- Fixed regression in :meth:`DataFrame.to_sql` not roundtripping datetime columns correctly for sqlite (:issue:`54877`)
- Fixed regression in :meth:`MultiIndex.append` raising when appending overlapping :class:`IntervalIndex` levels (:issue:`54934`)
- Fixed regression in :meth:`Series.drop_duplicates` for PyArrow strings (:issue:`54904`)
+- Fixed regression in :meth:`Series.interpolate` raising when ``fill_value`` was given (:issue:`54920`)
- Fixed regression in :meth:`Series.value_counts` raising for numeric data if ``bins`` was specified (:issue:`54857`)
- Fixed regression when comparing a :class:`Series` with ``datetime64`` dtype with ``None`` (:issue:`54870`)
@@ -28,7 +32,9 @@ Fixed regressions
Bug fixes
~~~~~~~~~
+- Fixed bug for :class:`ArrowDtype` raising ``NotImplementedError`` for fixed-size list (:issue:`55000`)
- Fixed bug in :meth:`DataFrame.stack` with ``future_stack=True`` and columns a non-:class:`MultiIndex` consisting of tuples (:issue:`54948`)
+- Fixed bug in :meth:`Series.pct_change` and :meth:`DataFrame.pct_change` showing unnecessary ``FutureWarning`` (:issue:`54981`)
.. ---------------------------------------------------------------------------
.. _whatsnew_211.other:
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
index 8a5fd043dba99..2c450003872f7 100644
--- a/doc/source/whatsnew/v2.2.0.rst
+++ b/doc/source/whatsnew/v2.2.0.rst
@@ -168,6 +168,7 @@ Performance improvements
Bug fixes
~~~~~~~~~
- Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`)
+- Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`)
Categorical
^^^^^^^^^^^
@@ -245,7 +246,7 @@ Groupby/resample/rolling
Reshaping
^^^^^^^^^
--
+- Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`)
-
Sparse
diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx
index 02934346130a5..7b306c5e681e0 100644
--- a/pandas/_libs/window/indexers.pyx
+++ b/pandas/_libs/window/indexers.pyx
@@ -138,6 +138,8 @@ def calculate_variable_window_bounds(
break
# end bound is previous end
# or current index
+ elif index[end[i - 1]] == end_bound and not right_closed:
+ end[i] = end[i - 1] + 1
elif (index[end[i - 1]] - end_bound) * index_growth_sign <= 0:
end[i] = i + 1
else:
diff --git a/pandas/conftest.py b/pandas/conftest.py
index a4f58e99d8bcc..ac0275bf695d4 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -71,6 +71,7 @@
Index,
MultiIndex,
)
+from pandas.util.version import Version
if TYPE_CHECKING:
from collections.abc import (
@@ -191,6 +192,10 @@ def pytest_collection_modifyitems(items, config) -> None:
item.add_marker(pytest.mark.arraymanager)
+hypothesis_health_checks = [hypothesis.HealthCheck.too_slow]
+if Version(hypothesis.__version__) >= Version("6.83.2"):
+ hypothesis_health_checks.append(hypothesis.HealthCheck.differing_executors)
+
# Hypothesis
hypothesis.settings.register_profile(
"ci",
@@ -202,7 +207,7 @@ def pytest_collection_modifyitems(items, config) -> None:
# 2022-02-09: Changed deadline from 500 -> None. Deadline leads to
# non-actionable, flaky CI failures (# GH 24641, 44969, 45118, 44969)
deadline=None,
- suppress_health_check=(hypothesis.HealthCheck.too_slow,),
+ suppress_health_check=tuple(hypothesis_health_checks),
)
hypothesis.settings.load_profile("ci")
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 4d887ecd1510f..83ed54c42a23c 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -2192,11 +2192,11 @@ def _str_rstrip(self, to_strip=None):
return type(self)(result)
def _str_removeprefix(self, prefix: str):
- # TODO: Should work once https://github.com/apache/arrow/issues/14991 is fixed
- # starts_with = pc.starts_with(self._pa_array, pattern=prefix)
- # removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
- # result = pc.if_else(starts_with, removed, self._pa_array)
- # return type(self)(result)
+ if not pa_version_under13p0:
+ starts_with = pc.starts_with(self._pa_array, pattern=prefix)
+ removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
+ result = pc.if_else(starts_with, removed, self._pa_array)
+ return type(self)(result)
predicate = lambda val: val.removeprefix(prefix)
result = self._apply_elementwise(predicate)
return type(self)(pa.chunked_array(result))
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index aaa515ac459bd..338724d405ad8 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -15,7 +15,10 @@
lib,
missing as libmissing,
)
-from pandas.compat import pa_version_under7p0
+from pandas.compat import (
+ pa_version_under7p0,
+ pa_version_under13p0,
+)
from pandas.util._exceptions import find_stack_level
from pandas.core.dtypes.common import (
@@ -417,7 +420,7 @@ def _str_isupper(self):
def _str_len(self):
result = pc.utf8_length(self._pa_array)
- return Int64Dtype().__from_arrow__(result)
+ return self._convert_int_dtype(result)
def _str_lower(self):
return type(self)(pc.utf8_lower(self._pa_array))
@@ -446,6 +449,43 @@ def _str_rstrip(self, to_strip=None):
result = pc.utf8_rtrim(self._pa_array, characters=to_strip)
return type(self)(result)
+ def _str_removeprefix(self, prefix: str):
+ if not pa_version_under13p0:
+ starts_with = pc.starts_with(self._pa_array, pattern=prefix)
+ removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
+ result = pc.if_else(starts_with, removed, self._pa_array)
+ return type(self)(result)
+ return super()._str_removeprefix(prefix)
+
+ def _str_removesuffix(self, suffix: str):
+ ends_with = pc.ends_with(self._pa_array, pattern=suffix)
+ removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
+ result = pc.if_else(ends_with, removed, self._pa_array)
+ return type(self)(result)
+
+ def _str_count(self, pat: str, flags: int = 0):
+ if flags:
+ return super()._str_count(pat, flags)
+ result = pc.count_substring_regex(self._pa_array, pat)
+ return self._convert_int_dtype(result)
+
+ def _str_find(self, sub: str, start: int = 0, end: int | None = None):
+ if start != 0 and end is not None:
+ slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
+ result = pc.find_substring(slices, sub)
+ not_found = pc.equal(result, -1)
+ offset_result = pc.add(result, end - start)
+ result = pc.if_else(not_found, result, offset_result)
+ elif start == 0 and end is None:
+ slices = self._pa_array
+ result = pc.find_substring(slices, sub)
+ else:
+ return super()._str_find(sub, start, end)
+ return self._convert_int_dtype(result)
+
+ def _convert_int_dtype(self, result):
+ return Int64Dtype().__from_arrow__(result)
+
class ArrowStringArrayNumpySemantics(ArrowStringArray):
_storage = "pyarrow_numpy"
@@ -526,34 +566,11 @@ def _str_map(
return lib.map_infer_mask(arr, f, mask.view("uint8"))
def _convert_int_dtype(self, result):
+ result = result.to_numpy()
if result.dtype == np.int32:
result = result.astype(np.int64)
return result
- def _str_count(self, pat: str, flags: int = 0):
- if flags:
- return super()._str_count(pat, flags)
- result = pc.count_substring_regex(self._pa_array, pat).to_numpy()
- return self._convert_int_dtype(result)
-
- def _str_len(self):
- result = pc.utf8_length(self._pa_array).to_numpy()
- return self._convert_int_dtype(result)
-
- def _str_find(self, sub: str, start: int = 0, end: int | None = None):
- if start != 0 and end is not None:
- slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
- result = pc.find_substring(slices, sub)
- not_found = pc.equal(result, -1)
- offset_result = pc.add(result, end - start)
- result = pc.if_else(not_found, result, offset_result)
- elif start == 0 and end is None:
- slices = self._pa_array
- result = pc.find_substring(slices, sub)
- else:
- return super()._str_find(sub, start, end)
- return self._convert_int_dtype(result.to_numpy())
-
def _cmp_method(self, other, op):
result = super()._cmp_method(other, op)
return result.to_numpy(np.bool_, na_value=False)
diff --git a/pandas/core/base.py b/pandas/core/base.py
index d973f8f5fe35a..3026189e747bb 100644
--- a/pandas/core/base.py
+++ b/pandas/core/base.py
@@ -485,8 +485,8 @@ def array(self) -> ExtensionArray:
types, this is the actual array. For NumPy native types, this
is a thin (no copy) wrapper around :class:`numpy.ndarray`.
- ``.array`` differs ``.values`` which may require converting the
- data to a different form.
+ ``.array`` differs from ``.values``, which may require converting
+ the data to a different form.
See Also
--------
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
index f76163cbbd0a1..12de63967c78f 100644
--- a/pandas/core/dtypes/dtypes.py
+++ b/pandas/core/dtypes/dtypes.py
@@ -70,7 +70,7 @@
from collections.abc import MutableMapping
from datetime import tzinfo
- import pyarrow as pa # noqa: F811, TCH004
+ import pyarrow as pa # noqa: TCH004
from pandas._typing import (
Dtype,
@@ -2148,6 +2148,8 @@ def type(self):
return CategoricalDtypeType
elif pa.types.is_list(pa_type) or pa.types.is_large_list(pa_type):
return list
+ elif pa.types.is_fixed_size_list(pa_type):
+ return list
elif pa.types.is_map(pa_type):
return list
elif pa.types.is_struct(pa_type):
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 4bfa8a4415785..a731cdbf99b0e 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1926,11 +1926,17 @@ def to_dict(
self,
orient: Literal["dict", "list", "series", "split", "tight", "index"] = ...,
into: type[dict] = ...,
+ index: bool = ...,
) -> dict:
...
@overload
- def to_dict(self, orient: Literal["records"], into: type[dict] = ...) -> list[dict]:
+ def to_dict(
+ self,
+ orient: Literal["records"],
+ into: type[dict] = ...,
+ index: bool = ...,
+ ) -> list[dict]:
...
@deprecate_nonkeyword_arguments(
@@ -11297,7 +11303,7 @@ def _reduce_axis1(self, name: str, func, skipna: bool) -> Series:
def any( # type: ignore[override]
self,
*,
- axis: Axis = 0,
+ axis: Axis | None = 0,
bool_only: bool = False,
skipna: bool = True,
**kwargs,
@@ -11312,7 +11318,7 @@ def any( # type: ignore[override]
@doc(make_doc("all", ndim=2))
def all(
self,
- axis: Axis = 0,
+ axis: Axis | None = 0,
bool_only: bool = False,
skipna: bool = True,
**kwargs,
@@ -11711,6 +11717,7 @@ def quantile(
axis: Axis = ...,
numeric_only: bool = ...,
interpolation: QuantileInterpolation = ...,
+ method: Literal["single", "table"] = ...,
) -> Series:
...
@@ -11721,6 +11728,7 @@ def quantile(
axis: Axis = ...,
numeric_only: bool = ...,
interpolation: QuantileInterpolation = ...,
+ method: Literal["single", "table"] = ...,
) -> Series | DataFrame:
...
@@ -11731,6 +11739,7 @@ def quantile(
axis: Axis = ...,
numeric_only: bool = ...,
interpolation: QuantileInterpolation = ...,
+ method: Literal["single", "table"] = ...,
) -> Series | DataFrame:
...
@@ -11830,11 +11839,10 @@ def quantile(
if not is_list_like(q):
# BlockManager.quantile expects listlike, so we wrap and unwrap here
- # error: List item 0 has incompatible type "Union[float, Union[Union[
- # ExtensionArray, ndarray[Any, Any]], Index, Series], Sequence[float]]";
- # expected "float"
- res_df = self.quantile( # type: ignore[call-overload]
- [q],
+ # error: List item 0 has incompatible type "float | ExtensionArray |
+ # ndarray[Any, Any] | Index | Series | Sequence[float]"; expected "float"
+ res_df = self.quantile(
+ [q], # type: ignore[list-item]
axis=axis,
numeric_only=numeric_only,
interpolation=interpolation,
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index b9407ebe6624a..5c303e2a73bd7 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -2847,7 +2847,7 @@ def to_sql(
index : bool, default True
Write DataFrame index as a column. Uses `index_label` as the column
- name in the table.
+ name in the table. Creates a table index for this column.
index_label : str or sequence, default None
Column label for index column(s). If None is given (default) and
`index` is True, then the index names are used.
@@ -5718,10 +5718,12 @@ def filter(
if items is not None:
name = self._get_axis_name(axis)
+ items = Index(items).intersection(labels)
+ if len(items) == 0:
+ # Keep the dtype of labels when we are empty
+ items = items.astype(labels.dtype)
# error: Keywords must be strings
- return self.reindex( # type: ignore[misc]
- **{name: labels.intersection(items)}
- )
+ return self.reindex(**{name: items}) # type: ignore[misc]
elif like:
def f(x) -> bool_t:
@@ -7938,6 +7940,51 @@ def replace(
else:
return result.__finalize__(self, method="replace")
+ @overload
+ def interpolate(
+ self,
+ method: InterpolateOptions = ...,
+ *,
+ axis: Axis = ...,
+ limit: int | None = ...,
+ inplace: Literal[False] = ...,
+ limit_direction: Literal["forward", "backward", "both"] | None = ...,
+ limit_area: Literal["inside", "outside"] | None = ...,
+ downcast: Literal["infer"] | None | lib.NoDefault = ...,
+ **kwargs,
+ ) -> Self:
+ ...
+
+ @overload
+ def interpolate(
+ self,
+ method: InterpolateOptions = ...,
+ *,
+ axis: Axis = ...,
+ limit: int | None = ...,
+ inplace: Literal[True],
+ limit_direction: Literal["forward", "backward", "both"] | None = ...,
+ limit_area: Literal["inside", "outside"] | None = ...,
+ downcast: Literal["infer"] | None | lib.NoDefault = ...,
+ **kwargs,
+ ) -> None:
+ ...
+
+ @overload
+ def interpolate(
+ self,
+ method: InterpolateOptions = ...,
+ *,
+ axis: Axis = ...,
+ limit: int | None = ...,
+ inplace: bool_t = ...,
+ limit_direction: Literal["forward", "backward", "both"] | None = ...,
+ limit_area: Literal["inside", "outside"] | None = ...,
+ downcast: Literal["infer"] | None | lib.NoDefault = ...,
+ **kwargs,
+ ) -> Self | None:
+ ...
+
@final
def interpolate(
self,
@@ -8180,10 +8227,11 @@ def interpolate(
stacklevel=find_stack_level(),
)
- if "fill_value" in kwargs:
+ if method in fillna_methods and "fill_value" in kwargs:
raise ValueError(
"'fill_value' is not a valid keyword for "
- f"{type(self).__name__}.interpolate"
+ f"{type(self).__name__}.interpolate with method from "
+ f"{fillna_methods}"
)
if isinstance(obj.index, MultiIndex) and method != "linear":
@@ -8607,6 +8655,42 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace):
# GH 40420
return self.where(subset, threshold, axis=axis, inplace=inplace)
+ @overload
+ def clip(
+ self,
+ lower=...,
+ upper=...,
+ *,
+ axis: Axis | None = ...,
+ inplace: Literal[False] = ...,
+ **kwargs,
+ ) -> Self:
+ ...
+
+ @overload
+ def clip(
+ self,
+ lower=...,
+ upper=...,
+ *,
+ axis: Axis | None = ...,
+ inplace: Literal[True],
+ **kwargs,
+ ) -> None:
+ ...
+
+ @overload
+ def clip(
+ self,
+ lower=...,
+ upper=...,
+ *,
+ axis: Axis | None = ...,
+ inplace: bool_t = ...,
+ **kwargs,
+ ) -> Self | None:
+ ...
+
@final
def clip(
self,
@@ -11709,15 +11793,21 @@ def pct_change(
stacklevel=find_stack_level(),
)
if fill_method is lib.no_default:
- if self.isna().values.any():
- warnings.warn(
- "The default fill_method='pad' in "
- f"{type(self).__name__}.pct_change is deprecated and will be "
- "removed in a future version. Call ffill before calling "
- "pct_change to retain current behavior and silence this warning.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
+ cols = self.items() if self.ndim == 2 else [(None, self)]
+ for _, col in cols:
+ mask = col.isna().values
+ mask = mask[np.argmax(~mask) :]
+ if mask.any():
+ warnings.warn(
+ "The default fill_method='pad' in "
+ f"{type(self).__name__}.pct_change is deprecated and will be "
+ "removed in a future version. Call ffill before calling "
+ "pct_change to retain current behavior and silence this "
+ "warning.",
+ FutureWarning,
+ stacklevel=find_stack_level(),
+ )
+ break
fill_method = "pad"
if limit is lib.no_default:
limit = None
@@ -11743,7 +11833,7 @@ def _logical_func(
self,
name: str,
func,
- axis: Axis = 0,
+ axis: Axis | None = 0,
bool_only: bool_t = False,
skipna: bool_t = True,
**kwargs,
@@ -11756,7 +11846,10 @@ def _logical_func(
res = self._logical_func(
name, func, axis=0, bool_only=bool_only, skipna=skipna, **kwargs
)
- return res._logical_func(name, func, skipna=skipna, **kwargs)
+ # error: Item "bool" of "Series | bool" has no attribute "_logical_func"
+ return res._logical_func( # type: ignore[union-attr]
+ name, func, skipna=skipna, **kwargs
+ )
elif axis is None:
axis = 0
diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py
index 694a420ad2494..c13ec51ff3851 100644
--- a/pandas/core/indexers/objects.py
+++ b/pandas/core/indexers/objects.py
@@ -262,7 +262,9 @@ def get_window_bounds(
# end bound is previous end
# or current index
end_diff = (self.index[end[i - 1]] - end_bound) * index_growth_sign
- if end_diff <= zero:
+ if end_diff == zero and not right_closed:
+ end[i] = end[i - 1] + 1
+ elif end_diff <= zero:
end[i] = i + 1
else:
end[i] = end[i - 1]
diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py
index 781dfae7fef64..a8ef0e034ba9b 100644
--- a/pandas/core/indexes/api.py
+++ b/pandas/core/indexes/api.py
@@ -377,5 +377,5 @@ def all_indexes_same(indexes) -> bool:
def default_index(n: int) -> RangeIndex:
- rng = range(0, n)
+ rng = range(n)
return RangeIndex._simple_new(rng, name=None)
diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py
index 4d33f0137d3c4..b2d463a8c6c26 100644
--- a/pandas/core/internals/concat.py
+++ b/pandas/core/internals/concat.py
@@ -177,7 +177,7 @@ def concatenate_managers(
values = np.concatenate(vals, axis=1) # type: ignore[arg-type]
elif is_1d_only_ea_dtype(blk.dtype):
# TODO(EA2D): special-casing not needed with 2D EAs
- values = concat_compat(vals, axis=1, ea_compat_axis=True)
+ values = concat_compat(vals, axis=0, ea_compat_axis=True)
values = ensure_block_shape(values, ndim=2)
else:
values = concat_compat(vals, axis=1)
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
index 8ef3943ab0d8d..6d1ff07e07c76 100644
--- a/pandas/core/reshape/merge.py
+++ b/pandas/core/reshape/merge.py
@@ -1272,12 +1272,7 @@ def _get_merge_keys(
# work-around for merge_asof(right_index=True)
right_keys.append(right.index._values)
if lk is not None and lk == rk: # FIXME: what about other NAs?
- # avoid key upcast in corner case (length-0)
- lk = cast(Hashable, lk)
- if len(left) > 0:
- right_drop.append(rk)
- else:
- left_drop.append(lk)
+ right_drop.append(rk)
else:
rk = cast(ArrayLike, rk)
right_keys.append(rk)
@@ -2421,7 +2416,8 @@ def _factorize_keys(
elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype:
if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or (
- isinstance(lk.dtype, StringDtype) and lk.dtype.storage == "pyarrow"
+ isinstance(lk.dtype, StringDtype)
+ and lk.dtype.storage in ["pyarrow", "pyarrow_numpy"]
):
import pyarrow as pa
import pyarrow.compute as pc
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
index 9ffbfb9f1149f..b4b0f29019c31 100644
--- a/pandas/io/excel/_base.py
+++ b/pandas/io/excel/_base.py
@@ -1,6 +1,5 @@
from __future__ import annotations
-import abc
from collections.abc import (
Hashable,
Iterable,
@@ -549,7 +548,7 @@ def read_excel(
_WorkbookT = TypeVar("_WorkbookT")
-class BaseExcelReader(Generic[_WorkbookT], metaclass=abc.ABCMeta):
+class BaseExcelReader(Generic[_WorkbookT]):
book: _WorkbookT
def __init__(
@@ -589,13 +588,11 @@ def __init__(
)
@property
- @abc.abstractmethod
def _workbook_class(self) -> type[_WorkbookT]:
- pass
+ raise NotImplementedError
- @abc.abstractmethod
def load_workbook(self, filepath_or_buffer, engine_kwargs) -> _WorkbookT:
- pass
+ raise NotImplementedError
def close(self) -> None:
if hasattr(self, "book"):
@@ -611,21 +608,17 @@ def close(self) -> None:
self.handles.close()
@property
- @abc.abstractmethod
def sheet_names(self) -> list[str]:
- pass
+ raise NotImplementedError
- @abc.abstractmethod
def get_sheet_by_name(self, name: str):
- pass
+ raise NotImplementedError
- @abc.abstractmethod
def get_sheet_by_index(self, index: int):
- pass
+ raise NotImplementedError
- @abc.abstractmethod
def get_sheet_data(self, sheet, rows: int | None = None):
- pass
+ raise NotImplementedError
def raise_if_bad_sheet_by_index(self, index: int) -> None:
n_sheets = len(self.sheet_names)
@@ -940,7 +933,7 @@ def parse(
@doc(storage_options=_shared_docs["storage_options"])
-class ExcelWriter(Generic[_WorkbookT], metaclass=abc.ABCMeta):
+class ExcelWriter(Generic[_WorkbookT]):
"""
Class for writing DataFrame objects into excel sheets.
@@ -1178,20 +1171,19 @@ def engine(self) -> str:
return self._engine
@property
- @abc.abstractmethod
def sheets(self) -> dict[str, Any]:
"""Mapping of sheet names to sheet objects."""
+ raise NotImplementedError
@property
- @abc.abstractmethod
def book(self) -> _WorkbookT:
"""
Book instance. Class type will depend on the engine used.
This attribute can be used to access engine-specific features.
"""
+ raise NotImplementedError
- @abc.abstractmethod
def _write_cells(
self,
cells,
@@ -1214,12 +1206,13 @@ def _write_cells(
freeze_panes: int tuple of length 2
contains the bottom-most row and right-most column to freeze
"""
+ raise NotImplementedError
- @abc.abstractmethod
def _save(self) -> None:
"""
Save workbook to disk.
"""
+ raise NotImplementedError
def __init__(
self,
diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py
index 9970d465ced9d..b344d9849f16c 100644
--- a/pandas/io/formats/excel.py
+++ b/pandas/io/formats/excel.py
@@ -941,9 +941,7 @@ def write(
if isinstance(writer, ExcelWriter):
need_save = False
else:
- # error: Cannot instantiate abstract class 'ExcelWriter' with abstract
- # attributes 'engine', 'save', 'supported_extensions' and 'write_cells'
- writer = ExcelWriter( # type: ignore[abstract]
+ writer = ExcelWriter(
writer,
engine=engine,
storage_options=storage_options,
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
index 833f4986b6da6..52ea072d1483f 100644
--- a/pandas/io/json/_json.py
+++ b/pandas/io/json/_json.py
@@ -82,6 +82,7 @@
JSONEngine,
JSONSerializable,
ReadBuffer,
+ Self,
StorageOptions,
WriteBuffer,
)
@@ -1056,7 +1057,7 @@ def close(self) -> None:
if self.handles is not None:
self.handles.close()
- def __iter__(self: JsonReader[FrameSeriesStrT]) -> JsonReader[FrameSeriesStrT]:
+ def __iter__(self) -> Self:
return self
@overload
@@ -1099,7 +1100,7 @@ def __next__(self) -> DataFrame | Series:
else:
return obj
- def __enter__(self) -> JsonReader[FrameSeriesStrT]:
+ def __enter__(self) -> Self:
return self
def __exit__(
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
index 10d3ab230cb9d..e0f171035e89e 100644
--- a/pandas/io/parsers/readers.py
+++ b/pandas/io/parsers/readers.py
@@ -1307,6 +1307,51 @@ def read_table(
return _read(filepath_or_buffer, kwds)
+@overload
+def read_fwf(
+ filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
+ *,
+ colspecs: Sequence[tuple[int, int]] | str | None = ...,
+ widths: Sequence[int] | None = ...,
+ infer_nrows: int = ...,
+ dtype_backend: DtypeBackend | lib.NoDefault = ...,
+ iterator: Literal[True],
+ chunksize: int | None = ...,
+ **kwds,
+) -> TextFileReader:
+ ...
+
+
+@overload
+def read_fwf(
+ filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
+ *,
+ colspecs: Sequence[tuple[int, int]] | str | None = ...,
+ widths: Sequence[int] | None = ...,
+ infer_nrows: int = ...,
+ dtype_backend: DtypeBackend | lib.NoDefault = ...,
+ iterator: bool = ...,
+ chunksize: int,
+ **kwds,
+) -> TextFileReader:
+ ...
+
+
+@overload
+def read_fwf(
+ filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
+ *,
+ colspecs: Sequence[tuple[int, int]] | str | None = ...,
+ widths: Sequence[int] | None = ...,
+ infer_nrows: int = ...,
+ dtype_backend: DtypeBackend | lib.NoDefault = ...,
+ iterator: Literal[False] = ...,
+ chunksize: None = ...,
+ **kwds,
+) -> DataFrame:
+ ...
+
+
def read_fwf(
filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
*,
@@ -1314,6 +1359,8 @@ def read_fwf(
widths: Sequence[int] | None = None,
infer_nrows: int = 100,
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
+ iterator: bool = False,
+ chunksize: int | None = None,
**kwds,
) -> DataFrame | TextFileReader:
r"""
@@ -1412,6 +1459,8 @@ def read_fwf(
kwds["colspecs"] = colspecs
kwds["infer_nrows"] = infer_nrows
kwds["engine"] = "python-fwf"
+ kwds["iterator"] = iterator
+ kwds["chunksize"] = chunksize
check_dtype_backend(dtype_backend)
kwds["dtype_backend"] = dtype_backend
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
index 7669d5aa4cea5..2b139f8ca527c 100644
--- a/pandas/io/sql.py
+++ b/pandas/io/sql.py
@@ -2091,13 +2091,11 @@ def _adapt_time(t) -> str:
adapt_date_iso = lambda val: val.isoformat()
adapt_datetime_iso = lambda val: val.isoformat()
- adapt_datetime_epoch = lambda val: int(val.timestamp())
sqlite3.register_adapter(time, _adapt_time)
sqlite3.register_adapter(date, adapt_date_iso)
sqlite3.register_adapter(datetime, adapt_datetime_iso)
- sqlite3.register_adapter(datetime, adapt_datetime_epoch)
convert_date = lambda val: date.fromisoformat(val.decode())
convert_datetime = lambda val: datetime.fromisoformat(val.decode())
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index 5f1b16a44b8e9..fa6e85ba204d2 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -2992,6 +2992,15 @@ def test_groupby_count_return_arrow_dtype(data_missing):
tm.assert_frame_equal(result, expected)
+def test_fixed_size_list():
+ # GH#55000
+ ser = pd.Series(
+ [[1, 2], [3, 4]], dtype=ArrowDtype(pa.list_(pa.int64(), list_size=2))
+ )
+ result = ser.dtype.type
+ assert result == list
+
+
def test_arrowextensiondtype_dataframe_repr():
# GH 54062
df = pd.DataFrame(
diff --git a/pandas/tests/frame/methods/test_copy.py b/pandas/tests/frame/methods/test_copy.py
index 95fcaaa473067..e7901ed363106 100644
--- a/pandas/tests/frame/methods/test_copy.py
+++ b/pandas/tests/frame/methods/test_copy.py
@@ -56,7 +56,7 @@ def test_copy_consolidates(self):
}
)
- for i in range(0, 10):
+ for i in range(10):
df.loc[:, f"n_{i}"] = np.random.default_rng(2).integers(0, 100, size=55)
assert len(df._mgr.blocks) == 11
diff --git a/pandas/tests/frame/methods/test_filter.py b/pandas/tests/frame/methods/test_filter.py
index 1a2fbf8a65a55..9d5e6876bb08c 100644
--- a/pandas/tests/frame/methods/test_filter.py
+++ b/pandas/tests/frame/methods/test_filter.py
@@ -137,3 +137,17 @@ def test_filter_regex_non_string(self):
result = df.filter(regex="STRING")
expected = df[["STRING"]]
tm.assert_frame_equal(result, expected)
+
+ def test_filter_keep_order(self):
+ # GH#54980
+ df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+ result = df.filter(items=["B", "A"])
+ expected = df[["B", "A"]]
+ tm.assert_frame_equal(result, expected)
+
+ def test_filter_different_dtype(self):
+ # GH#54980
+ df = DataFrame({1: [1, 2, 3], 2: [4, 5, 6]})
+ result = df.filter(items=["B", "A"])
+ expected = df[[]]
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/methods/test_pct_change.py b/pandas/tests/frame/methods/test_pct_change.py
index d0153da038a75..ede212ae18ae9 100644
--- a/pandas/tests/frame/methods/test_pct_change.py
+++ b/pandas/tests/frame/methods/test_pct_change.py
@@ -160,3 +160,21 @@ def test_pct_change_with_duplicated_indices(fill_method):
index=["a", "b"] * 3,
)
tm.assert_frame_equal(result, expected)
+
+
+def test_pct_change_none_beginning_no_warning():
+ # GH#54481
+ df = DataFrame(
+ [
+ [1, None],
+ [2, 1],
+ [3, 2],
+ [4, 3],
+ [5, 4],
+ ]
+ )
+ result = df.pct_change()
+ expected = DataFrame(
+ {0: [np.nan, 1, 0.5, 1 / 3, 0.25], 1: [np.nan, np.nan, 1, 0.5, 1 / 3]}
+ )
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py
index 0858e33a989b7..56bdd2fc664cc 100644
--- a/pandas/tests/frame/methods/test_reindex.py
+++ b/pandas/tests/frame/methods/test_reindex.py
@@ -26,7 +26,7 @@
isna,
)
import pandas._testing as tm
-from pandas.api.types import CategoricalDtype as CDT
+from pandas.api.types import CategoricalDtype
class TestReindexSetIndex:
@@ -1082,7 +1082,9 @@ def test_reindex_with_categoricalindex(self):
{
"A": np.arange(3, dtype="int64"),
},
- index=CategoricalIndex(list("abc"), dtype=CDT(list("cabe")), name="B"),
+ index=CategoricalIndex(
+ list("abc"), dtype=CategoricalDtype(list("cabe")), name="B"
+ ),
)
# reindexing
@@ -1111,13 +1113,13 @@ def test_reindex_with_categoricalindex(self):
result = df.reindex(Categorical(["a", "e"], categories=cats))
expected = DataFrame(
- {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats))}
+ {"A": [0, np.nan], "B": Series(list("ae")).astype(CategoricalDtype(cats))}
).set_index("B")
tm.assert_frame_equal(result, expected, check_index_type=True)
result = df.reindex(Categorical(["a"], categories=cats))
expected = DataFrame(
- {"A": [0], "B": Series(list("a")).astype(CDT(cats))}
+ {"A": [0], "B": Series(list("a")).astype(CategoricalDtype(cats))}
).set_index("B")
tm.assert_frame_equal(result, expected, check_index_type=True)
@@ -1138,13 +1140,19 @@ def test_reindex_with_categoricalindex(self):
# give back the type of categorical that we received
result = df.reindex(Categorical(["a", "e"], categories=cats, ordered=True))
expected = DataFrame(
- {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats, ordered=True))}
+ {
+ "A": [0, np.nan],
+ "B": Series(list("ae")).astype(CategoricalDtype(cats, ordered=True)),
+ }
).set_index("B")
tm.assert_frame_equal(result, expected, check_index_type=True)
result = df.reindex(Categorical(["a", "d"], categories=["a", "d"]))
expected = DataFrame(
- {"A": [0, np.nan], "B": Series(list("ad")).astype(CDT(["a", "d"]))}
+ {
+ "A": [0, np.nan],
+ "B": Series(list("ad")).astype(CategoricalDtype(["a", "d"])),
+ }
).set_index("B")
tm.assert_frame_equal(result, expected, check_index_type=True)
@@ -1152,7 +1160,9 @@ def test_reindex_with_categoricalindex(self):
{
"A": np.arange(6, dtype="int64"),
},
- index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cabe")), name="B"),
+ index=CategoricalIndex(
+ list("aabbca"), dtype=CategoricalDtype(list("cabe")), name="B"
+ ),
)
# passed duplicate indexers are not allowed
msg = "cannot reindex on an axis with duplicate labels"
diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py
index d99dd36f3a2e3..339e19254fd10 100644
--- a/pandas/tests/frame/methods/test_reset_index.py
+++ b/pandas/tests/frame/methods/test_reset_index.py
@@ -788,15 +788,15 @@ def test_errorreset_index_rename(float_frame):
def test_reset_index_false_index_name():
- result_series = Series(data=range(5, 10), index=range(0, 5))
+ result_series = Series(data=range(5, 10), index=range(5))
result_series.index.name = False
result_series.reset_index()
- expected_series = Series(range(5, 10), RangeIndex(range(0, 5), name=False))
+ expected_series = Series(range(5, 10), RangeIndex(range(5), name=False))
tm.assert_series_equal(result_series, expected_series)
# GH 38147
- result_frame = DataFrame(data=range(5, 10), index=range(0, 5))
+ result_frame = DataFrame(data=range(5, 10), index=range(5))
result_frame.index.name = False
result_frame.reset_index()
- expected_frame = DataFrame(range(5, 10), RangeIndex(range(0, 5), name=False))
+ expected_frame = DataFrame(range(5, 10), RangeIndex(range(5), name=False))
tm.assert_frame_equal(result_frame, expected_frame)
diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py
index 228b62a418813..985a9e3602410 100644
--- a/pandas/tests/frame/methods/test_sort_index.py
+++ b/pandas/tests/frame/methods/test_sort_index.py
@@ -911,7 +911,7 @@ def test_sort_index_multiindex_sparse_column(self):
expected = DataFrame(
{
i: pd.array([0.0, 0.0, 0.0, 0.0], dtype=pd.SparseDtype("float64", 0.0))
- for i in range(0, 4)
+ for i in range(4)
},
index=MultiIndex.from_product([[1, 2], [1, 2]]),
)
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
index 3e2cde37c30eb..fd851ab244cb8 100644
--- a/pandas/tests/frame/test_constructors.py
+++ b/pandas/tests/frame/test_constructors.py
@@ -692,12 +692,12 @@ def test_constructor_error_msgs(self):
arr = np.array([[4, 5, 6]])
msg = r"Shape of passed values is \(1, 3\), indices imply \(1, 4\)"
with pytest.raises(ValueError, match=msg):
- DataFrame(index=[0], columns=range(0, 4), data=arr)
+ DataFrame(index=[0], columns=range(4), data=arr)
arr = np.array([4, 5, 6])
msg = r"Shape of passed values is \(3, 1\), indices imply \(1, 4\)"
with pytest.raises(ValueError, match=msg):
- DataFrame(index=[0], columns=range(0, 4), data=arr)
+ DataFrame(index=[0], columns=range(4), data=arr)
# higher dim raise exception
with pytest.raises(ValueError, match="Must pass 2-d input"):
@@ -2391,7 +2391,7 @@ def test_construct_with_two_categoricalindex_series(self):
def test_constructor_series_nonexact_categoricalindex(self):
# GH 42424
- ser = Series(range(0, 100))
+ ser = Series(range(100))
ser1 = cut(ser, 10).value_counts().head(5)
ser2 = cut(ser, 10).value_counts().tail(5)
result = DataFrame({"1": ser1, "2": ser2})
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index be226b4466f98..999a03d18644d 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -1928,7 +1928,7 @@ def test_pivot_table_values_key_error():
df = DataFrame(
{
"eventDate": date_range(datetime.today(), periods=20, freq="M").tolist(),
- "thename": range(0, 20),
+ "thename": range(20),
}
)
@@ -3189,6 +3189,14 @@ def test_depr_get_group_len_1_list_likes(test_series, kwarg, value, name, warn):
tm.assert_equal(result, expected)
+def test_groupby_ngroup_with_nan():
+ # GH#50100
+ df = DataFrame({"a": Categorical([np.nan]), "b": [1]})
+ result = df.groupby(["a", "b"], dropna=False, observed=False).ngroup()
+ expected = Series([0])
+ tm.assert_series_equal(result, expected)
+
+
def test_get_group_axis_1():
# GH#54858
df = DataFrame(
diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py
index c9fe011f7063b..55f96bd1443de 100644
--- a/pandas/tests/groupby/test_timegrouper.py
+++ b/pandas/tests/groupby/test_timegrouper.py
@@ -842,7 +842,7 @@ def test_grouper_period_index(self):
result = period_series.groupby(period_series.index.month).sum()
expected = Series(
- range(0, periods), index=Index(range(1, periods + 1), name=index.name)
+ range(periods), index=Index(range(1, periods + 1), name=index.name)
)
tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/indexes/multi/test_partial_indexing.py b/pandas/tests/indexes/multi/test_partial_indexing.py
index 47efc43d5eae0..66163dad3deae 100644
--- a/pandas/tests/indexes/multi/test_partial_indexing.py
+++ b/pandas/tests/indexes/multi/test_partial_indexing.py
@@ -31,7 +31,7 @@ def df():
dr = date_range("2016-01-01", "2016-01-03", freq="12H")
abc = ["a", "b", "c"]
mi = MultiIndex.from_product([dr, abc])
- frame = DataFrame({"c1": range(0, 15)}, index=mi)
+ frame = DataFrame({"c1": range(15)}, index=mi)
return frame
diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py
index 5f137df281fa3..132704434829e 100644
--- a/pandas/tests/indexes/ranges/test_range.py
+++ b/pandas/tests/indexes/ranges/test_range.py
@@ -10,9 +10,6 @@
)
import pandas._testing as tm
-# aliases to make some tests easier to read
-RI = RangeIndex
-
class TestRangeIndex:
@pytest.fixture
@@ -507,25 +504,31 @@ def test_len_specialised(self, step):
@pytest.mark.parametrize(
"indices, expected",
[
- ([RI(1, 12, 5)], RI(1, 12, 5)),
- ([RI(0, 6, 4)], RI(0, 6, 4)),
- ([RI(1, 3), RI(3, 7)], RI(1, 7)),
- ([RI(1, 5, 2), RI(5, 6)], RI(1, 6, 2)),
- ([RI(1, 3, 2), RI(4, 7, 3)], RI(1, 7, 3)),
- ([RI(-4, 3, 2), RI(4, 7, 2)], RI(-4, 7, 2)),
- ([RI(-4, -8), RI(-8, -12)], RI(0, 0)),
- ([RI(-4, -8), RI(3, -4)], RI(0, 0)),
- ([RI(-4, -8), RI(3, 5)], RI(3, 5)),
- ([RI(-4, -2), RI(3, 5)], Index([-4, -3, 3, 4])),
- ([RI(-2), RI(3, 5)], RI(3, 5)),
- ([RI(2), RI(2)], Index([0, 1, 0, 1])),
- ([RI(2), RI(2, 5), RI(5, 8, 4)], RI(0, 6)),
- ([RI(2), RI(3, 5), RI(5, 8, 4)], Index([0, 1, 3, 4, 5])),
- ([RI(-2, 2), RI(2, 5), RI(5, 8, 4)], RI(-2, 6)),
- ([RI(3), Index([-1, 3, 15])], Index([0, 1, 2, -1, 3, 15])),
- ([RI(3), Index([-1, 3.1, 15.0])], Index([0, 1, 2, -1, 3.1, 15.0])),
- ([RI(3), Index(["a", None, 14])], Index([0, 1, 2, "a", None, 14])),
- ([RI(3, 1), Index(["a", None, 14])], Index(["a", None, 14])),
+ ([RangeIndex(1, 12, 5)], RangeIndex(1, 12, 5)),
+ ([RangeIndex(0, 6, 4)], RangeIndex(0, 6, 4)),
+ ([RangeIndex(1, 3), RangeIndex(3, 7)], RangeIndex(1, 7)),
+ ([RangeIndex(1, 5, 2), RangeIndex(5, 6)], RangeIndex(1, 6, 2)),
+ ([RangeIndex(1, 3, 2), RangeIndex(4, 7, 3)], RangeIndex(1, 7, 3)),
+ ([RangeIndex(-4, 3, 2), RangeIndex(4, 7, 2)], RangeIndex(-4, 7, 2)),
+ ([RangeIndex(-4, -8), RangeIndex(-8, -12)], RangeIndex(0, 0)),
+ ([RangeIndex(-4, -8), RangeIndex(3, -4)], RangeIndex(0, 0)),
+ ([RangeIndex(-4, -8), RangeIndex(3, 5)], RangeIndex(3, 5)),
+ ([RangeIndex(-4, -2), RangeIndex(3, 5)], Index([-4, -3, 3, 4])),
+ ([RangeIndex(-2), RangeIndex(3, 5)], RangeIndex(3, 5)),
+ ([RangeIndex(2), RangeIndex(2)], Index([0, 1, 0, 1])),
+ ([RangeIndex(2), RangeIndex(2, 5), RangeIndex(5, 8, 4)], RangeIndex(0, 6)),
+ (
+ [RangeIndex(2), RangeIndex(3, 5), RangeIndex(5, 8, 4)],
+ Index([0, 1, 3, 4, 5]),
+ ),
+ (
+ [RangeIndex(-2, 2), RangeIndex(2, 5), RangeIndex(5, 8, 4)],
+ RangeIndex(-2, 6),
+ ),
+ ([RangeIndex(3), Index([-1, 3, 15])], Index([0, 1, 2, -1, 3, 15])),
+ ([RangeIndex(3), Index([-1, 3.1, 15.0])], Index([0, 1, 2, -1, 3.1, 15.0])),
+ ([RangeIndex(3), Index(["a", None, 14])], Index([0, 1, 2, "a", None, 14])),
+ ([RangeIndex(3, 1), Index(["a", None, 14])], Index(["a", None, 14])),
],
)
def test_append(self, indices, expected):
@@ -567,7 +570,7 @@ def test_format_empty(self):
assert empty_idx.format(name=True) == [""]
@pytest.mark.parametrize(
- "RI",
+ "ri",
[
RangeIndex(0, -1, -1),
RangeIndex(0, 1, 1),
@@ -576,10 +579,10 @@ def test_format_empty(self):
RangeIndex(-3, -5, -2),
],
)
- def test_append_len_one(self, RI):
+ def test_append_len_one(self, ri):
# GH39401
- result = RI.append([])
- tm.assert_index_equal(result, RI, exact=True)
+ result = ri.append([])
+ tm.assert_index_equal(result, ri, exact=True)
@pytest.mark.parametrize("base", [RangeIndex(0, 2), Index([0, 1])])
def test_isin_range(self, base):
diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py
index 9d11827e2923e..b86e233110e88 100644
--- a/pandas/tests/indexing/multiindex/test_getitem.py
+++ b/pandas/tests/indexing/multiindex/test_getitem.py
@@ -148,7 +148,7 @@ def test_frame_getitem_simple_key_error(
def test_tuple_string_column_names():
# GH#50372
mi = MultiIndex.from_tuples([("a", "aa"), ("a", "ab"), ("b", "ba"), ("b", "bb")])
- df = DataFrame([range(0, 4), range(1, 5), range(2, 6)], columns=mi)
+ df = DataFrame([range(4), range(1, 5), range(2, 6)], columns=mi)
df["single_index"] = 0
df_flat = df.copy()
diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py
index b45d197af332e..d3a6d4bf7cebf 100644
--- a/pandas/tests/indexing/test_categorical.py
+++ b/pandas/tests/indexing/test_categorical.py
@@ -16,7 +16,6 @@
Timestamp,
)
import pandas._testing as tm
-from pandas.api.types import CategoricalDtype as CDT
@pytest.fixture
@@ -25,7 +24,9 @@ def df():
{
"A": np.arange(6, dtype="int64"),
},
- index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cab")), name="B"),
+ index=CategoricalIndex(
+ list("aabbca"), dtype=CategoricalDtype(list("cab")), name="B"
+ ),
)
@@ -35,13 +36,15 @@ def df2():
{
"A": np.arange(6, dtype="int64"),
},
- index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cabe")), name="B"),
+ index=CategoricalIndex(
+ list("aabbca"), dtype=CategoricalDtype(list("cabe")), name="B"
+ ),
)
class TestCategoricalIndex:
def test_loc_scalar(self, df):
- dtype = CDT(list("cab"))
+ dtype = CategoricalDtype(list("cab"))
result = df.loc["a"]
bidx = Series(list("aaa"), name="B").astype(dtype)
assert bidx.dtype == dtype
diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py
index f36fdf0d36ea9..7353b5ef76ba3 100644
--- a/pandas/tests/indexing/test_chaining_and_caching.py
+++ b/pandas/tests/indexing/test_chaining_and_caching.py
@@ -1,4 +1,4 @@
-from string import ascii_letters as letters
+from string import ascii_letters
import numpy as np
import pytest
@@ -24,9 +24,9 @@
def random_text(nobs=100):
# Construct a DataFrame where each row is a random slice from 'letters'
- idxs = np.random.default_rng(2).integers(len(letters), size=(nobs, 2))
+ idxs = np.random.default_rng(2).integers(len(ascii_letters), size=(nobs, 2))
idxs.sort(axis=1)
- strings = [letters[x[0] : x[1]] for x in idxs]
+ strings = [ascii_letters[x[0] : x[1]] for x in idxs]
return DataFrame(strings, columns=["letters"])
diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py
index 73de2b068b699..6c3bf01cb1857 100644
--- a/pandas/tests/io/formats/test_info.py
+++ b/pandas/tests/io/formats/test_info.py
@@ -1,6 +1,6 @@
from io import StringIO
import re
-from string import ascii_uppercase as uppercase
+from string import ascii_uppercase
import sys
import textwrap
@@ -452,9 +452,9 @@ def memory_usage(f):
return f.memory_usage(deep=True).sum()
N = 100
- M = len(uppercase)
+ M = len(ascii_uppercase)
index = MultiIndex.from_product(
- [list(uppercase), date_range("20160101", periods=N)],
+ [list(ascii_uppercase), date_range("20160101", periods=N)],
names=["id", "date"],
)
df = DataFrame(
diff --git a/pandas/tests/io/formats/test_series_info.py b/pandas/tests/io/formats/test_series_info.py
index 02827ee25042a..29dd704f6efa9 100644
--- a/pandas/tests/io/formats/test_series_info.py
+++ b/pandas/tests/io/formats/test_series_info.py
@@ -1,5 +1,5 @@
from io import StringIO
-from string import ascii_uppercase as uppercase
+from string import ascii_uppercase
import textwrap
import numpy as np
@@ -165,9 +165,9 @@ def test_info_memory_usage_bug_on_multiindex():
# GH 14308
# memory usage introspection should not materialize .values
N = 100
- M = len(uppercase)
+ M = len(ascii_uppercase)
index = MultiIndex.from_product(
- [list(uppercase), date_range("20160101", periods=N)],
+ [list(ascii_uppercase), date_range("20160101", periods=N)],
names=["id", "date"],
)
s = Series(np.random.default_rng(2).standard_normal(N * M), index=index)
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index ca3ce6ba34515..b3c2e67f7c318 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -2044,7 +2044,7 @@ def test_read_json_dtype_backend(self, string_storage, dtype_backend, orient):
)
if orient == "values":
- expected.columns = list(range(0, 8))
+ expected.columns = list(range(8))
tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py
index 5bb7097770820..d5f8c5200c4a3 100644
--- a/pandas/tests/io/json/test_ujson.py
+++ b/pandas/tests/io/json/test_ujson.py
@@ -1033,7 +1033,7 @@ def test_decode_floating_point(self, sign, float_number):
def test_encode_big_set(self):
s = set()
- for x in range(0, 100000):
+ for x in range(100000):
s.add(x)
# Make sure no Exception is raised.
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index db3909c147ad3..55445e44b9366 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -1012,7 +1012,7 @@ def test_timezone_aware_index(self, request, pa, timezone_aware_date_list):
def test_filter_row_groups(self, pa):
# https://github.com/pandas-dev/pandas/issues/26551
pytest.importorskip("pyarrow")
- df = pd.DataFrame({"a": list(range(0, 3))})
+ df = pd.DataFrame({"a": list(range(3))})
with tm.ensure_clean() as path:
df.to_parquet(path, engine=pa)
result = read_parquet(
@@ -1219,7 +1219,7 @@ def test_categorical(self, fp):
check_round_trip(df, fp)
def test_filter_row_groups(self, fp):
- d = {"a": list(range(0, 3))}
+ d = {"a": list(range(3))}
df = pd.DataFrame(d)
with tm.ensure_clean() as path:
df.to_parquet(path, engine=fp, compression=None, row_group_offsets=1)
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
index 9ec0ba0b12a76..bfa93a4ff910e 100644
--- a/pandas/tests/io/test_sql.py
+++ b/pandas/tests/io/test_sql.py
@@ -2962,6 +2962,13 @@ def test_read_sql_string_inference(self):
tm.assert_frame_equal(result, expected)
+ def test_roundtripping_datetimes(self):
+ # GH#54877
+ df = DataFrame({"t": [datetime(2020, 12, 31, 12)]}, dtype="datetime64[ns]")
+ df.to_sql("test", self.conn, if_exists="replace", index=False)
+ result = pd.read_sql("select * from test", self.conn).iloc[0, 0]
+ assert result == "2020-12-31 12:00:00.000000"
+
@pytest.mark.db
class TestMySQLAlchemy(_TestSQLAlchemy):
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
index 7459aa1df8f3e..cd504616b6c5d 100644
--- a/pandas/tests/io/test_stata.py
+++ b/pandas/tests/io/test_stata.py
@@ -798,7 +798,7 @@ def test_missing_value_generator(self):
expected_values.insert(0, ".")
for t in types:
offset = valid_range[t][1]
- for i in range(0, 27):
+ for i in range(27):
val = StataMissingValue(offset + 1 + i)
assert val.string == expected_values[i]
diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py
index 3efcd930af581..5dde863f246d1 100644
--- a/pandas/tests/reshape/concat/test_concat.py
+++ b/pandas/tests/reshape/concat/test_concat.py
@@ -858,3 +858,12 @@ def test_concat_multiindex_with_category():
)
expected = expected.set_index(["c1", "c2"])
tm.assert_frame_equal(result, expected)
+
+
+def test_concat_ea_upcast():
+ # GH#54848
+ df1 = DataFrame(["a"], dtype="string")
+ df2 = DataFrame([1], dtype="Int64")
+ result = concat([df1, df2])
+ expected = DataFrame(["a", 1], index=[0, 0])
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
index 9cada6964c094..d889ae2e4806b 100644
--- a/pandas/tests/reshape/merge/test_merge.py
+++ b/pandas/tests/reshape/merge/test_merge.py
@@ -26,7 +26,6 @@
TimedeltaIndex,
)
import pandas._testing as tm
-from pandas.api.types import CategoricalDtype as CDT
from pandas.core.reshape.concat import concat
from pandas.core.reshape.merge import (
MergeError,
@@ -582,11 +581,11 @@ def test_merge_empty_frame(self, series_of_dtype, series_of_dtype2):
df_empty = df[:0]
expected = DataFrame(
{
- "value_x": Series(dtype=df.dtypes["value"]),
"key": Series(dtype=df.dtypes["key"]),
+ "value_x": Series(dtype=df.dtypes["value"]),
"value_y": Series(dtype=df.dtypes["value"]),
},
- columns=["value_x", "key", "value_y"],
+ columns=["key", "value_x", "value_y"],
)
actual = df_empty.merge(df, on="key")
tm.assert_frame_equal(actual, expected)
@@ -889,13 +888,13 @@ def test_merge_on_datetime64tz_empty(self):
result = left.merge(right, on="date")
expected = DataFrame(
{
+ "date": Series(dtype=dtz),
"value_x": Series(dtype=float),
"date2_x": Series(dtype=dtz),
- "date": Series(dtype=dtz),
"value_y": Series(dtype=float),
"date2_y": Series(dtype=dtz),
},
- columns=["value_x", "date2_x", "date", "value_y", "date2_y"],
+ columns=["date", "value_x", "date2_x", "value_y", "date2_y"],
)
tm.assert_frame_equal(result, expected)
@@ -1827,11 +1826,9 @@ def test_merge_empty(self, left_empty, how, exp):
if exp == "left":
expected = DataFrame({"A": [2, 1], "B": [3, 4], "C": [np.nan, np.nan]})
elif exp == "right":
- expected = DataFrame({"B": [np.nan], "A": [1], "C": [5]})
+ expected = DataFrame({"A": [1], "B": [np.nan], "C": [5]})
elif exp == "empty":
expected = DataFrame(columns=["A", "B", "C"], dtype="int64")
- if left_empty:
- expected = expected[["B", "A", "C"]]
elif exp == "empty_cross":
expected = DataFrame(columns=["A_x", "B", "A_y", "C"], dtype="int64")
@@ -1844,7 +1841,7 @@ def left():
{
"X": Series(
np.random.default_rng(2).choice(["foo", "bar"], size=(10,))
- ).astype(CDT(["foo", "bar"])),
+ ).astype(CategoricalDtype(["foo", "bar"])),
"Y": np.random.default_rng(2).choice(["one", "two", "three"], size=(10,)),
}
)
@@ -1853,7 +1850,10 @@ def left():
@pytest.fixture
def right():
return DataFrame(
- {"X": Series(["foo", "bar"]).astype(CDT(["foo", "bar"])), "Z": [1, 2]}
+ {
+ "X": Series(["foo", "bar"]).astype(CategoricalDtype(["foo", "bar"])),
+ "Z": [1, 2],
+ }
)
@@ -2004,8 +2004,8 @@ def test_other_columns(self, left, right):
"change",
[
lambda x: x,
- lambda x: x.astype(CDT(["foo", "bar", "bah"])),
- lambda x: x.astype(CDT(ordered=True)),
+ lambda x: x.astype(CategoricalDtype(["foo", "bar", "bah"])),
+ lambda x: x.astype(CategoricalDtype(ordered=True)),
],
)
def test_dtype_on_merged_different(self, change, join_type, left, right):
@@ -2112,11 +2112,13 @@ def test_merging_with_bool_or_int_cateorical_column(
# GH 17187
# merging with a boolean/int categorical column
df1 = DataFrame({"id": [1, 2, 3, 4], "cat": category_column})
- df1["cat"] = df1["cat"].astype(CDT(categories, ordered=ordered))
+ df1["cat"] = df1["cat"].astype(CategoricalDtype(categories, ordered=ordered))
df2 = DataFrame({"id": [2, 4], "num": [1, 9]})
result = df1.merge(df2)
expected = DataFrame({"id": [2, 4], "cat": expected_categories, "num": [1, 9]})
- expected["cat"] = expected["cat"].astype(CDT(categories, ordered=ordered))
+ expected["cat"] = expected["cat"].astype(
+ CategoricalDtype(categories, ordered=ordered)
+ )
tm.assert_frame_equal(expected, result)
def test_merge_on_int_array(self):
@@ -2481,14 +2483,12 @@ def test_merge_multiindex_columns():
result = frame_x.merge(frame_y, on="id", suffixes=((l_suf, r_suf)))
# Constructing the expected results
- expected_labels = [letter + l_suf for letter in letters] + [
- letter + r_suf for letter in letters
- ]
- expected_index = MultiIndex.from_product(
- [expected_labels, numbers], names=["outer", "inner"]
- )
+ tuples = [(letter + l_suf, num) for letter in letters for num in numbers]
+ tuples += [("id", "")]
+ tuples += [(letter + r_suf, num) for letter in letters for num in numbers]
+
+ expected_index = MultiIndex.from_tuples(tuples, names=["outer", "inner"])
expected = DataFrame(columns=expected_index)
- expected["id"] = ""
tm.assert_frame_equal(result, expected)
@@ -2949,13 +2949,36 @@ def test_merge_ea_int_and_float_numpy():
tm.assert_frame_equal(result, expected.astype("float64"))
-def test_merge_arrow_string_index():
+def test_merge_arrow_string_index(any_string_dtype):
# GH#54894
pytest.importorskip("pyarrow")
- left = DataFrame({"a": ["a", "b"]}, dtype="string[pyarrow]")
- right = DataFrame({"b": 1}, index=Index(["a", "c"], dtype="string[pyarrow]"))
+ left = DataFrame({"a": ["a", "b"]}, dtype=any_string_dtype)
+ right = DataFrame({"b": 1}, index=Index(["a", "c"], dtype=any_string_dtype))
result = left.merge(right, left_on="a", right_index=True, how="left")
expected = DataFrame(
- {"a": Series(["a", "b"], dtype="string[pyarrow]"), "b": [1, np.nan]}
+ {"a": Series(["a", "b"], dtype=any_string_dtype), "b": [1, np.nan]}
)
tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("left_empty", [True, False])
+@pytest.mark.parametrize("right_empty", [True, False])
+def test_merge_empty_frames_column_order(left_empty, right_empty):
+ # GH 51929
+ df1 = DataFrame(1, index=[0], columns=["A", "B"])
+ df2 = DataFrame(1, index=[0], columns=["A", "C", "D"])
+
+ if left_empty:
+ df1 = df1.iloc[:0]
+ if right_empty:
+ df2 = df2.iloc[:0]
+
+ result = merge(df1, df2, on=["A"], how="outer")
+ expected = DataFrame(1, index=[0], columns=["A", "B", "C", "D"])
+ if left_empty and right_empty:
+ expected = expected.iloc[:0]
+ elif left_empty:
+ expected.loc[:, "B"] = np.nan
+ elif right_empty:
+ expected.loc[:, ["C", "D"]] = np.nan
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py
index b2a6ac49fdff2..3a284f7732ac1 100644
--- a/pandas/tests/reshape/test_cut.py
+++ b/pandas/tests/reshape/test_cut.py
@@ -21,7 +21,7 @@
to_datetime,
)
import pandas._testing as tm
-from pandas.api.types import CategoricalDtype as CDT
+from pandas.api.types import CategoricalDtype
import pandas.core.reshape.tile as tmod
@@ -359,7 +359,7 @@ def test_cut_return_intervals():
IntervalIndex.from_breaks(exp_bins, closed="right").take(
[0, 0, 0, 1, 1, 1, 2, 2, 2]
)
- ).astype(CDT(ordered=True))
+ ).astype(CategoricalDtype(ordered=True))
tm.assert_series_equal(result, expected)
@@ -370,7 +370,7 @@ def test_series_ret_bins():
expected = Series(
IntervalIndex.from_breaks([-0.003, 1.5, 3], closed="right").repeat(2)
- ).astype(CDT(ordered=True))
+ ).astype(CategoricalDtype(ordered=True))
tm.assert_series_equal(result, expected)
@@ -445,7 +445,7 @@ def test_datetime_bin(conv):
Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2])),
]
)
- ).astype(CDT(ordered=True))
+ ).astype(CategoricalDtype(ordered=True))
bins = [conv(v) for v in bin_data]
result = Series(cut(data, bins=bins))
@@ -491,7 +491,7 @@ def test_datetime_cut(data):
),
]
)
- ).astype(CDT(ordered=True))
+ ).astype(CategoricalDtype(ordered=True))
tm.assert_series_equal(Series(result), expected)
@@ -534,7 +534,7 @@ def test_datetime_tz_cut(bins, box):
),
]
)
- ).astype(CDT(ordered=True))
+ ).astype(CategoricalDtype(ordered=True))
tm.assert_series_equal(result, expected)
@@ -700,7 +700,7 @@ def test_cut_with_duplicated_index_lowest_included():
def test_cut_with_nonexact_categorical_indices():
# GH 42424
- ser = Series(range(0, 100))
+ ser = Series(range(100))
ser1 = cut(ser, 10).value_counts().head(5)
ser2 = cut(ser, 10).value_counts().tail(5)
result = DataFrame({"1": ser1, "2": ser2})
diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
index 46da18445e135..28ad133a0c8d6 100644
--- a/pandas/tests/reshape/test_pivot.py
+++ b/pandas/tests/reshape/test_pivot.py
@@ -23,7 +23,7 @@
date_range,
)
import pandas._testing as tm
-from pandas.api.types import CategoricalDtype as CDT
+from pandas.api.types import CategoricalDtype
from pandas.core.reshape import reshape as reshape_lib
from pandas.core.reshape.pivot import pivot_table
@@ -33,7 +33,7 @@ def dropna(request):
return request.param
-@pytest.fixture(params=[([0] * 4, [1] * 4), (range(0, 3), range(1, 4))])
+@pytest.fixture(params=[([0] * 4, [1] * 4), (range(3), range(1, 4))])
def interval_values(request, closed):
left, right = request.param
return Categorical(pd.IntervalIndex.from_arrays(left, right, closed))
@@ -215,14 +215,16 @@ def test_pivot_table_dropna_categoricals(self, dropna):
{
"A": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
"B": [1, 2, 3, 1, 2, 3, 1, 2, 3],
- "C": range(0, 9),
+ "C": range(9),
}
)
- df["A"] = df["A"].astype(CDT(categories, ordered=False))
+ df["A"] = df["A"].astype(CategoricalDtype(categories, ordered=False))
result = df.pivot_table(index="B", columns="A", values="C", dropna=dropna)
expected_columns = Series(["a", "b", "c"], name="A")
- expected_columns = expected_columns.astype(CDT(categories, ordered=False))
+ expected_columns = expected_columns.astype(
+ CategoricalDtype(categories, ordered=False)
+ )
expected_index = Series([1, 2, 3], name="B")
expected = DataFrame(
[[0.0, 3.0, 6.0], [1.0, 4.0, 7.0], [2.0, 5.0, 8.0]],
diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py
index 907eeca6e9b5e..bcfbe5ed1aa20 100644
--- a/pandas/tests/reshape/test_qcut.py
+++ b/pandas/tests/reshape/test_qcut.py
@@ -20,7 +20,7 @@
timedelta_range,
)
import pandas._testing as tm
-from pandas.api.types import CategoricalDtype as CDT
+from pandas.api.types import CategoricalDtype
from pandas.tseries.offsets import (
Day,
@@ -129,7 +129,9 @@ def test_qcut_return_intervals():
exp_levels = np.array(
[Interval(-0.001, 2.664), Interval(2.664, 5.328), Interval(5.328, 8)]
)
- exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(CDT(ordered=True))
+ exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(
+ CategoricalDtype(ordered=True)
+ )
tm.assert_series_equal(res, exp)
@@ -199,7 +201,7 @@ def test_single_quantile(data, start, end, length, labels):
if labels is None:
intervals = IntervalIndex([Interval(start, end)] * length, closed="right")
- expected = Series(intervals).astype(CDT(ordered=True))
+ expected = Series(intervals).astype(CategoricalDtype(ordered=True))
else:
expected = Series([0] * length, dtype=np.intp)
@@ -249,7 +251,7 @@ def test_datetime_tz_qcut(bins):
),
]
)
- ).astype(CDT(ordered=True))
+ ).astype(CategoricalDtype(ordered=True))
tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/series/methods/test_interpolate.py b/pandas/tests/series/methods/test_interpolate.py
index 619690f400d98..549f429f09d35 100644
--- a/pandas/tests/series/methods/test_interpolate.py
+++ b/pandas/tests/series/methods/test_interpolate.py
@@ -858,3 +858,11 @@ def test_interpolate_asfreq_raises(self):
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(FutureWarning, match=msg2):
ser.interpolate(method="asfreq")
+
+ def test_interpolate_fill_value(self):
+ # GH#54920
+ pytest.importorskip("scipy")
+ ser = Series([np.nan, 0, 1, np.nan, 3, np.nan])
+ result = ser.interpolate(method="nearest", fill_value=0)
+ expected = Series([np.nan, 0, 1, 1, 3, 0])
+ tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/series/methods/test_pct_change.py b/pandas/tests/series/methods/test_pct_change.py
index 4dabf7b87e2cd..6740b8756853e 100644
--- a/pandas/tests/series/methods/test_pct_change.py
+++ b/pandas/tests/series/methods/test_pct_change.py
@@ -107,3 +107,11 @@ def test_pct_change_with_duplicated_indices(fill_method):
expected = Series([np.nan, np.nan, 1.0, 0.5, 2.0, 1.0], index=["a", "b"] * 3)
tm.assert_series_equal(result, expected)
+
+
+def test_pct_change_no_warning_na_beginning():
+ # GH#54981
+ ser = Series([None, None, 1, 2, 3])
+ result = ser.pct_change()
+ expected = Series([np.nan, np.nan, np.nan, 1, 0.5])
+ tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py
index bce7d2d554004..016208f2d2026 100644
--- a/pandas/tests/series/methods/test_reindex.py
+++ b/pandas/tests/series/methods/test_reindex.py
@@ -159,9 +159,9 @@ def test_reindex_inference():
def test_reindex_downcasting():
# GH4618 shifted series downcasting
- s = Series(False, index=range(0, 5))
+ s = Series(False, index=range(5))
result = s.shift(1).bfill()
- expected = Series(False, index=range(0, 5))
+ expected = Series(False, index=range(5))
tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index cb703d3439d44..661290fb00d13 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -17,7 +17,7 @@
is_integer_dtype,
is_object_dtype,
)
-from pandas.core.dtypes.dtypes import CategoricalDtype as CDT
+from pandas.core.dtypes.dtypes import CategoricalDtype
import pandas as pd
from pandas import (
@@ -1182,7 +1182,7 @@ def test_value_counts(self):
with tm.assert_produces_warning(FutureWarning, match=msg):
result = algos.value_counts(factor)
breaks = [-1.606, -1.018, -0.431, 0.155, 0.741]
- index = IntervalIndex.from_breaks(breaks).astype(CDT(ordered=True))
+ index = IntervalIndex.from_breaks(breaks).astype(CategoricalDtype(ordered=True))
expected = Series([1, 0, 2, 1], index=index, name="count")
tm.assert_series_equal(result.sort_index(), expected.sort_index())
diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py
index ab00e18fc4812..46ab00c3e2284 100644
--- a/pandas/tests/window/test_groupby.py
+++ b/pandas/tests/window/test_groupby.py
@@ -466,20 +466,23 @@ def test_groupby_rolling_subset_with_closed(self):
# GH 35549
df = DataFrame(
{
- "column1": range(6),
- "column2": range(6),
- "group": 3 * ["A", "B"],
- "date": [Timestamp("2019-01-01")] * 6,
+ "column1": range(8),
+ "column2": range(8),
+ "group": ["A"] * 4 + ["B"] * 4,
+ "date": [
+ Timestamp(date)
+ for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"]
+ ]
+ * 2,
}
)
result = (
df.groupby("group").rolling("1D", on="date", closed="left")["column1"].sum()
)
expected = Series(
- [np.nan, 0.0, 2.0, np.nan, 1.0, 4.0],
- index=MultiIndex.from_tuples(
- [("A", Timestamp("2019-01-01"))] * 3
- + [("B", Timestamp("2019-01-01"))] * 3,
+ [np.nan, np.nan, 1.0, 1.0, np.nan, np.nan, 9.0, 9.0],
+ index=MultiIndex.from_frame(
+ df[["group", "date"]],
names=["group", "date"],
),
name="column1",
@@ -490,10 +493,14 @@ def test_groupby_subset_rolling_subset_with_closed(self):
# GH 35549
df = DataFrame(
{
- "column1": range(6),
- "column2": range(6),
- "group": 3 * ["A", "B"],
- "date": [Timestamp("2019-01-01")] * 6,
+ "column1": range(8),
+ "column2": range(8),
+ "group": ["A"] * 4 + ["B"] * 4,
+ "date": [
+ Timestamp(date)
+ for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"]
+ ]
+ * 2,
}
)
@@ -503,10 +510,9 @@ def test_groupby_subset_rolling_subset_with_closed(self):
.sum()
)
expected = Series(
- [np.nan, 0.0, 2.0, np.nan, 1.0, 4.0],
- index=MultiIndex.from_tuples(
- [("A", Timestamp("2019-01-01"))] * 3
- + [("B", Timestamp("2019-01-01"))] * 3,
+ [np.nan, np.nan, 1.0, 1.0, np.nan, np.nan, 9.0, 9.0],
+ index=MultiIndex.from_frame(
+ df[["group", "date"]],
names=["group", "date"],
),
name="column1",
diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py
index f4d903dc19fb7..3fe922539780d 100644
--- a/pandas/tests/window/test_rolling.py
+++ b/pandas/tests/window/test_rolling.py
@@ -100,9 +100,9 @@ def test_freq_window_not_implemented(window):
index=date_range("2015-12-24", periods=10, freq="D"),
)
with pytest.raises(
- NotImplementedError, match="step is not supported with frequency windows"
+ NotImplementedError, match="^step (not implemented|is not supported)"
):
- df.rolling("3D", step=3)
+ df.rolling(window, step=3).sum()
@pytest.mark.parametrize("agg", ["cov", "corr"])
@@ -304,6 +304,76 @@ def test_datetimelike_nonunique_index_centering(
tm.assert_equal(result, expected)
+@pytest.mark.parametrize(
+ "closed,expected",
+ [
+ ("left", [np.nan, np.nan, 1, 1, 1, 10, 14, 14, 18, 21]),
+ ("neither", [np.nan, np.nan, 1, 1, 1, 9, 5, 5, 13, 8]),
+ ("right", [0, 1, 3, 6, 10, 14, 11, 18, 21, 17]),
+ ("both", [0, 1, 3, 6, 10, 15, 20, 27, 26, 30]),
+ ],
+)
+def test_variable_window_nonunique(closed, expected, frame_or_series):
+ # GH 20712
+ index = DatetimeIndex(
+ [
+ "2011-01-01",
+ "2011-01-01",
+ "2011-01-02",
+ "2011-01-02",
+ "2011-01-02",
+ "2011-01-03",
+ "2011-01-04",
+ "2011-01-04",
+ "2011-01-05",
+ "2011-01-06",
+ ]
+ )
+
+ df = frame_or_series(range(10), index=index, dtype=float)
+ expected = frame_or_series(expected, index=index, dtype=float)
+
+ result = df.rolling("2D", closed=closed).sum()
+
+ tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+ "closed,expected",
+ [
+ ("left", [np.nan, np.nan, 1, 1, 1, 10, 15, 15, 18, 21]),
+ ("neither", [np.nan, np.nan, 1, 1, 1, 10, 15, 15, 13, 8]),
+ ("right", [0, 1, 3, 6, 10, 15, 21, 28, 21, 17]),
+ ("both", [0, 1, 3, 6, 10, 15, 21, 28, 26, 30]),
+ ],
+)
+def test_variable_offset_window_nonunique(closed, expected, frame_or_series):
+ # GH 20712
+ index = DatetimeIndex(
+ [
+ "2011-01-01",
+ "2011-01-01",
+ "2011-01-02",
+ "2011-01-02",
+ "2011-01-02",
+ "2011-01-03",
+ "2011-01-04",
+ "2011-01-04",
+ "2011-01-05",
+ "2011-01-06",
+ ]
+ )
+
+ df = frame_or_series(range(10), index=index, dtype=float)
+ expected = frame_or_series(expected, index=index, dtype=float)
+
+ offset = BusinessDay(2)
+ indexer = VariableOffsetWindowIndexer(index=index, offset=offset)
+ result = df.rolling(indexer, closed=closed, min_periods=1).sum()
+
+ tm.assert_equal(result, expected)
+
+
def test_even_number_window_alignment():
# see discussion in GH 38780
s = Series(range(3), index=date_range(start="2020-01-01", freq="D", periods=3))
diff --git a/pandas/tests/window/test_rolling_functions.py b/pandas/tests/window/test_rolling_functions.py
index 940f0845befa2..51f801ab3761b 100644
--- a/pandas/tests/window/test_rolling_functions.py
+++ b/pandas/tests/window/test_rolling_functions.py
@@ -388,7 +388,7 @@ def test_rolling_max_resample(step):
# So that we can have 3 datapoints on last day (4, 10, and 20)
indices.append(datetime(1975, 1, 5, 1))
indices.append(datetime(1975, 1, 5, 2))
- series = Series(list(range(0, 5)) + [10, 20], index=indices)
+ series = Series(list(range(5)) + [10, 20], index=indices)
# Use floats instead of ints as values
series = series.map(lambda x: float(x))
# Sort chronologically
@@ -425,7 +425,7 @@ def test_rolling_min_resample(step):
# So that we can have 3 datapoints on last day (4, 10, and 20)
indices.append(datetime(1975, 1, 5, 1))
indices.append(datetime(1975, 1, 5, 2))
- series = Series(list(range(0, 5)) + [10, 20], index=indices)
+ series = Series(list(range(5)) + [10, 20], index=indices)
# Use floats instead of ints as values
series = series.map(lambda x: float(x))
# Sort chronologically
@@ -445,7 +445,7 @@ def test_rolling_median_resample():
# So that we can have 3 datapoints on last day (4, 10, and 20)
indices.append(datetime(1975, 1, 5, 1))
indices.append(datetime(1975, 1, 5, 2))
- series = Series(list(range(0, 5)) + [10, 20], index=indices)
+ series = Series(list(range(5)) + [10, 20], index=indices)
# Use floats instead of ints as values
series = series.map(lambda x: float(x))
# Sort chronologically