Merge branch 'main' into depr-fastpath

jbrockmendel · Sep 1, 2023 · 6b23901 · 6b23901
2 parents 604d716 + 80a1a8b
commit 6b23901
Show file tree

Hide file tree

Showing 63 changed files with 472 additions and 210 deletions.
diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
@@ -694,22 +694,28 @@ def time_frame_sort_values(self, ascending):
 
 
 class SortMultiKey:
-    def setup(self):
+    params = [True, False]
+    param_names = ["monotonic"]
+
+    def setup(self, monotonic):
         N = 10000
         K = 10
-        self.df_by_columns = DataFrame(
+        df = DataFrame(
             {
                 "key1": tm.makeStringIndex(N).values.repeat(K),
                 "key2": tm.makeStringIndex(N).values.repeat(K),
                 "value": np.random.randn(N * K),
             }
         )
-        self.df_by_index = self.df_by_columns.set_index(["key1", "key2"])
+        if monotonic:
+            df = df.sort_values(["key1", "key2"])
+        self.df_by_columns = df
+        self.df_by_index = df.set_index(["key1", "key2"])
 
-    def time_sort_values(self):
+    def time_sort_values(self, monotonic):
         self.df_by_columns.sort_values(by=["key1", "key2"])
 
-    def time_sort_index(self):
+    def time_sort_index(self, monotonic):
         self.df_by_index.sort_index()
 
 

diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst
@@ -13,6 +13,7 @@ including other versions of pandas.
 
 Fixed regressions
 ~~~~~~~~~~~~~~~~~
+- Fixed regression in :func:`read_csv` when ``usecols`` is given and ``dtypes`` is a dict for ``engine="python"`` (:issue:`54868`)
 - Fixed regression in :meth:`DataFrame.__setitem__` raising ``AssertionError`` when setting a :class:`Series` with a partial :class:`MultiIndex` (:issue:`54875`)
 
 .. ---------------------------------------------------------------------------

diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -146,6 +146,7 @@ Deprecations
 - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_pickle` except ``path``. (:issue:`54229`)
 - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_string` except ``buf``. (:issue:`54229`)
 - Deprecated allowing passing :class:`BlockManager` objects to :class:`DataFrame` or :class:`SingleBlockManager` objects to :class:`Series` (:issue:`52419`)
+- Deprecated downcasting behavior in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, :meth:`DataFrame.mask`, :meth:`Series.clip`, :meth:`DataFrame.clip`; in a future version these will not infer object-dtype columns to non-object dtype, or all-round floats to integer dtype. Call ``result.infer_objects(copy=False)`` on the result for object inference, or explicitly cast floats to ints. To opt in to the future version, use ``pd.set_option("future.downcasting", True)`` (:issue:`53656`)
 - Deprecated not passing a tuple to :class:`DataFrameGroupBy.get_group` or :class:`SeriesGroupBy.get_group` when grouping by a length-1 list-like (:issue:`25971`)
 - Deprecated strings ``S``, ``U``, and ``N`` denoting units in :func:`to_timedelta` (:issue:`52536`)
 - Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`52536`)
@@ -158,7 +159,7 @@ Deprecations
 
 Performance improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
-- Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`)
+- Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`, :issue:`54883`)
 - Performance improvement when indexing with more than 4 keys (:issue:`54550`)
 -
 

diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py
@@ -118,7 +118,7 @@ def round_trip_localpath(writer, reader, path: str | None = None):
     return obj
 
 
-def write_to_compressed(compression, path, data, dest: str = "test"):
+def write_to_compressed(compression, path, data, dest: str = "test") -> None:
     """
     Write data to a compressed file.
 

diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py
@@ -36,7 +36,7 @@
 ]
 
 
-def __getattr__(name: str):
+def __getattr__(name: str) -> type[NumpyExtensionArray]:
     if name == "PandasArray":
         # GH#53694
         import warnings

diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py
@@ -26,7 +26,7 @@
     from collections.abc import Generator
 
 
-def load_reduce(self):
+def load_reduce(self) -> None:
     stack = self.stack
     args = stack.pop()
     func = stack[-1]

diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -142,6 +142,7 @@ def pytest_collection_modifyitems(items, config) -> None:
         ("is_sparse", "is_sparse is deprecated"),
         ("NDFrame.replace", "The 'method' keyword"),
         ("NDFrame.replace", "Series.replace without 'value'"),
+        ("NDFrame.clip", "Downcasting behavior in Series and DataFrame methods"),
         ("Series.idxmin", "The behavior of Series.idxmin"),
         ("Series.idxmax", "The behavior of Series.idxmax"),
         ("SeriesGroupBy.idxmin", "The behavior of Series.idxmin"),
@@ -492,7 +493,7 @@ def box_with_array(request):
 
 
 @pytest.fixture
-def dict_subclass():
+def dict_subclass() -> type[dict]:
     """
     Fixture for a dictionary subclass.
     """
@@ -505,7 +506,7 @@ def __init__(self, *args, **kwargs) -> None:
 
 
 @pytest.fixture
-def non_dict_mapping_subclass():
+def non_dict_mapping_subclass() -> type[abc.Mapping]:
     """
     Fixture for a non-mapping dictionary subclass.
     """

diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -54,9 +54,9 @@
 
 if TYPE_CHECKING:
     from collections.abc import (
+        Generator,
         Hashable,
         Iterable,
-        Iterator,
         Sequence,
     )
 
@@ -253,7 +253,7 @@ def transform(self) -> DataFrame | Series:
 
         return result
 
-    def transform_dict_like(self, func):
+    def transform_dict_like(self, func) -> DataFrame:
         """
         Compute transform in the case of a dict-like func
         """
@@ -315,7 +315,7 @@ def compute_list_like(
         op_name: Literal["agg", "apply"],
         selected_obj: Series | DataFrame,
         kwargs: dict[str, Any],
-    ) -> tuple[list[Hashable], list[Any]]:
+    ) -> tuple[list[Hashable] | Index, list[Any]]:
         """
         Compute agg/apply results for like-like input.
 
@@ -330,7 +330,7 @@ def compute_list_like(
 
         Returns
         -------
-        keys : list[hashable]
+        keys : list[Hashable] or Index
             Index labels for result.
         results : list
             Data for result. When aggregating with a Series, this can contain any
@@ -370,12 +370,14 @@ def compute_list_like(
                 new_res = getattr(colg, op_name)(func, *args, **kwargs)
                 results.append(new_res)
                 indices.append(index)
-            keys = selected_obj.columns.take(indices)
+            # error: Incompatible types in assignment (expression has type "Any |
+            # Index", variable has type "list[Any | Callable[..., Any] | str]")
+            keys = selected_obj.columns.take(indices)  # type: ignore[assignment]
 
         return keys, results
 
     def wrap_results_list_like(
-        self, keys: list[Hashable], results: list[Series | DataFrame]
+        self, keys: Iterable[Hashable], results: list[Series | DataFrame]
     ):
         from pandas.core.reshape.concat import concat
 
@@ -772,7 +774,7 @@ def result_columns(self) -> Index:
 
     @property
     @abc.abstractmethod
-    def series_generator(self) -> Iterator[Series]:
+    def series_generator(self) -> Generator[Series, None, None]:
         pass
 
     @abc.abstractmethod
@@ -1014,7 +1016,7 @@ class FrameRowApply(FrameApply):
     axis: AxisInt = 0
 
     @property
-    def series_generator(self):
+    def series_generator(self) -> Generator[Series, None, None]:
         return (self.obj._ixs(i, axis=1) for i in range(len(self.columns)))
 
     @property
@@ -1075,7 +1077,7 @@ def apply_broadcast(self, target: DataFrame) -> DataFrame:
         return result.T
 
     @property
-    def series_generator(self):
+    def series_generator(self) -> Generator[Series, None, None]:
         values = self.values
         values = ensure_wrapped_if_datetimelike(values)
         assert len(values) > 0

diff --git a/pandas/core/arrays/arrow/extension_types.py b/pandas/core/arrays/arrow/extension_types.py
@@ -48,7 +48,7 @@ def __ne__(self, other) -> bool:
     def __hash__(self) -> int:
         return hash((str(self), self.freq))
 
-    def to_pandas_dtype(self):
+    def to_pandas_dtype(self) -> PeriodDtype:
         return PeriodDtype(freq=self.freq)
 
 
@@ -105,7 +105,7 @@ def __ne__(self, other) -> bool:
     def __hash__(self) -> int:
         return hash((str(self), str(self.subtype), self.closed))
 
-    def to_pandas_dtype(self):
+    def to_pandas_dtype(self) -> IntervalDtype:
         return IntervalDtype(self.subtype.to_pandas_dtype(), self.closed)
 
 

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2410,7 +2410,7 @@ def _mode(self, dropna: bool = True) -> Categorical:
     # ------------------------------------------------------------------
     # ExtensionArray Interface
 
-    def unique(self):
+    def unique(self) -> Self:
         """
         Return the ``Categorical`` which ``categories`` and ``codes`` are
         unique.

diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
@@ -110,7 +110,7 @@
     )
 
 
-IntervalSideT = Union[TimeArrayLike, np.ndarray]
+IntervalSide = Union[TimeArrayLike, np.ndarray]
 IntervalOrNA = Union[Interval, float]
 
 _interval_shared_docs: dict[str, str] = {}
@@ -219,8 +219,8 @@ def ndim(self) -> Literal[1]:
         return 1
 
     # To make mypy recognize the fields
-    _left: IntervalSideT
-    _right: IntervalSideT
+    _left: IntervalSide
+    _right: IntervalSide
     _dtype: IntervalDtype
 
     # ---------------------------------------------------------------------
@@ -237,8 +237,8 @@ def __new__(
         data = extract_array(data, extract_numpy=True)
 
         if isinstance(data, cls):
-            left: IntervalSideT = data._left
-            right: IntervalSideT = data._right
+            left: IntervalSide = data._left
+            right: IntervalSide = data._right
             closed = closed or data.closed
             dtype = IntervalDtype(left.dtype, closed=closed)
         else:
@@ -280,8 +280,8 @@ def __new__(
     @classmethod
     def _simple_new(
         cls,
-        left: IntervalSideT,
-        right: IntervalSideT,
+        left: IntervalSide,
+        right: IntervalSide,
         dtype: IntervalDtype,
     ) -> Self:
         result = IntervalMixin.__new__(cls)
@@ -299,7 +299,7 @@ def _ensure_simple_new_inputs(
         closed: IntervalClosedType | None = None,
         copy: bool = False,
         dtype: Dtype | None = None,
-    ) -> tuple[IntervalSideT, IntervalSideT, IntervalDtype]:
+    ) -> tuple[IntervalSide, IntervalSide, IntervalDtype]:
         """Ensure correctness of input parameters for cls._simple_new."""
         from pandas.core.indexes.base import ensure_index
 
@@ -1031,8 +1031,8 @@ def _concat_same_type(cls, to_concat: Sequence[IntervalArray]) -> Self:
             raise ValueError("Intervals must all be closed on the same side.")
         closed = closed_set.pop()
 
-        left = np.concatenate([interval.left for interval in to_concat])
-        right = np.concatenate([interval.right for interval in to_concat])
+        left: IntervalSide = np.concatenate([interval.left for interval in to_concat])
+        right: IntervalSide = np.concatenate([interval.right for interval in to_concat])
 
         left, right, dtype = cls._ensure_simple_new_inputs(left, right, closed=closed)
 
@@ -1283,7 +1283,7 @@ def _format_space(self) -> str:
     # Vectorized Interval Properties/Attributes
 
     @property
-    def left(self):
+    def left(self) -> Index:
         """
         Return the left endpoints of each Interval in the IntervalArray as an Index.
 
@@ -1303,7 +1303,7 @@ def left(self):
         return Index(self._left, copy=False)
 
     @property
-    def right(self):
+    def right(self) -> Index:
         """
         Return the right endpoints of each Interval in the IntervalArray as an Index.
 
@@ -1855,11 +1855,17 @@ def isin(self, values) -> npt.NDArray[np.bool_]:
         return isin(self.astype(object), values.astype(object))
 
     @property
-    def _combined(self) -> IntervalSideT:
-        left = self.left._values.reshape(-1, 1)
-        right = self.right._values.reshape(-1, 1)
+    def _combined(self) -> IntervalSide:
+        # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]"
+        # has no attribute "reshape"  [union-attr]
+        left = self.left._values.reshape(-1, 1)  # type: ignore[union-attr]
+        right = self.right._values.reshape(-1, 1)  # type: ignore[union-attr]
         if needs_i8_conversion(left.dtype):
-            comb = left._concat_same_type([left, right], axis=1)
+            # error: Item "ndarray[Any, Any]" of "Any | ndarray[Any, Any]" has
+            # no attribute "_concat_same_type"
+            comb = left._concat_same_type(  # type: ignore[union-attr]
+                [left, right], axis=1
+            )
         else:
             comb = np.concatenate([left, right], axis=1)
         return comb

diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py
@@ -948,7 +948,7 @@ def _check_timedeltalike_freq_compat(self, other):
         return lib.item_from_zerodim(delta)
 
 
-def raise_on_incompatible(left, right):
+def raise_on_incompatible(left, right) -> IncompatibleFrequency:
     """
     Helper function to render a consistent error message when raising
     IncompatibleFrequency.
@@ -1089,7 +1089,7 @@ def validate_dtype_freq(dtype, freq: timedelta | str | None) -> BaseOffset:
 
 
 def validate_dtype_freq(
-    dtype, freq: BaseOffsetT | timedelta | str | None
+    dtype, freq: BaseOffsetT | BaseOffset | timedelta | str | None
 ) -> BaseOffsetT:
     """
     If both a dtype and a freq are available, ensure they match.  If only
@@ -1110,10 +1110,7 @@ def validate_dtype_freq(
     IncompatibleFrequency : mismatch between dtype and freq
     """
     if freq is not None:
-        # error: Incompatible types in assignment (expression has type
-        # "BaseOffset", variable has type "Union[BaseOffsetT, timedelta,
-        # str, None]")
-        freq = to_offset(freq)  # type: ignore[assignment]
+        freq = to_offset(freq)
 
     if dtype is not None:
         dtype = pandas_dtype(dtype)

diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -702,7 +702,9 @@ def npoints(self) -> int:
         """
         return self.sp_index.npoints
 
-    def isna(self):
+    # error: Return type "SparseArray" of "isna" incompatible with return type
+    # "ndarray[Any, Any] | ExtensionArraySupportsAnyAll" in supertype "ExtensionArray"
+    def isna(self) -> Self:  # type: ignore[override]
         # If null fill value, we want SparseDtype[bool, true]
         # to preserve the same memory usage.
         dtype = SparseDtype(bool, self._null_fill_value)
@@ -1421,7 +1423,7 @@ def all(self, axis=None, *args, **kwargs):
 
         return values.all()
 
-    def any(self, axis: AxisInt = 0, *args, **kwargs):
+    def any(self, axis: AxisInt = 0, *args, **kwargs) -> bool:
         """
         Tests whether at least one of elements evaluate True
 

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -59,6 +59,7 @@
         NumpySorter,
         NumpyValueArrayLike,
         Scalar,
+        Self,
         npt,
         type_t,
     )
@@ -135,7 +136,7 @@ def type(self) -> type[str]:
         return str
 
     @classmethod
-    def construct_from_string(cls, string):
+    def construct_from_string(cls, string) -> Self:
         """
         Construct a StringDtype from a string.