From 801181b0f4c6c6e92327f0c90eaf18af24eb401e Mon Sep 17 00:00:00 2001 From: Parthi Date: Tue, 26 Jul 2022 22:08:56 +0530 Subject: [PATCH 01/23] TST: Addition test for get_indexer for interval index (#47816) * TST: Addition test for get_indexer for interval index * TST: Moved tests to interval and fixed pre-commit issue * TST: Changed to assert numpy array equals * TST: Moved test to test_indexing --- pandas/tests/indexes/interval/test_indexing.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index e05cb73cfe446b..74d17b31aff277 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -15,7 +15,11 @@ NaT, Series, Timedelta, + Timestamp, + array, date_range, + interval_range, + period_range, timedelta_range, ) import pandas._testing as tm @@ -415,6 +419,16 @@ def test_get_indexer_multiindex_with_intervals(self): expected = np.array([1, 4, 7], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize("box", [IntervalIndex, array, list]) + def test_get_indexer_interval_index(self, box): + # GH#30178 + rng = period_range("2022-07-01", freq="D", periods=3) + idx = box(interval_range(Timestamp("2022-07-01"), freq="3D", periods=3)) + + actual = rng.get_indexer(idx) + expected = np.array([-1, -1, -1], dtype=np.intp) + tm.assert_numpy_array_equal(actual, expected) + class TestSliceLocs: def test_slice_locs_with_interval(self): From 45a38c611798f6973b6bf95de95885edc6f2751a Mon Sep 17 00:00:00 2001 From: CCXXXI Date: Wed, 27 Jul 2022 02:06:06 +0800 Subject: [PATCH 02/23] DOC: correct links of Plotly (#47858) https://poltly.com/ -> https://plotly.com/ fix GH47857 --- doc/source/ecosystem.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 256c3ee36e80cd..a847624c55eba8 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -161,10 +161,10 @@ A good implementation for Python users is `has2k1/plotnine `__ leverages `Vega `__ to create plots within Jupyter Notebook. -`Plotly `__ +`Plotly `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -`Plotly’s `__ `Python API `__ enables interactive figures and web shareability. Maps, 2D, 3D, and live-streaming graphs are rendered with WebGL and `D3.js `__. The library supports plotting directly from a pandas DataFrame and cloud-based collaboration. Users of `matplotlib, ggplot for Python, and Seaborn `__ can convert figures into interactive web-based plots. Plots can be drawn in `IPython Notebooks `__ , edited with R or MATLAB, modified in a GUI, or embedded in apps and dashboards. Plotly is free for unlimited sharing, and has `offline `__, or `on-premise `__ accounts for private use. +`Plotly’s `__ `Python API `__ enables interactive figures and web shareability. Maps, 2D, 3D, and live-streaming graphs are rendered with WebGL and `D3.js `__. The library supports plotting directly from a pandas DataFrame and cloud-based collaboration. Users of `matplotlib, ggplot for Python, and Seaborn `__ can convert figures into interactive web-based plots. Plots can be drawn in `IPython Notebooks `__ , edited with R or MATLAB, modified in a GUI, or embedded in apps and dashboards. Plotly is free for unlimited sharing, and has `offline `__, or `on-premise `__ accounts for private use. `Lux `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 6b7578b86889c3c0e8f9ffe1ab56159ef0edd281 Mon Sep 17 00:00:00 2001 From: srotondo <97266896+srotondo@users.noreply.github.com> Date: Tue, 26 Jul 2022 14:11:04 -0700 Subject: [PATCH 03/23] Add test for unique aggregation returning different dtypes (#47603) * TST: Added test for consistent type with unique agg #22558 * TST: Added test for consistent type with unique agg #22558 * TST: Moved and restructured test #22558 * TST: Moved test to different file #22558 * TST: Changed scalars to 1-element lists Co-authored-by: Steven Rotondo --- pandas/tests/apply/test_frame_apply.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 72a9d8723d34c0..1ef0865fff552a 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -1585,3 +1585,21 @@ def test_apply_on_empty_dataframe(): result = df.head(0).apply(lambda x: max(x["a"], x["b"]), axis=1) expected = Series([]) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "test, constant", + [ + ({"a": [1, 2, 3], "b": [1, 1, 1]}, {"a": [1, 2, 3], "b": [1]}), + ({"a": [2, 2, 2], "b": [1, 1, 1]}, {"a": [2], "b": [1]}), + ], +) +def test_unique_agg_type_is_series(test, constant): + # GH#22558 + df1 = DataFrame(test) + expected = Series(data=constant, index=["a", "b"], dtype="object") + aggregation = {"a": "unique", "b": "unique"} + + result = df1.agg(aggregation) + + tm.assert_series_equal(result, expected) From e12b318194cde185c17b09553d73703f4c8110e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Tue, 26 Jul 2022 18:34:01 -0400 Subject: [PATCH 04/23] TYP/CLN: small cleanups from flake-pyi (#47850) * TYP/CLN: small cleanups from flake-pyi * black * manual changes in py files * some more cases --- .pre-commit-config.yaml | 2 +- pandas/_libs/algos.pyi | 2 -- pandas/_libs/interval.pyi | 20 +++++++++----------- pandas/_libs/join.pyi | 6 +++--- pandas/_libs/json.pyi | 2 +- pandas/_libs/tslibs/offsets.pyi | 2 -- pandas/_libs/tslibs/timedeltas.pyi | 4 ++-- pandas/_libs/tslibs/timestamps.pyi | 8 +------- pandas/_libs/writers.pyi | 2 -- pandas/_typing.py | 8 ++++---- pandas/core/arrays/datetimelike.py | 4 ++-- pandas/core/frame.py | 2 +- pandas/core/nanops.py | 4 ++-- pandas/core/reshape/encoding.py | 2 +- pandas/core/tools/datetimes.py | 2 +- pandas/core/tools/timedeltas.py | 2 +- pandas/io/formats/excel.py | 2 +- pandas/io/formats/format.py | 4 ++-- pandas/io/formats/info.py | 2 +- pandas/io/formats/style.py | 4 ++-- pandas/io/formats/style_render.py | 4 ++-- pandas/io/parsers/readers.py | 4 ++-- pandas/io/stata.py | 26 +++++++++++++------------- 23 files changed, 52 insertions(+), 66 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f8cb869e6ed89a..f65c42dab1852d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -230,7 +230,7 @@ repos: language: python additional_dependencies: - flake8==4.0.1 - - flake8-pyi==22.5.1 + - flake8-pyi==22.7.0 - id: future-annotations name: import annotations from __future__ entry: 'from __future__ import annotations' diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi index f55ff0ae8b5743..9ffcf25f6eacd6 100644 --- a/pandas/_libs/algos.pyi +++ b/pandas/_libs/algos.pyi @@ -1,5 +1,3 @@ -from __future__ import annotations - from typing import Any import numpy as np diff --git a/pandas/_libs/interval.pyi b/pandas/_libs/interval.pyi index bad0f2bab93d82..b27d2b5f8fd4d6 100644 --- a/pandas/_libs/interval.pyi +++ b/pandas/_libs/interval.pyi @@ -1,5 +1,3 @@ -from __future__ import annotations - from typing import ( Any, Generic, @@ -84,7 +82,7 @@ class Interval(IntervalMixin, Generic[_OrderableT]): self: Interval[_OrderableTimesT], key: _OrderableTimesT ) -> bool: ... @overload - def __contains__(self: Interval[_OrderableScalarT], key: int | float) -> bool: ... + def __contains__(self: Interval[_OrderableScalarT], key: float) -> bool: ... @overload def __add__( self: Interval[_OrderableTimesT], y: Timedelta @@ -94,7 +92,7 @@ class Interval(IntervalMixin, Generic[_OrderableT]): self: Interval[int], y: _OrderableScalarT ) -> Interval[_OrderableScalarT]: ... @overload - def __add__(self: Interval[float], y: int | float) -> Interval[float]: ... + def __add__(self: Interval[float], y: float) -> Interval[float]: ... @overload def __radd__( self: Interval[_OrderableTimesT], y: Timedelta @@ -104,7 +102,7 @@ class Interval(IntervalMixin, Generic[_OrderableT]): self: Interval[int], y: _OrderableScalarT ) -> Interval[_OrderableScalarT]: ... @overload - def __radd__(self: Interval[float], y: int | float) -> Interval[float]: ... + def __radd__(self: Interval[float], y: float) -> Interval[float]: ... @overload def __sub__( self: Interval[_OrderableTimesT], y: Timedelta @@ -114,7 +112,7 @@ class Interval(IntervalMixin, Generic[_OrderableT]): self: Interval[int], y: _OrderableScalarT ) -> Interval[_OrderableScalarT]: ... @overload - def __sub__(self: Interval[float], y: int | float) -> Interval[float]: ... + def __sub__(self: Interval[float], y: float) -> Interval[float]: ... @overload def __rsub__( self: Interval[_OrderableTimesT], y: Timedelta @@ -124,31 +122,31 @@ class Interval(IntervalMixin, Generic[_OrderableT]): self: Interval[int], y: _OrderableScalarT ) -> Interval[_OrderableScalarT]: ... @overload - def __rsub__(self: Interval[float], y: int | float) -> Interval[float]: ... + def __rsub__(self: Interval[float], y: float) -> Interval[float]: ... @overload def __mul__( self: Interval[int], y: _OrderableScalarT ) -> Interval[_OrderableScalarT]: ... @overload - def __mul__(self: Interval[float], y: int | float) -> Interval[float]: ... + def __mul__(self: Interval[float], y: float) -> Interval[float]: ... @overload def __rmul__( self: Interval[int], y: _OrderableScalarT ) -> Interval[_OrderableScalarT]: ... @overload - def __rmul__(self: Interval[float], y: int | float) -> Interval[float]: ... + def __rmul__(self: Interval[float], y: float) -> Interval[float]: ... @overload def __truediv__( self: Interval[int], y: _OrderableScalarT ) -> Interval[_OrderableScalarT]: ... @overload - def __truediv__(self: Interval[float], y: int | float) -> Interval[float]: ... + def __truediv__(self: Interval[float], y: float) -> Interval[float]: ... @overload def __floordiv__( self: Interval[int], y: _OrderableScalarT ) -> Interval[_OrderableScalarT]: ... @overload - def __floordiv__(self: Interval[float], y: int | float) -> Interval[float]: ... + def __floordiv__(self: Interval[float], y: float) -> Interval[float]: ... def overlaps(self: Interval[_OrderableT], other: Interval[_OrderableT]) -> bool: ... def intervals_to_interval_bounds( diff --git a/pandas/_libs/join.pyi b/pandas/_libs/join.pyi index 8d02f8f57dee13..11b65b859095fc 100644 --- a/pandas/_libs/join.pyi +++ b/pandas/_libs/join.pyi @@ -55,7 +55,7 @@ def asof_join_backward_on_X_by_Y( left_by_values: np.ndarray, # by_t[:] right_by_values: np.ndarray, # by_t[:] allow_exact_matches: bool = ..., - tolerance: np.number | int | float | None = ..., + tolerance: np.number | float | None = ..., use_hashtable: bool = ..., ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... def asof_join_forward_on_X_by_Y( @@ -64,7 +64,7 @@ def asof_join_forward_on_X_by_Y( left_by_values: np.ndarray, # by_t[:] right_by_values: np.ndarray, # by_t[:] allow_exact_matches: bool = ..., - tolerance: np.number | int | float | None = ..., + tolerance: np.number | float | None = ..., use_hashtable: bool = ..., ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... def asof_join_nearest_on_X_by_Y( @@ -73,6 +73,6 @@ def asof_join_nearest_on_X_by_Y( left_by_values: np.ndarray, # by_t[:] right_by_values: np.ndarray, # by_t[:] allow_exact_matches: bool = ..., - tolerance: np.number | int | float | None = ..., + tolerance: np.number | float | None = ..., use_hashtable: bool = ..., ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... diff --git a/pandas/_libs/json.pyi b/pandas/_libs/json.pyi index 555484a37c83f3..8e7ba60ccce24f 100644 --- a/pandas/_libs/json.pyi +++ b/pandas/_libs/json.pyi @@ -12,7 +12,7 @@ def dumps( date_unit: str = ..., iso_dates: bool = ..., default_handler: None - | Callable[[Any], str | int | float | bool | list | dict | None] = ..., + | Callable[[Any], str | float | bool | list | dict | None] = ..., ) -> str: ... def loads( s: str, diff --git a/pandas/_libs/tslibs/offsets.pyi b/pandas/_libs/tslibs/offsets.pyi index c885b869f983ad..c3d550c7a5ba90 100644 --- a/pandas/_libs/tslibs/offsets.pyi +++ b/pandas/_libs/tslibs/offsets.pyi @@ -1,5 +1,3 @@ -from __future__ import annotations - from datetime import ( datetime, timedelta, diff --git a/pandas/_libs/tslibs/timedeltas.pyi b/pandas/_libs/tslibs/timedeltas.pyi index 1921329da9e241..7969c0901e08fe 100644 --- a/pandas/_libs/tslibs/timedeltas.pyi +++ b/pandas/_libs/tslibs/timedeltas.pyi @@ -86,7 +86,7 @@ class Timedelta(timedelta): cls: type[_S], value=..., unit: str = ..., - **kwargs: int | float | np.integer | np.floating, + **kwargs: float | np.integer | np.floating, ) -> _S: ... # GH 46171 # While Timedelta can return pd.NaT, having the constructor return @@ -123,7 +123,7 @@ class Timedelta(timedelta): @overload # type: ignore[override] def __floordiv__(self, other: timedelta) -> int: ... @overload - def __floordiv__(self, other: int | float) -> Timedelta: ... + def __floordiv__(self, other: float) -> Timedelta: ... @overload def __floordiv__( self, other: npt.NDArray[np.timedelta64] diff --git a/pandas/_libs/tslibs/timestamps.pyi b/pandas/_libs/tslibs/timestamps.pyi index 082f26cf6f2139..f39d1f44d82c03 100644 --- a/pandas/_libs/tslibs/timestamps.pyi +++ b/pandas/_libs/tslibs/timestamps.pyi @@ -33,13 +33,7 @@ class Timestamp(datetime): value: int # np.int64 def __new__( cls: type[_DatetimeT], - ts_input: int - | np.integer - | float - | str - | _date - | datetime - | np.datetime64 = ..., + ts_input: np.integer | float | str | _date | datetime | np.datetime64 = ..., freq: int | None | str | BaseOffset = ..., tz: str | _tzinfo | None | int = ..., unit: str | int | None = ..., diff --git a/pandas/_libs/writers.pyi b/pandas/_libs/writers.pyi index 0d2096eee35739..611c0c7cd15126 100644 --- a/pandas/_libs/writers.pyi +++ b/pandas/_libs/writers.pyi @@ -1,5 +1,3 @@ -from __future__ import annotations - import numpy as np from pandas._typing import ArrayLike diff --git a/pandas/_typing.py b/pandas/_typing.py index 4bc5f754004554..6e531d10336b5b 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -82,7 +82,7 @@ # scalars -PythonScalar = Union[str, int, float, bool] +PythonScalar = Union[str, float, bool] DatetimeLikeScalar = Union["Period", "Timestamp", "Timedelta"] PandasScalar = Union["Period", "Timestamp", "Timedelta", "Interval"] Scalar = Union[PythonScalar, PandasScalar, np.datetime64, np.timedelta64, datetime] @@ -92,10 +92,10 @@ # timestamp and timedelta convertible types TimestampConvertibleTypes = Union[ - "Timestamp", datetime, np.datetime64, int, np.int64, float, str + "Timestamp", datetime, np.datetime64, np.int64, float, str ] TimedeltaConvertibleTypes = Union[ - "Timedelta", timedelta, np.timedelta64, int, np.int64, float, str + "Timedelta", timedelta, np.timedelta64, np.int64, float, str ] Timezone = Union[str, tzinfo] @@ -126,7 +126,7 @@ ] # dtypes -NpDtype = Union[str, np.dtype, type_t[Union[str, float, int, complex, bool, object]]] +NpDtype = Union[str, np.dtype, type_t[Union[str, complex, bool, object]]] Dtype = Union["ExtensionDtype", NpDtype] AstypeArg = Union["ExtensionDtype", "npt.DTypeLike"] # DtypeArg specifies all allowable dtypes in a functions its dtype argument diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index c3fbd716ad09df..ea9414aaaa1a8d 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -2170,11 +2170,11 @@ def validate_periods(periods: None) -> None: @overload -def validate_periods(periods: int | float) -> int: +def validate_periods(periods: float) -> int: ... -def validate_periods(periods: int | float | None) -> int | None: +def validate_periods(periods: float | None) -> int | None: """ If a `periods` argument is passed to the Datetime/Timedelta Array/Index constructor, cast it to an integer. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 47203fbf315e59..a541cbfe502fba 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2563,7 +2563,7 @@ def to_stata( compression: CompressionOptions = "infer", storage_options: StorageOptions = None, *, - value_labels: dict[Hashable, dict[float | int, str]] | None = None, + value_labels: dict[Hashable, dict[float, str]] | None = None, ) -> None: """ Export DataFrame object to Stata dta format. diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 81766dc91f271f..6658b25d09e6d9 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -827,7 +827,7 @@ def _get_counts_nanvar( axis: int | None, ddof: int, dtype: np.dtype = np.dtype(np.float64), -) -> tuple[int | float | np.ndarray, int | float | np.ndarray]: +) -> tuple[float | np.ndarray, float | np.ndarray]: """ Get the count of non-null values along an axis, accounting for degrees of freedom. @@ -1414,7 +1414,7 @@ def _get_counts( mask: npt.NDArray[np.bool_] | None, axis: int | None, dtype: np.dtype = np.dtype(np.float64), -) -> int | float | np.ndarray: +) -> float | np.ndarray: """ Get the count of non-null values along an axis diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index fc908a56488856..da4de8cc57e65b 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -272,7 +272,7 @@ def get_empty_frame(data) -> DataFrame: if sparse: - fill_value: bool | float | int + fill_value: bool | float if is_integer_dtype(dtype): fill_value = 0 elif dtype == np.dtype(bool): diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 1ec0e6ca83d8f4..7ec4bc1016a9d7 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -91,7 +91,7 @@ # types used in annotations ArrayConvertible = Union[List, Tuple, AnyArrayLike] -Scalar = Union[int, float, str] +Scalar = Union[float, str] DatetimeScalar = Union[Scalar, datetime] DatetimeScalarOrArrayConvertible = Union[DatetimeScalar, ArrayConvertible] diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index 720d02f0cf59e6..5026c97c0b2b01 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -45,7 +45,7 @@ @overload def to_timedelta( - arg: str | int | float | timedelta, + arg: str | float | timedelta, unit: UnitChoices | None = ..., errors: DateTimeErrorChoices = ..., ) -> Timedelta: diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index 3f9f6c7a5fee7d..4279a9707e6925 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -331,7 +331,7 @@ def build_number_format(self, props: Mapping[str, str]) -> dict[str, str | None] def build_font( self, props: Mapping[str, str] - ) -> dict[str, bool | int | float | str | None]: + ) -> dict[str, bool | float | str | None]: font_names = self._get_font_names(props) decoration = self._get_decoration(props) return { diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 57bc534bd67c47..58beba4ddcb441 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1447,7 +1447,7 @@ def __init__(self, *args, **kwargs) -> None: def _value_formatter( self, float_format: FloatFormatType | None = None, - threshold: float | int | None = None, + threshold: float | None = None, ) -> Callable: """Returns a function to be applied on each value to format it""" # the float_format parameter supersedes self.float_format @@ -2047,7 +2047,7 @@ def __init__( self.accuracy = accuracy self.use_eng_prefix = use_eng_prefix - def __call__(self, num: int | float) -> str: + def __call__(self, num: float) -> str: """ Formats a number in engineering notation, appending a letter representing the power of 1000 of the original number. Some examples: diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 07ec50a2cd6a89..543df8bec657b8 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -325,7 +325,7 @@ def _put_str(s: str | Dtype, space: int) -> str: return str(s)[:space].ljust(space) -def _sizeof_fmt(num: int | float, size_qualifier: str) -> str: +def _sizeof_fmt(num: float, size_qualifier: str) -> str: """ Return size in human readable format. diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index a557f9b5c0a0dc..01b4812d3dc2a1 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -3213,7 +3213,7 @@ def bar( cmap: Any | None = None, width: float = 100, height: float = 100, - align: str | float | int | Callable = "mid", + align: str | float | Callable = "mid", vmin: float | None = None, vmax: float | None = None, props: str = "width: 10em;", @@ -4039,7 +4039,7 @@ def _highlight_value(data: DataFrame | Series, op: str, props: str) -> np.ndarra def _bar( data: NDFrame, - align: str | float | int | Callable, + align: str | float | Callable, colors: str | list | tuple, cmap: Any, width: float, diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index 65720e675a77af..dba161cf6d45c9 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -48,7 +48,7 @@ BaseFormatter = Union[str, Callable] ExtFormatter = Union[BaseFormatter, Dict[Any, Optional[BaseFormatter]]] -CSSPair = Tuple[str, Union[str, int, float]] +CSSPair = Tuple[str, Union[str, float]] CSSList = List[CSSPair] CSSProperties = Union[str, CSSList] @@ -2055,7 +2055,7 @@ def _parse_latex_header_span( return display_val -def _parse_latex_options_strip(value: str | int | float, arg: str) -> str: +def _parse_latex_options_strip(value: str | float, arg: str) -> str: """ Strip a css_value which may have latex wrapping arguments, css comment identifiers, and whitespaces, to a valid string for latex options parsing. diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 4858d56d71c429..dc4556542d8e20 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -499,7 +499,7 @@ def validate_integer(name, val: None, min_val=...) -> None: @overload -def validate_integer(name, val: int | float, min_val=...) -> int: +def validate_integer(name, val: float, min_val=...) -> int: ... @@ -1910,7 +1910,7 @@ def _floatify_na_values(na_values): def _stringify_na_values(na_values): """return a stringified and numeric for these values""" - result: list[int | str | float] = [] + result: list[str | float] = [] for x in na_values: result.append(str(x)) result.append(x) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 2662df0361b55b..80b6db2500d281 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -678,7 +678,7 @@ def __init__( self.labname = catarray.name self._encoding = encoding categories = catarray.cat.categories - self.value_labels: list[tuple[int | float, str]] = list( + self.value_labels: list[tuple[float, str]] = list( zip(np.arange(len(categories)), categories) ) self.value_labels.sort(key=lambda x: x[0]) @@ -699,7 +699,7 @@ def _prepare_value_labels(self): # Compute lengths and setup lists of offsets and labels offsets: list[int] = [] - values: list[int | float] = [] + values: list[float] = [] for vl in self.value_labels: category: str | bytes = vl[1] if not isinstance(category, str): @@ -798,7 +798,7 @@ class StataNonCatValueLabel(StataValueLabel): def __init__( self, labname: str, - value_labels: dict[float | int, str], + value_labels: dict[float, str], encoding: Literal["latin-1", "utf-8"] = "latin-1", ) -> None: @@ -807,7 +807,7 @@ def __init__( self.labname = labname self._encoding = encoding - self.value_labels: list[tuple[int | float, str]] = sorted( + self.value_labels: list[tuple[float, str]] = sorted( value_labels.items(), key=lambda x: x[0] ) self._prepare_value_labels() @@ -887,7 +887,7 @@ class StataMissingValue: "float64": struct.unpack(" None: + def __init__(self, value: float) -> None: self._value = value # Conversion to int to avoid hash issues on 32 bit platforms #8968 value = int(value) if value < 2147483648 else float(value) @@ -906,7 +906,7 @@ def string(self) -> str: return self._str @property - def value(self) -> int | float: + def value(self) -> float: """ The binary representation of the missing value. @@ -931,7 +931,7 @@ def __eq__(self, other: Any) -> bool: ) @classmethod - def get_base_missing_value(cls, dtype: np.dtype) -> int | float: + def get_base_missing_value(cls, dtype: np.dtype) -> float: if dtype.type is np.int8: value = cls.BASE_MISSING_VALUES["int8"] elif dtype.type is np.int16: @@ -1526,7 +1526,7 @@ def _read_value_labels(self) -> None: if self.format_version <= 108: # Value labels are not supported in version 108 and earlier. self._value_labels_read = True - self.value_label_dict: dict[str, dict[float | int, str]] = {} + self.value_label_dict: dict[str, dict[float, str]] = {} return if self.format_version >= 117: @@ -1886,7 +1886,7 @@ def _do_select_columns(self, data: DataFrame, columns: Sequence[str]) -> DataFra def _do_convert_categoricals( self, data: DataFrame, - value_label_dict: dict[str, dict[float | int, str]], + value_label_dict: dict[str, dict[float, str]], lbllist: Sequence[str], order_categoricals: bool, ) -> DataFrame: @@ -1977,7 +1977,7 @@ def variable_labels(self) -> dict[str, str]: """ return dict(zip(self.varlist, self._variable_labels)) - def value_labels(self) -> dict[str, dict[float | int, str]]: + def value_labels(self) -> dict[str, dict[float, str]]: """ Return a dict, associating each variable name a dict, associating each value its corresponding label. @@ -2270,7 +2270,7 @@ def __init__( compression: CompressionOptions = "infer", storage_options: StorageOptions = None, *, - value_labels: dict[Hashable, dict[float | int, str]] | None = None, + value_labels: dict[Hashable, dict[float, str]] | None = None, ) -> None: super().__init__() self.data = data @@ -3200,7 +3200,7 @@ def __init__( compression: CompressionOptions = "infer", storage_options: StorageOptions = None, *, - value_labels: dict[Hashable, dict[float | int, str]] | None = None, + value_labels: dict[Hashable, dict[float, str]] | None = None, ) -> None: # Copy to new list since convert_strl might be modified later self._convert_strl: list[Hashable] = [] @@ -3597,7 +3597,7 @@ def __init__( compression: CompressionOptions = "infer", storage_options: StorageOptions = None, *, - value_labels: dict[Hashable, dict[float | int, str]] | None = None, + value_labels: dict[Hashable, dict[float, str]] | None = None, ) -> None: if version is None: version = 118 if data.shape[1] <= 32767 else 119 From 8d7a379dc04fce73ac0190b601c4776ac1bdf05b Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Wed, 27 Jul 2022 11:40:19 -0500 Subject: [PATCH 05/23] PERF: efficient argmax/argmin for SparseArray (#47779) * Update test_reductions.py * Update v1.5.0.rst * Update array.py * Update base.py * Update sorting.py * fix format * Update array.py * Update base.py * Update sorting.py * Update base.py * Update array.py * Update test_reductions.py * fix format * fix import * Update test_reductions.py * Update array.py * move to perf * Update doc/source/whatsnew/v1.5.0.rst Co-authored-by: Matthew Roeschke --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/core/arrays/sparse/array.py | 44 ++++++++++++++++++- pandas/tests/arrays/sparse/test_reductions.py | 38 ++++++++++++++++ 3 files changed, 82 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 7f07187e34c780..e57166f7a4861b 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -802,6 +802,7 @@ Performance improvements - Performance improvement in datetime arrays string formatting when one of the default strftime formats ``"%Y-%m-%d %H:%M:%S"`` or ``"%Y-%m-%d %H:%M:%S.%f"`` is used. (:issue:`44764`) - Performance improvement in :meth:`Series.to_sql` and :meth:`DataFrame.to_sql` (:class:`SQLiteTable`) when processing time arrays. (:issue:`44764`) - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47404`, :issue:`47405`) +- Performance improvement in ``argmax`` and ``argmin`` for :class:`arrays.SparseArray` (:issue:`34197`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index b547446603853e..f946f881311c1a 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -42,7 +42,10 @@ from pandas.compat.numpy import function as nv from pandas.errors import PerformanceWarning from pandas.util._exceptions import find_stack_level -from pandas.util._validators import validate_insert_loc +from pandas.util._validators import ( + validate_bool_kwarg, + validate_insert_loc, +) from pandas.core.dtypes.astype import astype_nansafe from pandas.core.dtypes.cast import ( @@ -1646,6 +1649,45 @@ def _min_max(self, kind: Literal["min", "max"], skipna: bool) -> Scalar: else: return na_value_for_dtype(self.dtype.subtype, compat=False) + def _argmin_argmax(self, kind: Literal["argmin", "argmax"]) -> int: + + values = self._sparse_values + index = self._sparse_index.indices + mask = np.asarray(isna(values)) + func = np.argmax if kind == "argmax" else np.argmin + + idx = np.arange(values.shape[0]) + non_nans = values[~mask] + non_nan_idx = idx[~mask] + + _candidate = non_nan_idx[func(non_nans)] + candidate = index[_candidate] + + if isna(self.fill_value): + return candidate + if kind == "argmin" and self[candidate] < self.fill_value: + return candidate + if kind == "argmax" and self[candidate] > self.fill_value: + return candidate + _loc = self._first_fill_value_loc() + if _loc == -1: + # fill_value doesn't exist + return candidate + else: + return _loc + + def argmax(self, skipna: bool = True) -> int: + validate_bool_kwarg(skipna, "skipna") + if not skipna and self._hasna: + raise NotImplementedError + return self._argmin_argmax("argmax") + + def argmin(self, skipna: bool = True) -> int: + validate_bool_kwarg(skipna, "skipna") + if not skipna and self._hasna: + raise NotImplementedError + return self._argmin_argmax("argmin") + # ------------------------------------------------------------------------ # Ufuncs # ------------------------------------------------------------------------ diff --git a/pandas/tests/arrays/sparse/test_reductions.py b/pandas/tests/arrays/sparse/test_reductions.py index a33a282bb48695..2dd80c52f1419f 100644 --- a/pandas/tests/arrays/sparse/test_reductions.py +++ b/pandas/tests/arrays/sparse/test_reductions.py @@ -268,3 +268,41 @@ def test_na_value_if_no_valid_values(self, func, data, dtype, expected): assert result is NaT or np.isnat(result) else: assert np.isnan(result) + + +class TestArgmaxArgmin: + @pytest.mark.parametrize( + "arr,argmax_expected,argmin_expected", + [ + (SparseArray([1, 2, 0, 1, 2]), 1, 2), + (SparseArray([-1, -2, 0, -1, -2]), 2, 1), + (SparseArray([np.nan, 1, 0, 0, np.nan, -1]), 1, 5), + (SparseArray([np.nan, 1, 0, 0, np.nan, 2]), 5, 2), + (SparseArray([np.nan, 1, 0, 0, np.nan, 2], fill_value=-1), 5, 2), + (SparseArray([np.nan, 1, 0, 0, np.nan, 2], fill_value=0), 5, 2), + (SparseArray([np.nan, 1, 0, 0, np.nan, 2], fill_value=1), 5, 2), + (SparseArray([np.nan, 1, 0, 0, np.nan, 2], fill_value=2), 5, 2), + (SparseArray([np.nan, 1, 0, 0, np.nan, 2], fill_value=3), 5, 2), + (SparseArray([0] * 10 + [-1], fill_value=0), 0, 10), + (SparseArray([0] * 10 + [-1], fill_value=-1), 0, 10), + (SparseArray([0] * 10 + [-1], fill_value=1), 0, 10), + (SparseArray([-1] + [0] * 10, fill_value=0), 1, 0), + (SparseArray([1] + [0] * 10, fill_value=0), 0, 1), + (SparseArray([-1] + [0] * 10, fill_value=-1), 1, 0), + (SparseArray([1] + [0] * 10, fill_value=1), 0, 1), + ], + ) + def test_argmax_argmin(self, arr, argmax_expected, argmin_expected): + argmax_result = arr.argmax() + argmin_result = arr.argmin() + assert argmax_result == argmax_expected + assert argmin_result == argmin_expected + + @pytest.mark.parametrize( + "arr,method", + [(SparseArray([]), "argmax"), (SparseArray([]), "argmin")], + ) + def test_empty_array(self, arr, method): + msg = f"attempt to get {method} of an empty sequence" + with pytest.raises(ValueError, match=msg): + arr.argmax() if method == "argmax" else arr.argmin() From 71591f199615e7adad932de9f17b121b384f88e1 Mon Sep 17 00:00:00 2001 From: Sandro Casagrande Date: Wed, 27 Jul 2022 18:50:50 +0200 Subject: [PATCH 06/23] DOC: Minor fixes in the IO user guide (#47875) * DOC: updated dtype doc in io user guide with content from docstring and fixed formatting * DOC: introduced sphinx note directive * DOC: fixed formatting --- doc/source/user_guide/io.rst | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 7d1aa76613d336..25625dba1080f5 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -107,9 +107,10 @@ index_col : int, str, sequence of int / str, or False, optional, default ``None` string name or column index. If a sequence of int / str is given, a MultiIndex is used. - Note: ``index_col=False`` can be used to force pandas to *not* use the first - column as the index, e.g. when you have a malformed file with delimiters at - the end of each line. + .. note:: + ``index_col=False`` can be used to force pandas to *not* use the first + column as the index, e.g. when you have a malformed file with delimiters at + the end of each line. The default value of ``None`` instructs pandas to guess. If the number of fields in the column header row is equal to the number of fields in the body @@ -182,15 +183,16 @@ General parsing configuration +++++++++++++++++++++++++++++ dtype : Type name or dict of column -> type, default ``None`` - Data type for data or columns. E.g. ``{'a': np.float64, 'b': np.int32}`` - (unsupported with ``engine='python'``). Use ``str`` or ``object`` together - with suitable ``na_values`` settings to preserve and - not interpret dtype. + Data type for data or columns. E.g. ``{'a': np.float64, 'b': np.int32, 'c': 'Int64'}`` + Use ``str`` or ``object`` together with suitable ``na_values`` settings to preserve + and not interpret dtype. If converters are specified, they will be applied INSTEAD + of dtype conversion. + .. versionadded:: 1.5.0 - Support for defaultdict was added. Specify a defaultdict as input where - the default determines the dtype of the columns which are not explicitly - listed. + Support for defaultdict was added. Specify a defaultdict as input where + the default determines the dtype of the columns which are not explicitly + listed. engine : {``'c'``, ``'python'``, ``'pyarrow'``} Parser engine to use. The C and pyarrow engines are faster, while the python engine is currently more feature-complete. Multithreading is currently only supported by @@ -283,7 +285,9 @@ parse_dates : boolean or list of ints or names or list of lists or dict, default * If ``[[1, 3]]`` -> combine columns 1 and 3 and parse as a single date column. * If ``{'foo': [1, 3]}`` -> parse columns 1, 3 as date and call result 'foo'. - A fast-path exists for iso8601-formatted dates. + + .. note:: + A fast-path exists for iso8601-formatted dates. infer_datetime_format : boolean, default ``False`` If ``True`` and parse_dates is enabled for a column, attempt to infer the datetime format to speed up the processing. @@ -1593,8 +1597,10 @@ of multi-columns indices. pd.read_csv("mi2.csv", header=[0, 1], index_col=0) -Note: If an ``index_col`` is not specified (e.g. you don't have an index, or wrote it -with ``df.to_csv(..., index=False)``, then any ``names`` on the columns index will be *lost*. +.. note:: + If an ``index_col`` is not specified (e.g. you don't have an index, or wrote it + with ``df.to_csv(..., index=False)``, then any ``names`` on the columns index will + be *lost*. .. ipython:: python :suppress: From 2552a78f73e73dfe7707ddcbefe357ba27aad50d Mon Sep 17 00:00:00 2001 From: Reinert Huseby Karlsen Date: Wed, 27 Jul 2022 18:52:06 +0200 Subject: [PATCH 07/23] DOC: fix broken link in getting started tutorials (#47876) DOC: fix broken link to subsetting tutorial --- doc/source/getting_started/intro_tutorials/10_text_data.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/getting_started/intro_tutorials/10_text_data.rst b/doc/source/getting_started/intro_tutorials/10_text_data.rst index 63db920164ac3f..148ac246d7bf80 100644 --- a/doc/source/getting_started/intro_tutorials/10_text_data.rst +++ b/doc/source/getting_started/intro_tutorials/10_text_data.rst @@ -179,7 +179,7 @@ applied to integers, so no ``str`` is used. Based on the index name of the row (``307``) and the column (``Name``), we can do a selection using the ``loc`` operator, introduced in the -`tutorial on subsetting <3_subset_data.ipynb>`__. +:ref:`tutorial on subsetting <10min_tut_03_subset>`. .. raw:: html From 3242672a92049cef4b8fa9f1049d275c1f11949d Mon Sep 17 00:00:00 2001 From: James Freeman Date: Wed, 27 Jul 2022 17:53:22 +0100 Subject: [PATCH 08/23] DOC: Update link from VirtusLab to pandas-dev repo (#47869) * DOC: Update link from VirtusLab to pandas-dev repo * DOC: Extend underline --- doc/source/ecosystem.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index a847624c55eba8..166162a4763bf4 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -591,12 +591,12 @@ Library Accessor Classes Description Development tools ----------------- -`pandas-stubs `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`pandas-stubs `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ While pandas repository is partially typed, the package itself doesn't expose this information for external use. Install pandas-stubs to enable basic type coverage of pandas API. Learn more by reading through :issue:`14468`, :issue:`26766`, :issue:`28142`. -See installation and usage instructions on the `github page `__. +See installation and usage instructions on the `github page `__. From fbf1681b9ae6c845da1b8d3b16f9145c7d14d3fc Mon Sep 17 00:00:00 2001 From: Stefan Krawczyk Date: Wed, 27 Jul 2022 09:55:38 -0700 Subject: [PATCH 09/23] DOC: Adds Hamilton as a development tool (#47865) Adds Hamilton as a development tool Hamilton is a paradigm that helps one manage a pandas code base in an opinionated manner. We have had great success with it at Stitch Fix, and it's got a growing open source following. I think it helps one to write pandas in a way that gives confidence if you want to run Pandas in production jobs. --- web/pandas/community/ecosystem.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index 1d77c596c1eb0f..89d243555a3580 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -400,3 +400,20 @@ Learn more by reading through these issues [14468](https://github.com/pandas-dev [26766](https://github.com/pandas-dev/pandas/issues/26766), [28142](https://github.com/pandas-dev/pandas/issues/28142). See installation and usage instructions on the [github page](https://github.com/VirtusLab/pandas-stubs). + +### [Hamilton](https://github.com/stitchfix/hamilton) + +Hamilton is a declarative dataflow framework that came out of Stitch Fix. It was designed to help one manage a +Pandas code base, specifically with respect to feature engineering for machine learning models. + +It prescibes an opinionated paradigm, that ensures all code is: + +* unit testable +* integration testing friendly +* documentation friendly +* transformation logic is reuseable, as it is decoupled from the context of where it is used. +* integrateable with runtime data quality checks. + +This helps one to scale your pandas code base, at the same time, keeping maintenance costs low. + +For more information, see [documentation](https://hamilton-docs.gitbook.io/). From 9a2276fddc2e72b74a8bba1d50c862b3fb5a176c Mon Sep 17 00:00:00 2001 From: Dennis Chukwunta Date: Wed, 27 Jul 2022 17:58:17 +0100 Subject: [PATCH 10/23] TST: Test series construct of dtype timedelta64 #35465 (#47801) * TST GH35465 Tests for repr of Series with dtype timedelta64 * pre commit test file * TST GH35465 Tests for Series constructs with dtype timedelta64 * restore pandas/tests/series/test_repr.py * perform pre-commit * use int64 for tests --- pandas/tests/series/test_constructors.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 4e4ee4fd12d5f3..de9a682acdfd61 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1879,6 +1879,28 @@ def test_constructor_bool_dtype_missing_values(self): expected = Series(True, index=[0], dtype="bool") tm.assert_series_equal(result, expected) + def test_constructor_dtype_timedelta_alternative_construct(self): + # GH#35465 + result = Series([1000000, 200000, 3000000], dtype="timedelta64[ns]") + expected = Series(pd.to_timedelta([1000000, 200000, 3000000], unit="ns")) + tm.assert_series_equal(result, expected) + + def test_constructor_dtype_timedelta_ns_s(self): + # GH#35465 + result = Series([1000000, 200000, 3000000], dtype="timedelta64[ns]") + expected = Series([1000000, 200000, 3000000], dtype="timedelta64[s]") + tm.assert_series_equal(result, expected) + + def test_constructor_dtype_timedelta_ns_s_astype_int64(self): + # GH#35465 + result = Series([1000000, 200000, 3000000], dtype="timedelta64[ns]").astype( + "int64" + ) + expected = Series([1000000, 200000, 3000000], dtype="timedelta64[s]").astype( + "int64" + ) + tm.assert_series_equal(result, expected) + @pytest.mark.filterwarnings( "ignore:elementwise comparison failed:DeprecationWarning" ) From 8a71e839d6a9aed26cb9b98fad76f25464859203 Mon Sep 17 00:00:00 2001 From: BarkotBeyene <38720673+BarkotBeyene@users.noreply.github.com> Date: Thu, 28 Jul 2022 12:12:13 -0500 Subject: [PATCH 11/23] DOC: Additions/updates to documentation-GH46359 (#47877) --- pandas/core/ops/docstrings.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/pandas/core/ops/docstrings.py b/pandas/core/ops/docstrings.py index aa469c2e077260..9c3158b3465b76 100644 --- a/pandas/core/ops/docstrings.py +++ b/pandas/core/ops/docstrings.py @@ -461,7 +461,7 @@ def make_flex_doc(op_name: str, typ: str) -> str: Parameters ---------- -other : scalar, sequence, Series, or DataFrame +other : scalar, sequence, Series, dict or DataFrame Any single or multiple element data structure, or list-like object. axis : {{0 or 'index', 1 or 'columns'}} Whether to compare by the index (0 or 'index') or columns. @@ -556,6 +556,20 @@ def make_flex_doc(op_name: str, typ: str) -> str: triangle 2 179 rectangle 3 359 +Multiply a dictionary by axis. + +>>> df.mul({{'angles': 0, 'degrees': 2}}) + angles degrees +circle 0 720 +triangle 0 360 +rectangle 0 720 + +>>> df.mul({{'circle': 0, 'triangle': 2, 'rectangle': 3}}, axis='index') + angles degrees +circle 0 0 +triangle 6 360 +rectangle 12 1080 + Multiply a DataFrame of different shape with operator version. >>> other = pd.DataFrame({{'angles': [0, 3, 4]}}, From f171c966f0ba132dd24fc635041da680d2985b07 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 28 Jul 2022 10:33:53 -0700 Subject: [PATCH 12/23] CLN/DOC: Remove sphinx referencing the wiki (#47853) --- doc/source/conf.py | 1 - web/pandas/about/roadmap.md | 49 ++++++++++++++++++++++++++++++++++--- 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 2a6ec8947c8d79..81ff14d33758ab 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -453,7 +453,6 @@ # extlinks alias extlinks = { "issue": ("https://github.com/pandas-dev/pandas/issues/%s", "GH"), - "wiki": ("https://github.com/pandas-dev/pandas/wiki/%s", "wiki "), } diff --git a/web/pandas/about/roadmap.md b/web/pandas/about/roadmap.md index 35a6b3361f32ee..3c6c4d4fdf9a22 100644 --- a/web/pandas/about/roadmap.md +++ b/web/pandas/about/roadmap.md @@ -103,9 +103,52 @@ uses label-based, rather than position-based, indexing. We propose that it should only work with positional indexing, and the translation of keys to positions should be entirely done at a higher level. -Indexing is a complicated API with many subtleties. This refactor will -require care and attention. More details are discussed at - +Indexing is a complicated API with many subtleties. This refactor will require care +and attention. The following principles should inspire refactoring of indexing code and +should result on cleaner, simpler, and more performant code. + +1. Label indexing must never involve looking in an axis twice for the same label(s). +This implies that any validation step must either: + + * limit validation to general features (e.g. dtype/structure of the key/index), or + * reuse the result for the actual indexing. + +2. Indexers must never rely on an explicit call to other indexers. +For instance, it is OK to have some internal method of `.loc` call some +internal method of `__getitem__` (or of their common base class), +but never in the code flow of `.loc` should `the_obj[something]` appear. + +3. Execution of positional indexing must never involve labels (as currently, sadly, happens). +That is, the code flow of a getter call (or a setter call in which the right hand side is non-indexed) +to `.iloc` should never involve the axes of the object in any way. + +4. Indexing must never involve accessing/modifying values (i.e., act on `._data` or `.values`) more than once. +The following steps must hence be clearly decoupled: + + * find positions we need to access/modify on each axis + * (if we are accessing) derive the type of object we need to return (dimensionality) + * actually access/modify the values + * (if we are accessing) construct the return object + +5. As a corollary to the decoupling between 4.i and 4.iii, any code which deals on how data is stored +(including any combination of handling multiple dtypes, and sparse storage, categoricals, third-party types) +must be independent from code that deals with identifying affected rows/columns, +and take place only once step 4.i is completed. + + * In particular, such code should most probably not live in `pandas/core/indexing.py` + * ... and must not depend in any way on the type(s) of axes (e.g. no `MultiIndex` special cases) + +6. As a corollary to point 1.i, `Index` (sub)classes must provide separate methods for any desired validity check of label(s) which does not involve actual lookup, +on the one side, and for any required conversion/adaptation/lookup of label(s), on the other. + +7. Use of trial and error should be limited, and anyway restricted to catch only exceptions +which are actually expected (typically `KeyError`). + + * In particular, code should never (intentionally) raise new exceptions in the `except` portion of a `try... exception` + +8. Any code portion which is not specific to setters and getters must be shared, +and when small differences in behavior are expected (e.g. getting with `.loc` raises for +missing labels, setting still doesn't), they can be managed with a specific parameter. ## Numba-accelerated operations From 1497bf24cd222dab6388f6ce555bd02cb9a45104 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 28 Jul 2022 10:37:37 -0700 Subject: [PATCH 13/23] ENH/TST: Add argsort/min/max for ArrowExtensionArray (#47811) --- pandas/core/arrays/arrow/array.py | 58 ++++++++++++++++++++++++++- pandas/tests/extension/test_arrow.py | 47 ++++++++++++++++++++++ pandas/tests/extension/test_string.py | 43 +++++++++++++++++++- pandas/tests/indexes/test_common.py | 9 ++++- pandas/tests/indexes/test_setops.py | 5 ++- 5 files changed, 157 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index a882d3a955469a..841275e54e3d6b 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -22,8 +22,12 @@ pa_version_under4p0, pa_version_under5p0, pa_version_under6p0, + pa_version_under7p0, +) +from pandas.util._decorators import ( + deprecate_nonkeyword_arguments, + doc, ) -from pandas.util._decorators import doc from pandas.core.dtypes.common import ( is_array_like, @@ -418,6 +422,58 @@ def isna(self) -> npt.NDArray[np.bool_]: else: return self._data.is_null().to_numpy() + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) + def argsort( + self, + ascending: bool = True, + kind: str = "quicksort", + na_position: str = "last", + *args, + **kwargs, + ) -> np.ndarray: + order = "ascending" if ascending else "descending" + null_placement = {"last": "at_end", "first": "at_start"}.get(na_position, None) + if null_placement is None or pa_version_under7p0: + # Although pc.array_sort_indices exists in version 6 + # there's a bug that affects the pa.ChunkedArray backing + # https://issues.apache.org/jira/browse/ARROW-12042 + fallback_performancewarning("7") + return super().argsort( + ascending=ascending, kind=kind, na_position=na_position + ) + + result = pc.array_sort_indices( + self._data, order=order, null_placement=null_placement + ) + if pa_version_under2p0: + np_result = result.to_pandas().values + else: + np_result = result.to_numpy() + return np_result.astype(np.intp, copy=False) + + def _argmin_max(self, skipna: bool, method: str) -> int: + if self._data.length() in (0, self._data.null_count) or ( + self._hasna and not skipna + ): + # For empty or all null, pyarrow returns -1 but pandas expects TypeError + # For skipna=False and data w/ null, pandas expects NotImplementedError + # let ExtensionArray.arg{max|min} raise + return getattr(super(), f"arg{method}")(skipna=skipna) + + if pa_version_under6p0: + raise NotImplementedError( + f"arg{method} only implemented for pyarrow version >= 6.0" + ) + + value = getattr(pc, method)(self._data, skip_nulls=skipna) + return pc.index(self._data, value).as_py() + + def argmin(self, skipna: bool = True) -> int: + return self._argmin_max(skipna, "min") + + def argmax(self, skipna: bool = True) -> int: + return self._argmin_max(skipna, "max") + def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: """ Return a shallow copy of the array. diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index a2a96da02b2a6d..2f3482ddc48115 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1385,6 +1385,11 @@ def test_value_counts_with_normalize(self, data, request): ) super().test_value_counts_with_normalize(data) + @pytest.mark.xfail( + pa_version_under6p0, + raises=NotImplementedError, + reason="argmin/max only implemented for pyarrow version >= 6.0", + ) def test_argmin_argmax( self, data_for_sorting, data_missing_for_sorting, na_value, request ): @@ -1395,8 +1400,50 @@ def test_argmin_argmax( reason=f"{pa_dtype} only has 2 unique possible values", ) ) + elif pa.types.is_duration(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=f"min_max not supported in pyarrow for {pa_dtype}", + ) + ) super().test_argmin_argmax(data_for_sorting, data_missing_for_sorting, na_value) + @pytest.mark.parametrize( + "op_name, skipna, expected", + [ + ("idxmax", True, 0), + ("idxmin", True, 2), + ("argmax", True, 0), + ("argmin", True, 2), + ("idxmax", False, np.nan), + ("idxmin", False, np.nan), + ("argmax", False, -1), + ("argmin", False, -1), + ], + ) + def test_argreduce_series( + self, data_missing_for_sorting, op_name, skipna, expected, request + ): + pa_dtype = data_missing_for_sorting.dtype.pyarrow_dtype + if pa_version_under6p0 and skipna: + request.node.add_marker( + pytest.mark.xfail( + raises=NotImplementedError, + reason="min_max not supported in pyarrow", + ) + ) + elif not pa_version_under6p0 and pa.types.is_duration(pa_dtype) and skipna: + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=f"min_max not supported in pyarrow for {pa_dtype}", + ) + ) + super().test_argreduce_series( + data_missing_for_sorting, op_name, skipna, expected + ) + @pytest.mark.parametrize("ascending", [True, False]) def test_sort_values(self, data_for_sorting, ascending, sort_by_key, request): pa_dtype = data_for_sorting.dtype.pyarrow_dtype diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 6cea21b6672d87..e4293d6d70e387 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -167,7 +167,48 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): class TestMethods(base.BaseMethodsTests): - pass + def test_argmin_argmax( + self, data_for_sorting, data_missing_for_sorting, na_value, request + ): + if pa_version_under6p0 and data_missing_for_sorting.dtype.storage == "pyarrow": + request.node.add_marker( + pytest.mark.xfail( + raises=NotImplementedError, + reason="min_max not supported in pyarrow", + ) + ) + super().test_argmin_argmax(data_for_sorting, data_missing_for_sorting, na_value) + + @pytest.mark.parametrize( + "op_name, skipna, expected", + [ + ("idxmax", True, 0), + ("idxmin", True, 2), + ("argmax", True, 0), + ("argmin", True, 2), + ("idxmax", False, np.nan), + ("idxmin", False, np.nan), + ("argmax", False, -1), + ("argmin", False, -1), + ], + ) + def test_argreduce_series( + self, data_missing_for_sorting, op_name, skipna, expected, request + ): + if ( + pa_version_under6p0 + and data_missing_for_sorting.dtype.storage == "pyarrow" + and skipna + ): + request.node.add_marker( + pytest.mark.xfail( + raises=NotImplementedError, + reason="min_max not supported in pyarrow", + ) + ) + super().test_argreduce_series( + data_missing_for_sorting, op_name, skipna, expected + ) class TestCasting(base.BaseCastingTests): diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index d582a469eaf0e6..e7e971f957e480 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -10,7 +10,7 @@ from pandas.compat import ( IS64, - pa_version_under2p0, + pa_version_under7p0, ) from pandas.core.dtypes.common import is_integer_dtype @@ -396,11 +396,16 @@ def test_astype_preserves_name(self, index, dtype): # imaginary components discarded warn = np.ComplexWarning + is_pyarrow_str = ( + str(index.dtype) == "string[pyarrow]" + and pa_version_under7p0 + and dtype == "category" + ) try: # Some of these conversions cannot succeed so we use a try / except with tm.assert_produces_warning( warn, - raise_on_extra_warnings=not pa_version_under2p0, + raise_on_extra_warnings=is_pyarrow_str, ): result = index.astype(dtype) except (ValueError, TypeError, NotImplementedError, SystemError): diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index f38a6c89e1bcb2..45ecd09e550d00 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas.compat import pa_version_under7p0 + from pandas.core.dtypes.cast import find_common_type from pandas import ( @@ -177,7 +179,8 @@ def test_dunder_inplace_setops_deprecated(index): with tm.assert_produces_warning(FutureWarning): index &= index - with tm.assert_produces_warning(FutureWarning): + is_pyarrow = str(index.dtype) == "string[pyarrow]" and pa_version_under7p0 + with tm.assert_produces_warning(FutureWarning, raise_on_extra_warnings=is_pyarrow): index ^= index From ffc111c2a93a7491247f21044022f92032c2c32c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 28 Jul 2022 11:12:36 -0700 Subject: [PATCH 14/23] ENH/TST: Add quantile & mode tests for ArrowExtensionArray (#47744) --- pandas/compat/__init__.py | 2 + pandas/compat/pyarrow.py | 2 + pandas/core/arrays/arrow/array.py | 51 ++++++++++++++++++++ pandas/tests/extension/test_arrow.py | 72 +++++++++++++++++++++++++++- 4 files changed, 126 insertions(+), 1 deletion(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 5db859897b663b..147134afd70c3f 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -28,6 +28,7 @@ pa_version_under6p0, pa_version_under7p0, pa_version_under8p0, + pa_version_under9p0, ) if TYPE_CHECKING: @@ -160,4 +161,5 @@ def get_lzma_file() -> type[lzma.LZMAFile]: "pa_version_under6p0", "pa_version_under7p0", "pa_version_under8p0", + "pa_version_under9p0", ] diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 833cda20368a26..6965865acb5da7 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -17,6 +17,7 @@ pa_version_under6p0 = _palv < Version("6.0.0") pa_version_under7p0 = _palv < Version("7.0.0") pa_version_under8p0 = _palv < Version("8.0.0") + pa_version_under9p0 = _palv < Version("9.0.0") except ImportError: pa_version_under1p01 = True pa_version_under2p0 = True @@ -26,3 +27,4 @@ pa_version_under6p0 = True pa_version_under7p0 = True pa_version_under8p0 = True + pa_version_under9p0 = True diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 841275e54e3d6b..f35d744763478e 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -881,6 +881,57 @@ def _indexing_key_to_indices( indices = np.arange(n)[key] return indices + # TODO: redefine _rank using pc.rank with pyarrow 9.0 + + def _quantile( + self: ArrowExtensionArrayT, qs: npt.NDArray[np.float64], interpolation: str + ) -> ArrowExtensionArrayT: + """ + Compute the quantiles of self for each quantile in `qs`. + + Parameters + ---------- + qs : np.ndarray[float64] + interpolation: str + + Returns + ------- + same type as self + """ + if pa_version_under4p0: + raise NotImplementedError( + "quantile only supported for pyarrow version >= 4.0" + ) + result = pc.quantile(self._data, q=qs, interpolation=interpolation) + return type(self)(result) + + def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArrayT: + """ + Returns the mode(s) of the ExtensionArray. + + Always returns `ExtensionArray` even if only one value. + + Parameters + ---------- + dropna : bool, default True + Don't consider counts of NA values. + Not implemented by pyarrow. + + Returns + ------- + same type as self + Sorted, if possible. + """ + if pa_version_under6p0: + raise NotImplementedError("mode only supported for pyarrow version >= 6.0") + modes = pc.mode(self._data, pc.count_distinct(self._data).as_py()) + values = modes.field(0) + counts = modes.field(1) + # counts sorted descending i.e counts[0] = max + mask = pc.equal(counts, counts[0]) + most_common = values.filter(mask) + return type(self)(most_common) + def _maybe_convert_setitem_value(self, value): """Maybe convert value to be pyarrow compatible.""" # TODO: Make more robust like ArrowStringArray._maybe_convert_setitem_value diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 2f3482ddc48115..43c52ef8848e2b 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -10,7 +10,6 @@ classes (if they are relevant for the extension interface for all dtypes), or be added to the array-specific tests in `pandas/tests/arrays/`. """ - from datetime import ( date, datetime, @@ -24,8 +23,10 @@ from pandas.compat import ( pa_version_under2p0, pa_version_under3p0, + pa_version_under4p0, pa_version_under6p0, pa_version_under8p0, + pa_version_under9p0, ) import pandas as pd @@ -1993,3 +1994,72 @@ def test_compare_array(self, data, comparison_op, na_value, request): def test_arrowdtype_construct_from_string_type_with_unsupported_parameters(): with pytest.raises(NotImplementedError, match="Passing pyarrow type"): ArrowDtype.construct_from_string("timestamp[s, tz=UTC][pyarrow]") + + +@pytest.mark.xfail( + pa_version_under4p0, + raises=NotImplementedError, + reason="quantile only supported for pyarrow version >= 4.0", +) +@pytest.mark.parametrize( + "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"] +) +@pytest.mark.parametrize("quantile", [0.5, [0.5, 0.5]]) +def test_quantile(data, interpolation, quantile, request): + pa_dtype = data.dtype.pyarrow_dtype + if not (pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype)): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=f"quantile not supported by pyarrow for {pa_dtype}", + ) + ) + data = data.take([0, 0, 0]) + ser = pd.Series(data) + result = ser.quantile(q=quantile, interpolation=interpolation) + if quantile == 0.5: + assert result == data[0] + else: + # Just check the values + result = result.astype("float64[pyarrow]") + expected = pd.Series( + data.take([0, 0]).astype("float64[pyarrow]"), index=[0.5, 0.5] + ) + tm.assert_series_equal(result, expected) + + +@pytest.mark.xfail( + pa_version_under6p0, + raises=NotImplementedError, + reason="mode only supported for pyarrow version >= 6.0", +) +@pytest.mark.parametrize("dropna", [True, False]) +@pytest.mark.parametrize( + "take_idx, exp_idx", + [[[0, 0, 2, 2, 4, 4], [4, 0]], [[0, 0, 0, 2, 4, 4], [0]]], + ids=["multi_mode", "single_mode"], +) +def test_mode(data_for_grouping, dropna, take_idx, exp_idx, request): + pa_dtype = data_for_grouping.dtype.pyarrow_dtype + if pa.types.is_temporal(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=f"mode not supported by pyarrow for {pa_dtype}", + ) + ) + elif ( + pa.types.is_boolean(pa_dtype) + and "multi_mode" in request.node.nodeid + and pa_version_under9p0 + ): + request.node.add_marker( + pytest.mark.xfail( + reason="https://issues.apache.org/jira/browse/ARROW-17096", + ) + ) + data = data_for_grouping.take(take_idx) + ser = pd.Series(data) + result = ser.mode(dropna=dropna) + expected = pd.Series(data_for_grouping.take(exp_idx)) + tm.assert_series_equal(result, expected) From 6ed93fc47b77bd73580fb05a5c67191c916ae8ff Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 28 Jul 2022 11:12:56 -0700 Subject: [PATCH 15/23] DOC: Add sphinx-toggleprompt and sphinx-copybutton (#47870) --- doc/source/conf.py | 25 +++++++++++++++---------- environment.yml | 2 ++ requirements-dev.txt | 2 ++ 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 81ff14d33758ab..33c916f532e90e 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -50,23 +50,25 @@ # sphinxext. extensions = [ - "sphinx.ext.autodoc", - "sphinx.ext.autosummary", - "sphinx.ext.doctest", - "sphinx.ext.extlinks", - "sphinx.ext.todo", - "numpydoc", # handle NumPy documentation formatted docstrings + "contributors", # custom pandas extension "IPython.sphinxext.ipython_directive", "IPython.sphinxext.ipython_console_highlighting", "matplotlib.sphinxext.plot_directive", - "sphinx.ext.intersphinx", + "numpydoc", + "sphinx_copybutton", + "sphinx_panels", + "sphinx_toggleprompt", + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", "sphinx.ext.coverage", - "sphinx.ext.mathjax", + "sphinx.ext.doctest", + "sphinx.ext.extlinks", "sphinx.ext.ifconfig", + "sphinx.ext.intersphinx", "sphinx.ext.linkcode", + "sphinx.ext.mathjax", + "sphinx.ext.todo", "nbsphinx", - "sphinx_panels", - "contributors", # custom pandas extension ] exclude_patterns = [ @@ -144,6 +146,9 @@ # already loads it panels_add_bootstrap_css = False +# https://sphinx-toggleprompt.readthedocs.io/en/stable/#offset +toggleprompt_offset_right = 35 + # Add any paths that contain templates here, relative to this directory. templates_path = ["../_templates"] diff --git a/environment.yml b/environment.yml index eb4d53e1169274..fd0822b13984e9 100644 --- a/environment.yml +++ b/environment.yml @@ -103,6 +103,7 @@ dependencies: - pytest-cython # doctest - sphinx - sphinx-panels + - sphinx-copybutton - types-python-dateutil - types-PyMySQL - types-pytz @@ -128,3 +129,4 @@ dependencies: - jupyterlab >=3.4,<4 - pip: - jupyterlite==0.1.0b10 + - sphinx-toggleprompt diff --git a/requirements-dev.txt b/requirements-dev.txt index ff410c59b43dde..e1e98ef8ffe23f 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -80,6 +80,7 @@ pydata-sphinx-theme==0.8.0 pytest-cython sphinx sphinx-panels +sphinx-copybutton types-python-dateutil types-PyMySQL types-pytz @@ -98,4 +99,5 @@ pyyaml requests jupyterlab >=3.4,<4 jupyterlite==0.1.0b10 +sphinx-toggleprompt setuptools>=51.0.0 From d343e59c14677fdf3f7e8065747fb8cc1a4cb19d Mon Sep 17 00:00:00 2001 From: Xingrong Chen <56777910+xr-chen@users.noreply.github.com> Date: Thu, 28 Jul 2022 18:05:57 -0500 Subject: [PATCH 16/23] BUG: fix Dataframe.join with categorical index leads to unexpected reordering (#47881) * BUG: fix Dataframe.join with categorical index leads to unexpected reordering * pre-commit issue --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/core/indexes/base.py | 1 + pandas/tests/reshape/merge/test_join.py | 15 +++++++++++++++ 3 files changed, 17 insertions(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index e57166f7a4861b..786bdd502fb1b1 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -1024,6 +1024,7 @@ Reshaping - Bug in :meth:`DataFrame.pivot_table` with ``sort=False`` results in sorted index (:issue:`17041`) - Bug in :meth:`concat` when ``axis=1`` and ``sort=False`` where the resulting Index was a :class:`Int64Index` instead of a :class:`RangeIndex` (:issue:`46675`) - Bug in :meth:`wide_to_long` raises when ``stubnames`` is missing in columns and ``i`` contains string dtype column (:issue:`46044`) +- Bug in :meth:`DataFrame.join` with categorical index results in unexpected reordering (:issue:`47812`) Sparse ^^^^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a212da050e1f11..239e6656ea1514 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4682,6 +4682,7 @@ def join( not isinstance(self, ABCMultiIndex) or not any(is_categorical_dtype(dtype) for dtype in self.dtypes) ) + and not is_categorical_dtype(self.dtype) ): # Categorical is monotonic if data are ordered as categories, but join can # not handle this in case of not lexicographically monotonic GH#38502 diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 905c2af2d22a5b..d97c6a3dacdc36 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -712,6 +712,21 @@ def test_join_datetime_string(self): ) tm.assert_frame_equal(result, expected) + def test_join_with_categorical_index(self): + # GH47812 + ix = ["a", "b"] + id1 = pd.CategoricalIndex(ix, categories=ix) + id2 = pd.CategoricalIndex(reversed(ix), categories=reversed(ix)) + + df1 = DataFrame({"c1": ix}, index=id1) + df2 = DataFrame({"c2": reversed(ix)}, index=id2) + result = df1.join(df2) + expected = DataFrame( + {"c1": ["a", "b"], "c2": ["a", "b"]}, + index=pd.CategoricalIndex(["a", "b"], categories=["a", "b"]), + ) + tm.assert_frame_equal(result, expected) + def _check_join(left, right, result, join_col, how="left", lsuffix="_x", rsuffix="_y"): From 1ff651fbe993255a0bf820d604db9c1b7206ffca Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 28 Jul 2022 16:37:27 -0700 Subject: [PATCH 17/23] CI: Pin cython on 32bit build (#47889) * CI: Pin cython on 32bit build * == --- .github/workflows/32-bit-linux.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/32-bit-linux.yml b/.github/workflows/32-bit-linux.yml index e091160c952f8b..8c74a53feed9b2 100644 --- a/.github/workflows/32-bit-linux.yml +++ b/.github/workflows/32-bit-linux.yml @@ -38,7 +38,7 @@ jobs: /opt/python/cp38-cp38/bin/python -m venv ~/virtualenvs/pandas-dev && \ . ~/virtualenvs/pandas-dev/bin/activate && \ python -m pip install --no-deps -U pip wheel 'setuptools<60.0.0' && \ - pip install cython numpy python-dateutil pytz pytest pytest-xdist pytest-asyncio>=0.17 hypothesis && \ + pip install cython==0.29.30 numpy python-dateutil pytz pytest pytest-xdist pytest-asyncio>=0.17 hypothesis && \ python setup.py build_ext -q -j2 && \ python -m pip install --no-build-isolation --no-use-pep517 -e . && \ export PANDAS_CI=1 && \ From 23c53bb6db707104cf2d805a8670d94c2e4b4ebf Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Thu, 28 Jul 2022 21:33:19 -0400 Subject: [PATCH 18/23] DEPR: args and kwargs in rolling, expanding, and ewm ops (#47851) --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/core/window/common.py | 38 +++++++++++++ pandas/core/window/doc.py | 8 ++- pandas/core/window/ewm.py | 11 +++- pandas/core/window/expanding.py | 19 ++++++- pandas/core/window/rolling.py | 15 +++++ pandas/tests/window/test_api.py | 80 ++++++++++++++++++++++++++- pandas/tests/window/test_ewm.py | 16 ++++-- pandas/tests/window/test_expanding.py | 16 ++++-- pandas/tests/window/test_rolling.py | 16 ++++-- pandas/tests/window/test_win_type.py | 16 ++++-- 11 files changed, 206 insertions(+), 30 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 786bdd502fb1b1..d138ebb9c02a3f 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -773,6 +773,7 @@ Other Deprecations - Deprecated :class:`Series` and :class:`Resampler` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) raising a ``NotImplementedError`` when the dtype is non-numric and ``numeric_only=True`` is provided; this will raise a ``TypeError`` in a future version (:issue:`47500`) - Deprecated :meth:`Series.rank` returning an empty result when the dtype is non-numeric and ``numeric_only=True`` is provided; this will raise a ``TypeError`` in a future version (:issue:`47500`) - Deprecated argument ``errors`` for :meth:`Series.mask`, :meth:`Series.where`, :meth:`DataFrame.mask`, and :meth:`DataFrame.where` as ``errors`` had no effect on this methods (:issue:`47728`) +- Deprecated arguments ``*args`` and ``**kwargs`` in :class:`Rolling`, :class:`Expanding`, and :class:`ExponentialMovingWindow` ops. (:issue:`47836`) .. --------------------------------------------------------------------------- .. _whatsnew_150.performance: diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index ed2a4002f5ce7a..e31b5c60a37eec 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -3,9 +3,12 @@ from collections import defaultdict from typing import cast +import warnings import numpy as np +from pandas.util._exceptions import find_stack_level + from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, @@ -167,3 +170,38 @@ def prep_binary(arg1, arg2): X = arg1 + 0 * arg2 Y = arg2 + 0 * arg1 return X, Y + + +def maybe_warn_args_and_kwargs(cls, kernel: str, args, kwargs) -> None: + """ + Warn for deprecation of args and kwargs in rolling/expanding functions. + + Parameters + ---------- + cls : type + Class to warn about. + kernel : str + Operation name. + args : tuple or None + args passed by user. Will be None if and only if kernel does not have args. + kwargs : dict or None + kwargs passed by user. Will be None if and only if kernel does not have kwargs. + """ + warn_args = args is not None and len(args) > 0 + warn_kwargs = kwargs is not None and len(kwargs) > 0 + if warn_args and warn_kwargs: + msg = "args and kwargs" + elif warn_args: + msg = "args" + elif warn_kwargs: + msg = "kwargs" + else: + msg = "" + if msg != "": + warnings.warn( + f"Passing additional {msg} to {cls.__name__}.{kernel} has " + "no impact on the result and is deprecated. This will " + "raise a TypeError in a future version of pandas.", + category=FutureWarning, + stacklevel=find_stack_level(), + ) diff --git a/pandas/core/window/doc.py b/pandas/core/window/doc.py index 4fe08e2fa20b3a..835085d41cffa5 100644 --- a/pandas/core/window/doc.py +++ b/pandas/core/window/doc.py @@ -43,14 +43,18 @@ def create_section_header(header: str) -> str: args_compat = dedent( """ *args - For NumPy compatibility and will not have an effect on the result.\n + For NumPy compatibility and will not have an effect on the result. + + .. deprecated:: 1.5.0\n """ ).replace("\n", "", 1) kwargs_compat = dedent( """ **kwargs - For NumPy compatibility and will not have an effect on the result.\n + For NumPy compatibility and will not have an effect on the result. + + .. deprecated:: 1.5.0\n """ ).replace("\n", "", 1) diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 3a42a4b1a16636..020ca710500158 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -42,7 +42,10 @@ get_jit_arguments, maybe_use_numba, ) -from pandas.core.window.common import zsqrt +from pandas.core.window.common import ( + maybe_warn_args_and_kwargs, + zsqrt, +) from pandas.core.window.doc import ( _shared_docs, args_compat, @@ -546,6 +549,7 @@ def mean( engine_kwargs=None, **kwargs, ): + maybe_warn_args_and_kwargs(type(self), "mean", args, kwargs) if maybe_use_numba(engine): if self.method == "single": func = generate_numba_ewm_func @@ -603,6 +607,7 @@ def sum( engine_kwargs=None, **kwargs, ): + maybe_warn_args_and_kwargs(type(self), "sum", args, kwargs) if not self.adjust: raise NotImplementedError("sum is not implemented with adjust=False") if maybe_use_numba(engine): @@ -658,6 +663,7 @@ def sum( agg_method="std", ) def std(self, bias: bool = False, numeric_only: bool = False, *args, **kwargs): + maybe_warn_args_and_kwargs(type(self), "std", args, kwargs) nv.validate_window_func("std", args, kwargs) if ( numeric_only @@ -702,6 +708,7 @@ def vol(self, bias: bool = False, *args, **kwargs): agg_method="var", ) def var(self, bias: bool = False, numeric_only: bool = False, *args, **kwargs): + maybe_warn_args_and_kwargs(type(self), "var", args, kwargs) nv.validate_window_func("var", args, kwargs) window_func = window_aggregations.ewmcov wfunc = partial( @@ -756,6 +763,7 @@ def cov( ): from pandas import Series + maybe_warn_args_and_kwargs(type(self), "cov", None, kwargs) self._validate_numeric_only("cov", numeric_only) def cov_func(x, y): @@ -829,6 +837,7 @@ def corr( ): from pandas import Series + maybe_warn_args_and_kwargs(type(self), "corr", None, kwargs) self._validate_numeric_only("corr", numeric_only) def cov_func(x, y): diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index dcdcbc0483d596..e997ffe1ec1323 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -25,6 +25,7 @@ ExpandingIndexer, GroupbyIndexer, ) +from pandas.core.window.common import maybe_warn_args_and_kwargs from pandas.core.window.doc import ( _shared_docs, args_compat, @@ -252,6 +253,7 @@ def sum( engine_kwargs: dict[str, bool] | None = None, **kwargs, ): + maybe_warn_args_and_kwargs(type(self), "sum", args, kwargs) nv.validate_expanding_func("sum", args, kwargs) return super().sum( numeric_only=numeric_only, @@ -285,6 +287,7 @@ def max( engine_kwargs: dict[str, bool] | None = None, **kwargs, ): + maybe_warn_args_and_kwargs(type(self), "max", args, kwargs) nv.validate_expanding_func("max", args, kwargs) return super().max( numeric_only=numeric_only, @@ -318,6 +321,7 @@ def min( engine_kwargs: dict[str, bool] | None = None, **kwargs, ): + maybe_warn_args_and_kwargs(type(self), "min", args, kwargs) nv.validate_expanding_func("min", args, kwargs) return super().min( numeric_only=numeric_only, @@ -351,6 +355,7 @@ def mean( engine_kwargs: dict[str, bool] | None = None, **kwargs, ): + maybe_warn_args_and_kwargs(type(self), "mean", args, kwargs) nv.validate_expanding_func("mean", args, kwargs) return super().mean( numeric_only=numeric_only, @@ -382,6 +387,7 @@ def median( engine_kwargs: dict[str, bool] | None = None, **kwargs, ): + maybe_warn_args_and_kwargs(type(self), "median", None, kwargs) return super().median( numeric_only=numeric_only, engine=engine, @@ -446,6 +452,7 @@ def std( engine_kwargs: dict[str, bool] | None = None, **kwargs, ): + maybe_warn_args_and_kwargs(type(self), "std", args, kwargs) nv.validate_expanding_func("std", args, kwargs) return super().std( ddof=ddof, @@ -512,6 +519,7 @@ def var( engine_kwargs: dict[str, bool] | None = None, **kwargs, ): + maybe_warn_args_and_kwargs(type(self), "var", args, kwargs) nv.validate_expanding_func("var", args, kwargs) return super().var( ddof=ddof, @@ -557,8 +565,9 @@ def var( aggregation_description="standard error of mean", agg_method="sem", ) - def sem(self, ddof: int = 1, *args, **kwargs): - return super().sem(ddof=ddof, **kwargs) + def sem(self, ddof: int = 1, numeric_only: bool = False, *args, **kwargs): + maybe_warn_args_and_kwargs(type(self), "sem", args, kwargs) + return super().sem(ddof=ddof, numeric_only=numeric_only, **kwargs) @doc( template_header, @@ -577,6 +586,7 @@ def sem(self, ddof: int = 1, *args, **kwargs): agg_method="skew", ) def skew(self, numeric_only: bool = False, **kwargs): + maybe_warn_args_and_kwargs(type(self), "skew", None, kwargs) return super().skew(numeric_only=numeric_only, **kwargs) @doc( @@ -618,6 +628,7 @@ def skew(self, numeric_only: bool = False, **kwargs): agg_method="kurt", ) def kurt(self, numeric_only: bool = False, **kwargs): + maybe_warn_args_and_kwargs(type(self), "kurt", None, kwargs) return super().kurt(numeric_only=numeric_only, **kwargs) @doc( @@ -656,6 +667,7 @@ def quantile( numeric_only: bool = False, **kwargs, ): + maybe_warn_args_and_kwargs(type(self), "quantile", None, kwargs) return super().quantile( quantile=quantile, interpolation=interpolation, @@ -733,6 +745,7 @@ def rank( numeric_only: bool = False, **kwargs, ): + maybe_warn_args_and_kwargs(type(self), "rank", None, kwargs) return super().rank( method=method, ascending=ascending, @@ -779,6 +792,7 @@ def cov( numeric_only: bool = False, **kwargs, ): + maybe_warn_args_and_kwargs(type(self), "cov", None, kwargs) return super().cov( other=other, pairwise=pairwise, @@ -852,6 +866,7 @@ def corr( numeric_only: bool = False, **kwargs, ): + maybe_warn_args_and_kwargs(type(self), "corr", None, kwargs) return super().corr( other=other, pairwise=pairwise, diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 93f07c5d756255..84915e2f52f17d 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -79,6 +79,7 @@ ) from pandas.core.window.common import ( flex_binary_moment, + maybe_warn_args_and_kwargs, zsqrt, ) from pandas.core.window.doc import ( @@ -2080,6 +2081,7 @@ def sum( engine_kwargs: dict[str, bool] | None = None, **kwargs, ): + maybe_warn_args_and_kwargs(type(self), "sum", args, kwargs) nv.validate_rolling_func("sum", args, kwargs) return super().sum( numeric_only=numeric_only, @@ -2113,6 +2115,7 @@ def max( engine_kwargs: dict[str, bool] | None = None, **kwargs, ): + maybe_warn_args_and_kwargs(type(self), "max", args, kwargs) nv.validate_rolling_func("max", args, kwargs) return super().max( numeric_only=numeric_only, @@ -2161,6 +2164,7 @@ def min( engine_kwargs: dict[str, bool] | None = None, **kwargs, ): + maybe_warn_args_and_kwargs(type(self), "min", args, kwargs) nv.validate_rolling_func("min", args, kwargs) return super().min( numeric_only=numeric_only, @@ -2216,6 +2220,7 @@ def mean( engine_kwargs: dict[str, bool] | None = None, **kwargs, ): + maybe_warn_args_and_kwargs(type(self), "mean", args, kwargs) nv.validate_rolling_func("mean", args, kwargs) return super().mean( numeric_only=numeric_only, @@ -2262,6 +2267,7 @@ def median( engine_kwargs: dict[str, bool] | None = None, **kwargs, ): + maybe_warn_args_and_kwargs(type(self), "median", None, kwargs) return super().median( numeric_only=numeric_only, engine=engine, @@ -2325,6 +2331,7 @@ def std( engine_kwargs: dict[str, bool] | None = None, **kwargs, ): + maybe_warn_args_and_kwargs(type(self), "std", args, kwargs) nv.validate_rolling_func("std", args, kwargs) return super().std( ddof=ddof, @@ -2390,6 +2397,7 @@ def var( engine_kwargs: dict[str, bool] | None = None, **kwargs, ): + maybe_warn_args_and_kwargs(type(self), "var", args, kwargs) nv.validate_rolling_func("var", args, kwargs) return super().var( ddof=ddof, @@ -2416,6 +2424,7 @@ def var( agg_method="skew", ) def skew(self, numeric_only: bool = False, **kwargs): + maybe_warn_args_and_kwargs(type(self), "skew", None, kwargs) return super().skew(numeric_only=numeric_only, **kwargs) @doc( @@ -2454,6 +2463,7 @@ def skew(self, numeric_only: bool = False, **kwargs): agg_method="sem", ) def sem(self, ddof: int = 1, numeric_only: bool = False, *args, **kwargs): + maybe_warn_args_and_kwargs(type(self), "sem", args, kwargs) nv.validate_rolling_func("sem", args, kwargs) # Raise here so error message says sem instead of std self._validate_numeric_only("sem", numeric_only) @@ -2500,6 +2510,7 @@ def sem(self, ddof: int = 1, numeric_only: bool = False, *args, **kwargs): agg_method="kurt", ) def kurt(self, numeric_only: bool = False, **kwargs): + maybe_warn_args_and_kwargs(type(self), "kurt", None, kwargs) return super().kurt(numeric_only=numeric_only, **kwargs) @doc( @@ -2557,6 +2568,7 @@ def quantile( numeric_only: bool = False, **kwargs, ): + maybe_warn_args_and_kwargs(type(self), "quantile", None, kwargs) return super().quantile( quantile=quantile, interpolation=interpolation, @@ -2634,6 +2646,7 @@ def rank( numeric_only: bool = False, **kwargs, ): + maybe_warn_args_and_kwargs(type(self), "rank", None, kwargs) return super().rank( method=method, ascending=ascending, @@ -2680,6 +2693,7 @@ def cov( numeric_only: bool = False, **kwargs, ): + maybe_warn_args_and_kwargs(type(self), "cov", None, kwargs) return super().cov( other=other, pairwise=pairwise, @@ -2813,6 +2827,7 @@ def corr( numeric_only: bool = False, **kwargs, ): + maybe_warn_args_and_kwargs(type(self), "corr", None, kwargs) return super().corr( other=other, pairwise=pairwise, diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index 2a8caa1d42d4d8..6495f7411938cc 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -1,7 +1,10 @@ import numpy as np import pytest -from pandas.errors import SpecificationError +from pandas.errors import ( + SpecificationError, + UnsupportedFunctionCall, +) from pandas import ( DataFrame, @@ -409,3 +412,78 @@ def test_rolling_max_min_periods(step): msg = "min_periods 5 must be <= window 3" with pytest.raises(ValueError, match=msg): Series([1, 2, 3]).rolling(window=3, min_periods=5, step=step).max() + + +@pytest.mark.parametrize( + "roll_type, class_name", + [ + ("rolling", "Rolling"), + ("expanding", "Expanding"), + ("ewm", "ExponentialMovingWindow"), + ], +) +@pytest.mark.parametrize( + "kernel, has_args, raises", + [ + ("sum", True, True), + ("max", True, True), + ("min", True, True), + ("mean", True, True), + ("median", False, False), + ("std", True, True), + ("var", True, True), + ("skew", False, False), + ("sem", True, True), + ("kurt", False, False), + ("quantile", False, False), + ("rank", False, False), + ("cov", False, False), + ("corr", False, False), + ], +) +def test_args_kwargs_depr(roll_type, class_name, kernel, has_args, raises): + # GH#47836 + r = getattr(Series([2, 4, 6]), roll_type)(2) + error_msg = "numpy operations are not valid with window objects" + if kernel == "quantile": + required_args = (0.5,) + else: + required_args = () + + if roll_type == "ewm" and kernel not in ( + "sum", + "mean", + "std", + "var", + "cov", + "corr", + ): + # kernels not implemented for ewm + with pytest.raises(AttributeError, match=f"has no attribute '{kernel}'"): + getattr(r, kernel) + else: + warn_msg = f"Passing additional kwargs to {class_name}.{kernel}" + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + if raises: + with pytest.raises(UnsupportedFunctionCall, match=error_msg): + getattr(r, kernel)(*required_args, dtype=np.float64) + else: + getattr(r, kernel)(*required_args, dtype=np.float64) + + if has_args: + warn_msg = f"Passing additional args to {class_name}.{kernel}" + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + # sem raises for rolling but not expanding + if raises and (roll_type != "expanding" or kernel != "sem"): + with pytest.raises(UnsupportedFunctionCall, match=error_msg): + getattr(r, kernel)(*required_args, 1, 2, 3, 4) + else: + getattr(r, kernel)(*required_args, 1, 2, 3, 4) + + warn_msg = f"Passing additional args and kwargs to {class_name}.{kernel}" + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + if raises: + with pytest.raises(UnsupportedFunctionCall, match=error_msg): + getattr(r, kernel)(*required_args, 1, 2, 3, 4, dtype=np.float64) + else: + getattr(r, kernel)(*required_args, 1, 2, 3, 4, dtype=np.float64) diff --git a/pandas/tests/window/test_ewm.py b/pandas/tests/window/test_ewm.py index e0051ee6d51c68..b524eb5978fa08 100644 --- a/pandas/tests/window/test_ewm.py +++ b/pandas/tests/window/test_ewm.py @@ -69,12 +69,16 @@ def test_numpy_compat(method): # see gh-12811 e = ExponentialMovingWindow(Series([2, 4, 6]), alpha=0.5) - msg = "numpy operations are not valid with window objects" - - with pytest.raises(UnsupportedFunctionCall, match=msg): - getattr(e, method)(1, 2, 3) - with pytest.raises(UnsupportedFunctionCall, match=msg): - getattr(e, method)(dtype=np.float64) + error_msg = "numpy operations are not valid with window objects" + + warn_msg = f"Passing additional args to ExponentialMovingWindow.{method}" + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + with pytest.raises(UnsupportedFunctionCall, match=error_msg): + getattr(e, method)(1, 2, 3) + warn_msg = f"Passing additional kwargs to ExponentialMovingWindow.{method}" + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + with pytest.raises(UnsupportedFunctionCall, match=error_msg): + getattr(e, method)(dtype=np.float64) def test_ewma_times_not_datetime_type(): diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index e0c9294c445f2a..a12997018052d8 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -59,12 +59,16 @@ def test_numpy_compat(method): # see gh-12811 e = Expanding(Series([2, 4, 6])) - msg = "numpy operations are not valid with window objects" - - with pytest.raises(UnsupportedFunctionCall, match=msg): - getattr(e, method)(1, 2, 3) - with pytest.raises(UnsupportedFunctionCall, match=msg): - getattr(e, method)(dtype=np.float64) + error_msg = "numpy operations are not valid with window objects" + + warn_msg = f"Passing additional args to Expanding.{method}" + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + with pytest.raises(UnsupportedFunctionCall, match=error_msg): + getattr(e, method)(1, 2, 3) + warn_msg = f"Passing additional kwargs to Expanding.{method}" + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + with pytest.raises(UnsupportedFunctionCall, match=error_msg): + getattr(e, method)(dtype=np.float64) @pytest.mark.parametrize( diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 785603f6e05f03..c9ec2985488be1 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -158,12 +158,16 @@ def test_numpy_compat(method): # see gh-12811 r = Rolling(Series([2, 4, 6]), window=2) - msg = "numpy operations are not valid with window objects" - - with pytest.raises(UnsupportedFunctionCall, match=msg): - getattr(r, method)(1, 2, 3) - with pytest.raises(UnsupportedFunctionCall, match=msg): - getattr(r, method)(dtype=np.float64) + error_msg = "numpy operations are not valid with window objects" + + warn_msg = f"Passing additional args to Rolling.{method}" + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + with pytest.raises(UnsupportedFunctionCall, match=error_msg): + getattr(r, method)(1, 2, 3) + warn_msg = f"Passing additional kwargs to Rolling.{method}" + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + with pytest.raises(UnsupportedFunctionCall, match=error_msg): + getattr(r, method)(dtype=np.float64) @pytest.mark.parametrize("closed", ["right", "left", "both", "neither"]) diff --git a/pandas/tests/window/test_win_type.py b/pandas/tests/window/test_win_type.py index 8c8e9cadbfdc13..ba80ac19a6b6ae 100644 --- a/pandas/tests/window/test_win_type.py +++ b/pandas/tests/window/test_win_type.py @@ -79,12 +79,16 @@ def test_numpy_compat(method): # see gh-12811 w = Series([2, 4, 6]).rolling(window=2) - msg = "numpy operations are not valid with window objects" - - with pytest.raises(UnsupportedFunctionCall, match=msg): - getattr(w, method)(1, 2, 3) - with pytest.raises(UnsupportedFunctionCall, match=msg): - getattr(w, method)(dtype=np.float64) + error_msg = "numpy operations are not valid with window objects" + + warn_msg = f"Passing additional args to Rolling.{method}" + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + with pytest.raises(UnsupportedFunctionCall, match=error_msg): + getattr(w, method)(1, 2, 3) + warn_msg = f"Passing additional kwargs to Rolling.{method}" + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + with pytest.raises(UnsupportedFunctionCall, match=error_msg): + getattr(w, method)(dtype=np.float64) @td.skip_if_no_scipy From 0b6d1207d431961eed2a19f5a05073f6ee43610b Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Fri, 29 Jul 2022 16:56:42 -0400 Subject: [PATCH 19/23] CLN: Rename "add" to "sum" in groupby (#47892) * CLN: Rename "add" to "sum" * revert --- pandas/_libs/groupby.pyi | 2 +- pandas/_libs/groupby.pyx | 30 +++++++++---------- pandas/core/groupby/groupby.py | 8 ++--- pandas/core/groupby/ops.py | 14 ++++----- pandas/tests/groupby/aggregate/test_cython.py | 4 +-- pandas/tests/resample/test_datetime_index.py | 6 ++-- 6 files changed, 30 insertions(+), 34 deletions(-) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index c7cb9705d7cb96..dfae1bff91ac8a 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -50,7 +50,7 @@ def group_any_all( val_test: Literal["any", "all"], skipna: bool, ) -> None: ... -def group_add( +def group_sum( out: np.ndarray, # complexfloating_t[:, ::1] counts: np.ndarray, # int64_t[::1] values: np.ndarray, # ndarray[complexfloating_t, ndim=2] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index db785bd962f96a..06830a1d84c6eb 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -124,7 +124,7 @@ def group_median_float64( ndarray[intp_t] indexer float64_t* ptr - assert min_count == -1, "'min_count' only used in add and prod" + assert min_count == -1, "'min_count' only used in sum and prod" ngroups = len(counts) N, K = (values).shape @@ -502,7 +502,7 @@ def group_any_all( # ---------------------------------------------------------------------- -# group_add, group_prod, group_var, group_mean, group_ohlc +# group_sum, group_prod, group_var, group_mean, group_ohlc # ---------------------------------------------------------------------- ctypedef fused mean_t: @@ -511,17 +511,17 @@ ctypedef fused mean_t: complex64_t complex128_t -ctypedef fused add_t: +ctypedef fused sum_t: mean_t object @cython.wraparound(False) @cython.boundscheck(False) -def group_add( - add_t[:, ::1] out, +def group_sum( + sum_t[:, ::1] out, int64_t[::1] counts, - ndarray[add_t, ndim=2] values, + ndarray[sum_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=0, bint is_datetimelike=False, @@ -531,8 +531,8 @@ def group_add( """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - add_t val, t, y - add_t[:, ::1] sumx, compensation + sum_t val, t, y + sum_t[:, ::1] sumx, compensation int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) @@ -546,7 +546,7 @@ def group_add( N, K = (values).shape - if add_t is object: + if sum_t is object: # NB: this does not use 'compensation' like the non-object track does. for i in range(N): lab = labels[i] @@ -588,10 +588,10 @@ def group_add( # not nan # With dt64/td64 values, values have been cast to float64 - # instead if int64 for group_add, but the logic + # instead if int64 for group_sum, but the logic # is otherwise the same as in _treat_as_na if val == val and not ( - add_t is float64_t + sum_t is float64_t and is_datetimelike and val == NPY_NAT ): @@ -677,7 +677,7 @@ def group_var( int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) - assert min_count == -1, "'min_count' only used in add and prod" + assert min_count == -1, "'min_count' only used in sum and prod" if len_values != len_labels: raise ValueError("len(index) != len(labels)") @@ -745,7 +745,7 @@ def group_mean( Array containing unique label for each group, with its ordering matching up to the corresponding record in `values`. min_count : Py_ssize_t - Only used in add and prod. Always -1. + Only used in sum and prod. Always -1. is_datetimelike : bool True if `values` contains datetime-like entries. mask : ndarray[bool, ndim=2], optional @@ -766,7 +766,7 @@ def group_mean( int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) - assert min_count == -1, "'min_count' only used in add and prod" + assert min_count == -1, "'min_count' only used in sum and prod" if len_values != len_labels: raise ValueError("len(index) != len(labels)") @@ -821,7 +821,7 @@ def group_ohlc( Py_ssize_t i, j, N, K, lab floating val - assert min_count == -1, "'min_count' only used in add and prod" + assert min_count == -1, "'min_count' only used in sum and prod" if len(labels) == 0: return diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 9b4991d32692b0..06422f8cc5cb0d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1338,7 +1338,6 @@ def _resolve_numeric_only( if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype): # GH#47500 - how = "sum" if how == "add" else how warnings.warn( f"{type(self).__name__}.{how} called with " f"numeric_only={numeric_only} and dtype {self.obj.dtype}. This will " @@ -1738,9 +1737,8 @@ def _cython_agg_general( kwd_name = "numeric_only" if how in ["any", "all"]: kwd_name = "bool_only" - kernel = "sum" if how == "add" else how raise NotImplementedError( - f"{type(self).__name__}.{kernel} does not implement {kwd_name}." + f"{type(self).__name__}.{how} does not implement {kwd_name}." ) elif not is_ser: data = data.get_numeric_data(copy=False) @@ -2417,7 +2415,7 @@ def sum( result = self._agg_general( numeric_only=numeric_only, min_count=min_count, - alias="add", + alias="sum", npfunc=np.sum, ) @@ -4341,8 +4339,6 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde def warn_dropping_nuisance_columns_deprecated(cls, how: str, numeric_only) -> None: - if how == "add": - how = "sum" if numeric_only is not lib.no_default and not numeric_only: # numeric_only was specified and falsey but still dropped nuisance columns warnings.warn( diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 6dc4ccfa8e1eed..283e4a48657c58 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -121,7 +121,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: _CYTHON_FUNCTIONS = { "aggregate": { - "add": "group_add", + "sum": "group_sum", "prod": "group_prod", "min": "group_min", "max": "group_max", @@ -213,7 +213,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: values = ensure_float64(values) elif values.dtype.kind in ["i", "u"]: - if how in ["add", "var", "prod", "mean", "ohlc"] or ( + if how in ["sum", "var", "prod", "mean", "ohlc"] or ( self.kind == "transform" and self.has_dropped_na ): # result may still include NaN, so we have to cast @@ -241,7 +241,7 @@ def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False): if isinstance(dtype, CategoricalDtype): # NotImplementedError for methods that can fall back to a # non-cython implementation. - if how in ["add", "prod", "cumsum", "cumprod"]: + if how in ["sum", "prod", "cumsum", "cumprod"]: raise TypeError(f"{dtype} type does not support {how} operations") elif how not in ["rank"]: # only "rank" is implemented in cython @@ -258,7 +258,7 @@ def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False): # TODO: same for period_dtype? no for these methods with Period # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes - if how in ["add", "prod", "cumsum", "cumprod"]: + if how in ["sum", "prod", "cumsum", "cumprod"]: raise TypeError(f"datetime64 type does not support {how} operations") elif is_timedelta64_dtype(dtype): if how in ["prod", "cumprod"]: @@ -311,7 +311,7 @@ def _get_result_dtype(self, dtype: np.dtype) -> np.dtype: """ how = self.how - if how in ["add", "cumsum", "sum", "prod"]: + if how in ["sum", "cumsum", "sum", "prod"]: if dtype == np.dtype(bool): return np.dtype(np.int64) elif how in ["mean", "median", "var"]: @@ -567,7 +567,7 @@ def _call_cython_op( result_mask=result_mask, is_datetimelike=is_datetimelike, ) - elif self.how in ["add"]: + elif self.how in ["sum"]: # We support datetimelike func( out=result, @@ -625,7 +625,7 @@ def _call_cython_op( # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cast_blocklist we get here # Casting only needed for float16, bool, datetimelike, - # and self.how in ["add", "prod", "ohlc", "cumprod"] + # and self.how in ["sum", "prod", "ohlc", "cumprod"] res_dtype = self._get_result_dtype(orig_values.dtype) op_result = maybe_downcast_to_dtype(result, res_dtype) else: diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 869ed31b6a2d93..6c5a3ae67c78ae 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -166,7 +166,7 @@ def test_cython_fail_agg(): ("mean", np.mean), ("median", np.median), ("var", np.var), - ("add", np.sum), + ("sum", np.sum), ("prod", np.prod), ("min", np.min), ("max", np.max), @@ -214,7 +214,7 @@ def test_cython_agg_empty_buckets_nanops(observed): grps = range(0, 25, 5) # add / sum result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general( - "add", alt=None, numeric_only=True + "sum", alt=None, numeric_only=True ) intervals = pd.interval_range(0, 20, freq=5, inclusive="right") expected = DataFrame( diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index fbc3b385e5098f..970d4f155ecfc2 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -61,7 +61,7 @@ def test_custom_grouper(index): # check all cython functions work g.ohlc() # doesn't use _cython_agg_general - funcs = ["add", "mean", "prod", "min", "max", "var"] + funcs = ["sum", "mean", "prod", "min", "max", "var"] for f in funcs: g._cython_agg_general(f, alt=None, numeric_only=True) @@ -69,7 +69,7 @@ def test_custom_grouper(index): g = s.groupby(b) # check all cython functions work g.ohlc() # doesn't use _cython_agg_general - funcs = ["add", "mean", "prod", "min", "max", "var"] + funcs = ["sum", "mean", "prod", "min", "max", "var"] for f in funcs: g._cython_agg_general(f, alt=None, numeric_only=True) @@ -414,7 +414,7 @@ def test_resample_upsampling_picked_but_not_correct(): tm.assert_series_equal(result2, expected) -@pytest.mark.parametrize("f", ["add", "mean", "prod", "min", "max", "var"]) +@pytest.mark.parametrize("f", ["sum", "mean", "prod", "min", "max", "var"]) def test_resample_frame_basic_cy_funcs(f): df = tm.makeTimeDataFrame() From 2a082fd062bb459da89e9ebbf229af168ad9d330 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 29 Jul 2022 16:42:27 -0700 Subject: [PATCH 20/23] BUG: preserve _id in MultiIndex.copy(deep=False) (#47900) --- pandas/core/indexes/multi.py | 5 +++++ pandas/tests/indexes/multi/test_copy.py | 12 ++++++++++++ 2 files changed, 17 insertions(+) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index fd6b6ba63d7e00..b4b576df9918ea 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1190,6 +1190,7 @@ def copy( This could be potentially expensive on large MultiIndex objects. """ names = self._validate_names(name=name, names=names, deep=deep) + keep_id = not deep if levels is not None: warnings.warn( "parameter levels is deprecated and will be removed in a future " @@ -1197,6 +1198,7 @@ def copy( FutureWarning, stacklevel=find_stack_level(), ) + keep_id = False if codes is not None: warnings.warn( "parameter codes is deprecated and will be removed in a future " @@ -1204,6 +1206,7 @@ def copy( FutureWarning, stacklevel=find_stack_level(), ) + keep_id = False if deep: from copy import deepcopy @@ -1225,6 +1228,8 @@ def copy( ) new_index._cache = self._cache.copy() new_index._cache.pop("levels", None) # GH32669 + if keep_id: + new_index._id = self._id if dtype: warnings.warn( diff --git a/pandas/tests/indexes/multi/test_copy.py b/pandas/tests/indexes/multi/test_copy.py index 9a0e4bc0996beb..2b64845c919cf7 100644 --- a/pandas/tests/indexes/multi/test_copy.py +++ b/pandas/tests/indexes/multi/test_copy.py @@ -104,3 +104,15 @@ def test_copy_deprecated_parameters(deep, param_name, param_value): idx_copy = idx.copy(deep=deep, **{param_name: param_value}) assert [list(i) for i in getattr(idx_copy, param_name)] == param_value + + +def test_copy_deep_false_retains_id(): + # GH#47878 + idx = MultiIndex( + levels=[["foo", "bar"], ["fizz", "buzz"]], + codes=[[0, 0, 0, 1], [0, 0, 1, 1]], + names=["first", "second"], + ) + + res = idx.copy(deep=False) + assert res._id is idx._id From 62a69beddbedde349891378992c902c0b9341a9f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 29 Jul 2022 17:44:45 -0700 Subject: [PATCH 21/23] DOC: Add numpydoc SS06 validation (#47885) --- ci/code_checks.sh | 4 +- pandas/_config/config.py | 5 +- pandas/_libs/interval.pyx | 10 +-- pandas/_libs/lib.pyx | 3 +- pandas/_libs/tslibs/nattype.pyx | 30 ++++---- pandas/_libs/tslibs/np_datetime.pyx | 8 +-- pandas/_libs/tslibs/offsets.pyx | 23 +++--- pandas/_libs/tslibs/period.pyx | 12 ++-- pandas/_libs/tslibs/timedeltas.pyx | 3 +- pandas/_libs/tslibs/timestamps.pyx | 31 ++++----- pandas/core/arrays/datetimelike.py | 6 +- pandas/core/arrays/datetimes.py | 21 +++--- pandas/core/arrays/interval.py | 32 ++++----- pandas/core/arrays/timedeltas.py | 9 +-- pandas/core/base.py | 9 +-- pandas/core/frame.py | 5 +- pandas/core/generic.py | 5 +- pandas/core/groupby/groupby.py | 8 +-- pandas/core/indexers/objects.py | 3 +- pandas/core/indexes/base.py | 24 +++---- pandas/core/indexes/datetimes.py | 14 ++-- pandas/core/indexes/multi.py | 6 +- pandas/core/indexes/numeric.py | 14 ++-- pandas/core/indexes/timedeltas.py | 8 +-- pandas/core/resample.py | 5 +- pandas/core/reshape/concat.py | 5 +- pandas/core/reshape/pivot.py | 7 +- pandas/core/series.py | 2 + pandas/core/strings/accessor.py | 5 +- pandas/errors/__init__.py | 104 +++++++++++++++------------- pandas/io/formats/style.py | 6 +- pandas/io/formats/style_render.py | 20 ++---- pandas/io/pytables.py | 5 +- pandas/io/stata.py | 6 +- pandas/plotting/_misc.py | 3 +- pandas/tests/scalar/test_nat.py | 1 + scripts/validate_docstrings.py | 16 ++++- 37 files changed, 242 insertions(+), 236 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index b42ffc66f77140..113186c7461572 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -78,8 +78,8 @@ fi ### DOCSTRINGS ### if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then - MSG='Validate docstrings (EX04, GL01, GL02, GL03, GL04, GL05, GL06, GL07, GL09, GL10, PR03, PR04, PR05, PR06, PR08, PR09, PR10, RT01, RT04, RT05, SA02, SA03, SA04, SS01, SS02, SS03, SS04, SS05)' ; echo $MSG - $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX04,GL01,GL02,GL03,GL04,GL05,GL06,GL07,GL09,GL10,PR03,PR04,PR05,PR06,PR08,PR09,PR10,RT01,RT04,RT05,SA02,SA03,SA04,SS01,SS02,SS03,SS04,SS05 + MSG='Validate docstrings (EX04, GL01, GL02, GL03, GL04, GL05, GL06, GL07, GL09, GL10, PR03, PR04, PR05, PR06, PR08, PR09, PR10, RT01, RT04, RT05, SA02, SA03, SA04, SS01, SS02, SS03, SS04, SS05, SS06)' ; echo $MSG + $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX04,GL01,GL02,GL03,GL04,GL05,GL06,GL07,GL09,GL10,PR03,PR04,PR05,PR06,PR08,PR09,PR10,RT01,RT04,RT05,SA02,SA03,SA04,SS01,SS02,SS03,SS04,SS05,SS06 RET=$(($RET + $?)) ; echo $MSG "DONE" fi diff --git a/pandas/_config/config.py b/pandas/_config/config.py index eacbf1b016432a..d5e77d824340d9 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -102,8 +102,9 @@ class RegisteredOption(NamedTuple): class OptionError(AttributeError, KeyError): """ - Exception for pandas.options, backwards compatible with KeyError - checks. + Exception raised for pandas.options. + + Backwards compatible with KeyError checks. """ diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index bc0a63c5c5a33c..ec1dbff6903e77 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -352,8 +352,9 @@ cdef class Interval(IntervalMixin): cdef readonly str inclusive """ - Whether the interval is inclusive on the left-side, right-side, both or - neither. + String describing the inclusive side the intervals. + + Either ``left``, ``right``, ``both`` or ``neither``. """ def __init__(self, left, right, inclusive: str | None = None, closed: None | lib.NoDefault = lib.no_default): @@ -384,10 +385,11 @@ cdef class Interval(IntervalMixin): @property def closed(self): """ - Whether the interval is closed on the left-side, right-side, both or - neither. + String describing the inclusive side the intervals. .. deprecated:: 1.5.0 + + Either ``left``, ``right``, ``both`` or ``neither``. """ warnings.warn( "Attribute `closed` is deprecated in favor of `inclusive`.", diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e353d224708b7b..c90c9003c8d604 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1347,8 +1347,7 @@ cdef object _try_infer_map(object dtype): def infer_dtype(value: object, skipna: bool = True) -> str: """ - Efficiently infer the type of a passed val, or list-like - array of values. Return a string describing the type. + Return a string label of the type of a scalar or list-like of values. Parameters ---------- diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 93687abdf91530..b05b0ba636251b 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -446,6 +446,7 @@ class NaTType(_NaT): "weekday", """ Return the day of the week represented by the date. + Monday == 0 ... Sunday == 6. """, ) @@ -453,6 +454,7 @@ class NaTType(_NaT): "isoweekday", """ Return the day of the week represented by the date. + Monday == 1 ... Sunday == 7. """, ) @@ -533,10 +535,7 @@ class NaTType(_NaT): strftime = _make_error_func( "strftime", """ - Timestamp.strftime(format) - - Return a string representing the given POSIX timestamp - controlled by an explicit format string. + Return a formatted string of the Timestamp. Parameters ---------- @@ -680,10 +679,7 @@ class NaTType(_NaT): fromordinal = _make_error_func( "fromordinal", """ - Timestamp.fromordinal(ordinal, freq=None, tz=None) - - Passed an ordinal, translate and convert to a ts. - Note: by definition there cannot be any tz info on the ordinal itself. + Construct a timestamp from a a proleptic Gregorian ordinal. Parameters ---------- @@ -694,6 +690,10 @@ class NaTType(_NaT): tz : str, pytz.timezone, dateutil.tz.tzfile or None Time zone for the Timestamp. + Notes + ----- + By definition there cannot be any tz info on the ordinal itself. + Examples -------- >>> pd.Timestamp.fromordinal(737425) @@ -725,10 +725,7 @@ class NaTType(_NaT): now = _make_nat_func( "now", """ - Timestamp.now(tz=None) - - Return new Timestamp object representing current time local to - tz. + Return new Timestamp object representing current time local to tz. Parameters ---------- @@ -749,10 +746,9 @@ class NaTType(_NaT): today = _make_nat_func( "today", """ - Timestamp.today(cls, tz=None) + Return the current time in the local timezone. - Return the current time in the local timezone. This differs - from datetime.today() in that it can be localized to a + This differs from datetime.today() in that it can be localized to a passed timezone. Parameters @@ -1090,7 +1086,9 @@ timedelta}, default 'raise' tz_localize = _make_nat_func( "tz_localize", """ - Convert naive Timestamp to local time zone, or remove + Localize the Timestamp to a timezone. + + Convert naive Timestamp to local time zone or remove timezone from timezone-aware Timestamp. Parameters diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 8ab0ba24f91515..c58a8d4dc4ba67 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -156,16 +156,16 @@ cdef inline bint cmp_scalar(int64_t lhs, int64_t rhs, int op) except -1: class OutOfBoundsDatetime(ValueError): """ - Raised when the datetime is outside the range that - can be represented. + Raised when the datetime is outside the range that can be represented. """ pass class OutOfBoundsTimedelta(ValueError): """ - Raised when encountering a timedelta value that cannot be represented - as a timedelta64[ns]. + Raised when encountering a timedelta value that cannot be represented. + + Representation should be within a timedelta64[ns]. """ # Timedelta analogue to OutOfBoundsDatetime pass diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 5f4f6b998a60a1..48104965ec42b9 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -585,9 +585,7 @@ cdef class BaseOffset: def apply_index(self, dtindex): """ - Vectorized apply of DateOffset to DatetimeIndex, - raises NotImplementedError for offsets without a - vectorized implementation. + Vectorized apply of DateOffset to DatetimeIndex. .. deprecated:: 1.1.0 @@ -2448,8 +2446,7 @@ cdef class SemiMonthOffset(SingleConstructorOffset): cdef class SemiMonthEnd(SemiMonthOffset): """ - Two DateOffset's per month repeating on the last - day of the month and day_of_month. + Two DateOffset's per month repeating on the last day of the month & day_of_month. Parameters ---------- @@ -2470,8 +2467,7 @@ cdef class SemiMonthEnd(SemiMonthOffset): cdef class SemiMonthBegin(SemiMonthOffset): """ - Two DateOffset's per month repeating on the first - day of the month and day_of_month. + Two DateOffset's per month repeating on the first day of the month & day_of_month. Parameters ---------- @@ -2704,8 +2700,9 @@ cdef class WeekOfMonth(WeekOfMonthMixin): cdef class LastWeekOfMonth(WeekOfMonthMixin): """ - Describes monthly dates in last week of month like "the last Tuesday of - each month". + Describes monthly dates in last week of month. + + For example "the last Tuesday of each month". Parameters ---------- @@ -2991,8 +2988,9 @@ cdef class FY5253(FY5253Mixin): cdef class FY5253Quarter(FY5253Mixin): """ - DateOffset increments between business quarter dates - for 52-53 week fiscal year (also known as a 4-4-5 calendar). + DateOffset increments between business quarter dates for 52-53 week fiscal year. + + Also known as a 4-4-5 calendar. It is used by companies that desire that their fiscal year always end on the same day of the week. @@ -3602,8 +3600,7 @@ def _get_offset(name: str) -> BaseOffset: cpdef to_offset(freq): """ - Return DateOffset object from string or tuple representation - or datetime.timedelta object. + Return DateOffset object from string or datetime.timedelta object. Parameters ---------- diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 33326286277394..3acaa024f52c27 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2322,12 +2322,12 @@ cdef class _Period(PeriodMixin): def strftime(self, fmt: str) -> str: r""" - Returns the string representation of the :class:`Period`, depending - on the selected ``fmt``. ``fmt`` must be a string - containing one or several directives. The method recognizes the same - directives as the :func:`time.strftime` function of the standard Python - distribution, as well as the specific additional directives ``%f``, - ``%F``, ``%q``, ``%l``, ``%u``, ``%n``. + Returns a formatted string representation of the :class:`Period`. + + ``fmt`` must be a string containing one or several directives. + The method recognizes the same directives as the :func:`time.strftime` + function of the standard Python distribution, as well as the specific + additional directives ``%f``, ``%F``, ``%q``, ``%l``, ``%u``, ``%n``. (formatting & docs originally from scikits.timeries). +-----------+--------------------------------+-------+ diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 39458c10ad35b6..215d1c9d6c7223 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1464,7 +1464,8 @@ cdef class _Timedelta(timedelta): def isoformat(self) -> str: """ - Format Timedelta as ISO 8601 Duration like + Format the Timedelta as ISO 8601 Duration. + ``P[n]Y[n]M[n]DT[n]H[n]M[n]S``, where the ``[n]`` s are replaced by the values. See https://en.wikipedia.org/wiki/ISO_8601#Durations. diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index dc4da6c9bf4d23..66d848ba43da9d 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1345,10 +1345,7 @@ class Timestamp(_Timestamp): @classmethod def fromordinal(cls, ordinal, freq=None, tz=None): """ - Timestamp.fromordinal(ordinal, freq=None, tz=None) - - Passed an ordinal, translate and convert to a ts. - Note: by definition there cannot be any tz info on the ordinal itself. + Construct a timestamp from a a proleptic Gregorian ordinal. Parameters ---------- @@ -1359,6 +1356,10 @@ class Timestamp(_Timestamp): tz : str, pytz.timezone, dateutil.tz.tzfile or None Time zone for the Timestamp. + Notes + ----- + By definition there cannot be any tz info on the ordinal itself. + Examples -------- >>> pd.Timestamp.fromordinal(737425) @@ -1370,10 +1371,7 @@ class Timestamp(_Timestamp): @classmethod def now(cls, tz=None): """ - Timestamp.now(tz=None) - - Return new Timestamp object representing current time local to - tz. + Return new Timestamp object representing current time local to tz. Parameters ---------- @@ -1397,10 +1395,9 @@ class Timestamp(_Timestamp): @classmethod def today(cls, tz=None): """ - Timestamp.today(cls, tz=None) + Return the current time in the local timezone. - Return the current time in the local timezone. This differs - from datetime.today() in that it can be localized to a + This differs from datetime.today() in that it can be localized to a passed timezone. Parameters @@ -1477,10 +1474,7 @@ class Timestamp(_Timestamp): def strftime(self, format): """ - Timestamp.strftime(format) - - Return a string representing the given POSIX timestamp - controlled by an explicit format string. + Return a formatted string of the Timestamp. Parameters ---------- @@ -2052,7 +2046,9 @@ timedelta}, default 'raise' def tz_localize(self, tz, ambiguous='raise', nonexistent='raise'): """ - Convert naive Timestamp to local time zone, or remove + Localize the Timestamp to a timezone. + + Convert naive Timestamp to local time zone or remove timezone from timezone-aware Timestamp. Parameters @@ -2343,6 +2339,7 @@ default 'raise' def to_julian_date(self) -> np.float64: """ Convert TimeStamp to a Julian Date. + 0 Julian date is noon January 1, 4713 BC. Examples @@ -2374,6 +2371,7 @@ default 'raise' def isoweekday(self): """ Return the day of the week represented by the date. + Monday == 1 ... Sunday == 7. """ # same as super().isoweekday(), but that breaks because of how @@ -2383,6 +2381,7 @@ default 'raise' def weekday(self): """ Return the day of the week represented by the date. + Monday == 0 ... Sunday == 6. """ # same as super().weekday(), but that breaks because of how diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index ea9414aaaa1a8d..11c236836e791d 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -953,9 +953,9 @@ def freqstr(self) -> str | None: @property # NB: override with cache_readonly in immutable subclasses def inferred_freq(self) -> str | None: """ - Tries to return a string representing a frequency guess, - generated by infer_freq. Returns None if it can't autodetect the - frequency. + Tries to return a string representing a frequency generated by infer_freq. + + Returns None if it can't autodetect the frequency. """ if self.ndim != 1: return None diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 7a56bba0e58b39..ffd093b86582c9 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -810,8 +810,7 @@ def tz_convert(self, tz) -> DatetimeArray: @dtl.ravel_compat def tz_localize(self, tz, ambiguous="raise", nonexistent="raise") -> DatetimeArray: """ - Localize tz-naive Datetime Array/Index to tz-aware - Datetime Array/Index. + Localize tz-naive Datetime Array/Index to tz-aware Datetime Array/Index. This method takes a time zone (tz) naive Datetime Array/Index object and makes this time zone aware. It does not move the time to another @@ -993,8 +992,7 @@ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise") -> DatetimeArr def to_pydatetime(self) -> npt.NDArray[np.object_]: """ - Return Datetime Array/Index as object ndarray of datetime.datetime - objects. + Return an ndarray of datetime.datetime objects. Returns ------- @@ -1122,9 +1120,9 @@ def to_period(self, freq=None) -> PeriodArray: def to_perioddelta(self, freq) -> TimedeltaArray: """ - Calculate TimedeltaArray of difference between index - values and index converted to PeriodArray at specified - freq. Used for vectorized offsets. + Calculate deltas between self values and self converted to Periods at a freq. + + Used for vectorized offsets. Parameters ---------- @@ -1157,8 +1155,7 @@ def to_perioddelta(self, freq) -> TimedeltaArray: def month_name(self, locale=None) -> npt.NDArray[np.object_]: """ - Return the month names of the :class:`~pandas.Series` or - :class:`~pandas.DatetimeIndex` with specified locale. + Return the month names with specified locale. Parameters ---------- @@ -1202,8 +1199,7 @@ def month_name(self, locale=None) -> npt.NDArray[np.object_]: def day_name(self, locale=None) -> npt.NDArray[np.object_]: """ - Return the day names of the :class:`~pandas.Series` or - :class:`~pandas.DatetimeIndex` with specified locale. + Return the day names with specified locale. Parameters ---------- @@ -1262,8 +1258,7 @@ def time(self) -> npt.NDArray[np.object_]: @property def timetz(self) -> npt.NDArray[np.object_]: """ - Returns numpy array of :class:`datetime.time` objects with timezone - information. + Returns numpy array of :class:`datetime.time` objects with timezones. The time part of the Timestamps. """ diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 6469dccf6e2d5c..e7198a95c07f1f 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1254,8 +1254,7 @@ def _format_space(self) -> str: @property def left(self): """ - Return the left endpoints of each Interval in the IntervalArray as - an Index. + Return the left endpoints of each Interval in the IntervalArray as an Index. """ from pandas import Index @@ -1264,8 +1263,7 @@ def left(self): @property def right(self): """ - Return the right endpoints of each Interval in the IntervalArray as - an Index. + Return the right endpoints of each Interval in the IntervalArray as an Index. """ from pandas import Index @@ -1274,8 +1272,7 @@ def right(self): @property def length(self) -> Index: """ - Return an Index with entries denoting the length of each Interval in - the IntervalArray. + Return an Index with entries denoting the length of each Interval. """ return self.right - self.left @@ -1367,16 +1364,18 @@ def overlaps(self, other): @property def inclusive(self) -> IntervalInclusiveType: """ - Whether the intervals are inclusive on the left-side, right-side, both or - neither. + String describing the inclusive side the intervals. + + Either ``left``, ``right``, ``both`` or ``neither``. """ return self.dtype.inclusive @property def closed(self) -> IntervalInclusiveType: """ - Whether the intervals are closed on the left-side, right-side, both or - neither. + String describing the inclusive side the intervals. + + Either ``left``, ``right``, ``both`` or ``neither`. """ warnings.warn( "Attribute `closed` is deprecated in favor of `inclusive`.", @@ -1387,8 +1386,7 @@ def closed(self) -> IntervalInclusiveType: _interval_shared_docs["set_closed"] = textwrap.dedent( """ - Return an %(klass)s identical to the current one, but closed on the - specified side. + Return an identical %(klass)s closed on the specified side. .. deprecated:: 1.5.0 @@ -1440,8 +1438,7 @@ def set_closed( _interval_shared_docs["set_inclusive"] = textwrap.dedent( """ - Return an %(klass)s identical to the current one, but closed on the - specified side. + Return an identical %(klass)s but closed on the specified side. .. versionadded:: 1.5 @@ -1497,9 +1494,10 @@ def set_inclusive( _interval_shared_docs[ "is_non_overlapping_monotonic" ] = """ - Return True if the %(klass)s is non-overlapping (no Intervals share - points) and is either monotonic increasing or monotonic decreasing, - else False. + Return a boolean whether the %(klass)s is non-overlapping and monotonic. + + Non-overlapping means (no Intervals share points), and monotonic means + either monotonic increasing or monotonic decreasing. """ # https://github.com/python/mypy/issues/1362 diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 5f227cb45a65ba..4011f298559496 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -771,8 +771,7 @@ def total_seconds(self) -> npt.NDArray[np.float64]: def to_pytimedelta(self) -> npt.NDArray[np.object_]: """ - Return Timedelta Array/Index as object ndarray of datetime.timedelta - objects. + Return an ndarray of datetime.timedelta objects. Returns ------- @@ -800,8 +799,10 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]: @property def components(self) -> DataFrame: """ - Return a dataframe of the components (days, hours, minutes, - seconds, milliseconds, microseconds, nanoseconds) of the Timedeltas. + Return a DataFrame of the individual resolution components of the Timedeltas. + + The components (days, hours, minutes seconds, milliseconds, microseconds, + nanoseconds) are returned as columns in a DataFrame. Returns ------- diff --git a/pandas/core/base.py b/pandas/core/base.py index 2fa3f57f950b5f..f7e6c4434da32e 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1054,8 +1054,7 @@ def is_unique(self) -> bool: @property def is_monotonic(self) -> bool: """ - Return boolean if values in the object are - monotonic_increasing. + Return boolean if values in the object are monotonically increasing. Returns ------- @@ -1072,8 +1071,7 @@ def is_monotonic(self) -> bool: @property def is_monotonic_increasing(self) -> bool: """ - Return boolean if values in the object are - monotonic_increasing. + Return boolean if values in the object are monotonically increasing. Returns ------- @@ -1086,8 +1084,7 @@ def is_monotonic_increasing(self) -> bool: @property def is_monotonic_decreasing(self) -> bool: """ - Return boolean if values in the object are - monotonic_decreasing. + Return boolean if values in the object are monotonically decreasing. Returns ------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a541cbfe502fba..1c281f571d422b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4837,8 +4837,6 @@ def lookup( ) -> np.ndarray: """ Label-based "fancy indexing" function for DataFrame. - Given equal-length arrays of row and column labels, return an - array of the values corresponding to each (row, col) pair. .. deprecated:: 1.2.0 DataFrame.lookup is deprecated, @@ -4846,6 +4844,9 @@ def lookup( For further details see :ref:`Looking up values by index/column labels `. + Given equal-length arrays of row and column labels, return an + array of the values corresponding to each (row, col) pair. + Parameters ---------- row_labels : sequence diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0a439faed08961..a033b7a3f83d73 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9970,13 +9970,14 @@ def shift( def slice_shift(self: NDFrameT, periods: int = 1, axis=0) -> NDFrameT: """ Equivalent to `shift` without copying data. - The shifted data will not include the dropped periods and the - shifted axis will be smaller than the original. .. deprecated:: 1.2.0 slice_shift is deprecated, use DataFrame/Series.shift instead. + The shifted data will not include the dropped periods and the + shifted axis will be smaller than the original. + Parameters ---------- periods : int diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 06422f8cc5cb0d..28e1b2b3880359 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -329,8 +329,7 @@ class providing the base-class of operations. """ _pipe_template = """ -Apply a function `func` with arguments to this %(klass)s object and return -the function's result. +Apply a ``func`` with arguments to this %(klass)s object and return its result. Use `.pipe` when you want to improve readability by chaining together functions that expect Series, DataFrames, GroupBy or Resampler objects. @@ -381,8 +380,9 @@ class providing the base-class of operations. """ _transform_template = """ -Call function producing a same-indexed %(klass)s on each group and -return a %(klass)s having the same indexes as the original object +Call function producing a same-indexed %(klass)s on each group. + +Returns a %(klass)s having the same indexes as the original object filled with the transformed values. Parameters diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py index beb4db644dd791..c15cbf368c1598 100644 --- a/pandas/core/indexers/objects.py +++ b/pandas/core/indexers/objects.py @@ -238,8 +238,7 @@ def get_window_bounds( class FixedForwardWindowIndexer(BaseIndexer): """ - Creates window boundaries for fixed-length windows that include the - current row. + Creates window boundaries for fixed-length windows that include the current row. Examples -------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 239e6656ea1514..26b833f78bec65 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -271,8 +271,9 @@ def _new_Index(cls, d): class Index(IndexOpsMixin, PandasObject): """ - Immutable sequence used for indexing and alignment. The basic object - storing axis labels for all pandas objects. + Immutable sequence used for indexing and alignment. + + The basic object storing axis labels for all pandas objects. Parameters ---------- @@ -2290,8 +2291,7 @@ def is_monotonic(self) -> bool: @property def is_monotonic_increasing(self) -> bool: """ - Return if the index is monotonic increasing (only equal or - increasing) values. + Return a boolean if the values are equal or increasing. Examples -------- @@ -2307,8 +2307,7 @@ def is_monotonic_increasing(self) -> bool: @property def is_monotonic_decreasing(self) -> bool: """ - Return if the index is monotonic decreasing (only equal or - decreasing) values. + Return a boolean if the values are equal or decreasing. Examples -------- @@ -3810,8 +3809,9 @@ def get_loc(self, key, method=None, tolerance=None): _index_shared_docs[ "get_indexer" ] = """ - Compute indexer and mask for new index given the current index. The - indexer should be then used as an input to ndarray.take to align the + Compute indexer and mask for new index given the current index. + + The indexer should be then used as an input to ndarray.take to align the current data to the new index. Parameters @@ -4580,8 +4580,7 @@ def join( sort: bool = False, ) -> Index | tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: """ - Compute join_index and indexers to conform data - structures to the new index. + Compute join_index and indexers to conform data structures to the new index. Parameters ---------- @@ -5978,8 +5977,9 @@ def set_value(self, arr, key, value) -> None: _index_shared_docs[ "get_indexer_non_unique" ] = """ - Compute indexer and mask for new index given the current index. The - indexer should be then used as an input to ndarray.take to align the + Compute indexer and mask for new index given the current index. + + The indexer should be then used as an input to ndarray.take to align the current data to the new index. Parameters diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 3a7adb19f1c01f..30c770f32c2dcc 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -496,8 +496,9 @@ def _get_time_micros(self) -> npt.NDArray[np.int64]: def to_series(self, keep_tz=lib.no_default, index=None, name=None): """ - Create a Series with both index and values equal to the index keys - useful with map for returning an indexer based on an index. + Create a Series with both index and values equal to the index keys. + + Useful with map for returning an indexer based on an index. Parameters ---------- @@ -826,8 +827,7 @@ def inferred_type(self) -> str: def indexer_at_time(self, time, asof: bool = False) -> npt.NDArray[np.intp]: """ - Return index locations of values at particular time of day - (e.g. 9:30AM). + Return index locations of values at particular time of day. Parameters ---------- @@ -867,8 +867,7 @@ def indexer_between_time( self, start_time, end_time, include_start: bool = True, include_end: bool = True ) -> npt.NDArray[np.intp]: """ - Return index locations of values between particular times of day - (e.g., 9:00-9:30AM). + Return index locations of values between particular times of day. Parameters ---------- @@ -1134,8 +1133,7 @@ def bdate_range( **kwargs, ) -> DatetimeIndex: """ - Return a fixed frequency DatetimeIndex, with business day as the default - frequency. + Return a fixed frequency DatetimeIndex with business day as the default. Parameters ---------- diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index b4b576df9918ea..60f727f54b6215 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1572,8 +1572,7 @@ def _get_level_number(self, level) -> int: @cache_readonly def is_monotonic_increasing(self) -> bool: """ - return if the index is monotonic increasing (only equal or - increasing) values. + Return a boolean if the values are equal or increasing. """ if any(-1 in code for code in self.codes): return False @@ -1605,8 +1604,7 @@ def is_monotonic_increasing(self) -> bool: @cache_readonly def is_monotonic_decreasing(self) -> bool: """ - return if the index is monotonic decreasing (only equal or - decreasing) values. + Return a boolean if the values are equal or decreasing. """ # monotonic decreasing if and only if reverse is monotonic increasing return self[::-1].is_monotonic_increasing diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 5731d476cef109..a597bea0eb7241 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -42,9 +42,10 @@ class NumericIndex(Index): """ - Immutable sequence used for indexing and alignment. The basic object - storing axis labels for all pandas objects. NumericIndex is a special case - of `Index` with purely numpy int/uint/float labels. + Immutable numeric sequence used for indexing and alignment. + + The basic object storing axis labels for all pandas objects. + NumericIndex is a special case of `Index` with purely numpy int/uint/float labels. .. versionadded:: 1.4.0 @@ -309,14 +310,15 @@ def _format_native_types( _num_index_shared_docs[ "class_descr" ] = """ - Immutable sequence used for indexing and alignment. The basic object - storing axis labels for all pandas objects. %(klass)s is a special case - of `Index` with purely %(ltype)s labels. %(extra)s. + Immutable sequence used for indexing and alignment. .. deprecated:: 1.4.0 In pandas v2.0 %(klass)s will be removed and :class:`NumericIndex` used instead. %(klass)s will remain fully functional for the duration of pandas 1.x. + The basic object storing axis labels for all pandas objects. + %(klass)s is a special case of `Index` with purely %(ltype)s labels. %(extra)s. + Parameters ---------- data : array-like (1-dimensional) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 095c5d1b1ba03a..12a8f2c0d5a9d6 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -47,8 +47,9 @@ ) class TimedeltaIndex(DatetimeTimedeltaMixin): """ - Immutable ndarray of timedelta64 data, represented internally as int64, and - which can be boxed to timedelta objects. + Immutable Index of timedelta64 data. + + Represented internally as int64, and scalars returned Timedelta objects. Parameters ---------- @@ -209,8 +210,7 @@ def timedelta_range( closed=None, ) -> TimedeltaIndex: """ - Return a fixed frequency TimedeltaIndex, with day as the default - frequency. + Return a fixed frequency TimedeltaIndex with day as the default. Parameters ---------- diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 917382544199a5..87973241667455 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -362,8 +362,9 @@ def aggregate(self, func=None, *args, **kwargs): def transform(self, arg, *args, **kwargs): """ - Call function producing a like-indexed Series on each group and return - a Series with the transformed values. + Call function producing a like-indexed Series on each group. + + Return a Series with the transformed values. Parameters ---------- diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 523cd56db3e0a7..5328c7995ea6f5 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -157,8 +157,9 @@ def concat( copy: bool = True, ) -> DataFrame | Series: """ - Concatenate pandas objects along a particular axis with optional set logic - along the other axes. + Concatenate pandas objects along a particular axis. + + Allows optional set logic along the other axes. Can also add a layer of hierarchical indexing on the concatenation axis, which may be useful if the labels are the same (or overlapping) on diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 03aad0ef64dec0..5226c928c6f73b 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -537,9 +537,10 @@ def crosstab( normalize=False, ) -> DataFrame: """ - Compute a simple cross tabulation of two (or more) factors. By default - computes a frequency table of the factors unless an array of values and an - aggregation function are passed. + Compute a simple cross tabulation of two (or more) factors. + + By default, computes a frequency table of the factors unless an + array of values and an aggregation function are passed. Parameters ---------- diff --git a/pandas/core/series.py b/pandas/core/series.py index 67cdb5d8d72ab3..765bf9f7e04f16 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2642,6 +2642,7 @@ def quantile( def corr(self, other, method="pearson", min_periods=None) -> float: """ Compute correlation with `other` Series, excluding missing values. + The two `Series` objects are not required to be the same length and will be aligned internally before the correlation function is applied. @@ -2716,6 +2717,7 @@ def cov( ) -> float: """ Compute covariance with Series, excluding missing values. + The two `Series` objects are not required to be the same length and will be aligned internally before the covariance is calculated. diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 73d5c04ecd6520..d50daad9a22b15 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -2015,8 +2015,9 @@ def rstrip(self, to_strip=None): _shared_docs[ "str_removefix" ] = r""" - Remove a %(side)s from an object series. If the %(side)s is not present, - the original string will be returned. + Remove a %(side)s from an object series. + + If the %(side)s is not present, the original string will be returned. Parameters ---------- diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index e3f7e9d454383a..d7f5e7aab58ab9 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -15,21 +15,17 @@ class IntCastingNaNError(ValueError): """ - Raised when attempting an astype operation on an array with NaN to an integer - dtype. + Exception raised when converting (``astype``) an array with NaN to an integer type. """ - pass - class NullFrequencyError(ValueError): """ - Error raised when a null `freq` attribute is used in an operation - that needs a non-null frequency, particularly `DatetimeIndex.shift`, - `TimedeltaIndex.shift`, `PeriodIndex.shift`. - """ + Exception raised when a ``freq`` cannot be null. - pass + Particularly ``DatetimeIndex.shift``, ``TimedeltaIndex.shift``, + ``PeriodIndex.shift``. + """ class PerformanceWarning(Warning): @@ -40,16 +36,17 @@ class PerformanceWarning(Warning): class UnsupportedFunctionCall(ValueError): """ - Exception raised when attempting to call a numpy function - on a pandas object, but that function is not supported by - the object e.g. ``np.cumsum(groupby_object)``. + Exception raised when attempting to call a unsupported numpy function. + + For example, ``np.cumsum(groupby_object)``. """ class UnsortedIndexError(KeyError): """ - Error raised when attempting to get a slice of a MultiIndex, - and the index has not been lexsorted. Subclass of `KeyError`. + Error raised when slicing a MultiIndex which has not been lexsorted. + + Subclass of `KeyError`. """ @@ -124,8 +121,7 @@ class DtypeWarning(Warning): class EmptyDataError(ValueError): """ - Exception that is thrown in `pd.read_csv` (by both the C and - Python engines) when empty data or header is encountered. + Exception raised in ``pd.read_csv`` when empty data or header is encountered. """ @@ -172,8 +168,9 @@ class ParserWarning(Warning): class MergeError(ValueError): """ - Error raised when problems arise during merging due to problems - with input data. Subclass of `ValueError`. + Exception raised when merging data. + + Subclass of ``ValueError``. """ @@ -185,8 +182,7 @@ class AccessorRegistrationWarning(Warning): class AbstractMethodError(NotImplementedError): """ - Raise this error instead of NotImplementedError for abstract methods - while keeping compatibility with Python 2 and Python 3. + Raise this error instead of NotImplementedError for abstract methods. """ def __init__(self, class_instance, methodtype: str = "method") -> None: @@ -243,17 +239,23 @@ class InvalidIndexError(Exception): class DataError(Exception): """ - Exception raised when trying to perform a ohlc on a non-numnerical column. - Or, it can be raised when trying to apply a function to a non-numerical - column on a rolling window. + Exceptionn raised when performing an operation on non-numerical data. + + For example, calling ``ohlc`` on a non-numerical column or a function + on a rolling window. """ class SpecificationError(Exception): """ - Exception raised in two scenarios. The first way is calling agg on a + Exception raised by ``agg`` when the functions are ill-specified. + + The exception raised in two scenarios. + + The first way is calling ``agg`` on a Dataframe or Series using a nested renamer (dict-of-dict). - The second way is calling agg on a Dataframe with duplicated functions + + The second way is calling ``agg`` on a Dataframe with duplicated functions names without assigning column name. Examples @@ -274,9 +276,10 @@ class SpecificationError(Exception): class SettingWithCopyError(ValueError): """ - Exception is raised when trying to set on a copied slice from a dataframe and - the mode.chained_assignment is set to 'raise.' This can happen unintentionally - when chained indexing. + Exception raised when trying to set on a copied slice from a ``DataFrame``. + + The ``mode.chained_assignment`` needs to be set to set to 'raise.' This can + happen unintentionally when chained indexing. For more information on eveluation order, see :ref:`the user guide`. @@ -295,9 +298,11 @@ class SettingWithCopyError(ValueError): class SettingWithCopyWarning(Warning): """ - Warning is raised when trying to set on a copied slice from a dataframe and - the mode.chained_assignment is set to 'warn.' 'Warn' is the default option. - This can happen unintentionally when chained indexing. + Warning raised when trying to set on a copied slice from a ``DataFrame``. + + The ``mode.chained_assignment`` needs to be set to set to 'warn.' + 'Warn' is the default option. This can happen unintentionally when + chained indexing. For more information on eveluation order, see :ref:`the user guide`. @@ -315,10 +320,11 @@ class SettingWithCopyWarning(Warning): class NumExprClobberingError(NameError): """ - Exception is raised when trying to use a built-in numexpr name as a variable name - in a method like query or eval. Eval will throw the error if the engine is set - to 'numexpr'. 'numexpr' is the default engine value for eval if the numexpr package - is installed. + Exception raised when trying to use a built-in numexpr name as a variable name. + + ``eval`` or ``query`` will throw the error if the engine is set + to 'numexpr'. 'numexpr' is the default engine value for these methods if the + numexpr package is installed. Examples -------- @@ -333,9 +339,9 @@ class NumExprClobberingError(NameError): class UndefinedVariableError(NameError): """ - Exception is raised when trying to use an undefined variable name in a method - like query or eval. It will also specific whether the undefined variable is - local or not. + Exception raised by ``query`` or ``eval`` when using an undefined variable name. + + It will also specify whether the undefined variable is local or not. Examples -------- @@ -380,15 +386,18 @@ class IndexingError(Exception): class PyperclipException(RuntimeError): """ - Exception is raised when trying to use methods like to_clipboard() and - read_clipboard() on an unsupported OS/platform. + Exception raised when clipboard functionality is unsupported. + + Raised by ``to_clipboard()`` and ``read_clipboard()``. """ class PyperclipWindowsException(PyperclipException): """ - Exception is raised when pandas is unable to get access to the clipboard handle - due to some other window process is accessing it. + Exception raised when clipboard functionality is unsupported by Windows. + + Access to the clipboard handle would be denied due to some other + window process is accessing it. """ def __init__(self, message: str) -> None: @@ -400,6 +409,7 @@ def __init__(self, message: str) -> None: class CSSWarning(UserWarning): """ Warning is raised when converting css styling fails. + This can be due to the styling not having an equivalent value or because the styling isn't properly formatted. @@ -417,8 +427,7 @@ class CSSWarning(UserWarning): class PossibleDataLossError(Exception): """ - Exception is raised when trying to open a HDFStore file when the file is already - opened. + Exception raised when trying to open a HDFStore file when already opened. Examples -------- @@ -443,14 +452,15 @@ class ClosedFileError(Exception): class IncompatibilityWarning(Warning): """ - Warning is raised when trying to use where criteria on an incompatible - HDF5 file. + Warning raised when trying to use where criteria on an incompatible HDF5 file. """ class AttributeConflictWarning(Warning): """ - Warning is raised when attempting to append an index with a different + Warning raised when index attributes conflict when using HDFStore. + + Occurs when attempting to append an index with a different name than the existing index on an HDFStore or attempting to append an index with a different frequency than the existing index on an HDFStore. """ diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 01b4812d3dc2a1..9e3f54169d1785 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -1479,8 +1479,7 @@ def to_string( def set_td_classes(self, classes: DataFrame) -> Styler: """ - Set the DataFrame of strings added to the ``class`` attribute of ```` - HTML elements. + Set the ``class`` attribute of ```` HTML elements. Parameters ---------- @@ -3173,8 +3172,7 @@ def text_gradient( @Substitution(subset=subset) def set_properties(self, subset: Subset | None = None, **kwargs) -> Styler: """ - Set defined CSS-properties to each ```` HTML element within the given - subset. + Set defined CSS-properties to each ```` HTML element for the given subset. Parameters ---------- diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index dba161cf6d45c9..414bd3b76bd0d7 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -62,24 +62,18 @@ class CSSDict(TypedDict): Subset = Union[slice, Sequence, Index] -def _gl01_adjust(obj: Any) -> Any: - """Adjust docstrings for Numpydoc GLO1.""" - obj.__doc__ = "\n" + obj.__doc__ - return obj - - class StylerRenderer: """ Base class to process rendering a Styler with a specified jinja2 template. """ - loader = _gl01_adjust(jinja2.PackageLoader("pandas", "io/formats/templates")) - env = _gl01_adjust(jinja2.Environment(loader=loader, trim_blocks=True)) - template_html = _gl01_adjust(env.get_template("html.tpl")) - template_html_table = _gl01_adjust(env.get_template("html_table.tpl")) - template_html_style = _gl01_adjust(env.get_template("html_style.tpl")) - template_latex = _gl01_adjust(env.get_template("latex.tpl")) - template_string = _gl01_adjust(env.get_template("string.tpl")) + loader = jinja2.PackageLoader("pandas", "io/formats/templates") + env = jinja2.Environment(loader=loader, trim_blocks=True) + template_html = env.get_template("html.tpl") + template_html_table = env.get_template("html_table.tpl") + template_html_style = env.get_template("html_style.tpl") + template_latex = env.get_template("latex.tpl") + template_string = env.get_template("string.tpl") def __init__( self, diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 5c773a424a1c9a..a4049eff8ae71d 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1214,8 +1214,9 @@ def append( errors: str = "strict", ) -> None: """ - Append to Table in file. Node must already exist and be Table - format. + Append to Table in file. + + Node must already exist and be Table format. Parameters ---------- diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 80b6db2500d281..3daa6d837349e5 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1968,8 +1968,7 @@ def data_label(self) -> str: def variable_labels(self) -> dict[str, str]: """ - Return variable labels as a dict, associating each variable name - with corresponding label. + Return a dict associating each variable name with corresponding label. Returns ------- @@ -1979,8 +1978,7 @@ def variable_labels(self) -> dict[str, str]: def value_labels(self) -> dict[str, dict[float, str]]: """ - Return a dict, associating each variable name a dict, associating - each value its corresponding label. + Return a nested dict associating each variable name to its value and label. Returns ------- diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 17763b25329ab8..5bd2e8a53e8e8d 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -270,8 +270,7 @@ def andrews_curves( **kwargs, ) -> Axes: """ - Generate a matplotlib plot of Andrews curves, for visualising clusters of - multivariate data. + Generate a matplotlib plot for visualising clusters of multivariate data. Andrews curves have the functional form: diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 873103b01f64d8..1a07c02f4024a9 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -320,6 +320,7 @@ def test_overlap_public_nat_methods(klass, expected): _get_overlap_public_nat_methods(Timestamp, True) + _get_overlap_public_nat_methods(Timedelta, True) ), + ids=lambda x: f"{x[0].__name__}.{x[1]}", ) def test_nat_doc_strings(compare): # see gh-17327 diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 3a0c437c918fb8..cbf02bc0a01561 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -40,7 +40,16 @@ # With template backend, matplotlib plots nothing matplotlib.use("template") - +# Styler methods are Jinja2 objects who's docstrings we don't own. +IGNORE_VALIDATION = { + "Styler.env", + "Styler.template_html", + "Styler.template_html_style", + "Styler.template_html_table", + "Styler.template_latex", + "Styler.template_string", + "Styler.loader", +} PRIVATE_CLASSES = ["NDFrame", "IndexOpsMixin"] ERROR_MSGS = { "GL04": "Private classes ({mentioned_private_classes}) should not be " @@ -121,6 +130,8 @@ def get_api_items(api_doc_fd): position = None continue item = line.strip() + if item in IGNORE_VALIDATION: + continue func = importlib.import_module(current_module) for part in item.split("."): func = getattr(func, part) @@ -230,7 +241,8 @@ def pandas_validate(func_name: str): Information about the docstring and the errors found. """ func_obj = Validator._load_obj(func_name) - doc_obj = get_doc_object(func_obj) + # Some objects are instances, e.g. IndexSlice, which numpydoc can't validate + doc_obj = get_doc_object(func_obj, doc=func_obj.__doc__) doc = PandasDocstring(func_name, doc_obj) result = validate(doc_obj) From 76de473c2523d5cb26449fbc2eb6c56ae49237c4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 29 Jul 2022 17:45:45 -0700 Subject: [PATCH 22/23] REF: Rename exchange -> interchange (#47888) --- doc/source/reference/general_functions.rst | 2 +- pandas/api/__init__.py | 4 ++-- pandas/api/exchange/__init__.py | 8 -------- pandas/api/interchange/__init__.py | 8 ++++++++ pandas/core/frame.py | 10 +++++----- pandas/core/{exchange => interchange}/__init__.py | 0 pandas/core/{exchange => interchange}/buffer.py | 2 +- pandas/core/{exchange => interchange}/column.py | 8 ++++---- pandas/core/{exchange => interchange}/dataframe.py | 4 ++-- .../{exchange => interchange}/dataframe_protocol.py | 2 +- .../core/{exchange => interchange}/from_dataframe.py | 12 ++++++------ pandas/core/{exchange => interchange}/utils.py | 2 +- pandas/tests/api/test_api.py | 2 +- pandas/tests/{exchange => interchange}/__init__.py | 0 pandas/tests/{exchange => interchange}/conftest.py | 0 pandas/tests/{exchange => interchange}/test_impl.py | 4 ++-- .../test_spec_conformance.py | 0 pandas/tests/{exchange => interchange}/test_utils.py | 2 +- 18 files changed, 35 insertions(+), 35 deletions(-) delete mode 100644 pandas/api/exchange/__init__.py create mode 100644 pandas/api/interchange/__init__.py rename pandas/core/{exchange => interchange}/__init__.py (100%) rename pandas/core/{exchange => interchange}/buffer.py (97%) rename pandas/core/{exchange => interchange}/column.py (98%) rename pandas/core/{exchange => interchange}/dataframe.py (96%) rename pandas/core/{exchange => interchange}/dataframe_protocol.py (99%) rename pandas/core/{exchange => interchange}/from_dataframe.py (97%) rename pandas/core/{exchange => interchange}/utils.py (96%) rename pandas/tests/{exchange => interchange}/__init__.py (100%) rename pandas/tests/{exchange => interchange}/conftest.py (100%) rename pandas/tests/{exchange => interchange}/test_impl.py (97%) rename pandas/tests/{exchange => interchange}/test_spec_conformance.py (100%) rename pandas/tests/{exchange => interchange}/test_utils.py (95%) diff --git a/doc/source/reference/general_functions.rst b/doc/source/reference/general_functions.rst index f82d9c9a6482c0..474e37a85d8575 100644 --- a/doc/source/reference/general_functions.rst +++ b/doc/source/reference/general_functions.rst @@ -85,4 +85,4 @@ Importing from other DataFrame libraries .. autosummary:: :toctree: api/ - api.exchange.from_dataframe + api.interchange.from_dataframe diff --git a/pandas/api/__init__.py b/pandas/api/__init__.py index 22a09ed61d6941..9d4f721225d93a 100644 --- a/pandas/api/__init__.py +++ b/pandas/api/__init__.py @@ -1,13 +1,13 @@ """ public toolkit API """ from pandas.api import ( - exchange, extensions, indexers, + interchange, types, ) __all__ = [ - "exchange", + "interchange", "extensions", "indexers", "types", diff --git a/pandas/api/exchange/__init__.py b/pandas/api/exchange/__init__.py deleted file mode 100644 index 6760d81f60ac76..00000000000000 --- a/pandas/api/exchange/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -""" -Public API for DataFrame exchange protocol. -""" - -from pandas.core.exchange.dataframe_protocol import DataFrame -from pandas.core.exchange.from_dataframe import from_dataframe - -__all__ = ["from_dataframe", "DataFrame"] diff --git a/pandas/api/interchange/__init__.py b/pandas/api/interchange/__init__.py new file mode 100644 index 00000000000000..2f3a73bc46b310 --- /dev/null +++ b/pandas/api/interchange/__init__.py @@ -0,0 +1,8 @@ +""" +Public API for DataFrame interchange protocol. +""" + +from pandas.core.interchange.dataframe_protocol import DataFrame +from pandas.core.interchange.from_dataframe import from_dataframe + +__all__ = ["from_dataframe", "DataFrame"] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1c281f571d422b..338a8ff16e54e8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -220,8 +220,8 @@ if TYPE_CHECKING: - from pandas.core.exchange.dataframe_protocol import DataFrame as DataFrameXchg from pandas.core.groupby.generic import DataFrameGroupBy + from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg from pandas.core.internals import SingleDataManager from pandas.core.resample import Resampler @@ -819,7 +819,7 @@ def __dataframe__( self, nan_as_null: bool = False, allow_copy: bool = True ) -> DataFrameXchg: """ - Return the dataframe exchange object implementing the exchange protocol. + Return the dataframe interchange object implementing the interchange protocol. Parameters ---------- @@ -832,19 +832,19 @@ def __dataframe__( Returns ------- - DataFrame exchange object + DataFrame interchange object The object which consuming library can use to ingress the dataframe. Notes ----- - Details on the exchange protocol: + Details on the interchange protocol: https://data-apis.org/dataframe-protocol/latest/index.html `nan_as_null` currently has no effect; once support for nullable extension dtypes is added, this value should be propagated to columns. """ - from pandas.core.exchange.dataframe import PandasDataFrameXchg + from pandas.core.interchange.dataframe import PandasDataFrameXchg return PandasDataFrameXchg(self, nan_as_null, allow_copy) diff --git a/pandas/core/exchange/__init__.py b/pandas/core/interchange/__init__.py similarity index 100% rename from pandas/core/exchange/__init__.py rename to pandas/core/interchange/__init__.py diff --git a/pandas/core/exchange/buffer.py b/pandas/core/interchange/buffer.py similarity index 97% rename from pandas/core/exchange/buffer.py rename to pandas/core/interchange/buffer.py index a3b05a0c5d24aa..1d24efc263ca0c 100644 --- a/pandas/core/exchange/buffer.py +++ b/pandas/core/interchange/buffer.py @@ -3,7 +3,7 @@ import numpy as np from packaging import version -from pandas.core.exchange.dataframe_protocol import ( +from pandas.core.interchange.dataframe_protocol import ( Buffer, DlpackDeviceType, ) diff --git a/pandas/core/exchange/column.py b/pandas/core/interchange/column.py similarity index 98% rename from pandas/core/exchange/column.py rename to pandas/core/interchange/column.py index c2a1cfe766b22b..9ef73aa1f40e04 100644 --- a/pandas/core/exchange/column.py +++ b/pandas/core/interchange/column.py @@ -12,14 +12,14 @@ is_categorical_dtype, is_string_dtype, ) -from pandas.core.exchange.buffer import PandasBuffer -from pandas.core.exchange.dataframe_protocol import ( +from pandas.core.interchange.buffer import PandasBuffer +from pandas.core.interchange.dataframe_protocol import ( Column, ColumnBuffers, ColumnNullType, DtypeKind, ) -from pandas.core.exchange.utils import ( +from pandas.core.interchange.utils import ( ArrowCTypes, Endianness, NoBufferPresent, @@ -136,7 +136,7 @@ def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]: kind = _NP_KINDS.get(dtype.kind, None) if kind is None: # Not a NumPy dtype. Check if it's a categorical maybe - raise ValueError(f"Data type {dtype} not supported by exchange protocol") + raise ValueError(f"Data type {dtype} not supported by interchange protocol") return kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), dtype.byteorder diff --git a/pandas/core/exchange/dataframe.py b/pandas/core/interchange/dataframe.py similarity index 96% rename from pandas/core/exchange/dataframe.py rename to pandas/core/interchange/dataframe.py index e5bb3811afed0c..ddcffbff646700 100644 --- a/pandas/core/exchange/dataframe.py +++ b/pandas/core/interchange/dataframe.py @@ -4,8 +4,8 @@ from typing import TYPE_CHECKING import pandas as pd -from pandas.core.exchange.column import PandasColumn -from pandas.core.exchange.dataframe_protocol import DataFrame as DataFrameXchg +from pandas.core.interchange.column import PandasColumn +from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg if TYPE_CHECKING: from pandas import Index diff --git a/pandas/core/exchange/dataframe_protocol.py b/pandas/core/interchange/dataframe_protocol.py similarity index 99% rename from pandas/core/exchange/dataframe_protocol.py rename to pandas/core/interchange/dataframe_protocol.py index 367b9063327412..036f84a393903e 100644 --- a/pandas/core/exchange/dataframe_protocol.py +++ b/pandas/core/interchange/dataframe_protocol.py @@ -389,7 +389,7 @@ class DataFrame(ABC): @abstractmethod def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): - """Construct a new exchange object, potentially changing the parameters.""" + """Construct a new interchange object, potentially changing the parameters.""" pass @property diff --git a/pandas/core/exchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py similarity index 97% rename from pandas/core/exchange/from_dataframe.py rename to pandas/core/interchange/from_dataframe.py index a33e47ba3b68ec..ae9b39de54d416 100644 --- a/pandas/core/exchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -7,14 +7,14 @@ import numpy as np import pandas as pd -from pandas.core.exchange.dataframe_protocol import ( +from pandas.core.interchange.dataframe_protocol import ( Buffer, Column, ColumnNullType, DataFrame as DataFrameXchg, DtypeKind, ) -from pandas.core.exchange.utils import ( +from pandas.core.interchange.utils import ( ArrowCTypes, Endianness, ) @@ -34,7 +34,7 @@ def from_dataframe(df, allow_copy=True) -> pd.DataFrame: Parameters ---------- df : DataFrameXchg - Object supporting the exchange protocol, i.e. `__dataframe__` method. + Object supporting the interchange protocol, i.e. `__dataframe__` method. allow_copy : bool, default: True Whether to allow copying the memory to perform the conversion (if false then zero-copy approach is requested). @@ -54,12 +54,12 @@ def from_dataframe(df, allow_copy=True) -> pd.DataFrame: def _from_dataframe(df: DataFrameXchg, allow_copy=True): """ - Build a ``pd.DataFrame`` from the DataFrame exchange object. + Build a ``pd.DataFrame`` from the DataFrame interchange object. Parameters ---------- df : DataFrameXchg - Object supporting the exchange protocol, i.e. `__dataframe__` method. + Object supporting the interchange protocol, i.e. `__dataframe__` method. allow_copy : bool, default: True Whether to allow copying the memory to perform the conversion (if false then zero-copy approach is requested). @@ -91,7 +91,7 @@ def _from_dataframe(df: DataFrameXchg, allow_copy=True): def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame: """ - Convert exchange protocol chunk to ``pd.DataFrame``. + Convert interchange protocol chunk to ``pd.DataFrame``. Parameters ---------- diff --git a/pandas/core/exchange/utils.py b/pandas/core/interchange/utils.py similarity index 96% rename from pandas/core/exchange/utils.py rename to pandas/core/interchange/utils.py index 2cc5126591718d..1d56af94b2629a 100644 --- a/pandas/core/exchange/utils.py +++ b/pandas/core/interchange/utils.py @@ -1,5 +1,5 @@ """ -Utility functions and objects for implementing the exchange API. +Utility functions and objects for implementing the interchange API. """ from __future__ import annotations diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 6350f402ac0e5f..c62a86e1983f5c 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -277,7 +277,7 @@ def test_np(): class TestApi(Base): - allowed = ["types", "extensions", "indexers", "exchange"] + allowed = ["types", "extensions", "indexers", "interchange"] def test_api(self): self.check(api, self.allowed) diff --git a/pandas/tests/exchange/__init__.py b/pandas/tests/interchange/__init__.py similarity index 100% rename from pandas/tests/exchange/__init__.py rename to pandas/tests/interchange/__init__.py diff --git a/pandas/tests/exchange/conftest.py b/pandas/tests/interchange/conftest.py similarity index 100% rename from pandas/tests/exchange/conftest.py rename to pandas/tests/interchange/conftest.py diff --git a/pandas/tests/exchange/test_impl.py b/pandas/tests/interchange/test_impl.py similarity index 97% rename from pandas/tests/exchange/test_impl.py rename to pandas/tests/interchange/test_impl.py index e0e9fdce645d0e..5168e1acc8e7e2 100644 --- a/pandas/tests/exchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -6,11 +6,11 @@ import pandas as pd import pandas._testing as tm -from pandas.core.exchange.dataframe_protocol import ( +from pandas.core.interchange.dataframe_protocol import ( ColumnNullType, DtypeKind, ) -from pandas.core.exchange.from_dataframe import from_dataframe +from pandas.core.interchange.from_dataframe import from_dataframe test_data_categorical = { "ordered": pd.Categorical(list("testdata") * 30, ordered=True), diff --git a/pandas/tests/exchange/test_spec_conformance.py b/pandas/tests/interchange/test_spec_conformance.py similarity index 100% rename from pandas/tests/exchange/test_spec_conformance.py rename to pandas/tests/interchange/test_spec_conformance.py diff --git a/pandas/tests/exchange/test_utils.py b/pandas/tests/interchange/test_utils.py similarity index 95% rename from pandas/tests/exchange/test_utils.py rename to pandas/tests/interchange/test_utils.py index 4c80ecf0d23a0a..4fd42abb7f3f1b 100644 --- a/pandas/tests/exchange/test_utils.py +++ b/pandas/tests/interchange/test_utils.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas.core.exchange.utils import dtype_to_arrow_c_fmt +from pandas.core.interchange.utils import dtype_to_arrow_c_fmt # TODO: use ArrowSchema to get reference C-string. # At the time, there is no way to access ArrowSchema holding a type format string From 6e1a0401420bffd282f3cf8b57e653a39e373d3b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 29 Jul 2022 17:46:00 -0700 Subject: [PATCH 23/23] CI: Add CodeQL Github Action (#47890) --- .github/workflows/codeql.yml | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 .github/workflows/codeql.yml diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 00000000000000..457aa69fb924f6 --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,31 @@ +name: CodeQL +on: + schedule: + # every day at midnight + - cron: "0 0 * * *" + +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + +jobs: + analyze: + runs-on: ubuntu-latest + permissions: + actions: read + contents: read + security-events: write + + strategy: + fail-fast: false + matrix: + language: + - python + + steps: + - uses: actions/checkout@v3 + - uses: github/codeql-action/init@v2 + with: + languages: ${{ matrix.language }} + - uses: github/codeql-action/autobuild@v2 + - uses: github/codeql-action/analyze@v2