Merge branch 'master' into deprecate_Series.apply_using_func_returnin…

…g_Series
pandas-dev · Mar 29, 2023 · 53d4a16 · 53d4a16
2 parents 8c70ad9 + 8c7b8a4
commit 53d4a16
Show file tree

Hide file tree

Showing 136 changed files with 17,478 additions and 946 deletions.
diff --git a/.github/workflows/dependabot.yml b/.github/workflows/dependabot.yml
@@ -0,0 +1,9 @@
+version: 2
+updates:
+  - package-ecosystem: github-actions
+    directory: /
+    schedule:
+      interval: weekly
+    labels:
+      - "CI"
+      - "Dependencies"
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -28,7 +28,7 @@ repos:
         types_or: [python, pyi]
         additional_dependencies: [black==23.1.0]
 -   repo: https://github.com/charliermarsh/ruff-pre-commit
-    rev: v0.0.255
+    rev: v0.0.259
     hooks:
     -   id: ruff
         args: [--exit-non-zero-on-fix]
@@ -392,14 +392,6 @@ repos:
         files: ^pandas/
         exclude: ^(pandas/_libs/|pandas/tests/|pandas/errors/__init__.py$|pandas/_version.py)
         types: [python]
-    -   id: flake8-pyi
-        name: flake8-pyi
-        entry: flake8 --extend-ignore=E301,E302,E305,E701,E704
-        types: [pyi]
-        language: python
-        additional_dependencies:
-        - flake8==5.0.4
-        - flake8-pyi==22.8.1
     -   id: future-annotations
         name: import annotations from __future__
         entry: 'from __future__ import annotations'
@@ -421,8 +413,8 @@ repos:
         language: python
         stages: [manual]
         additional_dependencies:
-        - autotyping==22.9.0
-        - libcst==0.4.7
+        - autotyping==23.3.0
+        - libcst==0.4.9
     -   id: check-test-naming
         name: check that test names start with 'test'
         entry: python -m scripts.check_test_naming

diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py
@@ -266,10 +266,14 @@ def setup(self, tz):
         self.ts = self.s[halfway]
 
         self.s2 = Series(date_range("20010101", periods=N, freq="s", tz=tz))
+        self.ts_different_reso = Timestamp("2001-01-02", tz=tz)
 
     def time_series_timestamp_compare(self, tz):
         self.s <= self.ts
 
+    def time_series_timestamp_different_reso_compare(self, tz):
+        self.s <= self.ts_different_reso
+
     def time_timestamp_series_compare(self, tz):
         self.ts >= self.s
 

diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
@@ -34,7 +34,6 @@ def setup(self, dtype):
 
         # GH37371. Testing construction of string series/frames from ExtensionArrays
         self.series_cat_arr = Categorical(self.series_arr)
-        self.frame_cat_arr = Categorical(self.frame_arr)
 
     def time_series_construction(self, dtype):
         Series(self.series_arr, dtype=dtype)
@@ -54,12 +53,6 @@ def time_cat_series_construction(self, dtype):
     def peakmem_cat_series_construction(self, dtype):
         Series(self.series_cat_arr, dtype=dtype)
 
-    def time_cat_frame_construction(self, dtype):
-        DataFrame(self.frame_cat_arr, dtype=dtype)
-
-    def peakmem_cat_frame_construction(self, dtype):
-        DataFrame(self.frame_cat_arr, dtype=dtype)
-
 
 class Methods(Dtypes):
     def time_center(self, dtype):

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -86,8 +86,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
     MSG='Partially validate docstrings (EX01)' ;  echo $MSG
     $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX01 --ignore_functions \
         pandas.Series.index \
-        pandas.Series.hasnans \
-        pandas.Series.to_list \
         pandas.Series.__iter__ \
         pandas.Series.keys \
         pandas.Series.item \
@@ -309,7 +307,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         pandas_object \
         pandas.api.interchange.from_dataframe \
         pandas.Index.values \
-        pandas.Index.hasnans \
         pandas.Index.dtype \
         pandas.Index.inferred_type \
         pandas.Index.shape \

diff --git a/doc/source/_static/reshaping_pivot.png b/doc/source/_static/reshaping_pivot.png
diff --git a/doc/source/getting_started/index.rst b/doc/source/getting_started/index.rst
@@ -533,7 +533,7 @@ Data sets do not only contain numerical data. pandas provides a wide range of fu
 Coming from...
 --------------
 
-Are you familiar with other software for manipulating tablular data? Learn
+Are you familiar with other software for manipulating tabular data? Learn
 the pandas-equivalent operations compared to software you already know:
 
 .. panels::

diff --git a/doc/source/getting_started/tutorials.rst b/doc/source/getting_started/tutorials.rst
@@ -113,7 +113,7 @@ Various tutorials
 * `Wes McKinney's (pandas BDFL) blog <https://wesmckinney.com/archives.html>`_
 * `Statistical analysis made easy in Python with SciPy and pandas DataFrames, by Randal Olson <http://www.randalolson.com/2012/08/06/statistical-analysis-made-easy-in-python/>`_
 * `Statistical Data Analysis in Python, tutorial videos, by Christopher Fonnesbeck from SciPy 2013 <https://conference.scipy.org/scipy2013/tutorial_detail.php?id=109>`_
-* `Financial analysis in Python, by Thomas Wiecki <https://nbviewer.ipython.org/github/twiecki/financial-analysis-python-tutorial/blob/master/1.%20Pandas%20Basics.ipynb>`_
+* `Financial analysis in Python, by Thomas Wiecki <https://nbviewer.org/github/twiecki/financial-analysis-python-tutorial/blob/master/1.%20Pandas%20Basics.ipynb>`_
 * `Intro to pandas data structures, by Greg Reda <http://www.gregreda.com/2013/10/26/intro-to-pandas-data-structures/>`_
 * `Pandas and Python: Top 10, by Manish Amde <https://manishamde.github.io/blog/2013/03/07/pandas-and-python-top-10/>`_
 * `Pandas DataFrames Tutorial, by Karlijn Willems <https://www.datacamp.com/community/tutorials/pandas-tutorial-dataframe-python>`_

diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst
@@ -93,9 +93,10 @@ PyArrow type                                    pandas extension type      NumPy
 
 .. note::
 
-    For string types (``pyarrow.string()``, ``string[pyarrow]``), PyArrow support is still facilitated
-    by :class:`arrays.ArrowStringArray` and ``StringDtype("pyarrow")``. See the :ref:`string section <api.arrays.string>`
-    below.
+    Pyarrow-backed string support is provided by both ``pd.StringDtype("pyarrow")`` and ``pd.ArrowDtype(pa.string())``.
+    ``pd.StringDtype("pyarrow")`` is described below in the :ref:`string section <api.arrays.string>`
+    and will be returned if the string alias ``"string[pyarrow]"`` is specified. ``pd.ArrowDtype(pa.string())``
+    generally has better interoperability with :class:`ArrowDtype` of different types.
 
 While individual values in an :class:`arrays.ArrowExtensionArray` are stored as a PyArrow objects, scalars are **returned**
 as Python scalars corresponding to the data type, e.g. a PyArrow int64 will be returned as Python int, or :class:`NA` for missing

diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst
@@ -322,7 +322,7 @@ As usual, **both sides** of the slicers are included as this is label indexing.
 .. warning::
 
    You should specify all axes in the ``.loc`` specifier, meaning the indexer for the **index** and
-   for the **columns**. There are some ambiguous cases where the passed indexer could be mis-interpreted
+   for the **columns**. There are some ambiguous cases where the passed indexer could be misinterpreted
    as indexing *both* axes, rather than into say the ``MultiIndex`` for the rows.
 
    You should do this:

diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
@@ -149,7 +149,7 @@ the columns except the one we specify:
    grouped.sum()
 
 The above GroupBy will split the DataFrame on its index (rows). To split by columns, first do
-a tranpose:
+a transpose:
 
 .. ipython::
 

diff --git a/doc/source/user_guide/pyarrow.rst b/doc/source/user_guide/pyarrow.rst
@@ -35,6 +35,23 @@ which is similar to a NumPy array. To construct these from the main pandas data
    df = pd.DataFrame([[1, 2], [3, 4]], dtype="uint64[pyarrow]")
    df
 
+.. note::
+
+    The string alias ``"string[pyarrow]"`` maps to ``pd.StringDtype("pyarrow")`` which is not equivalent to
+    specifying ``dtype=pd.ArrowDtype(pa.string())``. Generally, operations on the data will behave similarly
+    except ``pd.StringDtype("pyarrow")`` can return NumPy-backed nullable types while ``pd.ArrowDtype(pa.string())``
+    will return :class:`ArrowDtype`.
+
+   .. ipython:: python
+
+      import pyarrow as pa
+      data = list("abc")
+      ser_sd = pd.Series(data, dtype="string[pyarrow]")
+      ser_ad = pd.Series(data, dtype=pd.ArrowDtype(pa.string()))
+      ser_ad.dtype == ser_sd.dtype
+      ser_sd.str.contains("a")
+      ser_ad.str.contains("a")
+
 For PyArrow types that accept parameters, you can pass in a PyArrow type with those parameters
 into :class:`ArrowDtype` to use in the ``dtype`` parameter.
 
@@ -106,6 +123,7 @@ The following are just some examples of operations that are accelerated by nativ
 
 .. ipython:: python
 
+   import pyarrow as pa
    ser = pd.Series([-1.545, 0.211, None], dtype="float32[pyarrow]")
    ser.mean()
    ser + ser
@@ -115,7 +133,7 @@ The following are just some examples of operations that are accelerated by nativ
    ser.isna()
    ser.fillna(0)
 
-   ser_str = pd.Series(["a", "b", None], dtype="string[pyarrow]")
+   ser_str = pd.Series(["a", "b", None], dtype=pd.ArrowDtype(pa.string()))
    ser_str.str.startswith("a")
 
    from datetime import datetime

diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst
@@ -13,7 +13,7 @@ Reshaping by pivoting DataFrame objects
 
 .. image:: ../_static/reshaping_pivot.png
 
-Data is often stored in so-called "stacked" or "record" format:
+Data is often stored in so-called "stacked" or "record" format. In a "record" or "wide" format typically there is one row for each subject. In the "stacked" or "long" format there are multiple rows for each subject where applicable.
 
 .. ipython:: python