Merge branch 'main' into today_now_error

pandas-dev · Jul 31, 2022 · 485615c · 485615c
2 parents 5582fd8 + 6e1a040
commit 485615c
Show file tree

Hide file tree

Showing 113 changed files with 1,172 additions and 443 deletions.
diff --git a/.github/workflows/32-bit-linux.yml b/.github/workflows/32-bit-linux.yml
@@ -38,7 +38,7 @@ jobs:
           /opt/python/cp38-cp38/bin/python -m venv ~/virtualenvs/pandas-dev && \
           . ~/virtualenvs/pandas-dev/bin/activate && \
           python -m pip install --no-deps -U pip wheel 'setuptools<60.0.0' && \
-          pip install cython numpy python-dateutil pytz pytest pytest-xdist pytest-asyncio>=0.17 hypothesis && \
+          pip install cython==0.29.30 numpy python-dateutil pytz pytest pytest-xdist pytest-asyncio>=0.17 hypothesis && \
           python setup.py build_ext -q -j2 && \
           python -m pip install --no-build-isolation --no-use-pep517 -e . && \
           export PANDAS_CI=1 && \

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
@@ -0,0 +1,31 @@
+name: CodeQL
+on:
+  schedule:
+    # every day at midnight
+    - cron: "0 0 * * *"
+
+concurrency:
+  group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
+  cancel-in-progress: true
+
+jobs:
+  analyze:
+    runs-on: ubuntu-latest
+    permissions:
+      actions: read
+      contents: read
+      security-events: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        language:
+          - python
+
+    steps:
+      - uses: actions/checkout@v3
+      - uses: github/codeql-action/init@v2
+        with:
+          languages: ${{ matrix.language }}
+      - uses: github/codeql-action/autobuild@v2
+      - uses: github/codeql-action/analyze@v2
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -230,7 +230,7 @@ repos:
         language: python
         additional_dependencies:
         - flake8==4.0.1
-        - flake8-pyi==22.5.1
+        - flake8-pyi==22.7.0
     -   id: future-annotations
         name: import annotations from __future__
         entry: 'from __future__ import annotations'

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -78,8 +78,8 @@ fi
 ### DOCSTRINGS ###
 if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
 
-    MSG='Validate docstrings (EX04, GL01, GL02, GL03, GL04, GL05, GL06, GL07, GL09, GL10, PR03, PR04, PR05, PR06, PR08, PR09, PR10, RT01, RT04, RT05, SA02, SA03, SA04, SS01, SS02, SS03, SS04, SS05)' ; echo $MSG
-    $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX04,GL01,GL02,GL03,GL04,GL05,GL06,GL07,GL09,GL10,PR03,PR04,PR05,PR06,PR08,PR09,PR10,RT01,RT04,RT05,SA02,SA03,SA04,SS01,SS02,SS03,SS04,SS05
+    MSG='Validate docstrings (EX04, GL01, GL02, GL03, GL04, GL05, GL06, GL07, GL09, GL10, PR03, PR04, PR05, PR06, PR08, PR09, PR10, RT01, RT04, RT05, SA02, SA03, SA04, SS01, SS02, SS03, SS04, SS05, SS06)' ; echo $MSG
+    $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX04,GL01,GL02,GL03,GL04,GL05,GL06,GL07,GL09,GL10,PR03,PR04,PR05,PR06,PR08,PR09,PR10,RT01,RT04,RT05,SA02,SA03,SA04,SS01,SS02,SS03,SS04,SS05,SS06
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
 fi

diff --git a/doc/source/conf.py b/doc/source/conf.py
@@ -50,23 +50,25 @@
 # sphinxext.
 
 extensions = [
-    "sphinx.ext.autodoc",
-    "sphinx.ext.autosummary",
-    "sphinx.ext.doctest",
-    "sphinx.ext.extlinks",
-    "sphinx.ext.todo",
-    "numpydoc",  # handle NumPy documentation formatted docstrings
+    "contributors",  # custom pandas extension
     "IPython.sphinxext.ipython_directive",
     "IPython.sphinxext.ipython_console_highlighting",
     "matplotlib.sphinxext.plot_directive",
-    "sphinx.ext.intersphinx",
+    "numpydoc",
+    "sphinx_copybutton",
+    "sphinx_panels",
+    "sphinx_toggleprompt",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.autosummary",
     "sphinx.ext.coverage",
-    "sphinx.ext.mathjax",
+    "sphinx.ext.doctest",
+    "sphinx.ext.extlinks",
     "sphinx.ext.ifconfig",
+    "sphinx.ext.intersphinx",
     "sphinx.ext.linkcode",
+    "sphinx.ext.mathjax",
+    "sphinx.ext.todo",
     "nbsphinx",
-    "sphinx_panels",
-    "contributors",  # custom pandas extension
 ]
 
 exclude_patterns = [
@@ -144,6 +146,9 @@
 # already loads it
 panels_add_bootstrap_css = False
 
+# https://sphinx-toggleprompt.readthedocs.io/en/stable/#offset
+toggleprompt_offset_right = 35
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["../_templates"]
 
@@ -453,7 +458,6 @@
 # extlinks alias
 extlinks = {
     "issue": ("https://github.com/pandas-dev/pandas/issues/%s", "GH"),
-    "wiki": ("https://github.com/pandas-dev/pandas/wiki/%s", "wiki "),
 }
 
 

diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst
@@ -161,10 +161,10 @@ A good implementation for Python users is `has2k1/plotnine <https://github.com/h
 `IPython Vega <https://github.com/vega/ipyvega>`__ leverages `Vega
 <https://github.com/vega/vega>`__ to create plots within Jupyter Notebook.
 
-`Plotly <https://poltly.com/python>`__
+`Plotly <https://plotly.com/python>`__
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-`Plotly’s <https://poltly.com/>`__ `Python API <https://poltly.com/python/>`__ enables interactive figures and web shareability. Maps, 2D, 3D, and live-streaming graphs are rendered with WebGL and `D3.js <https://d3js.org/>`__. The library supports plotting directly from a pandas DataFrame and cloud-based collaboration. Users of `matplotlib, ggplot for Python, and Seaborn <https://poltly.com/python/matplotlib-to-plotly-tutorial/>`__ can convert figures into interactive web-based plots. Plots can be drawn in `IPython Notebooks <https://plotly.com/ipython-notebooks/>`__ , edited with R or MATLAB, modified in a GUI, or embedded in apps and dashboards. Plotly is free for unlimited sharing, and has `offline <https://poltly.com/python/offline/>`__, or `on-premise <https://poltly.com/product/enterprise/>`__ accounts for private use.
+`Plotly’s <https://plotly.com/>`__ `Python API <https://plotly.com/python/>`__ enables interactive figures and web shareability. Maps, 2D, 3D, and live-streaming graphs are rendered with WebGL and `D3.js <https://d3js.org/>`__. The library supports plotting directly from a pandas DataFrame and cloud-based collaboration. Users of `matplotlib, ggplot for Python, and Seaborn <https://plotly.com/python/matplotlib-to-plotly-tutorial/>`__ can convert figures into interactive web-based plots. Plots can be drawn in `IPython Notebooks <https://plotly.com/ipython-notebooks/>`__ , edited with R or MATLAB, modified in a GUI, or embedded in apps and dashboards. Plotly is free for unlimited sharing, and has `offline <https://plotly.com/python/offline/>`__, or `on-premise <https://plotly.com/product/enterprise/>`__ accounts for private use.
 
 `Lux <https://github.com/lux-org/lux>`__
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -591,12 +591,12 @@ Library            Accessor     Classes                              Description
 Development tools
 -----------------
 
-`pandas-stubs <https://github.com/VirtusLab/pandas-stubs>`__
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+`pandas-stubs <https://github.com/pandas-dev/pandas-stubs>`__
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 While pandas repository is partially typed, the package itself doesn't expose this information for external use.
 Install pandas-stubs to enable basic type coverage of pandas API.
 
 Learn more by reading through :issue:`14468`, :issue:`26766`, :issue:`28142`.
 
-See installation and usage instructions on the `github page <https://github.com/VirtusLab/pandas-stubs>`__.
+See installation and usage instructions on the `github page <https://github.com/pandas-dev/pandas-stubs>`__.
diff --git a/doc/source/getting_started/intro_tutorials/10_text_data.rst b/doc/source/getting_started/intro_tutorials/10_text_data.rst
@@ -179,7 +179,7 @@ applied to integers, so no ``str`` is used.
 
 Based on the index name of the row (``307``) and the column (``Name``),
 we can do a selection using the ``loc`` operator, introduced in the
-`tutorial on subsetting <3_subset_data.ipynb>`__.
+:ref:`tutorial on subsetting <10min_tut_03_subset>`.
 
 .. raw:: html
 

diff --git a/doc/source/reference/general_functions.rst b/doc/source/reference/general_functions.rst
@@ -85,4 +85,4 @@ Importing from other DataFrame libraries
 .. autosummary::
    :toctree: api/
 
-   api.exchange.from_dataframe
+   api.interchange.from_dataframe
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -107,9 +107,10 @@ index_col : int, str, sequence of int / str, or False, optional, default ``None`
   string name or column index. If a sequence of int / str is given, a
   MultiIndex is used.
 
-  Note: ``index_col=False`` can be used to force pandas to *not* use the first
-  column as the index, e.g. when you have a malformed file with delimiters at
-  the end of each line.
+  .. note::
+     ``index_col=False`` can be used to force pandas to *not* use the first
+     column as the index, e.g. when you have a malformed file with delimiters at
+     the end of each line.
 
   The default value of ``None`` instructs pandas to guess. If the number of
   fields in the column header row is equal to the number of fields in the body
@@ -182,15 +183,16 @@ General parsing configuration
 +++++++++++++++++++++++++++++
 
 dtype : Type name or dict of column -> type, default ``None``
-  Data type for data or columns. E.g. ``{'a': np.float64, 'b': np.int32}``
-  (unsupported with ``engine='python'``). Use ``str`` or ``object`` together
-  with suitable ``na_values`` settings to preserve and
-  not interpret dtype.
+  Data type for data or columns. E.g. ``{'a': np.float64, 'b': np.int32, 'c': 'Int64'}``
+  Use ``str`` or ``object`` together with suitable ``na_values`` settings to preserve
+  and not interpret dtype. If converters are specified, they will be applied INSTEAD
+  of dtype conversion.
+
   .. versionadded:: 1.5.0
 
-    Support for defaultdict was added. Specify a defaultdict as input where
-    the default determines the dtype of the columns which are not explicitly
-    listed.
+     Support for defaultdict was added. Specify a defaultdict as input where
+     the default determines the dtype of the columns which are not explicitly
+     listed.
 engine : {``'c'``, ``'python'``, ``'pyarrow'``}
   Parser engine to use. The C and pyarrow engines are faster, while the python engine
   is currently more feature-complete. Multithreading is currently only supported by
@@ -283,7 +285,9 @@ parse_dates : boolean or list of ints or names or list of lists or dict, default
   * If ``[[1, 3]]`` -> combine columns 1 and 3 and parse as a single date
     column.
   * If ``{'foo': [1, 3]}`` -> parse columns 1, 3 as date and call result 'foo'.
-    A fast-path exists for iso8601-formatted dates.
+
+  .. note::
+     A fast-path exists for iso8601-formatted dates.
 infer_datetime_format : boolean, default ``False``
   If ``True`` and parse_dates is enabled for a column, attempt to infer the
   datetime format to speed up the processing.
@@ -1593,8 +1597,10 @@ of multi-columns indices.
 
    pd.read_csv("mi2.csv", header=[0, 1], index_col=0)
 
-Note: If an ``index_col`` is not specified (e.g. you don't have an index, or wrote it
-with ``df.to_csv(..., index=False)``, then any ``names`` on the columns index will be *lost*.
+.. note::
+   If an ``index_col`` is not specified (e.g. you don't have an index, or wrote it
+   with ``df.to_csv(..., index=False)``, then any ``names`` on the columns index will
+   be *lost*.
 
 .. ipython:: python
    :suppress:

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -773,6 +773,7 @@ Other Deprecations
 - Deprecated :class:`Series` and :class:`Resampler` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) raising a ``NotImplementedError`` when the dtype is non-numric and ``numeric_only=True`` is provided; this will raise a ``TypeError`` in a future version (:issue:`47500`)
 - Deprecated :meth:`Series.rank` returning an empty result when the dtype is non-numeric and ``numeric_only=True`` is provided; this will raise a ``TypeError`` in a future version (:issue:`47500`)
 - Deprecated argument ``errors`` for :meth:`Series.mask`, :meth:`Series.where`, :meth:`DataFrame.mask`, and :meth:`DataFrame.where` as ``errors`` had no effect on this methods (:issue:`47728`)
+- Deprecated arguments ``*args`` and ``**kwargs`` in :class:`Rolling`, :class:`Expanding`, and :class:`ExponentialMovingWindow` ops. (:issue:`47836`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_150.performance:
@@ -802,6 +803,7 @@ Performance improvements
 - Performance improvement in datetime arrays string formatting when one of the default strftime formats ``"%Y-%m-%d %H:%M:%S"`` or ``"%Y-%m-%d %H:%M:%S.%f"`` is used. (:issue:`44764`)
 - Performance improvement in :meth:`Series.to_sql` and :meth:`DataFrame.to_sql` (:class:`SQLiteTable`) when processing time arrays. (:issue:`44764`)
 - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47404`, :issue:`47405`)
+- Performance improvement in ``argmax`` and ``argmin`` for :class:`arrays.SparseArray` (:issue:`34197`)
 -
 
 .. ---------------------------------------------------------------------------
@@ -1023,6 +1025,7 @@ Reshaping
 - Bug in :meth:`DataFrame.pivot_table` with ``sort=False`` results in sorted index (:issue:`17041`)
 - Bug in :meth:`concat` when ``axis=1`` and ``sort=False`` where the resulting Index was a :class:`Int64Index` instead of a :class:`RangeIndex` (:issue:`46675`)
 - Bug in :meth:`wide_to_long` raises when ``stubnames`` is missing in columns and ``i`` contains string dtype column (:issue:`46044`)
+- Bug in :meth:`DataFrame.join` with categorical index results in unexpected reordering (:issue:`47812`)
 
 Sparse
 ^^^^^^

diff --git a/environment.yml b/environment.yml
@@ -103,6 +103,7 @@ dependencies:
   - pytest-cython  # doctest
   - sphinx
   - sphinx-panels
+  - sphinx-copybutton
   - types-python-dateutil
   - types-PyMySQL
   - types-pytz
@@ -128,3 +129,4 @@ dependencies:
   - jupyterlab >=3.4,<4
   - pip:
       - jupyterlite==0.1.0b10
+      - sphinx-toggleprompt
diff --git a/pandas/_config/config.py b/pandas/_config/config.py
@@ -102,8 +102,9 @@ class RegisteredOption(NamedTuple):
 
 class OptionError(AttributeError, KeyError):
     """
-    Exception for pandas.options, backwards compatible with KeyError
-    checks.
+    Exception raised for pandas.options.
+
+    Backwards compatible with KeyError checks.
     """
 
 

diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
 from typing import Any
 
 import numpy as np

diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi
@@ -50,7 +50,7 @@ def group_any_all(
     val_test: Literal["any", "all"],
     skipna: bool,
 ) -> None: ...
-def group_add(
+def group_sum(
     out: np.ndarray,  # complexfloating_t[:, ::1]
     counts: np.ndarray,  # int64_t[::1]
     values: np.ndarray,  # ndarray[complexfloating_t, ndim=2]

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -124,7 +124,7 @@ def group_median_float64(
         ndarray[intp_t] indexer
         float64_t* ptr
 
-    assert min_count == -1, "'min_count' only used in add and prod"
+    assert min_count == -1, "'min_count' only used in sum and prod"
 
     ngroups = len(counts)
     N, K = (<object>values).shape
@@ -502,7 +502,7 @@ def group_any_all(
 
 
 # ----------------------------------------------------------------------
-# group_add, group_prod, group_var, group_mean, group_ohlc
+# group_sum, group_prod, group_var, group_mean, group_ohlc
 # ----------------------------------------------------------------------
 
 ctypedef fused mean_t:
@@ -511,17 +511,17 @@ ctypedef fused mean_t:
     complex64_t
     complex128_t
 
-ctypedef fused add_t:
+ctypedef fused sum_t:
     mean_t
     object
 
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_add(
-    add_t[:, ::1] out,
+def group_sum(
+    sum_t[:, ::1] out,
     int64_t[::1] counts,
-    ndarray[add_t, ndim=2] values,
+    ndarray[sum_t, ndim=2] values,
     const intp_t[::1] labels,
     Py_ssize_t min_count=0,
     bint is_datetimelike=False,
@@ -531,8 +531,8 @@ def group_add(
     """
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        add_t val, t, y
-        add_t[:, ::1] sumx, compensation
+        sum_t val, t, y
+        sum_t[:, ::1] sumx, compensation
         int64_t[:, ::1] nobs
         Py_ssize_t len_values = len(values), len_labels = len(labels)
 
@@ -546,7 +546,7 @@ def group_add(
 
     N, K = (<object>values).shape
 
-    if add_t is object:
+    if sum_t is object:
         # NB: this does not use 'compensation' like the non-object track does.
         for i in range(N):
             lab = labels[i]
@@ -588,10 +588,10 @@ def group_add(
 
                     # not nan
                     # With dt64/td64 values, values have been cast to float64
-                    #  instead if int64 for group_add, but the logic
+                    #  instead if int64 for group_sum, but the logic
                     #  is otherwise the same as in _treat_as_na
                     if val == val and not (
-                        add_t is float64_t
+                        sum_t is float64_t
                         and is_datetimelike
                         and val == <float64_t>NPY_NAT
                     ):
@@ -677,7 +677,7 @@ def group_var(
         int64_t[:, ::1] nobs
         Py_ssize_t len_values = len(values), len_labels = len(labels)
 
-    assert min_count == -1, "'min_count' only used in add and prod"
+    assert min_count == -1, "'min_count' only used in sum and prod"
 
     if len_values != len_labels:
         raise ValueError("len(index) != len(labels)")
@@ -745,7 +745,7 @@ def group_mean(
         Array containing unique label for each group, with its
         ordering matching up to the corresponding record in `values`.
     min_count : Py_ssize_t
-        Only used in add and prod. Always -1.
+        Only used in sum and prod. Always -1.
     is_datetimelike : bool
         True if `values` contains datetime-like entries.
     mask : ndarray[bool, ndim=2], optional
@@ -766,7 +766,7 @@ def group_mean(
         int64_t[:, ::1] nobs
         Py_ssize_t len_values = len(values), len_labels = len(labels)
 
-    assert min_count == -1, "'min_count' only used in add and prod"
+    assert min_count == -1, "'min_count' only used in sum and prod"
 
     if len_values != len_labels:
         raise ValueError("len(index) != len(labels)")
@@ -821,7 +821,7 @@ def group_ohlc(
         Py_ssize_t i, j, N, K, lab
         floating val
 
-    assert min_count == -1, "'min_count' only used in add and prod"
+    assert min_count == -1, "'min_count' only used in sum and prod"
 
     if len(labels) == 0:
         return