Skip to content

Commit

Permalink
Merge branch 'main' into 54984
Browse files Browse the repository at this point in the history
  • Loading branch information
phofl authored Sep 6, 2023
2 parents b3b3fdf + 88683e9 commit a5b0716
Show file tree
Hide file tree
Showing 60 changed files with 678 additions and 257 deletions.
8 changes: 4 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ repos:
hooks:
- id: black
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.0.285
rev: v0.0.287
hooks:
- id: ruff
args: [--exit-non-zero-on-fix]
Expand All @@ -34,7 +34,7 @@ repos:
alias: ruff-selected-autofixes
args: [--select, "ANN001,ANN204", --fix-only, --exit-non-zero-on-fix]
- repo: https://github.com/jendrikseipp/vulture
rev: 'v2.7'
rev: 'v2.9.1'
hooks:
- id: vulture
entry: python scripts/run_vulture.py
Expand Down Expand Up @@ -84,7 +84,7 @@ repos:
'--filter=-readability/casting,-runtime/int,-build/include_subdir,-readability/fn_size'
]
- repo: https://github.com/pylint-dev/pylint
rev: v3.0.0a6
rev: v3.0.0a7
hooks:
- id: pylint
stages: [manual]
Expand Down Expand Up @@ -124,7 +124,7 @@ repos:
types: [text] # overwrite types: [rst]
types_or: [python, rst]
- repo: https://github.com/sphinx-contrib/sphinx-lint
rev: v0.6.7
rev: v0.6.8
hooks:
- id: sphinx-lint
- repo: local
Expand Down
2 changes: 1 addition & 1 deletion asv_bench/benchmarks/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def time_setitem(self, multiple_chunks):
self.array[i] = "foo"

def time_setitem_list(self, multiple_chunks):
indexer = list(range(0, 50)) + list(range(-1000, 0, 50))
indexer = list(range(50)) + list(range(-1000, 0, 50))
self.array[indexer] = ["foo"] * len(indexer)

def time_setitem_slice(self, multiple_chunks):
Expand Down
4 changes: 2 additions & 2 deletions asv_bench/benchmarks/join_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,14 +360,14 @@ class MergeCategoricals:
def setup(self):
self.left_object = DataFrame(
{
"X": np.random.choice(range(0, 10), size=(10000,)),
"X": np.random.choice(range(10), size=(10000,)),
"Y": np.random.choice(["one", "two", "three"], size=(10000,)),
}
)

self.right_object = DataFrame(
{
"X": np.random.choice(range(0, 10), size=(10000,)),
"X": np.random.choice(range(10), size=(10000,)),
"Z": np.random.choice(["jjj", "kkk", "sss"], size=(10000,)),
}
)
Expand Down
22 changes: 22 additions & 0 deletions doc/cheatsheet/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Pandas Cheat Sheet

The Pandas Cheat Sheet was created using Microsoft Powerpoint 2013.
To create the PDF version, within Powerpoint, simply do a "Save As"
and pick "PDF" as the format.

This cheat sheet, originally written by Irv Lustig, [Princeton Consultants](https://www.princetonoptimization.com/), was inspired by the [RStudio Data Wrangling Cheatsheet](https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf).

| Topic | PDF | PPT |
|------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Pandas_Cheat_Sheet | <a href="https://github.com/pandas-dev/pandas/blob/main/doc/cheatsheet/Pandas_Cheat_Sheet.pdf" target="_parent"><img src="https://img.shields.io/badge/Open in PDF-%23FF0000.svg?style=flat-square&logo=adobe&logoColor=white"/></a> | <a href="https://github.com/pandas-dev/pandas/blob/main/doc/cheatsheet/Pandas_Cheat_Sheet.pptx" target="_parent"><img src="https://img.shields.io/badge/Open in PPT-B7472A?style=flat-square&logo=microsoft-powerpoint&logoColor=white"/></a> |
| Pandas_Cheat_Sheet_JA | <a href="https://github.com/pandas-dev/pandas/blob/main/doc/cheatsheet/Pandas_Cheat_Sheet_JA.pdf" target="_parent"><img src="https://img.shields.io/badge/Open in PDF-%23FF0000.svg?style=flat-square&logo=adobe&logoColor=white"/></a> | <a href="https://github.com/pandas-dev/pandas/blob/main/doc/cheatsheet/Pandas_Cheat_Sheet_JA.pptx" target="_parent"><img src="https://img.shields.io/badge/Open in PPT-B7472A?style=flat-square&logo=microsoft-powerpoint&logoColor=white"/></a> |


**Alternative**

Alternatively, if you want to complement your learning, you can use the Pandas Cheat sheets
developed by [DataCamp](https://www.datacamp.com/) in "PDF", "Google Colab" and "Streamlit" formats.

| Topic | PDF | Streamlit | Google Colab |
|-------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Pandas | <a href="https://github.com/fralfaro/DS-Cheat-Sheets/blob/main/docs/files/pandas_cs.pdf" target="_parent"><img src="https://img.shields.io/badge/Open in PDF-%23FF0000.svg?style=flat-square&logo=adobe&logoColor=white"/></a> | <a href="https://ds-cheat-sheets-pandas.streamlit.app/" target="_parent"><img src="https://static.streamlit.io/badges/streamlit_badge_black_white.svg"/></a> | <a href="https://colab.research.google.com/github/fralfaro/DS-Cheat-Sheets/blob/main/docs/examples/pandas/pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a> |
8 changes: 0 additions & 8 deletions doc/cheatsheet/README.txt

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,9 @@ between square brackets ``[]``.
</ul>

.. note::
If you are familiar to Python
If you are familiar with Python
:ref:`dictionaries <python:tut-dictionaries>`, the selection of a
single column is very similar to selection of dictionary values based on
single column is very similar to the selection of dictionary values based on
the key.

You can create a ``Series`` from scratch as well:
Expand Down
62 changes: 49 additions & 13 deletions doc/source/whatsnew/v0.15.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,25 +24,61 @@ API changes
- Indexing in ``MultiIndex`` beyond lex-sort depth is now supported, though
a lexically sorted index will have a better performance. (:issue:`2646`)

.. ipython:: python
:okexcept:
:okwarning:
.. code-block:: ipython
In [1]: df = pd.DataFrame({'jim':[0, 0, 1, 1],
...: 'joe':['x', 'x', 'z', 'y'],
...: 'jolie':np.random.rand(4)}).set_index(['jim', 'joe'])
...:
df = pd.DataFrame({'jim':[0, 0, 1, 1],
'joe':['x', 'x', 'z', 'y'],
'jolie':np.random.rand(4)}).set_index(['jim', 'joe'])
df
df.index.lexsort_depth
In [2]: df
Out[2]:
jolie
jim joe
0 x 0.126970
x 0.966718
1 z 0.260476
y 0.897237
[4 rows x 1 columns]
In [3]: df.index.lexsort_depth
Out[3]: 1
# in prior versions this would raise a KeyError
# will now show a PerformanceWarning
df.loc[(1, 'z')]
In [4]: df.loc[(1, 'z')]
Out[4]:
jolie
jim joe
1 z 0.260476
[1 rows x 1 columns]
# lexically sorting
df2 = df.sort_index()
df2
df2.index.lexsort_depth
df2.loc[(1,'z')]
In [5]: df2 = df.sort_index()
In [6]: df2
Out[6]:
jolie
jim joe
0 x 0.126970
x 0.966718
1 y 0.897237
z 0.260476
[4 rows x 1 columns]
In [7]: df2.index.lexsort_depth
Out[7]: 2
In [8]: df2.loc[(1,'z')]
Out[8]:
jolie
jim joe
1 z 0.260476
[1 rows x 1 columns]
- Bug in unique of Series with ``category`` dtype, which returned all categories regardless
whether they were "used" or not (see :issue:`8559` for the discussion).
Expand Down
6 changes: 6 additions & 0 deletions doc/source/whatsnew/v2.1.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,17 @@ including other versions of pandas.

Fixed regressions
~~~~~~~~~~~~~~~~~
- Fixed regression in :func:`concat` when :class:`DataFrame` 's have two different extension dtypes (:issue:`54848`)
- Fixed regression in :func:`merge` when merging over a PyArrow string index (:issue:`54894`)
- Fixed regression in :func:`read_csv` when ``usecols`` is given and ``dtypes`` is a dict for ``engine="python"`` (:issue:`54868`)
- Fixed regression in :func:`read_csv` when ``delim_whitespace`` is True (:issue:`54918`, :issue:`54931`)
- Fixed regression in :meth:`.GroupBy.get_group` raising for ``axis=1`` (:issue:`54858`)
- Fixed regression in :meth:`DataFrame.__setitem__` raising ``AssertionError`` when setting a :class:`Series` with a partial :class:`MultiIndex` (:issue:`54875`)
- Fixed regression in :meth:`DataFrame.filter` not respecting the order of elements for ``filter`` (:issue:`54980`)
- Fixed regression in :meth:`DataFrame.to_sql` not roundtripping datetime columns correctly for sqlite (:issue:`54877`)
- Fixed regression in :meth:`MultiIndex.append` raising when appending overlapping :class:`IntervalIndex` levels (:issue:`54934`)
- Fixed regression in :meth:`Series.drop_duplicates` for PyArrow strings (:issue:`54904`)
- Fixed regression in :meth:`Series.interpolate` raising when ``fill_value`` was given (:issue:`54920`)
- Fixed regression in :meth:`Series.value_counts` raising for numeric data if ``bins`` was specified (:issue:`54857`)
- Fixed regression when comparing a :class:`Series` with ``datetime64`` dtype with ``None`` (:issue:`54870`)

Expand All @@ -28,7 +32,9 @@ Fixed regressions

Bug fixes
~~~~~~~~~
- Fixed bug for :class:`ArrowDtype` raising ``NotImplementedError`` for fixed-size list (:issue:`55000`)
- Fixed bug in :meth:`DataFrame.stack` with ``future_stack=True`` and columns a non-:class:`MultiIndex` consisting of tuples (:issue:`54948`)
- Fixed bug in :meth:`Series.pct_change` and :meth:`DataFrame.pct_change` showing unnecessary ``FutureWarning`` (:issue:`54981`)

.. ---------------------------------------------------------------------------
.. _whatsnew_211.other:
Expand Down
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ Performance improvements
Bug fixes
~~~~~~~~~
- Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`)
- Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`)

Categorical
^^^^^^^^^^^
Expand Down Expand Up @@ -245,7 +246,7 @@ Groupby/resample/rolling

Reshaping
^^^^^^^^^
-
- Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`)
-

Sparse
Expand Down
2 changes: 2 additions & 0 deletions pandas/_libs/window/indexers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ def calculate_variable_window_bounds(
break
# end bound is previous end
# or current index
elif index[end[i - 1]] == end_bound and not right_closed:
end[i] = end[i - 1] + 1
elif (index[end[i - 1]] - end_bound) * index_growth_sign <= 0:
end[i] = i + 1
else:
Expand Down
7 changes: 6 additions & 1 deletion pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
Index,
MultiIndex,
)
from pandas.util.version import Version

if TYPE_CHECKING:
from collections.abc import (
Expand Down Expand Up @@ -191,6 +192,10 @@ def pytest_collection_modifyitems(items, config) -> None:
item.add_marker(pytest.mark.arraymanager)


hypothesis_health_checks = [hypothesis.HealthCheck.too_slow]
if Version(hypothesis.__version__) >= Version("6.83.2"):
hypothesis_health_checks.append(hypothesis.HealthCheck.differing_executors)

# Hypothesis
hypothesis.settings.register_profile(
"ci",
Expand All @@ -202,7 +207,7 @@ def pytest_collection_modifyitems(items, config) -> None:
# 2022-02-09: Changed deadline from 500 -> None. Deadline leads to
# non-actionable, flaky CI failures (# GH 24641, 44969, 45118, 44969)
deadline=None,
suppress_health_check=(hypothesis.HealthCheck.too_slow,),
suppress_health_check=tuple(hypothesis_health_checks),
)
hypothesis.settings.load_profile("ci")

Expand Down
10 changes: 5 additions & 5 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -2192,11 +2192,11 @@ def _str_rstrip(self, to_strip=None):
return type(self)(result)

def _str_removeprefix(self, prefix: str):
# TODO: Should work once https://github.com/apache/arrow/issues/14991 is fixed
# starts_with = pc.starts_with(self._pa_array, pattern=prefix)
# removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
# result = pc.if_else(starts_with, removed, self._pa_array)
# return type(self)(result)
if not pa_version_under13p0:
starts_with = pc.starts_with(self._pa_array, pattern=prefix)
removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
result = pc.if_else(starts_with, removed, self._pa_array)
return type(self)(result)
predicate = lambda val: val.removeprefix(prefix)
result = self._apply_elementwise(predicate)
return type(self)(pa.chunked_array(result))
Expand Down
69 changes: 43 additions & 26 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@
lib,
missing as libmissing,
)
from pandas.compat import pa_version_under7p0
from pandas.compat import (
pa_version_under7p0,
pa_version_under13p0,
)
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -417,7 +420,7 @@ def _str_isupper(self):

def _str_len(self):
result = pc.utf8_length(self._pa_array)
return Int64Dtype().__from_arrow__(result)
return self._convert_int_dtype(result)

def _str_lower(self):
return type(self)(pc.utf8_lower(self._pa_array))
Expand Down Expand Up @@ -446,6 +449,43 @@ def _str_rstrip(self, to_strip=None):
result = pc.utf8_rtrim(self._pa_array, characters=to_strip)
return type(self)(result)

def _str_removeprefix(self, prefix: str):
if not pa_version_under13p0:
starts_with = pc.starts_with(self._pa_array, pattern=prefix)
removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
result = pc.if_else(starts_with, removed, self._pa_array)
return type(self)(result)
return super()._str_removeprefix(prefix)

def _str_removesuffix(self, suffix: str):
ends_with = pc.ends_with(self._pa_array, pattern=suffix)
removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
result = pc.if_else(ends_with, removed, self._pa_array)
return type(self)(result)

def _str_count(self, pat: str, flags: int = 0):
if flags:
return super()._str_count(pat, flags)
result = pc.count_substring_regex(self._pa_array, pat)
return self._convert_int_dtype(result)

def _str_find(self, sub: str, start: int = 0, end: int | None = None):
if start != 0 and end is not None:
slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
result = pc.find_substring(slices, sub)
not_found = pc.equal(result, -1)
offset_result = pc.add(result, end - start)
result = pc.if_else(not_found, result, offset_result)
elif start == 0 and end is None:
slices = self._pa_array
result = pc.find_substring(slices, sub)
else:
return super()._str_find(sub, start, end)
return self._convert_int_dtype(result)

def _convert_int_dtype(self, result):
return Int64Dtype().__from_arrow__(result)


class ArrowStringArrayNumpySemantics(ArrowStringArray):
_storage = "pyarrow_numpy"
Expand Down Expand Up @@ -526,34 +566,11 @@ def _str_map(
return lib.map_infer_mask(arr, f, mask.view("uint8"))

def _convert_int_dtype(self, result):
result = result.to_numpy()
if result.dtype == np.int32:
result = result.astype(np.int64)
return result

def _str_count(self, pat: str, flags: int = 0):
if flags:
return super()._str_count(pat, flags)
result = pc.count_substring_regex(self._pa_array, pat).to_numpy()
return self._convert_int_dtype(result)

def _str_len(self):
result = pc.utf8_length(self._pa_array).to_numpy()
return self._convert_int_dtype(result)

def _str_find(self, sub: str, start: int = 0, end: int | None = None):
if start != 0 and end is not None:
slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
result = pc.find_substring(slices, sub)
not_found = pc.equal(result, -1)
offset_result = pc.add(result, end - start)
result = pc.if_else(not_found, result, offset_result)
elif start == 0 and end is None:
slices = self._pa_array
result = pc.find_substring(slices, sub)
else:
return super()._str_find(sub, start, end)
return self._convert_int_dtype(result.to_numpy())

def _cmp_method(self, other, op):
result = super()._cmp_method(other, op)
return result.to_numpy(np.bool_, na_value=False)
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,8 +485,8 @@ def array(self) -> ExtensionArray:
types, this is the actual array. For NumPy native types, this
is a thin (no copy) wrapper around :class:`numpy.ndarray`.
``.array`` differs ``.values`` which may require converting the
data to a different form.
``.array`` differs from ``.values``, which may require converting
the data to a different form.
See Also
--------
Expand Down
Loading

0 comments on commit a5b0716

Please sign in to comment.