-
-
Notifications
You must be signed in to change notification settings - Fork 18.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
API: Return BoolArray for string ops when backed by StringArray #30239
Changes from all commits
0623200
1d9317e
efe7923
f6d9e4e
5f9dbe9
b83d677
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -2,7 +2,7 @@ | |||||
from functools import wraps | ||||||
import re | ||||||
import textwrap | ||||||
from typing import TYPE_CHECKING, Any, Callable, Dict, List | ||||||
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Type, Union | ||||||
import warnings | ||||||
|
||||||
import numpy as np | ||||||
|
@@ -140,7 +140,7 @@ def _map_stringarray( | |||||
The value to use for missing values. By default, this is | ||||||
the original value (NA). | ||||||
dtype : Dtype | ||||||
The result dtype to use. Specifying this aviods an intermediate | ||||||
The result dtype to use. Specifying this avoids an intermediate | ||||||
object-dtype allocation. | ||||||
|
||||||
Returns | ||||||
|
@@ -150,14 +150,20 @@ def _map_stringarray( | |||||
an ndarray. | ||||||
|
||||||
""" | ||||||
from pandas.arrays import IntegerArray, StringArray | ||||||
from pandas.arrays import IntegerArray, StringArray, BooleanArray | ||||||
|
||||||
mask = isna(arr) | ||||||
|
||||||
assert isinstance(arr, StringArray) | ||||||
arr = np.asarray(arr) | ||||||
|
||||||
if is_integer_dtype(dtype): | ||||||
if is_integer_dtype(dtype) or is_bool_dtype(dtype): | ||||||
constructor: Union[Type[IntegerArray], Type[BooleanArray]] | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Optional but less verbose if you put the Union inside of the Type |
||||||
if is_integer_dtype(dtype): | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. MyPy apparently doesn't like this...
Any suggestions on how to please the type checker? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea if you have assignment in an So before the block you can just declare There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nvm, I think I got it. |
||||||
constructor = IntegerArray | ||||||
else: | ||||||
constructor = BooleanArray | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is there a way to combine the above if/else reading this is super confusing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't see an easy way. |
||||||
|
||||||
na_value_is_na = isna(na_value) | ||||||
if na_value_is_na: | ||||||
na_value = 1 | ||||||
|
@@ -167,21 +173,20 @@ def _map_stringarray( | |||||
mask.view("uint8"), | ||||||
convert=False, | ||||||
na_value=na_value, | ||||||
dtype=np.dtype("int64"), | ||||||
dtype=np.dtype(dtype), | ||||||
) | ||||||
|
||||||
if not na_value_is_na: | ||||||
mask[:] = False | ||||||
|
||||||
return IntegerArray(result, mask) | ||||||
return constructor(result, mask) | ||||||
|
||||||
elif is_string_dtype(dtype) and not is_object_dtype(dtype): | ||||||
# i.e. StringDtype | ||||||
result = lib.map_infer_mask( | ||||||
arr, func, mask.view("uint8"), convert=False, na_value=na_value | ||||||
) | ||||||
return StringArray(result) | ||||||
# TODO: BooleanArray | ||||||
else: | ||||||
# This is when the result type is object. We reach this when | ||||||
# -> We know the result type is truly object (e.g. .encode returns bytes | ||||||
|
@@ -297,7 +302,7 @@ def str_count(arr, pat, flags=0): | |||||
""" | ||||||
regex = re.compile(pat, flags=flags) | ||||||
f = lambda x: len(regex.findall(x)) | ||||||
return _na_map(f, arr, dtype=int) | ||||||
return _na_map(f, arr, dtype="int64") | ||||||
|
||||||
|
||||||
def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): | ||||||
|
@@ -1363,7 +1368,7 @@ def str_find(arr, sub, start=0, end=None, side="left"): | |||||
else: | ||||||
f = lambda x: getattr(x, method)(sub, start, end) | ||||||
|
||||||
return _na_map(f, arr, dtype=int) | ||||||
return _na_map(f, arr, dtype="int64") | ||||||
|
||||||
|
||||||
def str_index(arr, sub, start=0, end=None, side="left"): | ||||||
|
@@ -1383,7 +1388,7 @@ def str_index(arr, sub, start=0, end=None, side="left"): | |||||
else: | ||||||
f = lambda x: getattr(x, method)(sub, start, end) | ||||||
|
||||||
return _na_map(f, arr, dtype=int) | ||||||
return _na_map(f, arr, dtype="int64") | ||||||
|
||||||
|
||||||
def str_pad(arr, width, side="left", fillchar=" "): | ||||||
|
@@ -3208,7 +3213,7 @@ def rindex(self, sub, start=0, end=None): | |||||
len, | ||||||
docstring=_shared_docs["len"], | ||||||
forbidden_types=None, | ||||||
dtype=int, | ||||||
dtype="int64", | ||||||
returns_string=False, | ||||||
) | ||||||
|
||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1825,7 +1825,7 @@ def test_extractall_same_as_extract_subject_index(self): | |
|
||
def test_empty_str_methods(self): | ||
empty_str = empty = Series(dtype=object) | ||
empty_int = Series(dtype=int) | ||
empty_int = Series(dtype="int64") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can anyone with a 32-bit platform confirm the behavior on master for We may have been inconsistent before, and returned int32 for empty, but int64 for non-empty. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is correct; not sure we return a platform int save maybe niche cases |
||
empty_bool = Series(dtype=bool) | ||
empty_bytes = Series(dtype=object) | ||
|
||
|
@@ -3524,6 +3524,12 @@ def test_string_array(any_string_method): | |
assert result.dtype == "string" | ||
result = result.astype(object) | ||
|
||
elif expected.dtype == "object" and lib.is_bool_array( | ||
expected.values, skipna=True | ||
): | ||
assert result.dtype == "boolean" | ||
result = result.astype(object) | ||
|
||
elif expected.dtype == "float" and expected.isna().any(): | ||
assert result.dtype == "Int64" | ||
result = result.astype("float") | ||
|
@@ -3549,3 +3555,19 @@ def test_string_array_numeric_integer_array(method, expected): | |
result = getattr(s.str, method)("a") | ||
expected = Series(expected, dtype="Int64") | ||
tm.assert_series_equal(result, expected) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"method,expected", | ||
[ | ||
("isdigit", [False, None, True]), | ||
("isalpha", [True, None, False]), | ||
("isalnum", [True, None, True]), | ||
("isdigit", [False, None, True]), | ||
], | ||
) | ||
def test_string_array_boolean_array(method, expected): | ||
s = Series(["a", None, "1"], dtype="string") | ||
result = getattr(s.str, method)() | ||
expected = Series(expected, dtype="boolean") | ||
tm.assert_series_equal(result, expected) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe add a doc-link here (can be followup)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm going to hold off on this. I'm planning to restructure the docs for integer / boolean / NA once all these PRs are in.