From 0623200fe060df9ec324aa775cf9a7ffd3bdf429 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 12 Dec 2019 11:02:29 -0600 Subject: [PATCH 1/5] API: Return BoolArray for string ops --- doc/source/user_guide/text.rst | 9 ++++++++- pandas/core/strings.py | 18 +++++++++++------- pandas/tests/test_strings.py | 22 ++++++++++++++++++++++ 3 files changed, 41 insertions(+), 8 deletions(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 072871f89bdae..032d51c5a388f 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -74,6 +74,7 @@ These are places where the behavior of ``StringDtype`` objects differ from l. For ``StringDtype``, :ref:`string accessor methods` that return **numeric** output will always return a nullable integer dtype, rather than either int or float dtype, depending on the presence of NA values. + Methods returning **boolean** output will return a nullable boolean dtype. .. ipython:: python @@ -89,7 +90,13 @@ l. For ``StringDtype``, :ref:`string accessor methods` s.astype(object).str.count("a") s.astype(object).dropna().str.count("a") - When NA values are present, the output dtype is float64. + When NA values are present, the output dtype is float64. Similarly for + methods returning boolean values. + + .. ipython:: python + + s.str.isdigit() + s.str.match("a") 2. Some string methods, like :meth:`Series.str.decode` are not available on ``StringArray`` because ``StringArray`` only holds strings, not diff --git a/pandas/core/strings.py b/pandas/core/strings.py index d4d8be90402b7..724a85a83ef26 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -140,7 +140,7 @@ def _map_stringarray( The value to use for missing values. By default, this is the original value (NA). dtype : Dtype - The result dtype to use. Specifying this aviods an intermediate + The result dtype to use. Specifying this avoids an intermediate object-dtype allocation. Returns @@ -150,14 +150,19 @@ def _map_stringarray( an ndarray. """ - from pandas.arrays import IntegerArray, StringArray + from pandas.arrays import IntegerArray, StringArray, BooleanArray mask = isna(arr) assert isinstance(arr, StringArray) arr = np.asarray(arr) - if is_integer_dtype(dtype): + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + if is_integer_dtype(dtype): + constructor = IntegerArray + else: + constructor = BooleanArray + na_value_is_na = isna(na_value) if na_value_is_na: na_value = 1 @@ -167,13 +172,13 @@ def _map_stringarray( mask.view("uint8"), convert=False, na_value=na_value, - dtype=np.dtype("int64"), + dtype=np.dtype(dtype), ) if not na_value_is_na: mask[:] = False - return IntegerArray(result, mask) + return constructor(result, mask) elif is_string_dtype(dtype) and not is_object_dtype(dtype): # i.e. StringDtype @@ -181,7 +186,6 @@ def _map_stringarray( arr, func, mask.view("uint8"), convert=False, na_value=na_value ) return StringArray(result) - # TODO: BooleanArray else: # This is when the result type is object. We reach this when # -> We know the result type is truly object (e.g. .encode returns bytes @@ -3352,7 +3356,7 @@ def rindex(self, sub, start=0, end=None): Series.str.isalpha : Check whether all characters are alphabetic. Series.str.isnumeric : Check whether all characters are numeric. Series.str.isalnum : Check whether all characters are alphanumeric. - Series.str.isdigit : Check whether all characters are digits. + Series.str.isdigit : Check whether all characters are digits.h Series.str.isdecimal : Check whether all characters are decimal. Series.str.isspace : Check whether all characters are whitespace. Series.str.islower : Check whether all characters are lowercase. diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 584550d562b0d..0bd316f41cbe2 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3524,6 +3524,12 @@ def test_string_array(any_string_method): assert result.dtype == "string" result = result.astype(object) + elif expected.dtype == "object" and lib.is_bool_array( + expected.values, skipna=True + ): + assert result.dtype == "boolean" + result = result.astype(object) + elif expected.dtype == "float" and expected.isna().any(): assert result.dtype == "Int64" result = result.astype("float") @@ -3549,3 +3555,19 @@ def test_string_array_numeric_integer_array(method, expected): result = getattr(s.str, method)("a") expected = Series(expected, dtype="Int64") tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "method,expected", + [ + ("isdigit", [False, None, True]), + ("isalpha", [True, None, False]), + ("isalnum", [True, None, True]), + ("isdigit", [False, None, True]), + ], +) +def test_string_array_boolean_array(method, expected): + s = Series(["a", None, "1"], dtype="string") + result = getattr(s.str, method)() + expected = Series(expected, dtype="boolean") + tm.assert_series_equal(result, expected) From 1d9317eb60dcae86aa86fe622638bca95eff0f86 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 12 Dec 2019 11:38:34 -0600 Subject: [PATCH 2/5] int64 --- pandas/core/strings.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 724a85a83ef26..295e7d6a74316 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -301,7 +301,7 @@ def str_count(arr, pat, flags=0): """ regex = re.compile(pat, flags=flags) f = lambda x: len(regex.findall(x)) - return _na_map(f, arr, dtype=int) + return _na_map(f, arr, dtype="int64") def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): @@ -1367,7 +1367,7 @@ def str_find(arr, sub, start=0, end=None, side="left"): else: f = lambda x: getattr(x, method)(sub, start, end) - return _na_map(f, arr, dtype=int) + return _na_map(f, arr, dtype="int64") def str_index(arr, sub, start=0, end=None, side="left"): @@ -1387,7 +1387,7 @@ def str_index(arr, sub, start=0, end=None, side="left"): else: f = lambda x: getattr(x, method)(sub, start, end) - return _na_map(f, arr, dtype=int) + return _na_map(f, arr, dtype="int64") def str_pad(arr, width, side="left", fillchar=" "): @@ -3212,7 +3212,7 @@ def rindex(self, sub, start=0, end=None): len, docstring=_shared_docs["len"], forbidden_types=None, - dtype=int, + dtype="int64", returns_string=False, ) From f6d9e4e1bc292037f5a53b9d73c3281ffadfcf23 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 12 Dec 2019 12:10:41 -0600 Subject: [PATCH 3/5] typo --- pandas/core/strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 295e7d6a74316..acec80a028cb2 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -3356,7 +3356,7 @@ def rindex(self, sub, start=0, end=None): Series.str.isalpha : Check whether all characters are alphabetic. Series.str.isnumeric : Check whether all characters are numeric. Series.str.isalnum : Check whether all characters are alphanumeric. - Series.str.isdigit : Check whether all characters are digits.h + Series.str.isdigit : Check whether all characters are digits. Series.str.isdecimal : Check whether all characters are decimal. Series.str.isspace : Check whether all characters are whitespace. Series.str.islower : Check whether all characters are lowercase. From 5f9dbe9a66daa027e11be3c7b21a845cd91fbf9a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 12 Dec 2019 12:11:49 -0600 Subject: [PATCH 4/5] fixup --- pandas/tests/test_strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 0bd316f41cbe2..d8b9c5983618e 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -1825,7 +1825,7 @@ def test_extractall_same_as_extract_subject_index(self): def test_empty_str_methods(self): empty_str = empty = Series(dtype=object) - empty_int = Series(dtype=int) + empty_int = Series(dtype="int64") empty_bool = Series(dtype=bool) empty_bytes = Series(dtype=object) From b83d677048ec550dff52e61a51abf068feb226d4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 12 Dec 2019 15:02:29 -0600 Subject: [PATCH 5/5] types --- pandas/core/strings.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index acec80a028cb2..6ef42eb185e49 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2,7 +2,7 @@ from functools import wraps import re import textwrap -from typing import TYPE_CHECKING, Any, Callable, Dict, List +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Type, Union import warnings import numpy as np @@ -158,6 +158,7 @@ def _map_stringarray( arr = np.asarray(arr) if is_integer_dtype(dtype) or is_bool_dtype(dtype): + constructor: Union[Type[IntegerArray], Type[BooleanArray]] if is_integer_dtype(dtype): constructor = IntegerArray else: