From 0bf654eb3b534cf3acc5d992e1f7bcf2c8cad7ed Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 4 Dec 2019 20:51:52 +0100 Subject: [PATCH 1/7] API: BooleanArray any/all with NA logic --- pandas/core/arrays/boolean.py | 24 ++++++++++++++++++++++++ pandas/tests/arrays/test_boolean.py | 17 +++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index aec3397bddd16..2662dbc9e8ce3 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -557,6 +557,30 @@ def _values_for_argsort(self) -> np.ndarray: data[self._mask] = -1 return data + def any(self, skipna=True): + # nv.validate_any((), dict(out=out, keepdims=keepdims)) + valid_values = self._data[~self._mask] + if skipna: + return valid_values.any().item() + else: + result = valid_values.any().item() + if result is True or len(self) == 0: + return result + else: + return self.dtype.na_value + + def all(self, skipna=True): + # nv.validate_any((), dict(out=out, keepdims=keepdims)) + valid_values = self._data[~self._mask] + if skipna: + return valid_values.all().item() + else: + result = valid_values.all().item() + if result is False or len(self) == 0: + return result + else: + return self.dtype.na_value + @classmethod def _create_logical_method(cls, op): def logical_method(self, other): diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py index d9cbf3f5b4172..99f9ebeef0079 100644 --- a/pandas/tests/arrays/test_boolean.py +++ b/pandas/tests/arrays/test_boolean.py @@ -531,6 +531,23 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions): assert isinstance(getattr(s, op)(), np.float64) +@pytest.mark.parametrize( + "values, exp_any, exp_all, exp_any_noskip, exp_all_noskip", + [ + ([True, pd.NA], True, True, True, pd.NA), + ([False, pd.NA], False, False, pd.NA, False), + ([pd.NA], False, True, pd.NA, pd.NA), + ([], False, True, False, True), + ], +) +def test_any_all(values, exp_any, exp_all, exp_any_noskip, exp_all_noskip): + arr = pd.array(values, dtype="boolean") + assert arr.any() is exp_any + assert arr.all() is exp_all + assert arr.any(skipna=False) is exp_any_noskip + assert arr.all(skipna=False) is exp_all_noskip + + # TODO when BooleanArray coerces to object dtype numpy array, need to do conversion # manually in the indexing code # def test_indexing_boolean_mask(): From 043f257202e62e7f26a5863b36f60061f1c9d1b4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 4 Dec 2019 20:57:30 +0100 Subject: [PATCH 2/7] use in Series implementation --- pandas/core/arrays/boolean.py | 10 +++++----- pandas/tests/arrays/test_boolean.py | 11 ++++++----- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 2662dbc9e8ce3..1d704246abd3b 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -670,6 +670,10 @@ def cmp_method(self, other): return set_function_name(cmp_method, name, cls) def _reduce(self, name, skipna=True, **kwargs): + + if name in {"any", "all"}: + return getattr(self, name)(skipna=skipna, **kwargs) + data = self._data mask = self._mask @@ -681,12 +685,8 @@ def _reduce(self, name, skipna=True, **kwargs): op = getattr(nanops, "nan" + name) result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) - # if we have a boolean op, don't coerce - if name in ["any", "all"]: - pass - # if we have numeric op that would result in an int, coerce to int if possible - elif name in ["sum", "prod"] and notna(result): + if name in ["sum", "prod"] and notna(result): int_result = np.int64(result) if int_result == result: result = int_result diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py index 99f9ebeef0079..cbab07b03339d 100644 --- a/pandas/tests/arrays/test_boolean.py +++ b/pandas/tests/arrays/test_boolean.py @@ -541,11 +541,12 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions): ], ) def test_any_all(values, exp_any, exp_all, exp_any_noskip, exp_all_noskip): - arr = pd.array(values, dtype="boolean") - assert arr.any() is exp_any - assert arr.all() is exp_all - assert arr.any(skipna=False) is exp_any_noskip - assert arr.all(skipna=False) is exp_all_noskip + for con in [pd.array, pd.Series]: + a = con(values, dtype="boolean") + assert a.any() is exp_any + assert a.all() is exp_all + assert a.any(skipna=False) is exp_any_noskip + assert a.all(skipna=False) is exp_all_noskip # TODO when BooleanArray coerces to object dtype numpy array, need to do conversion From 12d2729865ec48aa8c9b34e7f1684473e5f51089 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 4 Dec 2019 21:04:31 +0100 Subject: [PATCH 3/7] clean-up numpy scalars --- pandas/core/arrays/boolean.py | 12 ++++++------ pandas/tests/arrays/test_boolean.py | 6 ++++++ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 1d704246abd3b..0e0f214fcaba4 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -561,10 +561,10 @@ def any(self, skipna=True): # nv.validate_any((), dict(out=out, keepdims=keepdims)) valid_values = self._data[~self._mask] if skipna: - return valid_values.any().item() + return valid_values.any() else: - result = valid_values.any().item() - if result is True or len(self) == 0: + result = valid_values.any() + if result or len(self) == 0: return result else: return self.dtype.na_value @@ -573,10 +573,10 @@ def all(self, skipna=True): # nv.validate_any((), dict(out=out, keepdims=keepdims)) valid_values = self._data[~self._mask] if skipna: - return valid_values.all().item() + return valid_values.all() else: - result = valid_values.all().item() - if result is False or len(self) == 0: + result = valid_values.all() + if not result or len(self) == 0: return result else: return self.dtype.na_value diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py index cbab07b03339d..9947c587571a5 100644 --- a/pandas/tests/arrays/test_boolean.py +++ b/pandas/tests/arrays/test_boolean.py @@ -541,6 +541,12 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions): ], ) def test_any_all(values, exp_any, exp_all, exp_any_noskip, exp_all_noskip): + # the methods return numpy scalars + exp_any = pd.NA if exp_any is pd.NA else np.bool_(exp_any) + exp_all = pd.NA if exp_all is pd.NA else np.bool_(exp_all) + exp_any_noskip = pd.NA if exp_any_noskip is pd.NA else np.bool_(exp_any_noskip) + exp_all_noskip = pd.NA if exp_all_noskip is pd.NA else np.bool_(exp_all_noskip) + for con in [pd.array, pd.Series]: a = con(values, dtype="boolean") assert a.any() is exp_any From 15471d85b42a13aebe637c4ad15f14e4664a6454 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 4 Dec 2019 21:50:38 +0100 Subject: [PATCH 4/7] handle numpy compat --- pandas/core/arrays/boolean.py | 11 +++++++---- pandas/tests/arrays/test_boolean.py | 3 +++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 0e0f214fcaba4..d4dc85ef9f995 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -6,6 +6,7 @@ from pandas._libs import lib, missing as libmissing from pandas.compat import set_function_name +from pandas.compat.numpy import function as nv from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import astype_nansafe @@ -557,8 +558,9 @@ def _values_for_argsort(self) -> np.ndarray: data[self._mask] = -1 return data - def any(self, skipna=True): - # nv.validate_any((), dict(out=out, keepdims=keepdims)) + def any(self, skipna=True, **kwargs): + kwargs.pop("axis", None) + nv.validate_any((), kwargs) valid_values = self._data[~self._mask] if skipna: return valid_values.any() @@ -569,8 +571,9 @@ def any(self, skipna=True): else: return self.dtype.na_value - def all(self, skipna=True): - # nv.validate_any((), dict(out=out, keepdims=keepdims)) + def all(self, skipna=True, **kwargs): + kwargs.pop("axis", None) + nv.validate_all((), kwargs) valid_values = self._data[~self._mask] if skipna: return valid_values.all() diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py index 9947c587571a5..0c9451c7575db 100644 --- a/pandas/tests/arrays/test_boolean.py +++ b/pandas/tests/arrays/test_boolean.py @@ -554,6 +554,9 @@ def test_any_all(values, exp_any, exp_all, exp_any_noskip, exp_all_noskip): assert a.any(skipna=False) is exp_any_noskip assert a.all(skipna=False) is exp_all_noskip + assert np.any(a.any()) is exp_any + assert np.all(a.all()) is exp_all + # TODO when BooleanArray coerces to object dtype numpy array, need to do conversion # manually in the indexing code From e59e91f68847a22a4a24ec2cffad0928964692d3 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 Dec 2019 11:04:07 +0100 Subject: [PATCH 5/7] more efficient implementation with copy + putmask instead of filter --- pandas/core/arrays/boolean.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 6a9e4530e078a..92a1014dc048d 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -564,11 +564,13 @@ def _values_for_argsort(self) -> np.ndarray: def any(self, skipna=True, **kwargs): kwargs.pop("axis", None) nv.validate_any((), kwargs) - valid_values = self._data[~self._mask] + + values = self._data.copy() + np.putmask(values, self._mask, False) + result = values.any() if skipna: - return valid_values.any() + return result else: - result = valid_values.any() if result or len(self) == 0: return result else: @@ -577,11 +579,14 @@ def any(self, skipna=True, **kwargs): def all(self, skipna=True, **kwargs): kwargs.pop("axis", None) nv.validate_all((), kwargs) - valid_values = self._data[~self._mask] + + values = self._data.copy() + np.putmask(values, self._mask, True) + result = values.all() + if skipna: - return valid_values.all() + return result else: - result = valid_values.all() if not result or len(self) == 0: return result else: From 24797d4f16a7bebf91532815337ac7c96baeb8a9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 Dec 2019 18:33:06 +0100 Subject: [PATCH 6/7] add docstrings --- ci/code_checks.sh | 4 ++ pandas/core/arrays/boolean.py | 106 ++++++++++++++++++++++++++++++++++ 2 files changed, 110 insertions(+) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index cfe55f1e05f71..2b9ea7dc220d7 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -281,6 +281,10 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then pytest -q --doctest-modules pandas/core/arrays/string_.py RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Doctests arrays/boolean.py' ; echo $MSG + pytest -q --doctest-modules pandas/core/arrays/boolean.py + RET=$(($RET + $?)) ; echo $MSG "DONE" + fi ### DOCSTRINGS ### diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 92a1014dc048d..acdb71229b173 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -562,6 +562,60 @@ def _values_for_argsort(self) -> np.ndarray: return data def any(self, skipna=True, **kwargs): + """ + Return whether any element is True. + + Returns False unless there is at least one element that is True. + By default, NAs are skipped. If ``skipna=False`` is specified and + missing values are present, similar :ref:`Kleene logic ` + is used as for logical operations. + + Parameters + ---------- + skipna : bool, default True + Exclude NA values. If the entire array is NA and `skipna` is + True, then the result will be False, as for an empty array. + If `skipna` is False, the result will still be True if there is + at least one element that is True, otherwise NA will be returned + if there are NA's present. + **kwargs : any, default None + Additional keywords have no effect but might be accepted for + compatibility with NumPy. + + Returns + ------- + bool or :attr:`pandas.NA` + + See Also + -------- + numpy.any : Numpy version of this method. + BooleanArray.all : Return whether all elements are True. + + Examples + -------- + + The result indicates whether any element is True (and by default + skips NAs): + + >>> pd.array([True, False, True]).any() + True + >>> pd.array([True, False, pd.NA]).any() + True + >>> pd.array([False, False, pd.NA]).any() + False + >>> pd.array([], dtype="boolean").any() + False + >>> pd.array([pd.NA], dtype="boolean").any() + False + + With ``skipna=False``, the result can be NA if this is logically + required (whether ``pd.NA`` is True or False influences the result): + + >>> pd.array([True, False, pd.NA]).any(skipna=False) + True + >>> pd.array([False, False, pd.NA]).any(skipna=False) + NA + """ kwargs.pop("axis", None) nv.validate_any((), kwargs) @@ -577,6 +631,58 @@ def any(self, skipna=True, **kwargs): return self.dtype.na_value def all(self, skipna=True, **kwargs): + """ + Return whether all elements are True. + + Returns True unless there is at least one element that is False. + By default, NAs are skipped. If ``skipna=False`` is specified and + missing values are present, similar :ref:`Kleene logic ` + is used as for logical operations. + + Parameters + ---------- + skipna : bool, default True + Exclude NA values. If the entire array is NA and `skipna` is + True, then the result will be True, as for an empty array. + If `skipna` is False, the result will still be False if there is + at least one element that is False, otherwise NA will be returned + if there are NA's present. + **kwargs : any, default None + Additional keywords have no effect but might be accepted for + compatibility with NumPy. + + Returns + ------- + bool or :attr:`pandas.NA` + + See Also + -------- + numpy.all : Numpy version of this method. + BooleanArray.any : Return whether any element is True. + + Examples + -------- + + The result indicates whether any element is True (and by default + skips NAs): + + >>> pd.array([True, True, pd.NA]).all() + True + >>> pd.array([True, False, pd.NA]).all() + False + >>> pd.array([], dtype="boolean").all() + True + >>> pd.array([pd.NA], dtype="boolean").all() + True + + With ``skipna=False``, the result can be NA if this is logically + required (whether ``pd.NA`` is True or False influences the result): + + >>> pd.array([True, True, pd.NA]).all(skipna=False) + NA + >>> pd.array([True, False, pd.NA]).all(skipna=False) + False + """ kwargs.pop("axis", None) nv.validate_all((), kwargs) From ec7d072112a20973fce8e1af1c8e22d526e671ff Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 10 Dec 2019 14:33:31 +0100 Subject: [PATCH 7/7] type --- pandas/core/arrays/boolean.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index acdb71229b173..0cdc1bed34ecb 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -561,7 +561,7 @@ def _values_for_argsort(self) -> np.ndarray: data[self._mask] = -1 return data - def any(self, skipna=True, **kwargs): + def any(self, skipna: bool = True, **kwargs): """ Return whether any element is True. @@ -630,7 +630,7 @@ def any(self, skipna=True, **kwargs): else: return self.dtype.na_value - def all(self, skipna=True, **kwargs): + def all(self, skipna: bool = True, **kwargs): """ Return whether all elements are True.