diff --git a/ci/code_checks.sh b/ci/code_checks.sh index cfe55f1e05f71..2b9ea7dc220d7 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -281,6 +281,10 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then pytest -q --doctest-modules pandas/core/arrays/string_.py RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Doctests arrays/boolean.py' ; echo $MSG + pytest -q --doctest-modules pandas/core/arrays/boolean.py + RET=$(($RET + $?)) ; echo $MSG "DONE" + fi ### DOCSTRINGS ### diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 31dc656eb4b25..0cdc1bed34ecb 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -6,6 +6,7 @@ from pandas._libs import lib, missing as libmissing from pandas.compat import set_function_name +from pandas.compat.numpy import function as nv from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import astype_nansafe @@ -560,6 +561,143 @@ def _values_for_argsort(self) -> np.ndarray: data[self._mask] = -1 return data + def any(self, skipna: bool = True, **kwargs): + """ + Return whether any element is True. + + Returns False unless there is at least one element that is True. + By default, NAs are skipped. If ``skipna=False`` is specified and + missing values are present, similar :ref:`Kleene logic ` + is used as for logical operations. + + Parameters + ---------- + skipna : bool, default True + Exclude NA values. If the entire array is NA and `skipna` is + True, then the result will be False, as for an empty array. + If `skipna` is False, the result will still be True if there is + at least one element that is True, otherwise NA will be returned + if there are NA's present. + **kwargs : any, default None + Additional keywords have no effect but might be accepted for + compatibility with NumPy. + + Returns + ------- + bool or :attr:`pandas.NA` + + See Also + -------- + numpy.any : Numpy version of this method. + BooleanArray.all : Return whether all elements are True. + + Examples + -------- + + The result indicates whether any element is True (and by default + skips NAs): + + >>> pd.array([True, False, True]).any() + True + >>> pd.array([True, False, pd.NA]).any() + True + >>> pd.array([False, False, pd.NA]).any() + False + >>> pd.array([], dtype="boolean").any() + False + >>> pd.array([pd.NA], dtype="boolean").any() + False + + With ``skipna=False``, the result can be NA if this is logically + required (whether ``pd.NA`` is True or False influences the result): + + >>> pd.array([True, False, pd.NA]).any(skipna=False) + True + >>> pd.array([False, False, pd.NA]).any(skipna=False) + NA + """ + kwargs.pop("axis", None) + nv.validate_any((), kwargs) + + values = self._data.copy() + np.putmask(values, self._mask, False) + result = values.any() + if skipna: + return result + else: + if result or len(self) == 0: + return result + else: + return self.dtype.na_value + + def all(self, skipna: bool = True, **kwargs): + """ + Return whether all elements are True. + + Returns True unless there is at least one element that is False. + By default, NAs are skipped. If ``skipna=False`` is specified and + missing values are present, similar :ref:`Kleene logic ` + is used as for logical operations. + + Parameters + ---------- + skipna : bool, default True + Exclude NA values. If the entire array is NA and `skipna` is + True, then the result will be True, as for an empty array. + If `skipna` is False, the result will still be False if there is + at least one element that is False, otherwise NA will be returned + if there are NA's present. + **kwargs : any, default None + Additional keywords have no effect but might be accepted for + compatibility with NumPy. + + Returns + ------- + bool or :attr:`pandas.NA` + + See Also + -------- + numpy.all : Numpy version of this method. + BooleanArray.any : Return whether any element is True. + + Examples + -------- + + The result indicates whether any element is True (and by default + skips NAs): + + >>> pd.array([True, True, pd.NA]).all() + True + >>> pd.array([True, False, pd.NA]).all() + False + >>> pd.array([], dtype="boolean").all() + True + >>> pd.array([pd.NA], dtype="boolean").all() + True + + With ``skipna=False``, the result can be NA if this is logically + required (whether ``pd.NA`` is True or False influences the result): + + >>> pd.array([True, True, pd.NA]).all(skipna=False) + NA + >>> pd.array([True, False, pd.NA]).all(skipna=False) + False + """ + kwargs.pop("axis", None) + nv.validate_all((), kwargs) + + values = self._data.copy() + np.putmask(values, self._mask, True) + result = values.all() + + if skipna: + return result + else: + if not result or len(self) == 0: + return result + else: + return self.dtype.na_value + @classmethod def _create_logical_method(cls, op): def logical_method(self, other): @@ -656,6 +794,10 @@ def cmp_method(self, other): return set_function_name(cmp_method, name, cls) def _reduce(self, name, skipna=True, **kwargs): + + if name in {"any", "all"}: + return getattr(self, name)(skipna=skipna, **kwargs) + data = self._data mask = self._mask @@ -667,12 +809,8 @@ def _reduce(self, name, skipna=True, **kwargs): op = getattr(nanops, "nan" + name) result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) - # if we have a boolean op, don't coerce - if name in ["any", "all"]: - pass - # if we have numeric op that would result in an int, coerce to int if possible - elif name in ["sum", "prod"] and notna(result): + if name in ["sum", "prod"] and notna(result): int_result = np.int64(result) if int_result == result: result = int_result diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py index d2f1a4a60ada8..7266e7f741a51 100644 --- a/pandas/tests/arrays/test_boolean.py +++ b/pandas/tests/arrays/test_boolean.py @@ -700,6 +700,33 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions): assert isinstance(getattr(s, op)(), np.float64) +@pytest.mark.parametrize( + "values, exp_any, exp_all, exp_any_noskip, exp_all_noskip", + [ + ([True, pd.NA], True, True, True, pd.NA), + ([False, pd.NA], False, False, pd.NA, False), + ([pd.NA], False, True, pd.NA, pd.NA), + ([], False, True, False, True), + ], +) +def test_any_all(values, exp_any, exp_all, exp_any_noskip, exp_all_noskip): + # the methods return numpy scalars + exp_any = pd.NA if exp_any is pd.NA else np.bool_(exp_any) + exp_all = pd.NA if exp_all is pd.NA else np.bool_(exp_all) + exp_any_noskip = pd.NA if exp_any_noskip is pd.NA else np.bool_(exp_any_noskip) + exp_all_noskip = pd.NA if exp_all_noskip is pd.NA else np.bool_(exp_all_noskip) + + for con in [pd.array, pd.Series]: + a = con(values, dtype="boolean") + assert a.any() is exp_any + assert a.all() is exp_all + assert a.any(skipna=False) is exp_any_noskip + assert a.all(skipna=False) is exp_all_noskip + + assert np.any(a.any()) is exp_any + assert np.all(a.all()) is exp_all + + # TODO when BooleanArray coerces to object dtype numpy array, need to do conversion # manually in the indexing code # def test_indexing_boolean_mask():