Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API: BooleanArray any/all with NA logic #30062

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 29 additions & 5 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,30 @@ def _values_for_argsort(self) -> np.ndarray:
data[self._mask] = -1
return data

def any(self, skipna=True):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens with np.any with this? Do we need any keywords for compatibility?

Is the expected behavior here different from nanops.nanany? / nanops.nanall?

# nv.validate_any((), dict(out=out, keepdims=keepdims))
valid_values = self._data[~self._mask]
if skipna:
return valid_values.any()
else:
result = valid_values.any()
if result or len(self) == 0:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use not len(self)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In pandas/core, we actually use the len(..) == 0 pattern more than not len(..). I personally also find that easier to read.

(the typical pythonic idiom recommendation is about doing if (not) container: instead of if (not) len(container) for empty containers, but that of course doesn't hold for arrays)

return result
else:
return self.dtype.na_value

def all(self, skipna=True):
# nv.validate_any((), dict(out=out, keepdims=keepdims))
valid_values = self._data[~self._mask]
if skipna:
return valid_values.all()
else:
result = valid_values.all()
if not result or len(self) == 0:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above

return result
else:
return self.dtype.na_value

@classmethod
def _create_logical_method(cls, op):
def logical_method(self, other):
Expand Down Expand Up @@ -646,6 +670,10 @@ def cmp_method(self, other):
return set_function_name(cmp_method, name, cls)

def _reduce(self, name, skipna=True, **kwargs):

if name in {"any", "all"}:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we usually use lists for these checks

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this file we actually use more in {} than in [] (both are used), but since Tom and I wrote this file, that's probably not an argument ;)
Happy to change it, purely performance wise the set is faster (but this is about nanoseconds of course ..)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Heh, I'm probably to blame for the sets :) I like them more for membership tests, though it doesn't matter for small sets.

return getattr(self, name)(skipna=skipna, **kwargs)

data = self._data
mask = self._mask

Expand All @@ -657,12 +685,8 @@ def _reduce(self, name, skipna=True, **kwargs):
op = getattr(nanops, "nan" + name)
result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs)

# if we have a boolean op, don't coerce
if name in ["any", "all"]:
pass

# if we have numeric op that would result in an int, coerce to int if possible
elif name in ["sum", "prod"] and notna(result):
if name in ["sum", "prod"] and notna(result):
int_result = np.int64(result)
if int_result == result:
result = int_result
Expand Down
24 changes: 24 additions & 0 deletions pandas/tests/arrays/test_boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,30 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions):
assert isinstance(getattr(s, op)(), np.float64)


@pytest.mark.parametrize(
"values, exp_any, exp_all, exp_any_noskip, exp_all_noskip",
[
([True, pd.NA], True, True, True, pd.NA),
([False, pd.NA], False, False, pd.NA, False),
([pd.NA], False, True, pd.NA, pd.NA),
([], False, True, False, True),
],
)
def test_any_all(values, exp_any, exp_all, exp_any_noskip, exp_all_noskip):
# the methods return numpy scalars
exp_any = pd.NA if exp_any is pd.NA else np.bool_(exp_any)
exp_all = pd.NA if exp_all is pd.NA else np.bool_(exp_all)
exp_any_noskip = pd.NA if exp_any_noskip is pd.NA else np.bool_(exp_any_noskip)
exp_all_noskip = pd.NA if exp_all_noskip is pd.NA else np.bool_(exp_all_noskip)

for con in [pd.array, pd.Series]:
a = con(values, dtype="boolean")
assert a.any() is exp_any
assert a.all() is exp_all
assert a.any(skipna=False) is exp_any_noskip
assert a.all(skipna=False) is exp_all_noskip


# TODO when BooleanArray coerces to object dtype numpy array, need to do conversion
# manually in the indexing code
# def test_indexing_boolean_mask():
Expand Down