Skip to content

Commit

Permalink
ENH: add BooleanArray extension array (pandas-dev#29555)
Browse files Browse the repository at this point in the history
  • Loading branch information
jorisvandenbossche authored and TomAugspurger committed Nov 25, 2019
1 parent 7d7f885 commit bb904cb
Show file tree
Hide file tree
Showing 4 changed files with 205 additions and 7 deletions.
83 changes: 83 additions & 0 deletions doc/source/boolean.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
.. currentmodule:: pandas

.. _boolean:

**************************
Nullable Boolean Data Type
**************************

.. versionadded:: 1.0.0

.. _boolean.klean:

Kleene Logic
------------

:class:`arrays.BooleanArray` implements Kleene logic (sometime called three-value logic) for
logical operations like ``&`` (and), ``|`` (or) and ``^`` (exclusive-or).

Here's a table for ``and``.

========== =========== ============
left value right value output value
========== =========== ============
True True True
True False False
True NA NA
False False False
False NA False
NA NA NA
========== =========== ============


And for ``or``

========== =========== ============
left value right value output value
========== =========== ============
True True True
True False True
True NA True
False False False
False NA NA
NA NA NA
========== =========== ============

And for ``xor``

========== =========== ============
left value right value output value
========== =========== ============
True True False
True False True
True NA NA
False False False
False NA NA
NA NA NA
========== =========== ============

When an ``NA`` is present in an operation, the output value is ``NA`` only if
the result cannot be determined soley based on the other input. For example,
``True | NA`` is ``True``, because both ``True | True`` and ``True | False``
are ``True``. In that case, we don't actually need to consider the value
of the ``NA``.

On the other hand, ``True & NA`` is ``NA``. The result depends on whether
the ``NA`` really is ``True`` or ``False``, since ``True & True`` is ``True``,
but ``True & False`` is ``False``, so we can't determine the output.


This differs from how ``np.nan`` behaves in logical operations. Pandas treated
``np.nan`` is *always false in the output*.

In ``or``

.. ipython:: python
pd.Series([True, False, np.nan], dtype="object") | True
pd.Series([True, False, np.nan], dtype="boolean") | True
In ``and``

pd.Series([True, False, np.nan], dtype="object") & True
pd.Series([True, False, np.nan], dtype="boolean") & True
1 change: 1 addition & 0 deletions doc/source/index.rst.template
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ See the :ref:`overview` for more detail about what's in the library.
* :doc:`user_guide/missing_data`
* :doc:`user_guide/categorical`
* :doc:`user_guide/integer_na`
* :doc:`user_guide/boolean`
* :doc:`user_guide/visualization`
* :doc:`user_guide/computation`
* :doc:`user_guide/groupby`
Expand Down
40 changes: 33 additions & 7 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,9 @@ class BooleanArray(ExtensionArray, ExtensionOpsMixin):
represented by 2 numpy arrays: a boolean array with the data and
a boolean array with the mask (True indicating missing).
BooleanArray implements Kleene logic (sometimes called three-value
logic) for logical operations. See :ref:`` for more.
To construct an BooleanArray from generic array-like input, use
:func:`pandas.array` specifying ``dtype="boolean"`` (see examples
below).
Expand Down Expand Up @@ -560,10 +563,12 @@ def logical_method(self, other):
return NotImplemented

other = lib.item_from_zerodim(other)
mask = None
omask = mask = None
other_is_booleanarray = isinstance(other, BooleanArray)

if isinstance(other, BooleanArray):
other, mask = other._data, other._mask
if other_is_booleanarray:
other, omask = other._data, other._mask
mask = omask
elif is_list_like(other):
other = np.asarray(other, dtype="bool")
if other.ndim > 1:
Expand All @@ -576,17 +581,38 @@ def logical_method(self, other):

# numpy will show a DeprecationWarning on invalid elementwise
# comparisons, this will raise in the future
with warnings.catch_warnings():
warnings.filterwarnings("ignore", "elementwise", FutureWarning)
with np.errstate(all="ignore"):
result = op(self._data, other)
if lib.is_scalar(other) and np.isnan(
other
): # TODO(NA): change to libmissing.NA:
result = self._data
mask = True
else:
with warnings.catch_warnings():
warnings.filterwarnings("ignore", "elementwise", FutureWarning)
with np.errstate(all="ignore"):
result = op(self._data, other)

# nans propagate
if mask is None:
mask = self._mask
else:
mask = self._mask | mask

# Kleene-logic adjustments to the mask.
if op.__name__ in {"or_", "ror_"}:
mask[result] = False
elif op.__name__ in {"and_", "rand_"}:
mask[~self._data & ~self._mask] = False
if other_is_booleanarray:
mask[~other & ~omask] = False
elif lib.is_scalar(other) and np.isnan(other): # TODO(NA): change to NA
mask[:] = True
# Do we ever assume that masked values are False?
result[mask] = False
elif op.__name__ in {"xor", "rxor"}:
# Do we ever assume that masked values are False?
result[mask] = False

return BooleanArray(result, mask)

name = "__{name}__".format(name=op.__name__)
Expand Down
88 changes: 88 additions & 0 deletions pandas/tests/arrays/test_boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,13 +391,101 @@ def test_scalar(self, data, all_logical_operators):

def test_array(self, data, all_logical_operators):
op_name = all_logical_operators
if "or" in op_name:
pytest.skip("confusing")
other = pd.array([True] * len(data), dtype="boolean")
self._compare_other(data, op_name, other)
other = np.array([True] * len(data))
self._compare_other(data, op_name, other)
other = pd.Series([True] * len(data), dtype="boolean")
self._compare_other(data, op_name, other)

def test_kleene_or(self):
# A clear test of behavior.
a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
b = pd.array([True, False, None] * 3, dtype="boolean")
result = a | b
expected = pd.array(
[True, True, True, True, False, None, True, None, None], dtype="boolean"
)
tm.assert_extension_array_equal(result, expected)

result = b | a
tm.assert_extension_array_equal(result, expected)

def test_kleene_or_scalar(self):
a = pd.array([True, False, None], dtype="boolean")
result = a | np.nan # TODO: pd.NA
expected = pd.array([True, None, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)

result = np.nan | a # TODO: pd.NA
tm.assert_extension_array_equal(result, expected)

@pytest.mark.parametrize(
"left,right,expected",
[
([True, False, None], True, [True, True, True]),
([True, False, None], False, [True, False, None]),
([True, False, None], np.nan, [True, None, None]),
# TODO: pd.NA
],
)
def test_kleene_or_cases(self, left, right, expected):
if isinstance(left, list):
left = pd.array(left, dtype="boolean")
if isinstance(right, list):
right = pd.array(right, dtype="boolean")
expected = pd.array(expected, dtype="boolean")
result = left | right
tm.assert_extension_array_equal(result, expected)

result = right | left
tm.assert_extension_array_equal(result, expected)

def test_kleene_and(self):
# A clear test of behavior.
a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
b = pd.array([True, False, None] * 3, dtype="boolean")
result = a & b
expected = pd.array(
[True, False, None, False, False, False, None, False, None], dtype="boolean"
)
tm.assert_extension_array_equal(result, expected)

result = b & a
tm.assert_extension_array_equal(result, expected)

def test_kleene_and_scalar(self):
a = pd.array([True, False, None], dtype="boolean")
result = a & np.nan # TODO: pd.NA
expected = pd.array([None, None, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)

result = np.nan & a # TODO: pd.na
tm.assert_extension_array_equal(result, expected)

def test_kleene_xor(self):
a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
b = pd.array([True, False, None] * 3, dtype="boolean")
result = a ^ b
expected = pd.array(
[False, True, None, True, False, None, None, None, None], dtype="boolean"
)
tm.assert_extension_array_equal(result, expected)

result = b ^ a
tm.assert_extension_array_equal(result, expected)

def test_kleene_scalar(self):
a = pd.array([True, False, None], dtype="boolean")
result = a ^ np.nan # TODO: pd.NA
expected = pd.array([None, None, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)

result = np.nan ^ a # TODO: pd.NA
tm.assert_extension_array_equal(result, expected)


class TestComparisonOps(BaseOpsUtil):
def _compare_other(self, data, op_name, other):
Expand Down

0 comments on commit bb904cb

Please sign in to comment.