DOC/TST: Indexing with NA raises (#30308)

pandas-dev · Jan 3, 2020 · 59b431f · 59b431f
1 parent 7b35099
commit 59b431f
Show file tree

Hide file tree

Showing 21 changed files with 304 additions and 29 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -11,7 +11,7 @@ repos:
         language: python_venv
         additional_dependencies: [flake8-comprehensions>=3.1.0]
 -   repo: https://github.com/pre-commit/mirrors-isort
-    rev: v4.3.20
+    rev: v4.3.21
     hooks:
     -   id: isort
         language: python_venv

diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
@@ -131,6 +131,7 @@ def setup(self):
         self.col_scalar = columns[10]
         self.bool_indexer = self.df[self.col_scalar] > 0
         self.bool_obj_indexer = self.bool_indexer.astype(object)
+        self.boolean_indexer = (self.df[self.col_scalar] > 0).astype("boolean")
 
     def time_loc(self):
         self.df.loc[self.idx_scalar, self.col_scalar]
@@ -144,6 +145,9 @@ def time_boolean_rows(self):
     def time_boolean_rows_object(self):
         self.df[self.bool_obj_indexer]
 
+    def time_boolean_rows_boolean(self):
+        self.df[self.boolean_indexer]
+
 
 class DataFrameNumericIndexing:
     def setup(self):

diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst
@@ -59,3 +59,11 @@ objects.
       api.extensions.ExtensionArray.nbytes
       api.extensions.ExtensionArray.ndim
       api.extensions.ExtensionArray.shape
+
+Additionally, we have some utility methods for ensuring your object
+behaves correctly.
+
+.. autosummary::
+  :toctree: api/
+
+  api.indexers.check_bool_array_indexer
diff --git a/doc/source/user_guide/boolean.rst b/doc/source/user_guide/boolean.rst
@@ -14,6 +14,29 @@ Nullable Boolean Data Type
 
 .. versionadded:: 1.0.0
 
+
+.. _boolean.indexing:
+
+Indexing with NA values
+-----------------------
+
+pandas does not allow indexing with NA values. Attempting to do so
+will raise a ``ValueError``.
+
+.. ipython:: python
+   :okexcept:
+
+   s = pd.Series([1, 2, 3])
+   mask = pd.array([True, False, pd.NA], dtype="boolean")
+   s[mask]
+
+The missing values will need to be explicitly filled with True or False prior
+to using the array as a mask.
+
+.. ipython:: python
+
+   s[mask.fillna(False)]
+
 .. _boolean.kleene:
 
 Kleene Logical Operations

diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -820,6 +820,7 @@ Datetimelike
 - Bug in :func:`pandas._config.localization.get_locales` where the ``locales -a`` encodes the locales list as windows-1252 (:issue:`23638`, :issue:`24760`, :issue:`27368`)
 - Bug in :meth:`Series.var` failing to raise ``TypeError`` when called with ``timedelta64[ns]`` dtype (:issue:`28289`)
 - Bug in :meth:`DatetimeIndex.strftime` and :meth:`Series.dt.strftime` where ``NaT`` was converted to the string ``'NaT'`` instead of ``np.nan`` (:issue:`29578`)
+- Bug in masking datetime-like arrays with a boolean mask of an incorrect length not raising an ``IndexError`` (:issue:`30308`)
 - Bug in :attr:`Timestamp.resolution` being a property instead of a class attribute (:issue:`29910`)
 - Bug in :func:`pandas.to_datetime` when called with ``None`` raising ``TypeError`` instead of returning ``NaT`` (:issue:`30011`)
 - Bug in :func:`pandas.to_datetime` failing for `deques` when using ``cache=True`` (the default) (:issue:`29403`)

diff --git a/pandas/api/indexers/__init__.py b/pandas/api/indexers/__init__.py
@@ -1,2 +1,3 @@
 """Public API for Rolling Window Indexers"""
+from pandas.core.indexers import check_bool_array_indexer  # noqa: F401
 from pandas.core.window.indexers import BaseIndexer  # noqa: F401
diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
@@ -29,6 +29,8 @@
 from pandas.core import nanops, ops
 from pandas.core.algorithms import take
 from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
+import pandas.core.common as com
+from pandas.core.indexers import check_bool_array_indexer
 
 if TYPE_CHECKING:
     from pandas._typing import Scalar
@@ -307,11 +309,22 @@ def _from_factorized(cls, values, original: "BooleanArray"):
     def _formatter(self, boxed=False):
         return str
 
+    @property
+    def _hasna(self) -> bool:
+        # Note: this is expensive right now! The hope is that we can
+        # make this faster by having an optional mask, but not have to change
+        # source code using it..
+        return self._mask.any()
+
     def __getitem__(self, item):
         if is_integer(item):
             if self._mask[item]:
                 return self.dtype.na_value
             return self._data[item]
+
+        elif com.is_bool_indexer(item):
+            item = check_bool_array_indexer(self, item)
+
         return type(self)(self._data[item], self._mask[item])
 
     def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA):
@@ -329,7 +342,7 @@ def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA):
         if dtype is None:
             dtype = object
         if is_bool_dtype(dtype):
-            if not self.isna().any():
+            if not self._hasna:
                 return self._data
             else:
                 raise ValueError(
@@ -503,7 +516,7 @@ def astype(self, dtype, copy=True):
 
         if is_bool_dtype(dtype):
             # astype_nansafe converts np.nan to True
-            if self.isna().any():
+            if self._hasna:
                 raise ValueError("cannot convert float NaN to bool")
             else:
                 return self._data.astype(dtype, copy=copy)
@@ -515,7 +528,7 @@ def astype(self, dtype, copy=True):
             )
         # for integer, error if there are missing values
         if is_integer_dtype(dtype):
-            if self.isna().any():
+            if self._hasna:
                 raise ValueError("cannot convert NA to integer")
         # for float dtype, ensure we use np.nan before casting (numpy cannot
         # deal with pd.NA)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -49,6 +49,7 @@
 from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs
 import pandas.core.common as com
 from pandas.core.construction import array, extract_array, sanitize_array
+from pandas.core.indexers import check_bool_array_indexer
 from pandas.core.missing import interpolate_2d
 from pandas.core.ops.common import unpack_zerodim_and_defer
 from pandas.core.sorting import nargsort
@@ -1996,10 +1997,13 @@ def __getitem__(self, key):
                 return np.nan
             else:
                 return self.categories[i]
-        else:
-            return self._constructor(
-                values=self._codes[key], dtype=self.dtype, fastpath=True
-            )
+
+        elif com.is_bool_indexer(key):
+            key = check_bool_array_indexer(self, key)
+
+        return self._constructor(
+            values=self._codes[key], dtype=self.dtype, fastpath=True
+        )
 
     def __setitem__(self, key, value):
         """

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -40,6 +40,7 @@
 from pandas.core import missing, nanops
 from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts
 import pandas.core.common as com
+from pandas.core.indexers import check_bool_array_indexer
 from pandas.core.ops.common import unpack_zerodim_and_defer
 from pandas.core.ops.invalid import make_invalid_op
 
@@ -436,7 +437,7 @@ def __getitem__(self, key):
             return type(self)(val, dtype=self.dtype)
 
         if com.is_bool_indexer(key):
-            key = np.asarray(key, dtype=bool)
+            key = check_bool_array_indexer(self, key)
             if key.all():
                 key = slice(0, None, None)
             else:

diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -26,6 +26,8 @@
 from pandas.core import nanops, ops
 from pandas.core.algorithms import take
 from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
+import pandas.core.common as com
+from pandas.core.indexers import check_bool_array_indexer
 from pandas.core.ops import invalid_comparison
 from pandas.core.ops.common import unpack_zerodim_and_defer
 from pandas.core.tools.numeric import to_numeric
@@ -368,6 +370,10 @@ def __getitem__(self, item):
             if self._mask[item]:
                 return self.dtype.na_value
             return self._data[item]
+
+        elif com.is_bool_indexer(item):
+            item = check_bool_array_indexer(self, item)
+
         return type(self)(self._data[item], self._mask[item])
 
     def _coerce_to_ndarray(self, dtype=None, na_value=lib._no_default):

diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py
@@ -17,7 +17,9 @@
 from pandas import compat
 from pandas.core import nanops
 from pandas.core.algorithms import searchsorted, take, unique
+import pandas.core.common as com
 from pandas.core.construction import extract_array
+from pandas.core.indexers import check_bool_array_indexer
 from pandas.core.missing import backfill_1d, pad_1d
 
 from .base import ExtensionArray, ExtensionOpsMixin
@@ -234,6 +236,9 @@ def __getitem__(self, item):
         if isinstance(item, type(self)):
             item = item._ndarray
 
+        elif com.is_bool_indexer(item):
+            item = check_bool_array_indexer(self, item)
+
         result = self._ndarray[item]
         if not lib.is_scalar(item):
             result = type(self)(result)

diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -738,6 +738,9 @@ def value_counts(self, dropna=True):
     # --------
 
     def __getitem__(self, key):
+        # avoid mypy issues when importing at the top-level
+        from pandas.core.indexing import check_bool_indexer
+
         if isinstance(key, tuple):
             if len(key) > 1:
                 raise IndexError("too many indices for array.")
@@ -766,7 +769,9 @@ def __getitem__(self, key):
                 else:
                     key = np.asarray(key)
 
-            if com.is_bool_indexer(key) and len(self) == len(key):
+            if com.is_bool_indexer(key):
+                key = check_bool_indexer(self, key)
+
                 return self.take(np.arange(len(key), dtype=np.int32)[key])
             elif hasattr(key, "__len__"):
                 return self.take(key)

diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -111,14 +111,20 @@ def is_bool_indexer(key: Any) -> bool:
     Returns
     -------
     bool
+        Whether `key` is a valid boolean indexer.
 
     Raises
     ------
     ValueError
         When the array is an object-dtype ndarray or ExtensionArray
         and contains missing values.
+
+    See Also
+    --------
+    check_bool_array_indexer : Check that `key`
+        is a valid mask for an array, and convert to an ndarray.
     """
-    na_msg = "cannot index with vector containing NA / NaN values"
+    na_msg = "cannot mask with array containing NA / NaN values"
     if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or (
         is_array_like(key) and is_extension_array_dtype(key.dtype)
     ):

diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py
@@ -3,6 +3,8 @@
 """
 import numpy as np
 
+from pandas._typing import AnyArrayLike
+
 from pandas.core.dtypes.common import is_list_like
 from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
 
@@ -240,3 +242,68 @@ def length_of_indexer(indexer, target=None) -> int:
     elif not is_list_like_indexer(indexer):
         return 1
     raise AssertionError("cannot find the length of the indexer")
+
+
+def check_bool_array_indexer(array: AnyArrayLike, mask: AnyArrayLike) -> np.ndarray:
+    """
+    Check if `mask` is a valid boolean indexer for `array`.
+
+    `array` and `mask` are checked to have the same length, and the
+    dtype is validated.
+
+    .. versionadded:: 1.0.0
+
+    Parameters
+    ----------
+    array : array
+        The array that's being masked.
+    mask : array
+        The boolean array that's masking.
+
+    Returns
+    -------
+    numpy.ndarray
+        The validated boolean mask.
+
+    Raises
+    ------
+    IndexError
+        When the lengths don't match.
+    ValueError
+        When `mask` cannot be converted to a bool-dtype ndarray.
+
+    See Also
+    --------
+    api.extensions.is_bool_indexer : Check if `key` is a boolean indexer.
+
+    Examples
+    --------
+    A boolean ndarray is returned when the arguments are all valid.
+
+    >>> mask = pd.array([True, False])
+    >>> arr = pd.Series([1, 2])
+    >>> pd.api.extensions.check_bool_array_indexer(arr, mask)
+    array([ True, False])
+
+    An IndexError is raised when the lengths don't match.
+
+    >>> mask = pd.array([True, False, True])
+    >>> pd.api.extensions.check_bool_array_indexer(arr, mask)
+    Traceback (most recent call last):
+    ...
+    IndexError: Item wrong length 3 instead of 2.
+
+    A ValueError is raised when the mask cannot be converted to
+    a bool-dtype ndarray.
+
+    >>> mask = pd.array([True, pd.NA])
+    >>> pd.api.extensions.check_bool_array_indexer(arr, mask)
+    Traceback (most recent call last):
+    ...
+    ValueError: cannot convert to bool numpy array in presence of missing values
+    """
+    result = np.asarray(mask, dtype=bool)
+    # GH26658
+    if len(result) != len(array):
+        raise IndexError(f"Item wrong length {len(result)} instead of {len(array)}.")
+    return result
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -22,7 +22,11 @@
 from pandas.core.dtypes.missing import _infer_fill_value, isna
 
 import pandas.core.common as com
-from pandas.core.indexers import is_list_like_indexer, length_of_indexer
+from pandas.core.indexers import (
+    check_bool_array_indexer,
+    is_list_like_indexer,
+    length_of_indexer,
+)
 from pandas.core.indexes.api import Index, InvalidIndexError
 
 
@@ -2309,13 +2313,7 @@ def check_bool_indexer(index: Index, key) -> np.ndarray:
     else:
         if is_sparse(result):
             result = result.to_dense()
-        result = np.asarray(result, dtype=bool)
-
-        # GH26658
-        if len(result) != len(index):
-            raise IndexError(
-                f"Item wrong length {len(result)} instead of {len(index)}."
-            )
+        result = check_bool_array_indexer(index, result)
 
     return result