Skip to content

Commit

Permalink
DOC/TST: Indexing with NA raises (#30308)
Browse files Browse the repository at this point in the history
  • Loading branch information
TomAugspurger authored and jreback committed Jan 3, 2020
1 parent 7b35099 commit 59b431f
Show file tree
Hide file tree
Showing 21 changed files with 304 additions and 29 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ repos:
language: python_venv
additional_dependencies: [flake8-comprehensions>=3.1.0]
- repo: https://github.com/pre-commit/mirrors-isort
rev: v4.3.20
rev: v4.3.21
hooks:
- id: isort
language: python_venv
Expand Down
4 changes: 4 additions & 0 deletions asv_bench/benchmarks/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ def setup(self):
self.col_scalar = columns[10]
self.bool_indexer = self.df[self.col_scalar] > 0
self.bool_obj_indexer = self.bool_indexer.astype(object)
self.boolean_indexer = (self.df[self.col_scalar] > 0).astype("boolean")

def time_loc(self):
self.df.loc[self.idx_scalar, self.col_scalar]
Expand All @@ -144,6 +145,9 @@ def time_boolean_rows(self):
def time_boolean_rows_object(self):
self.df[self.bool_obj_indexer]

def time_boolean_rows_boolean(self):
self.df[self.boolean_indexer]


class DataFrameNumericIndexing:
def setup(self):
Expand Down
8 changes: 8 additions & 0 deletions doc/source/reference/extensions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,11 @@ objects.
api.extensions.ExtensionArray.nbytes
api.extensions.ExtensionArray.ndim
api.extensions.ExtensionArray.shape
Additionally, we have some utility methods for ensuring your object
behaves correctly.

.. autosummary::
:toctree: api/

api.indexers.check_bool_array_indexer
23 changes: 23 additions & 0 deletions doc/source/user_guide/boolean.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,29 @@ Nullable Boolean Data Type

.. versionadded:: 1.0.0


.. _boolean.indexing:

Indexing with NA values
-----------------------

pandas does not allow indexing with NA values. Attempting to do so
will raise a ``ValueError``.

.. ipython:: python
:okexcept:
s = pd.Series([1, 2, 3])
mask = pd.array([True, False, pd.NA], dtype="boolean")
s[mask]
The missing values will need to be explicitly filled with True or False prior
to using the array as a mask.

.. ipython:: python
s[mask.fillna(False)]
.. _boolean.kleene:

Kleene Logical Operations
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -820,6 +820,7 @@ Datetimelike
- Bug in :func:`pandas._config.localization.get_locales` where the ``locales -a`` encodes the locales list as windows-1252 (:issue:`23638`, :issue:`24760`, :issue:`27368`)
- Bug in :meth:`Series.var` failing to raise ``TypeError`` when called with ``timedelta64[ns]`` dtype (:issue:`28289`)
- Bug in :meth:`DatetimeIndex.strftime` and :meth:`Series.dt.strftime` where ``NaT`` was converted to the string ``'NaT'`` instead of ``np.nan`` (:issue:`29578`)
- Bug in masking datetime-like arrays with a boolean mask of an incorrect length not raising an ``IndexError`` (:issue:`30308`)
- Bug in :attr:`Timestamp.resolution` being a property instead of a class attribute (:issue:`29910`)
- Bug in :func:`pandas.to_datetime` when called with ``None`` raising ``TypeError`` instead of returning ``NaT`` (:issue:`30011`)
- Bug in :func:`pandas.to_datetime` failing for `deques` when using ``cache=True`` (the default) (:issue:`29403`)
Expand Down
1 change: 1 addition & 0 deletions pandas/api/indexers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
"""Public API for Rolling Window Indexers"""
from pandas.core.indexers import check_bool_array_indexer # noqa: F401
from pandas.core.window.indexers import BaseIndexer # noqa: F401
19 changes: 16 additions & 3 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
from pandas.core import nanops, ops
from pandas.core.algorithms import take
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
import pandas.core.common as com
from pandas.core.indexers import check_bool_array_indexer

if TYPE_CHECKING:
from pandas._typing import Scalar
Expand Down Expand Up @@ -307,11 +309,22 @@ def _from_factorized(cls, values, original: "BooleanArray"):
def _formatter(self, boxed=False):
return str

@property
def _hasna(self) -> bool:
# Note: this is expensive right now! The hope is that we can
# make this faster by having an optional mask, but not have to change
# source code using it..
return self._mask.any()

def __getitem__(self, item):
if is_integer(item):
if self._mask[item]:
return self.dtype.na_value
return self._data[item]

elif com.is_bool_indexer(item):
item = check_bool_array_indexer(self, item)

return type(self)(self._data[item], self._mask[item])

def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA):
Expand All @@ -329,7 +342,7 @@ def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA):
if dtype is None:
dtype = object
if is_bool_dtype(dtype):
if not self.isna().any():
if not self._hasna:
return self._data
else:
raise ValueError(
Expand Down Expand Up @@ -503,7 +516,7 @@ def astype(self, dtype, copy=True):

if is_bool_dtype(dtype):
# astype_nansafe converts np.nan to True
if self.isna().any():
if self._hasna:
raise ValueError("cannot convert float NaN to bool")
else:
return self._data.astype(dtype, copy=copy)
Expand All @@ -515,7 +528,7 @@ def astype(self, dtype, copy=True):
)
# for integer, error if there are missing values
if is_integer_dtype(dtype):
if self.isna().any():
if self._hasna:
raise ValueError("cannot convert NA to integer")
# for float dtype, ensure we use np.nan before casting (numpy cannot
# deal with pd.NA)
Expand Down
12 changes: 8 additions & 4 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs
import pandas.core.common as com
from pandas.core.construction import array, extract_array, sanitize_array
from pandas.core.indexers import check_bool_array_indexer
from pandas.core.missing import interpolate_2d
from pandas.core.ops.common import unpack_zerodim_and_defer
from pandas.core.sorting import nargsort
Expand Down Expand Up @@ -1996,10 +1997,13 @@ def __getitem__(self, key):
return np.nan
else:
return self.categories[i]
else:
return self._constructor(
values=self._codes[key], dtype=self.dtype, fastpath=True
)

elif com.is_bool_indexer(key):
key = check_bool_array_indexer(self, key)

return self._constructor(
values=self._codes[key], dtype=self.dtype, fastpath=True
)

def __setitem__(self, key, value):
"""
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
from pandas.core import missing, nanops
from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts
import pandas.core.common as com
from pandas.core.indexers import check_bool_array_indexer
from pandas.core.ops.common import unpack_zerodim_and_defer
from pandas.core.ops.invalid import make_invalid_op

Expand Down Expand Up @@ -436,7 +437,7 @@ def __getitem__(self, key):
return type(self)(val, dtype=self.dtype)

if com.is_bool_indexer(key):
key = np.asarray(key, dtype=bool)
key = check_bool_array_indexer(self, key)
if key.all():
key = slice(0, None, None)
else:
Expand Down
6 changes: 6 additions & 0 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
from pandas.core import nanops, ops
from pandas.core.algorithms import take
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
import pandas.core.common as com
from pandas.core.indexers import check_bool_array_indexer
from pandas.core.ops import invalid_comparison
from pandas.core.ops.common import unpack_zerodim_and_defer
from pandas.core.tools.numeric import to_numeric
Expand Down Expand Up @@ -368,6 +370,10 @@ def __getitem__(self, item):
if self._mask[item]:
return self.dtype.na_value
return self._data[item]

elif com.is_bool_indexer(item):
item = check_bool_array_indexer(self, item)

return type(self)(self._data[item], self._mask[item])

def _coerce_to_ndarray(self, dtype=None, na_value=lib._no_default):
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/arrays/numpy_.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
from pandas import compat
from pandas.core import nanops
from pandas.core.algorithms import searchsorted, take, unique
import pandas.core.common as com
from pandas.core.construction import extract_array
from pandas.core.indexers import check_bool_array_indexer
from pandas.core.missing import backfill_1d, pad_1d

from .base import ExtensionArray, ExtensionOpsMixin
Expand Down Expand Up @@ -234,6 +236,9 @@ def __getitem__(self, item):
if isinstance(item, type(self)):
item = item._ndarray

elif com.is_bool_indexer(item):
item = check_bool_array_indexer(self, item)

result = self._ndarray[item]
if not lib.is_scalar(item):
result = type(self)(result)
Expand Down
7 changes: 6 additions & 1 deletion pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -738,6 +738,9 @@ def value_counts(self, dropna=True):
# --------

def __getitem__(self, key):
# avoid mypy issues when importing at the top-level
from pandas.core.indexing import check_bool_indexer

if isinstance(key, tuple):
if len(key) > 1:
raise IndexError("too many indices for array.")
Expand Down Expand Up @@ -766,7 +769,9 @@ def __getitem__(self, key):
else:
key = np.asarray(key)

if com.is_bool_indexer(key) and len(self) == len(key):
if com.is_bool_indexer(key):
key = check_bool_indexer(self, key)

return self.take(np.arange(len(key), dtype=np.int32)[key])
elif hasattr(key, "__len__"):
return self.take(key)
Expand Down
8 changes: 7 additions & 1 deletion pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,14 +111,20 @@ def is_bool_indexer(key: Any) -> bool:
Returns
-------
bool
Whether `key` is a valid boolean indexer.
Raises
------
ValueError
When the array is an object-dtype ndarray or ExtensionArray
and contains missing values.
See Also
--------
check_bool_array_indexer : Check that `key`
is a valid mask for an array, and convert to an ndarray.
"""
na_msg = "cannot index with vector containing NA / NaN values"
na_msg = "cannot mask with array containing NA / NaN values"
if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or (
is_array_like(key) and is_extension_array_dtype(key.dtype)
):
Expand Down
67 changes: 67 additions & 0 deletions pandas/core/indexers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
"""
import numpy as np

from pandas._typing import AnyArrayLike

from pandas.core.dtypes.common import is_list_like
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries

Expand Down Expand Up @@ -240,3 +242,68 @@ def length_of_indexer(indexer, target=None) -> int:
elif not is_list_like_indexer(indexer):
return 1
raise AssertionError("cannot find the length of the indexer")


def check_bool_array_indexer(array: AnyArrayLike, mask: AnyArrayLike) -> np.ndarray:
"""
Check if `mask` is a valid boolean indexer for `array`.
`array` and `mask` are checked to have the same length, and the
dtype is validated.
.. versionadded:: 1.0.0
Parameters
----------
array : array
The array that's being masked.
mask : array
The boolean array that's masking.
Returns
-------
numpy.ndarray
The validated boolean mask.
Raises
------
IndexError
When the lengths don't match.
ValueError
When `mask` cannot be converted to a bool-dtype ndarray.
See Also
--------
api.extensions.is_bool_indexer : Check if `key` is a boolean indexer.
Examples
--------
A boolean ndarray is returned when the arguments are all valid.
>>> mask = pd.array([True, False])
>>> arr = pd.Series([1, 2])
>>> pd.api.extensions.check_bool_array_indexer(arr, mask)
array([ True, False])
An IndexError is raised when the lengths don't match.
>>> mask = pd.array([True, False, True])
>>> pd.api.extensions.check_bool_array_indexer(arr, mask)
Traceback (most recent call last):
...
IndexError: Item wrong length 3 instead of 2.
A ValueError is raised when the mask cannot be converted to
a bool-dtype ndarray.
>>> mask = pd.array([True, pd.NA])
>>> pd.api.extensions.check_bool_array_indexer(arr, mask)
Traceback (most recent call last):
...
ValueError: cannot convert to bool numpy array in presence of missing values
"""
result = np.asarray(mask, dtype=bool)
# GH26658
if len(result) != len(array):
raise IndexError(f"Item wrong length {len(result)} instead of {len(array)}.")
return result
14 changes: 6 additions & 8 deletions pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,11 @@
from pandas.core.dtypes.missing import _infer_fill_value, isna

import pandas.core.common as com
from pandas.core.indexers import is_list_like_indexer, length_of_indexer
from pandas.core.indexers import (
check_bool_array_indexer,
is_list_like_indexer,
length_of_indexer,
)
from pandas.core.indexes.api import Index, InvalidIndexError


Expand Down Expand Up @@ -2309,13 +2313,7 @@ def check_bool_indexer(index: Index, key) -> np.ndarray:
else:
if is_sparse(result):
result = result.to_dense()
result = np.asarray(result, dtype=bool)

# GH26658
if len(result) != len(index):
raise IndexError(
f"Item wrong length {len(result)} instead of {len(index)}."
)
result = check_bool_array_indexer(index, result)

return result

Expand Down
Loading

0 comments on commit 59b431f

Please sign in to comment.