From 2d448045d32e77155d79b005d885eb104a891343 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Thu, 19 Apr 2018 19:05:03 +0200 Subject: [PATCH] API: emit warning to raise KeyError in the future for missing keys also for MultiIndex closes #17758 closes #20748 closes #20753 --- doc/source/whatsnew/v0.23.0.txt | 3 + pandas/core/indexes/base.py | 3 + pandas/core/indexing.py | 242 ++++++++++++++------------- pandas/tests/frame/test_indexing.py | 5 +- pandas/tests/indexing/common.py | 31 +++- pandas/tests/indexing/test_floats.py | 18 +- pandas/tests/indexing/test_loc.py | 23 ++- 7 files changed, 194 insertions(+), 131 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index bcc442189bf11b..b64f3cbdf0fd71 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -866,6 +866,7 @@ Deprecations - The ``is_copy`` attribute is deprecated and will be removed in a future version (:issue:`18801`). - ``IntervalIndex.from_intervals`` is deprecated in favor of the :class:`IntervalIndex` constructor (:issue:`19263`) - ``DataFrame.from_items`` is deprecated. Use :func:`DataFrame.from_dict` instead, or ``DataFrame.from_dict(OrderedDict())`` if you wish to preserve the key order (:issue:`17320`, :issue:`17312`) +- Indexing a ``MultiIndex`` or a ``FloatIndex`` with a list containing some missing keys will now show a ``FutureWarning``, consistently with other types of indexes (:issue:`17758`). - The ``broadcast`` parameter of ``.apply()`` is deprecated in favor of ``result_type='broadcast'`` (:issue:`18577`) - The ``reduce`` parameter of ``.apply()`` is deprecated in favor of ``result_type='reduce'`` (:issue:`18577`) @@ -1102,6 +1103,8 @@ Indexing - :func:`Index.to_series` now accepts ``index`` and ``name`` kwargs (:issue:`18699`) - :func:`DatetimeIndex.to_series` now accepts ``index`` and ``name`` kwargs (:issue:`18699`) - Bug in indexing non-scalar value from ``Series`` having non-unique ``Index`` will return value flattened (:issue:`17610`) +- Bug in indexing with iterator containing only missing keys, which raised no error (:issue:`20748`) +- Fixed inconsistency in ``.ix`` between list and scalar keys when index has integer dtype and does not include desired key(s) (:issue:`20753`) - Bug in ``__setitem__`` when indexing a :class:`DataFrame` with a 2-d boolean ndarray (:issue:`18582`) - Bug in ``str.extractall`` when there were no matches empty :class:`Index` was returned instead of appropriate :class:`MultiIndex` (:issue:`19034`) - Bug in :class:`IntervalIndex` where empty and purely NA data was constructed inconsistently depending on the construction method (:issue:`18421`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 12bb09e8f8a8ab..3eb7f0d8b5ba75 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4881,6 +4881,9 @@ def _ensure_index(index_like, copy=False): if hasattr(index_like, 'name'): return Index(index_like, name=index_like.name, copy=copy) + if is_iterator(index_like): + index_like = list(index_like) + # must check for exactly list here because of strict type # check in clean_index_list if isinstance(index_like, list): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 5240a4703c2423..2d5a762ef52dbb 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -186,7 +186,7 @@ def __setitem__(self, key, value): indexer = self._get_setitem_indexer(key) self._setitem_with_indexer(indexer, value) - def _has_valid_type(self, k, axis): + def _validate_key(self, k, axis): raise NotImplementedError() def _has_valid_tuple(self, key): @@ -194,25 +194,13 @@ def _has_valid_tuple(self, key): for i, k in enumerate(key): if i >= self.obj.ndim: raise IndexingError('Too many indexers') - if not self._has_valid_type(k, i): + try: + self._validate_key(k, i) + except ValueError: raise ValueError("Location based indexing can only have " "[{types}] types" .format(types=self._valid_types)) - def _should_validate_iterable(self, axis=None): - """ return a boolean whether this axes needs validation for a passed - iterable - """ - if axis is None: - axis = self.axis or 0 - ax = self.obj._get_axis(axis) - if isinstance(ax, MultiIndex): - return False - elif ax.is_floating(): - return False - - return True - def _is_nested_tuple_indexer(self, tup): if any(isinstance(ax, MultiIndex) for ax in self.obj.axes): return any(is_nested_tuple(tup, ax) for ax in self.obj.axes) @@ -905,38 +893,31 @@ def _multi_take(self, tup): """ try: o = self.obj - d = {a: self._convert_for_reindex(t, axis=o._get_axis_number(a)) - for t, a in zip(tup, o._AXIS_ORDERS)} - return o.reindex(**d) - except(KeyError, IndexingError): - raise self._exception + d = {} + for key, axis in zip(tup, o._AXIS_ORDERS): + ax = o._get_axis(axis) + indexer, keyarr = ax._convert_listlike_indexer(key, + kind=self.name) + if indexer is not None and (indexer != -1).all(): + self._validate_read_indexer(key, indexer, axis) + d[axis] = (ax[indexer], indexer) + continue + + # If we are trying to get actual keys from empty Series, we + # patiently wait for a KeyError later on - otherwise, convert + if len(ax) or not len(key): + key = self._convert_for_reindex(key, axis) + indexer = ax.get_indexer_for(key) + keyarr = ax.reindex(keyarr)[0] + self._validate_read_indexer(keyarr, indexer, + o._get_axis_number(axis)) + d[axis] = (keyarr, indexer) + return o._reindex_with_indexers(d, copy=True, allow_dups=True) + except(KeyError, IndexingError) as detail: + raise self._exception(detail) def _convert_for_reindex(self, key, axis=None): - if axis is None: - axis = self.axis or 0 - labels = self.obj._get_axis(axis) - - if com.is_bool_indexer(key): - key = check_bool_indexer(labels, key) - return labels[key] - else: - if isinstance(key, Index): - keyarr = labels._convert_index_indexer(key) - else: - # asarray can be unsafe, NumPy strings are weird - keyarr = com._asarray_tuplesafe(key) - - if is_integer_dtype(keyarr): - # Cast the indexer to uint64 if possible so - # that the values returned from indexing are - # also uint64. - keyarr = labels._convert_arr_indexer(keyarr) - - if not labels.is_integer(): - keyarr = _ensure_platform_int(keyarr) - return labels.take(keyarr) - - return keyarr + return key def _handle_lowerdim_multi_index_axis0(self, tup): # we have an axis0 multi-index, handle or raise @@ -1072,8 +1053,9 @@ def _getitem_axis(self, key, axis=None): if axis is None: axis = self.axis or 0 - if self._should_validate_iterable(axis): - self._has_valid_type(key, axis) + if is_iterator(key): + key = list(key) + self._validate_key(key, axis) labels = self.obj._get_axis(axis) if isinstance(key, slice): @@ -1109,8 +1091,7 @@ def _getitem_iterable(self, key, axis=None): if axis is None: axis = self.axis or 0 - if self._should_validate_iterable(axis): - self._has_valid_type(key, axis) + self._validate_key(key, axis) labels = self.obj._get_axis(axis) @@ -1124,19 +1105,17 @@ def _getitem_iterable(self, key, axis=None): indexer, keyarr = labels._convert_listlike_indexer( key, kind=self.name) if indexer is not None and (indexer != -1).all(): + self._validate_read_indexer(key, indexer, axis) return self.obj.take(indexer, axis=axis) + ax = self.obj._get_axis(axis) # existing labels are unique and indexer are unique if labels.is_unique and Index(keyarr).is_unique: + self._validate_read_indexer(key, ax.get_indexer_for(key), axis) - try: - return self.obj.reindex(keyarr, axis=axis) - except AttributeError: - - # Series - if axis != 0: - raise AssertionError('axis must be 0') - return self.obj.reindex(keyarr) + d = {axis: [ax.reindex(keyarr)[0], ax.get_indexer_for(key)]} + return self.obj._reindex_with_indexers(d, copy=True, + allow_dups=True) # existing labels are non-unique else: @@ -1153,15 +1132,53 @@ def _getitem_iterable(self, key, axis=None): result = self.obj._take(indexer[indexer != -1], axis=axis, convert=False) + self._validate_read_indexer(key, new_indexer, axis) result = result._reindex_with_indexers( {axis: [new_target, new_indexer]}, copy=True, allow_dups=True) else: + self._validate_read_indexer(key, indexer, axis) result = self.obj._take(indexer, axis=axis) return result + def _validate_read_indexer(self, key, indexer, axis): + """ + Check that indexer is OK (e.g. at least one element was found, unless + the list of keys was actually empty). + """ + ax = self.obj._get_axis(axis) + # True indicates missing values + if len(key) == 0: + return + + missing = indexer < 0 + + if np.any(missing): + if np.all(missing): + raise KeyError( + u"None of [{key}] are in the [{axis}]".format( + key=key, axis=self.obj._get_axis_name(axis))) + else: + + # we skip the warning on Categorical/Interval + # as this check is actually done (check for + # non-missing values), but a bit later in the + # code, so we want to avoid warning & then + # just raising + + _missing_key_warning = textwrap.dedent(""" + Passing list-likes to .loc or [] with any missing label will raise + KeyError in the future, you can use .reindex() as an alternative. + + See the documentation here: + https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike""") # noqa + + if not (ax.is_categorical() or ax.is_interval()): + warnings.warn(_missing_key_warning, + FutureWarning, stacklevel=5) + def _convert_to_indexer(self, obj, axis=None, is_setter=False): """ Convert indexing key into something we can use to do actual fancy @@ -1336,7 +1353,7 @@ def __init__(self, name, obj): DeprecationWarning, stacklevel=2) super(_IXIndexer, self).__init__(name, obj) - def _has_valid_type(self, key, axis): + def _validate_key(self, key, axis): if isinstance(key, slice): return True @@ -1352,6 +1369,33 @@ def _has_valid_type(self, key, axis): return True + def _convert_for_reindex(self, key, axis=None): + if axis is None: + axis = self.axis or 0 + labels = self.obj._get_axis(axis) + + if com.is_bool_indexer(key): + key = check_bool_indexer(labels, key) + return labels[key] + else: + if isinstance(key, Index): + keyarr = labels._convert_index_indexer(key) + else: + # asarray can be unsafe, NumPy strings are weird + keyarr = com._asarray_tuplesafe(key) + + if is_integer_dtype(keyarr): + # Cast the indexer to uint64 if possible so + # that the values returned from indexing are + # also uint64. + keyarr = labels._convert_arr_indexer(keyarr) + + if not labels.is_integer(): + keyarr = _ensure_platform_int(keyarr) + return labels.take(keyarr) + + return keyarr + class _LocationIndexer(_NDFrameIndexer): _exception = Exception @@ -1655,7 +1699,7 @@ class _LocIndexer(_LocationIndexer): "index is integers), listlike of labels, boolean") _exception = KeyError - def _has_valid_type(self, key, axis): + def _validate_key(self, key, axis): ax = self.obj._get_axis(axis) # valid for a label where all labels are in the index @@ -1664,48 +1708,12 @@ def _has_valid_type(self, key, axis): # boolean if isinstance(key, slice): - return True + return elif com.is_bool_indexer(key): - return True - - elif is_list_like_indexer(key): - - # mi is just a passthru - if isinstance(key, tuple) and isinstance(ax, MultiIndex): - return True - - if not is_iterator(key) and len(key): + return - # True indicates missing values - missing = ax.get_indexer_for(key) < 0 - - if np.any(missing): - if len(key) == 1 or np.all(missing): - raise KeyError( - u"None of [{key}] are in the [{axis}]".format( - key=key, axis=self.obj._get_axis_name(axis))) - else: - - # we skip the warning on Categorical/Interval - # as this check is actually done (check for - # non-missing values), but a bit later in the - # code, so we want to avoid warning & then - # just raising - _missing_key_warning = textwrap.dedent(""" - Passing list-likes to .loc or [] with any missing label will raise - KeyError in the future, you can use .reindex() as an alternative. - - See the documentation here: - https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike""") # noqa - - if not (ax.is_categorical() or ax.is_interval()): - warnings.warn(_missing_key_warning, - FutureWarning, stacklevel=5) - - return True - - else: + elif not is_list_like_indexer(key): def error(): if isna(key): @@ -1728,8 +1736,6 @@ def error(): except: error() - return True - def _is_scalar_access(self, key): # this is a shortcut accessor to both .loc and .iloc # that provide the equivalent access of .at and .iat @@ -1788,11 +1794,14 @@ def _getitem_axis(self, key, axis=None): if axis is None: axis = self.axis or 0 + if is_iterator(key): + key = list(key) + labels = self.obj._get_axis(axis) key = self._get_partial_string_timestamp_match_key(key, labels) if isinstance(key, slice): - self._has_valid_type(key, axis) + self._validate_key(key, axis) return self._get_slice_axis(key, axis=axis) elif com.is_bool_indexer(key): return self._getbool_axis(key, axis=axis) @@ -1838,7 +1847,7 @@ def _getitem_axis(self, key, axis=None): return self.obj.iloc[tuple(indexer)] # fall thru to straight lookup - self._has_valid_type(key, axis) + self._validate_key(key, axis) return self._get_label(key, axis=axis) @@ -1870,7 +1879,7 @@ class _iLocIndexer(_LocationIndexer): "point is EXCLUDED), listlike of integers, boolean array") _exception = IndexError - def _has_valid_type(self, key, axis): + def _validate_key(self, key, axis): if com.is_bool_indexer(key): if hasattr(key, 'index') and isinstance(key.index, Index): if key.index.inferred_type == 'integer': @@ -1879,15 +1888,17 @@ def _has_valid_type(self, key, axis): "is not available") raise ValueError("iLocation based boolean indexing cannot use " "an indexable as a mask") - return True + return if isinstance(key, slice): - return True + return elif is_integer(key): - return self._is_valid_integer(key, axis) + assert(self._is_valid_integer(key, axis)) elif is_list_like_indexer(key): - return self._is_valid_list_like(key, axis) - return False + assert(self._is_valid_list_like(key, axis)) + else: + raise ValueError("Can only index by location with " + "a [{types}]".format(types=self._valid_types)) def _has_valid_setitem_indexer(self, indexer): self._has_valid_positional_setitem_indexer(indexer) @@ -2016,7 +2027,7 @@ def _getitem_axis(self, key, axis=None): axis = self.axis or 0 if isinstance(key, slice): - self._has_valid_type(key, axis) + self._validate_key(key, axis) return self._get_slice_axis(key, axis=axis) if isinstance(key, list): @@ -2026,7 +2037,7 @@ def _getitem_axis(self, key, axis=None): pass if com.is_bool_indexer(key): - self._has_valid_type(key, axis) + self._validate_key(key, axis) return self._getbool_axis(key, axis=axis) # a list of integers @@ -2058,11 +2069,12 @@ def _convert_to_indexer(self, obj, axis=None, is_setter=False): elif is_float(obj): return self._convert_scalar_indexer(obj, axis) - elif self._has_valid_type(obj, axis): + try: + self._validate_key(obj, axis) return obj - - raise ValueError("Can only index by location with " - "a [{types}]".format(types=self._valid_types)) + except ValueError: + raise ValueError("Can only index by location with " + "a [{types}]".format(types=self._valid_types)) class _ScalarAccessIndexer(_NDFrameIndexer): diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index a8b81b1b035520..878099a17917f6 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -396,8 +396,9 @@ def test_getitem_setitem_ix_negative_integers(self): assert (self.frame['D'] == 0).all() df = DataFrame(np.random.randn(8, 4)) - with catch_warnings(record=True): - assert isna(df.ix[:, [-1]].values).all() + # ix does label-based indexing when having an integer index + with pytest.raises(KeyError): + df.ix[:, [-1]] # #1942 a = DataFrame(randn(20, 2), index=[chr(x + 65) for x in range(20)]) diff --git a/pandas/tests/indexing/common.py b/pandas/tests/indexing/common.py index ded16224aedf2a..f7fa0584cc16bd 100644 --- a/pandas/tests/indexing/common.py +++ b/pandas/tests/indexing/common.py @@ -6,7 +6,8 @@ from pandas.compat import lrange from pandas.core.dtypes.common import is_scalar -from pandas import Series, DataFrame, Panel, date_range, UInt64Index +from pandas import (Series, DataFrame, Panel, date_range, UInt64Index, + Float64Index, MultiIndex) from pandas.util import testing as tm from pandas.io.formats.printing import pprint_thing @@ -29,7 +30,7 @@ class Base(object): _objs = set(['series', 'frame', 'panel']) _typs = set(['ints', 'uints', 'labels', 'mixed', - 'ts', 'floats', 'empty', 'ts_rev']) + 'ts', 'floats', 'empty', 'ts_rev', 'multi']) def setup_method(self, method): @@ -54,6 +55,32 @@ def setup_method(self, method): major_axis=UInt64Index(lrange(0, 12, 3)), minor_axis=UInt64Index(lrange(0, 16, 4))) + self.series_floats = Series(np.random.rand(4), + index=Float64Index(range(0, 8, 2))) + self.frame_floats = DataFrame(np.random.randn(4, 4), + index=Float64Index(range(0, 8, 2)), + columns=Float64Index(range(0, 12, 3))) + with catch_warnings(record=True): + self.panel_floats = Panel(np.random.rand(4, 4, 4), + items=Float64Index(range(0, 8, 2)), + major_axis=Float64Index(range(0, 12, 3)), + minor_axis=Float64Index(range(0, 16, 4))) + + m_idces = [MultiIndex.from_product([[1, 2], [3, 4]]), + MultiIndex.from_product([[5, 6], [7, 8]]), + MultiIndex.from_product([[9, 10], [11, 12]])] + + self.series_multi = Series(np.random.rand(4), + index=m_idces[0]) + self.frame_multi = DataFrame(np.random.randn(4, 4), + index=m_idces[0], + columns=m_idces[1]) + with catch_warnings(record=True): + self.panel_multi = Panel(np.random.rand(4, 4, 4), + items=m_idces[0], + major_axis=m_idces[1], + minor_axis=m_idces[2]) + self.series_labels = Series(np.random.randn(4), index=list('abcd')) self.frame_labels = DataFrame(np.random.randn(4, 4), index=list('abcd'), columns=list('ABCD')) diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index e3f93924aca0da..32a56aeafc6ad0 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -685,17 +685,23 @@ def test_floating_misc(self): assert_series_equal(result1, result3) assert_series_equal(result1, result4) - result1 = s[[1.6, 5, 10]] - result2 = s.loc[[1.6, 5, 10]] - result3 = s.loc[[1.6, 5, 10]] + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result1 = s[[1.6, 5, 10]] + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result2 = s.loc[[1.6, 5, 10]] + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result3 = s.loc[[1.6, 5, 10]] assert_series_equal(result1, result2) assert_series_equal(result1, result3) assert_series_equal(result1, Series( [np.nan, 2, 4], index=[1.6, 5, 10])) - result1 = s[[0, 1, 2]] - result2 = s.loc[[0, 1, 2]] - result3 = s.loc[[0, 1, 2]] + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result1 = s[[0, 1, 2]] + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result2 = s.loc[[0, 1, 2]] + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result3 = s.loc[[0, 1, 2]] assert_series_equal(result1, result2) assert_series_equal(result1, result3) assert_series_equal(result1, Series( diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 86a5a82441ee87..3f0a9d0eea6f9a 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -119,7 +119,7 @@ def test_loc_getitem_label_out_of_range(self): typs=['ints', 'uints', 'labels', 'mixed', 'ts'], fails=KeyError) self.check_result('label range', 'loc', 'f', 'ix', 'f', - typs=['floats'], fails=TypeError) + typs=['floats'], fails=KeyError) self.check_result('label range', 'loc', 20, 'ix', 20, typs=['ints', 'uints', 'mixed'], fails=KeyError) self.check_result('label range', 'loc', 20, 'ix', 20, @@ -127,7 +127,7 @@ def test_loc_getitem_label_out_of_range(self): self.check_result('label range', 'loc', 20, 'ix', 20, typs=['ts'], axes=0, fails=TypeError) self.check_result('label range', 'loc', 20, 'ix', 20, typs=['floats'], - axes=0, fails=TypeError) + axes=0, fails=KeyError) def test_loc_getitem_label_list(self): @@ -155,13 +155,24 @@ def test_loc_getitem_label_list_with_missing(self): self.check_result('list lbl', 'loc', [0, 1, 2], 'indexer', [0, 1, 2], typs=['empty'], fails=KeyError) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - self.check_result('list lbl', 'loc', [0, 2, 3], 'ix', [0, 2, 3], - typs=['ints', 'uints'], axes=0, fails=KeyError) + self.check_result('list lbl', 'loc', [0, 2, 10], 'ix', [0, 2, 10], + typs=['ints', 'uints', 'floats'], + axes=0, fails=KeyError) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): self.check_result('list lbl', 'loc', [3, 6, 7], 'ix', [3, 6, 7], - typs=['ints', 'uints'], axes=1, fails=KeyError) + typs=['ints', 'uints', 'floats'], + axes=1, fails=KeyError) self.check_result('list lbl', 'loc', [4, 8, 10], 'ix', [4, 8, 10], - typs=['ints', 'uints'], axes=2, fails=KeyError) + typs=['ints', 'uints', 'floats'], + axes=2, fails=KeyError) + + # GH 17758 - MultiIndex and missing keys + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self.check_result('list lbl', 'loc', [(1, 3), (1, 4), (2, 5)], + 'ix', [(1, 3), (1, 4), (2, 5)], + typs=['multi'], + axes=0) def test_getitem_label_list_with_missing(self): s = Series(range(3), index=['a', 'b', 'c'])