diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index db25bcf8113f50..05bf447dd04ca9 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -110,6 +110,8 @@ Bug Fixes - Bug in :meth:`Series.reset_index` where appropriate error was not raised with an invalid level name (:issue:`20925`) - Bug in :func:`interval_range` when ``start``/``periods`` or ``end``/``periods`` are specified with float ``start`` or ``end`` (:issue:`21161`) - Bug in :meth:`MultiIndex.set_names` where error raised for a ``MultiIndex`` with ``nlevels == 1`` (:issue:`21149`) +- Bug in :meth:`DataFrame.__getitem__` and :meth:`DataFrame.loc` which did not accept columns keys passed as non-list iterables (:issue:`21294`) +- Bug in :meth:`DataFrame.__getitem__` looking for np.nan in non-unique columns (:issue:`21428`) - Bug in :class:`IntervalIndex` constructors where creating an ``IntervalIndex`` from categorical data was not fully supported (:issue:`21243`, issue:`21253`) - Bug in :meth:`MultiIndex.sort_index` which was not guaranteed to sort correctly with ``level=1``; this was also causing data misalignment in particular :meth:`DataFrame.stack` operations (:issue:`20994`, :issue:`20945`, :issue:`21052`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ca572e2e56b6c4..53d5cc45c7b330 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2663,67 +2663,72 @@ def _ixs(self, i, axis=0): def __getitem__(self, key): key = com._apply_if_callable(key, self) - # shortcut if we are an actual column - is_mi_columns = isinstance(self.columns, MultiIndex) + # shortcut if the key is in columns try: - if key in self.columns and not is_mi_columns: - return self._getitem_column(key) - except: + if self.columns.is_unique and key in self.columns: + if self.columns.nlevels > 1: + return self._getitem_multilevel(key) + return self._get_item_cache(key) + except (ValueError, TypeError): pass - # see if we can slice the rows + # Do we have a slicer (on rows)? indexer = convert_to_index_sliceable(self, key) if indexer is not None: - return self._getitem_slice(indexer) + return self._slice(indexer, axis=0) - if isinstance(key, (Series, np.ndarray, Index, list)): - # either boolean or fancy integer index - return self._getitem_array(key) - elif isinstance(key, DataFrame): + # Do we have a (boolean) DataFrame? + if isinstance(key, DataFrame): return self._getitem_frame(key) - elif is_mi_columns: - return self._getitem_multilevel(key) - else: - return self._getitem_column(key) - def _getitem_column(self, key): - """ return the actual column """ + # Do we have a (boolean) 1d indexer? + if com.is_bool_indexer(key): + return self._getitem_bool_array(key) - # get column - if self.columns.is_unique: - return self._get_item_cache(key) + # We are left with two options: a single key, and a collection of keys, + # We interpret tuples as collections only for non-MultiIndex + coll_key = is_list_like(key) and (not isinstance(key, tuple) or + self.columns.nlevels > 1) - # duplicate columns & possible reduce dimensionality - result = self._constructor(self._data.get(key)) - if result.columns.is_unique: - result = result[key] + if coll_key: + indexer = self.loc._convert_to_indexer(key, axis=1) + else: + if self.columns.nlevels > 1: + return self._getitem_multilevel(key) + indexer = self.columns.get_loc(key) + if is_integer(indexer): + indexer = [indexer] - return result + # take() does not accept boolean indexers + if getattr(indexer, "dtype", None) == bool: + indexer = np.where(indexer)[0] - def _getitem_slice(self, key): - return self._slice(key, axis=0) + data = self._take(indexer, axis=1) - def _getitem_array(self, key): + if not coll_key: + # This test preserves #9519; the second part preserves #21309 + if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex): + data = data[key] + + return data + + def _getitem_bool_array(self, key): # also raises Exception if object array with NA values - if com.is_bool_indexer(key): - # warning here just in case -- previously __setitem__ was - # reindexing but __getitem__ was not; it seems more reasonable to - # go with the __setitem__ behavior since that is more consistent - # with all other indexing behavior - if isinstance(key, Series) and not key.index.equals(self.index): - warnings.warn("Boolean Series key will be reindexed to match " - "DataFrame index.", UserWarning, stacklevel=3) - elif len(key) != len(self.index): - raise ValueError('Item wrong length %d instead of %d.' % - (len(key), len(self.index))) - # check_bool_indexer will throw exception if Series key cannot - # be reindexed to match DataFrame rows - key = check_bool_indexer(self.index, key) - indexer = key.nonzero()[0] - return self._take(indexer, axis=0) - else: - indexer = self.loc._convert_to_indexer(key, axis=1) - return self._take(indexer, axis=1) + # warning here just in case -- previously __setitem__ was + # reindexing but __getitem__ was not; it seems more reasonable to + # go with the __setitem__ behavior since that is more consistent + # with all other indexing behavior + if isinstance(key, Series) and not key.index.equals(self.index): + warnings.warn("Boolean Series key will be reindexed to match " + "DataFrame index.", UserWarning, stacklevel=3) + elif len(key) != len(self.index): + raise ValueError('Item wrong length %d instead of %d.' % + (len(key), len(self.index))) + # check_bool_indexer will throw exception if Series key cannot + # be reindexed to match DataFrame rows + key = check_bool_indexer(self.index, key) + indexer = key.nonzero()[0] + return self._take(indexer, axis=0) def _getitem_multilevel(self, key): loc = self.columns.get_loc(key) diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index be37e696ea0a35..1ac8fc13d961d8 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -92,45 +92,46 @@ def test_get(self): result = df.get(None) assert result is None - def test_getitem_iterator(self): + def test_loc_iterable(self): idx = iter(['A', 'B', 'C']) result = self.frame.loc[:, idx] expected = self.frame.loc[:, ['A', 'B', 'C']] assert_frame_equal(result, expected) - idx = iter(['A', 'B', 'C']) - result = self.frame.loc[:, idx] - expected = self.frame.loc[:, ['A', 'B', 'C']] - assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "idx_type", + [list, iter, Index, set, + lambda l: dict(zip(l, range(len(l)))), + lambda l: dict(zip(l, range(len(l)))).keys()], + ids=["list", "iter", "Index", "set", "dict", "dict_keys"]) + @pytest.mark.parametrize("levels", [1, 2]) + def test_getitem_listlike(self, idx_type, levels): + # GH 21294 + + if levels == 1: + frame, missing = self.frame, 'food' + else: + # MultiIndex columns + frame = DataFrame(randn(8, 3), + columns=Index([('foo', 'bar'), ('baz', 'qux'), + ('peek', 'aboo')], + name=('sth', 'sth2'))) + missing = ('good', 'food') - def test_getitem_list(self): - self.frame.columns.name = 'foo' + keys = [frame.columns[1], frame.columns[0]] + idx = idx_type(keys) + idx_check = list(idx_type(keys)) - result = self.frame[['B', 'A']] - result2 = self.frame[Index(['B', 'A'])] + result = frame[idx] - expected = self.frame.loc[:, ['B', 'A']] - expected.columns.name = 'foo' + expected = frame.loc[:, idx_check] + expected.columns.names = frame.columns.names assert_frame_equal(result, expected) - assert_frame_equal(result2, expected) - assert result.columns.name == 'foo' - - with tm.assert_raises_regex(KeyError, 'not in index'): - self.frame[['B', 'A', 'food']] + idx = idx_type(keys + [missing]) with tm.assert_raises_regex(KeyError, 'not in index'): - self.frame[Index(['B', 'A', 'foo'])] - - # tuples - df = DataFrame(randn(8, 3), - columns=Index([('foo', 'bar'), ('baz', 'qux'), - ('peek', 'aboo')], name=('sth', 'sth2'))) - - result = df[[('foo', 'bar'), ('baz', 'qux')]] - expected = df.iloc[:, :2] - assert_frame_equal(result, expected) - assert result.columns.names == ('sth', 'sth2') + frame[idx] def test_getitem_callable(self): # GH 12533 @@ -3016,8 +3017,7 @@ def test_type_error_multiindex(self): dg = df.pivot_table(index='i', columns='c', values=['x', 'y']) - with tm.assert_raises_regex(TypeError, "is an invalid key"): - str(dg[:, 0]) + assert_frame_equal(dg[:, 0], dg.iloc[:, [0, 2]]) index = Index(range(2), name='i') columns = MultiIndex(levels=[['x', 'y'], [0, 1]],