From ae4188df6e2f7f2c27a312ee145e299b36189b86 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 22 May 2023 12:01:02 +0100 Subject: [PATCH 1/2] Handle some corner-cases in indexing with boolean masks These must be treated specially and not accidentally converted to integers before indexing. - Closes #13015 - Closes #13265 - Closes #13270 --- python/cudf/cudf/core/column_accessor.py | 27 +++++++++++++++-- python/cudf/cudf/core/indexed_frame.py | 3 +- python/cudf/cudf/core/single_column_frame.py | 4 +++ python/cudf/cudf/tests/test_indexing.py | 32 ++++++++++++++++++-- 4 files changed, 60 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 832d5acf2de..5d792a25969 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -19,6 +19,7 @@ import pandas as pd from packaging.version import Version +from pandas.api.types import is_bool import cudf from cudf.core import column @@ -359,7 +360,8 @@ def get_labels_by_index(self, index: Any) -> tuple: Parameters ---------- - index : integer, integer slice, or list-like of integers + index : integer, integer slice, boolean mask, + or list-like of integers The column indexes. Returns @@ -371,6 +373,14 @@ def get_labels_by_index(self, index: Any) -> tuple: return self.names[start:stop:step] elif pd.api.types.is_integer(index): return (self.names[index],) + elif index and all(map(is_bool, index)): + if len(index) != len(self.names): + raise IndexError("Invalid boolean mask for column selection") + if isinstance(index, (pd.Series, cudf.Series)): + # Don't allow iloc indexing with series + raise IndexError("Cannot use Series object for iloc indexing") + # TODO: Doesn't handle on-device columns + return tuple(n for n, keep in zip(self.names, index) if keep) else: return tuple(self.names[i] for i in index) @@ -381,7 +391,8 @@ def select_by_index(self, index: Any) -> ColumnAccessor: Parameters ---------- - key : integer, integer slice, or list-like of integers + key : integer, integer slice, boolean mask, + or list-like of integers Returns ------- @@ -462,7 +473,17 @@ def set_by_label(self, key: Any, value: Any, validate: bool = True): self._clear_cache() def _select_by_label_list_like(self, key: Any) -> ColumnAccessor: - data = {k: self._grouped_data[k] for k in key} + # Might be a generator + key = tuple(key) + # Special-casing for boolean mask + if key and all(map(is_bool, key)): + data = dict( + item + for item, keep in zip(self._grouped_data.items(), key) + if keep + ) + else: + data = {k: self._grouped_data[k] for k in key} if self.multiindex: data = _to_flat_dict(data) return self.__class__( diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 7141958f62d..fe6aab8c6b0 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -2904,7 +2904,8 @@ def _apply_boolean_mask(self, boolean_mask): boolean_mask = cudf.core.column.as_column(boolean_mask) if not is_bool_dtype(boolean_mask.dtype): raise ValueError("boolean_mask is not boolean type.") - + if (bn := len(boolean_mask)) != (n := len(self)): + raise IndexError(f"Boolean index has wrong length: {bn} not {n}") return self._from_columns_like_self( libcudf.stream_compaction.apply_boolean_mask( list(self._index._columns + self._columns), boolean_mask diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 5f07c57fc80..7d3d7333a84 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -403,6 +403,10 @@ def _get_elements_from_column(self, arg) -> Union[ScalarLike, ColumnBase]: if is_integer_dtype(arg.dtype): return self._column.take(arg) if is_bool_dtype(arg.dtype): + if (bn := len(arg)) != (n := len(self)): + raise IndexError( + f"Boolean index has wrong length: {bn} not {n}" + ) return self._column.apply_boolean_mask(arg) raise NotImplementedError(f"Unknown indexer {type(arg)}") diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 95936c48b7c..a94a38028ce 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -579,7 +579,6 @@ def test_dataframe_series_loc_multiindex(obj): @pytest.mark.parametrize("nelem", [2, 5, 20, 100]) def test_series_iloc(nelem): - # create random cudf.Series np.random.seed(12) ps = pd.Series(np.random.sample(nelem)) @@ -1265,7 +1264,6 @@ def test_iloc_categorical_index(index): ) @pytest.mark.parametrize("is_dataframe", [True, False]) def test_loc_datetime_index(sli, is_dataframe): - if is_dataframe is True: pd_data = pd.DataFrame( {"a": [1, 2, 3]}, @@ -1685,3 +1683,33 @@ def test_loc_single_row_from_slice(): pdf = pd.DataFrame({"a": [10, 20, 30], "b": [1, 2, 3]}).set_index("a") df = cudf.from_pandas(pdf) assert_eq(pdf.loc[5:10], df.loc[5:10]) + + +@pytest.mark.parametrize("indexer", ["loc", "iloc"]) +@pytest.mark.parametrize( + "mask", + [[False, True], [False, False, True, True, True]], + ids=["too-short", "too-long"], +) +def test_boolean_mask_wrong_length(indexer, mask): + s = pd.Series([1, 2, 3, 4]) + + indexee = getattr(s, indexer) + with pytest.raises(IndexError): + indexee[mask] + + c = cudf.from_pandas(s) + indexee = getattr(c, indexer) + with pytest.raises(IndexError): + indexee[mask] + + +@pytest.mark.parametrize("indexer", ["loc", "iloc"]) +def test_boolean_mask_columns(indexer): + df = pd.DataFrame(np.zeros((3, 3))) + cdf = cudf.from_pandas(df) + + expect = getattr(df, indexer)[:, [True, False, True]] + got = getattr(cdf, indexer)[:, [True, False, True]] + + assert_eq(expect, got) From 163b3fcea23474173a053de65e0faf76bddcd16c Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 24 May 2023 18:17:57 +0100 Subject: [PATCH 2/2] A bit more coverage and better error messages --- python/cudf/cudf/core/column_accessor.py | 18 ++++++++--- python/cudf/cudf/core/indexed_frame.py | 2 +- python/cudf/cudf/core/single_column_frame.py | 2 +- python/cudf/cudf/tests/test_indexing.py | 34 ++++++++++++++++++-- 4 files changed, 46 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 5d792a25969..6022d86e106 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -373,12 +373,16 @@ def get_labels_by_index(self, index: Any) -> tuple: return self.names[start:stop:step] elif pd.api.types.is_integer(index): return (self.names[index],) - elif index and all(map(is_bool, index)): - if len(index) != len(self.names): - raise IndexError("Invalid boolean mask for column selection") + elif (bn := len(index)) > 0 and all(map(is_bool, index)): + if bn != (n := len(self.names)): + raise IndexError( + f"Boolean mask has wrong length: {bn} not {n}" + ) if isinstance(index, (pd.Series, cudf.Series)): # Don't allow iloc indexing with series - raise IndexError("Cannot use Series object for iloc indexing") + raise NotImplementedError( + "Cannot use Series object for mask iloc indexing" + ) # TODO: Doesn't handle on-device columns return tuple(n for n, keep in zip(self.names, index) if keep) else: @@ -476,7 +480,11 @@ def _select_by_label_list_like(self, key: Any) -> ColumnAccessor: # Might be a generator key = tuple(key) # Special-casing for boolean mask - if key and all(map(is_bool, key)): + if (bn := len(key)) > 0 and all(map(is_bool, key)): + if bn != (n := len(self.names)): + raise IndexError( + f"Boolean mask has wrong length: {bn} not {n}" + ) data = dict( item for item, keep in zip(self._grouped_data.items(), key) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index fe6aab8c6b0..bc5ed5afcac 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -2905,7 +2905,7 @@ def _apply_boolean_mask(self, boolean_mask): if not is_bool_dtype(boolean_mask.dtype): raise ValueError("boolean_mask is not boolean type.") if (bn := len(boolean_mask)) != (n := len(self)): - raise IndexError(f"Boolean index has wrong length: {bn} not {n}") + raise IndexError(f"Boolean mask has wrong length: {bn} not {n}") return self._from_columns_like_self( libcudf.stream_compaction.apply_boolean_mask( list(self._index._columns + self._columns), boolean_mask diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 7d3d7333a84..0320e404e16 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -405,7 +405,7 @@ def _get_elements_from_column(self, arg) -> Union[ScalarLike, ColumnBase]: if is_bool_dtype(arg.dtype): if (bn := len(arg)) != (n := len(self)): raise IndexError( - f"Boolean index has wrong length: {bn} not {n}" + f"Boolean mask has wrong length: {bn} not {n}" ) return self._column.apply_boolean_mask(arg) raise NotImplementedError(f"Unknown indexer {type(arg)}") diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index a94a38028ce..b3ec6cdcf25 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -1708,8 +1708,36 @@ def test_boolean_mask_wrong_length(indexer, mask): def test_boolean_mask_columns(indexer): df = pd.DataFrame(np.zeros((3, 3))) cdf = cudf.from_pandas(df) - - expect = getattr(df, indexer)[:, [True, False, True]] - got = getattr(cdf, indexer)[:, [True, False, True]] + mask = [True, False, True] + expect = getattr(df, indexer)[:, mask] + got = getattr(cdf, indexer)[:, mask] assert_eq(expect, got) + + +@pytest.mark.parametrize("indexer", ["loc", "iloc"]) +@pytest.mark.parametrize( + "mask", + [[False, True], [False, False, True, True, True]], + ids=["too-short", "too-long"], +) +def test_boolean_mask_columns_wrong_length(indexer, mask): + df = pd.DataFrame(np.zeros((3, 3))) + cdf = cudf.from_pandas(df) + + with pytest.raises(IndexError): + getattr(df, indexer)[:, mask] + with pytest.raises(IndexError): + getattr(cdf, indexer)[:, mask] + + +def test_boolean_mask_columns_iloc_series(): + df = pd.DataFrame(np.zeros((3, 3))) + cdf = cudf.from_pandas(df) + + mask = pd.Series([True, False, True], dtype=bool) + with pytest.raises(NotImplementedError): + df.iloc[:, mask] + + with pytest.raises(NotImplementedError): + cdf.iloc[:, mask]