Skip to content
/ cudf Public
forked from rapidsai/cudf

Commit

Permalink
Handle some corner-cases in indexing with boolean masks
Browse files Browse the repository at this point in the history
These must be treated specially and not accidentally converted to
integers before indexing.

- Closes rapidsai#13015
- Closes rapidsai#13265
- Closes rapidsai#13270
  • Loading branch information
wence- committed May 24, 2023
1 parent 209ab6e commit ae4188d
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 6 deletions.
27 changes: 24 additions & 3 deletions python/cudf/cudf/core/column_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import pandas as pd
from packaging.version import Version
from pandas.api.types import is_bool

import cudf
from cudf.core import column
Expand Down Expand Up @@ -359,7 +360,8 @@ def get_labels_by_index(self, index: Any) -> tuple:
Parameters
----------
index : integer, integer slice, or list-like of integers
index : integer, integer slice, boolean mask,
or list-like of integers
The column indexes.
Returns
Expand All @@ -371,6 +373,14 @@ def get_labels_by_index(self, index: Any) -> tuple:
return self.names[start:stop:step]
elif pd.api.types.is_integer(index):
return (self.names[index],)
elif index and all(map(is_bool, index)):
if len(index) != len(self.names):
raise IndexError("Invalid boolean mask for column selection")
if isinstance(index, (pd.Series, cudf.Series)):
# Don't allow iloc indexing with series
raise IndexError("Cannot use Series object for iloc indexing")
# TODO: Doesn't handle on-device columns
return tuple(n for n, keep in zip(self.names, index) if keep)
else:
return tuple(self.names[i] for i in index)

Expand All @@ -381,7 +391,8 @@ def select_by_index(self, index: Any) -> ColumnAccessor:
Parameters
----------
key : integer, integer slice, or list-like of integers
key : integer, integer slice, boolean mask,
or list-like of integers
Returns
-------
Expand Down Expand Up @@ -462,7 +473,17 @@ def set_by_label(self, key: Any, value: Any, validate: bool = True):
self._clear_cache()

def _select_by_label_list_like(self, key: Any) -> ColumnAccessor:
data = {k: self._grouped_data[k] for k in key}
# Might be a generator
key = tuple(key)
# Special-casing for boolean mask
if key and all(map(is_bool, key)):
data = dict(
item
for item, keep in zip(self._grouped_data.items(), key)
if keep
)
else:
data = {k: self._grouped_data[k] for k in key}
if self.multiindex:
data = _to_flat_dict(data)
return self.__class__(
Expand Down
3 changes: 2 additions & 1 deletion python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2904,7 +2904,8 @@ def _apply_boolean_mask(self, boolean_mask):
boolean_mask = cudf.core.column.as_column(boolean_mask)
if not is_bool_dtype(boolean_mask.dtype):
raise ValueError("boolean_mask is not boolean type.")

if (bn := len(boolean_mask)) != (n := len(self)):
raise IndexError(f"Boolean index has wrong length: {bn} not {n}")
return self._from_columns_like_self(
libcudf.stream_compaction.apply_boolean_mask(
list(self._index._columns + self._columns), boolean_mask
Expand Down
4 changes: 4 additions & 0 deletions python/cudf/cudf/core/single_column_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,10 @@ def _get_elements_from_column(self, arg) -> Union[ScalarLike, ColumnBase]:
if is_integer_dtype(arg.dtype):
return self._column.take(arg)
if is_bool_dtype(arg.dtype):
if (bn := len(arg)) != (n := len(self)):
raise IndexError(
f"Boolean index has wrong length: {bn} not {n}"
)
return self._column.apply_boolean_mask(arg)
raise NotImplementedError(f"Unknown indexer {type(arg)}")

Expand Down
32 changes: 30 additions & 2 deletions python/cudf/cudf/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,7 +579,6 @@ def test_dataframe_series_loc_multiindex(obj):

@pytest.mark.parametrize("nelem", [2, 5, 20, 100])
def test_series_iloc(nelem):

# create random cudf.Series
np.random.seed(12)
ps = pd.Series(np.random.sample(nelem))
Expand Down Expand Up @@ -1265,7 +1264,6 @@ def test_iloc_categorical_index(index):
)
@pytest.mark.parametrize("is_dataframe", [True, False])
def test_loc_datetime_index(sli, is_dataframe):

if is_dataframe is True:
pd_data = pd.DataFrame(
{"a": [1, 2, 3]},
Expand Down Expand Up @@ -1685,3 +1683,33 @@ def test_loc_single_row_from_slice():
pdf = pd.DataFrame({"a": [10, 20, 30], "b": [1, 2, 3]}).set_index("a")
df = cudf.from_pandas(pdf)
assert_eq(pdf.loc[5:10], df.loc[5:10])


@pytest.mark.parametrize("indexer", ["loc", "iloc"])
@pytest.mark.parametrize(
"mask",
[[False, True], [False, False, True, True, True]],
ids=["too-short", "too-long"],
)
def test_boolean_mask_wrong_length(indexer, mask):
s = pd.Series([1, 2, 3, 4])

indexee = getattr(s, indexer)
with pytest.raises(IndexError):
indexee[mask]

c = cudf.from_pandas(s)
indexee = getattr(c, indexer)
with pytest.raises(IndexError):
indexee[mask]


@pytest.mark.parametrize("indexer", ["loc", "iloc"])
def test_boolean_mask_columns(indexer):
df = pd.DataFrame(np.zeros((3, 3)))
cdf = cudf.from_pandas(df)

expect = getattr(df, indexer)[:, [True, False, True]]
got = getattr(cdf, indexer)[:, [True, False, True]]

assert_eq(expect, got)

0 comments on commit ae4188d

Please sign in to comment.