Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle some corner-cases in indexing with boolean masks #13402

Merged
merged 3 commits into from
May 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 32 additions & 3 deletions python/cudf/cudf/core/column_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import pandas as pd
from packaging.version import Version
from pandas.api.types import is_bool

import cudf
from cudf.core import column
Expand Down Expand Up @@ -359,7 +360,8 @@ def get_labels_by_index(self, index: Any) -> tuple:

Parameters
----------
index : integer, integer slice, or list-like of integers
index : integer, integer slice, boolean mask,
or list-like of integers
The column indexes.

Returns
Expand All @@ -371,6 +373,18 @@ def get_labels_by_index(self, index: Any) -> tuple:
return self.names[start:stop:step]
elif pd.api.types.is_integer(index):
return (self.names[index],)
elif (bn := len(index)) > 0 and all(map(is_bool, index)):
if bn != (n := len(self.names)):
raise IndexError(
f"Boolean mask has wrong length: {bn} not {n}"
)
if isinstance(index, (pd.Series, cudf.Series)):
shwina marked this conversation as resolved.
Show resolved Hide resolved
# Don't allow iloc indexing with series
raise NotImplementedError(
"Cannot use Series object for mask iloc indexing"
)
# TODO: Doesn't handle on-device columns
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But the old code didn't either...

return tuple(n for n, keep in zip(self.names, index) if keep)
else:
return tuple(self.names[i] for i in index)

Expand All @@ -381,7 +395,8 @@ def select_by_index(self, index: Any) -> ColumnAccessor:

Parameters
----------
key : integer, integer slice, or list-like of integers
key : integer, integer slice, boolean mask,
or list-like of integers

Returns
-------
Expand Down Expand Up @@ -462,7 +477,21 @@ def set_by_label(self, key: Any, value: Any, validate: bool = True):
self._clear_cache()

def _select_by_label_list_like(self, key: Any) -> ColumnAccessor:
data = {k: self._grouped_data[k] for k in key}
# Might be a generator
key = tuple(key)
# Special-casing for boolean mask
if (bn := len(key)) > 0 and all(map(is_bool, key)):
if bn != (n := len(self.names)):
raise IndexError(
f"Boolean mask has wrong length: {bn} not {n}"
)
data = dict(
item
for item, keep in zip(self._grouped_data.items(), key)
if keep
)
else:
data = {k: self._grouped_data[k] for k in key}
if self.multiindex:
data = _to_flat_dict(data)
return self.__class__(
Expand Down
3 changes: 2 additions & 1 deletion python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2906,7 +2906,8 @@ def _apply_boolean_mask(self, boolean_mask):
boolean_mask = cudf.core.column.as_column(boolean_mask)
if not is_bool_dtype(boolean_mask.dtype):
raise ValueError("boolean_mask is not boolean type.")

if (bn := len(boolean_mask)) != (n := len(self)):
raise IndexError(f"Boolean mask has wrong length: {bn} not {n}")
return self._from_columns_like_self(
libcudf.stream_compaction.apply_boolean_mask(
list(self._index._columns + self._columns), boolean_mask
Expand Down
4 changes: 4 additions & 0 deletions python/cudf/cudf/core/single_column_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,10 @@ def _get_elements_from_column(self, arg) -> Union[ScalarLike, ColumnBase]:
if is_integer_dtype(arg.dtype):
return self._column.take(arg)
if is_bool_dtype(arg.dtype):
if (bn := len(arg)) != (n := len(self)):
raise IndexError(
f"Boolean mask has wrong length: {bn} not {n}"
)
return self._column.apply_boolean_mask(arg)
raise NotImplementedError(f"Unknown indexer {type(arg)}")

Expand Down
60 changes: 58 additions & 2 deletions python/cudf/cudf/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,7 +579,6 @@ def test_dataframe_series_loc_multiindex(obj):

@pytest.mark.parametrize("nelem", [2, 5, 20, 100])
def test_series_iloc(nelem):

# create random cudf.Series
np.random.seed(12)
ps = pd.Series(np.random.sample(nelem))
Expand Down Expand Up @@ -1265,7 +1264,6 @@ def test_iloc_categorical_index(index):
)
@pytest.mark.parametrize("is_dataframe", [True, False])
def test_loc_datetime_index(sli, is_dataframe):

if is_dataframe is True:
pd_data = pd.DataFrame(
{"a": [1, 2, 3]},
Expand Down Expand Up @@ -1685,3 +1683,61 @@ def test_loc_single_row_from_slice():
pdf = pd.DataFrame({"a": [10, 20, 30], "b": [1, 2, 3]}).set_index("a")
df = cudf.from_pandas(pdf)
assert_eq(pdf.loc[5:10], df.loc[5:10])


@pytest.mark.parametrize("indexer", ["loc", "iloc"])
@pytest.mark.parametrize(
"mask",
[[False, True], [False, False, True, True, True]],
ids=["too-short", "too-long"],
)
def test_boolean_mask_wrong_length(indexer, mask):
s = pd.Series([1, 2, 3, 4])

indexee = getattr(s, indexer)
with pytest.raises(IndexError):
indexee[mask]

c = cudf.from_pandas(s)
indexee = getattr(c, indexer)
with pytest.raises(IndexError):
indexee[mask]


@pytest.mark.parametrize("indexer", ["loc", "iloc"])
def test_boolean_mask_columns(indexer):
df = pd.DataFrame(np.zeros((3, 3)))
cdf = cudf.from_pandas(df)
mask = [True, False, True]
expect = getattr(df, indexer)[:, mask]
got = getattr(cdf, indexer)[:, mask]

assert_eq(expect, got)


@pytest.mark.parametrize("indexer", ["loc", "iloc"])
@pytest.mark.parametrize(
"mask",
[[False, True], [False, False, True, True, True]],
ids=["too-short", "too-long"],
)
def test_boolean_mask_columns_wrong_length(indexer, mask):
df = pd.DataFrame(np.zeros((3, 3)))
cdf = cudf.from_pandas(df)

with pytest.raises(IndexError):
getattr(df, indexer)[:, mask]
with pytest.raises(IndexError):
getattr(cdf, indexer)[:, mask]


def test_boolean_mask_columns_iloc_series():
df = pd.DataFrame(np.zeros((3, 3)))
cdf = cudf.from_pandas(df)

mask = pd.Series([True, False, True], dtype=bool)
with pytest.raises(NotImplementedError):
df.iloc[:, mask]

with pytest.raises(NotImplementedError):
cdf.iloc[:, mask]