Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor isin implementations #10165

Merged
merged 12 commits into from
Feb 18, 2022
9 changes: 9 additions & 0 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1347,6 +1347,15 @@ def isin(self, values):
array([ True, False, False])
"""

# Even though only list-like objects are supposed to be passed, only
# scalars throw errors. Other types (like dicts) just transparently
# return False (see the implementation of ColumnBase.isin).
vyasr marked this conversation as resolved.
Show resolved Hide resolved
if is_scalar(values):
raise TypeError(
"only list-like objects are allowed to be passed "
f"to isin(), you passed a [{type(values).__name__}]"
)

return self._values.isin(values).values

@classmethod
Expand Down
142 changes: 72 additions & 70 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from cudf.core import column, df_protocol, reshape
from cudf.core.abc import Serializable
from cudf.core.column import (
CategoricalColumn,
as_column,
build_categorical_column,
build_column,
Expand Down Expand Up @@ -5040,82 +5041,83 @@ def isin(self, values):
falcon True True
dog False False
"""
if isinstance(values, dict):

result_df = DataFrame()

for col in self._data.names:
if col in values:
val = values[col]
result_df[col] = self._data[col].isin(val)
else:
result_df[col] = column.full(
size=len(self), fill_value=False, dtype="bool"
)

result_df.index = self.index
return result_df
elif isinstance(values, Series):
# TODO: propagate nulls through isin
# https://github.com/rapidsai/cudf/issues/7556

def make_false_column_like_self():
return column.full(size=len(self), fill_value=False, dtype="bool")
vyasr marked this conversation as resolved.
Show resolved Hide resolved

# Preprocess different input types into a mapping from column names to
# a list of values to check.
result = {}
if isinstance(values, (Series, DataFrame)):
vyasr marked this conversation as resolved.
Show resolved Hide resolved
obj_dtype = cudf.dtype("object")

# Note: In the case where values is a Series, computing some
# information about the values column outside the loop may result
# in performance gains. However, since categorical conversion
# depends on the current column in the loop, using the correct
# precomputed variables inside the loop requires nontrivial logic.
# This optimization could be attempted if `isin` ever becomes a
# bottleneck.
values = values.reindex(self.index)
other_cols = (
values._data
if isinstance(values, DataFrame)
else {name: values._column for name in self._data}
)
for col, self_col in self._data.items():
if col in other_cols:
other_col = other_cols[col]
self_is_cat = isinstance(self_col, CategoricalColumn)
other_is_cat = isinstance(other_col, CategoricalColumn)

if self_is_cat != other_is_cat:
# It is valid to compare the levels of a categorical
# column to a non-categorical column.
if self_is_cat:
self_col = self_col._get_decategorized_column()
else:
other_col = other_col._get_decategorized_column()

result = DataFrame()
# TODO: propagate nulls through isin
# https://github.com/rapidsai/cudf/issues/7556
for col in self._data.names:
if isinstance(
self[col]._column, cudf.core.column.CategoricalColumn
) and isinstance(
values._column, cudf.core.column.CategoricalColumn
):
res = (self._data[col] == values._column).fillna(False)
result[col] = res
elif (
isinstance(
self[col]._column, cudf.core.column.CategoricalColumn
)
or np.issubdtype(self[col].dtype, cudf.dtype("object"))
) or (
isinstance(
values._column, cudf.core.column.CategoricalColumn
)
or np.issubdtype(values.dtype, cudf.dtype("object"))
):
result[col] = utils.scalar_broadcast_to(False, len(self))
else:
result[col] = (self._data[col] == values._column).fillna(
False
)

result.index = self.index
return result
elif isinstance(values, DataFrame):
values = values.reindex(self.index)
# We use the type checks from _before_ the conversion
# because if only one was categorical then it's already
# been converted and we have to check if they're strings.
if self_is_cat and other_is_cat:
self_is_obj = other_is_obj = False
else:
# These checks must happen after the conversions above
# since numpy can't handle categorical dtypes.
self_is_obj = np.issubdtype(self_col.dtype, obj_dtype)
vyasr marked this conversation as resolved.
Show resolved Hide resolved
other_is_obj = np.issubdtype(
other_col.dtype, obj_dtype
)

result = DataFrame()
for col in self._data.names:
if col in values.columns:
result[col] = (
self._data[col] == values[col]._column
).fillna(False)
if self_is_obj != other_is_obj:
# Strings can't compare to anything else.
brandon-b-miller marked this conversation as resolved.
Show resolved Hide resolved
result[col] = make_false_column_like_self()
else:
result[col] = (self_col == other_col).fillna(False)
else:
result[col] = utils.scalar_broadcast_to(False, len(self))
result.index = self.index
return result
result[col] = make_false_column_like_self()
elif is_dict_like(values):
for name, col in self._data.items():
if name in values:
result[name] = col.isin(values[name])
else:
result[name] = make_false_column_like_self()
elif is_list_like(values):
for name, col in self._data.items():
result[name] = col.isin(values)
else:
if not is_list_like(values):
raise TypeError(
f"only list-like or dict-like objects are "
f"allowed to be passed to DataFrame.isin(), "
f"you passed a "
f"'{type(values).__name__}'"
)

result_df = DataFrame()
raise TypeError(
"only list-like or dict-like objects are "
"allowed to be passed to DataFrame.isin(), "
"you passed a "
f"'{type(values).__name__}'"
brandon-b-miller marked this conversation as resolved.
Show resolved Hide resolved
)

for col in self._data.names:
result_df[col] = self._data[col].isin(values)
result_df.index = self.index
return result_df
return DataFrame._from_data(result, self.index)

#
# Stats
Expand Down
7 changes: 5 additions & 2 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2638,14 +2638,17 @@ def isin(self, values):
dtype: bool
"""

# Even though only list-like objects are supposed to be passed, only
# scalars throw errors. Other types (like dicts) just transparently
# return False (see the implementation of ColumnBase.isin).
if is_scalar(values):
raise TypeError(
"only list-like objects are allowed to be passed "
f"to isin(), you passed a [{type(values).__name__}]"
)

return Series(
self._column.isin(values), index=self.index, name=self.name
return Series._from_data(
{self.name: self._column.isin(values)}, index=self.index
)

def unique(self):
Expand Down
Loading