Skip to content

Commit

Permalink
Refactor isin implementations (#10165)
Browse files Browse the repository at this point in the history
This PR fixes a number of error cases around the implementation of `isin`, particularly involving categorical dtypes and index alignment when called on a `DataFrame`. It also makes significant changes to simplify and improve the performance of `DataFrame.isin`, resulting in a 10-40% speedup when called with a `DataFrame` or `Series` as the argument (depending on the data sizes).

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - https://github.com/brandon-b-miller

URL: #10165
  • Loading branch information
vyasr authored Feb 18, 2022
1 parent a362c65 commit 858ab83
Show file tree
Hide file tree
Showing 7 changed files with 354 additions and 371 deletions.
10 changes: 10 additions & 0 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1347,6 +1347,16 @@ def isin(self, values):
array([ True, False, False])
"""

# To match pandas behavior, even though only list-like objects are
# supposed to be passed, only scalars throw errors. Other types (like
# dicts) just transparently return False (see the implementation of
# ColumnBase.isin).
if is_scalar(values):
raise TypeError(
"only list-like objects are allowed to be passed "
f"to isin(), you passed a {type(values).__name__}"
)

return self._values.isin(values).values

@classmethod
Expand Down
140 changes: 70 additions & 70 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from cudf.core import column, df_protocol, reshape
from cudf.core.abc import Serializable
from cudf.core.column import (
CategoricalColumn,
as_column,
build_categorical_column,
build_column,
Expand Down Expand Up @@ -5169,82 +5170,81 @@ def isin(self, values):
falcon True True
dog False False
"""
if isinstance(values, dict):

result_df = DataFrame()

for col in self._data.names:
if col in values:
val = values[col]
result_df[col] = self._data[col].isin(val)
else:
result_df[col] = column.full(
size=len(self), fill_value=False, dtype="bool"
)

result_df.index = self.index
return result_df
elif isinstance(values, Series):
# TODO: propagate nulls through isin
# https://github.com/rapidsai/cudf/issues/7556

fill_value = cudf.Scalar(False)

def make_false_column_like_self():
return column.full(len(self), fill_value, "bool")

# Preprocess different input types into a mapping from column names to
# a list of values to check.
result = {}
if isinstance(values, IndexedFrame):
# Note: In the case where values is a Series, computing some
# information about the values column outside the loop may result
# in performance gains. However, since categorical conversion
# depends on the current column in the loop, using the correct
# precomputed variables inside the loop requires nontrivial logic.
# This optimization could be attempted if `isin` ever becomes a
# bottleneck.
values = values.reindex(self.index)
other_cols = (
values._data
if isinstance(values, DataFrame)
else {name: values._column for name in self._data}
)
for col, self_col in self._data.items():
if col in other_cols:
other_col = other_cols[col]
self_is_cat = isinstance(self_col, CategoricalColumn)
other_is_cat = isinstance(other_col, CategoricalColumn)

if self_is_cat != other_is_cat:
# It is valid to compare the levels of a categorical
# column to a non-categorical column.
if self_is_cat:
self_col = self_col._get_decategorized_column()
else:
other_col = other_col._get_decategorized_column()

result = DataFrame()
# TODO: propagate nulls through isin
# https://github.com/rapidsai/cudf/issues/7556
for col in self._data.names:
if isinstance(
self[col]._column, cudf.core.column.CategoricalColumn
) and isinstance(
values._column, cudf.core.column.CategoricalColumn
):
res = (self._data[col] == values._column).fillna(False)
result[col] = res
elif (
isinstance(
self[col]._column, cudf.core.column.CategoricalColumn
)
or np.issubdtype(self[col].dtype, cudf.dtype("object"))
) or (
isinstance(
values._column, cudf.core.column.CategoricalColumn
)
or np.issubdtype(values.dtype, cudf.dtype("object"))
):
result[col] = utils.scalar_broadcast_to(False, len(self))
# We use the type checks from _before_ the conversion
# because if only one was categorical then it's already
# been converted and we have to check if they're strings.
if self_is_cat and other_is_cat:
self_is_str = other_is_str = False
else:
# These checks must happen after the conversions above
# since numpy can't handle categorical dtypes.
self_is_str = is_string_dtype(self_col.dtype)
other_is_str = is_string_dtype(other_col.dtype)

if self_is_str != other_is_str:
# Strings can't compare to anything else.
result[col] = make_false_column_like_self()
else:
result[col] = (self_col == other_col).fillna(False)
else:
result[col] = (self._data[col] == values._column).fillna(
False
)

result.index = self.index
return result
elif isinstance(values, DataFrame):
values = values.reindex(self.index)

result = DataFrame()
for col in self._data.names:
if col in values.columns:
result[col] = (
self._data[col] == values[col]._column
).fillna(False)
result[col] = make_false_column_like_self()
elif is_dict_like(values):
for name, col in self._data.items():
if name in values:
result[name] = col.isin(values[name])
else:
result[col] = utils.scalar_broadcast_to(False, len(self))
result.index = self.index
return result
result[name] = make_false_column_like_self()
elif is_list_like(values):
for name, col in self._data.items():
result[name] = col.isin(values)
else:
if not is_list_like(values):
raise TypeError(
f"only list-like or dict-like objects are "
f"allowed to be passed to DataFrame.isin(), "
f"you passed a "
f"'{type(values).__name__}'"
)

result_df = DataFrame()
raise TypeError(
"only list-like or dict-like objects are "
"allowed to be passed to DataFrame.isin(), "
"you passed a "
f"'{type(values).__name__}'"
)

for col in self._data.names:
result_df[col] = self._data[col].isin(values)
result_df.index = self.index
return result_df
return DataFrame._from_data(result, self.index)

#
# Stats
Expand Down
7 changes: 5 additions & 2 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2658,14 +2658,17 @@ def isin(self, values):
dtype: bool
"""

# Even though only list-like objects are supposed to be passed, only
# scalars throw errors. Other types (like dicts) just transparently
# return False (see the implementation of ColumnBase.isin).
if is_scalar(values):
raise TypeError(
"only list-like objects are allowed to be passed "
f"to isin(), you passed a [{type(values).__name__}]"
)

return Series(
self._column.isin(values), index=self.index, name=self.name
return Series._from_data(
{self.name: self._column.isin(values)}, index=self.index
)

def unique(self):
Expand Down
Loading

0 comments on commit 858ab83

Please sign in to comment.