Skip to content

Commit

Permalink
Match Pandas logic for comparing two objects with nulls (#7490)
Browse files Browse the repository at this point in the history
Fixes #7066

Authors:
  - @brandon-b-miller

Approvers:
  - Ashwin Srinath (@shwina)
  - Christopher Harris (@cwharris)
  - Keith Kraus (@kkraus14)

URL: #7490
  • Loading branch information
brandon-b-miller authored Mar 16, 2021
1 parent 05bb2f0 commit 561f68a
Show file tree
Hide file tree
Showing 16 changed files with 182 additions and 88 deletions.
20 changes: 4 additions & 16 deletions python/cudf/cudf/_lib/binaryop.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,9 @@ class BinaryOperation(IntEnum):
GENERIC_BINARY = (
<underlying_type_t_binary_operator> binary_operator.GENERIC_BINARY
)
NULL_EQUALS = (
<underlying_type_t_binary_operator> binary_operator.NULL_EQUALS
)


cdef binaryop_v_v(Column lhs, Column rhs,
Expand Down Expand Up @@ -154,17 +157,6 @@ cdef binaryop_s_v(DeviceScalar lhs, Column rhs,
return Column.from_unique_ptr(move(c_result))


def handle_null_for_string_column(Column input_col, op):
if op in ('eq', 'lt', 'le', 'gt', 'ge'):
return replace_nulls(input_col, DeviceScalar(False, 'bool'))

elif op == 'ne':
return replace_nulls(input_col, DeviceScalar(True, 'bool'))

# Nothing needs to be done
return input_col


def binaryop(lhs, rhs, op, dtype):
"""
Dispatches a binary op call to the appropriate libcudf function:
Expand Down Expand Up @@ -205,11 +197,7 @@ def binaryop(lhs, rhs, op, dtype):
c_op,
c_dtype
)

if is_string_col is True:
return handle_null_for_string_column(result, op.name.lower())
else:
return result
return result


def binaryop_udf(Column lhs, Column rhs, udf_ptx, dtype):
Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/_lib/cpp/binaryop.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
GREATER "cudf::binary_operator::GREATER"
LESS_EQUAL "cudf::binary_operator::LESS_EQUAL"
GREATER_EQUAL "cudf::binary_operator::GREATER_EQUAL"
NULL_EQUALS "cudf::binary_operator::NULL_EQUALS"
BITWISE_AND "cudf::binary_operator::BITWISE_AND"
BITWISE_OR "cudf::binary_operator::BITWISE_OR"
BITWISE_XOR "cudf::binary_operator::BITWISE_XOR"
Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/_lib/reduce.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
return incol.dtype.type(0)
if reduction_op == 'product':
return incol.dtype.type(1)
if reduction_op == "any":
return False

return cudf.utils.dtypes._get_nan_for_dtype(col_dtype)

Expand Down
6 changes: 5 additions & 1 deletion python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1014,7 +1014,11 @@ def slice(
def binary_operator(
self, op: str, rhs, reflect: bool = False
) -> ColumnBase:
if not (self.ordered and rhs.ordered) and op not in ("eq", "ne"):
if not (self.ordered and rhs.ordered) and op not in (
"eq",
"ne",
"NULL_EQUALS",
):
if op in ("lt", "gt", "le", "ge"):
raise TypeError(
"Unordered Categoricals can only compare equality or not"
Expand Down
6 changes: 5 additions & 1 deletion python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,11 @@ def equals(self, other: ColumnBase, check_dtypes: bool = False) -> bool:
if check_dtypes:
if self.dtype != other.dtype:
return False
return (self == other).min()
null_equals = self._null_equals(other)
return null_equals.all()

def _null_equals(self, other: ColumnBase) -> ColumnBase:
return self.binary_operator("NULL_EQUALS", other)

def all(self) -> bool:
return bool(libcudf.reduce.reduce("all", self, dtype=np.bool_))
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ def binary_operator(
if isinstance(rhs, cudf.DateOffset):
return binop_offset(self, rhs, op)
lhs, rhs = self, rhs
if op in ("eq", "ne", "lt", "gt", "le", "ge"):
if op in ("eq", "ne", "lt", "gt", "le", "ge", "NULL_EQUALS"):
out_dtype = np.dtype(np.bool_) # type: Dtype
elif op == "add" and pd.api.types.is_timedelta64_dtype(rhs.dtype):
out_dtype = cudf.core.column.timedelta._timedelta_add_result_dtype(
Expand Down
13 changes: 9 additions & 4 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -700,16 +700,21 @@ def _numeric_column_binop(
if reflect:
lhs, rhs = rhs, lhs

is_op_comparison = op in ["lt", "gt", "le", "ge", "eq", "ne"]
is_op_comparison = op in [
"lt",
"gt",
"le",
"ge",
"eq",
"ne",
"NULL_EQUALS",
]

if is_op_comparison:
out_dtype = "bool"

out = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)

if is_op_comparison:
out = out.fillna(op == "ne")

return out


Expand Down
3 changes: 1 addition & 2 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,6 @@ def cat(self, others=None, sep=None, na_rep=None):
3 dD
dtype: object
"""

if sep is None:
sep = ""

Expand Down Expand Up @@ -5109,7 +5108,7 @@ def binary_operator(
if isinstance(rhs, (StringColumn, str, cudf.Scalar)):
if op == "add":
return cast("column.ColumnBase", lhs.str().cat(others=rhs))
elif op in ("eq", "ne", "gt", "lt", "ge", "le"):
elif op in ("eq", "ne", "gt", "lt", "ge", "le", "NULL_EQUALS"):
return _string_column_binop(self, rhs, op=op, out_dtype="bool")

raise TypeError(
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/column/timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def binary_operator(

if op in ("eq", "ne"):
out_dtype = self._binary_op_eq_ne(rhs)
elif op in ("lt", "gt", "le", "ge"):
elif op in ("lt", "gt", "le", "ge", "NULL_EQUALS"):
out_dtype = self._binary_op_lt_gt_le_ge(rhs)
elif op == "mul":
out_dtype = self._binary_op_mul(rhs)
Expand Down
14 changes: 9 additions & 5 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6031,7 +6031,6 @@ def isin(self, values):
falcon True True
dog False False
"""

if isinstance(values, dict):

result_df = DataFrame()
Expand All @@ -6051,14 +6050,15 @@ def isin(self, values):
values = values.reindex(self.index)

result = DataFrame()

# TODO: propagate nulls through isin
# https://github.com/rapidsai/cudf/issues/7556
for col in self._data.names:
if isinstance(
self[col]._column, cudf.core.column.CategoricalColumn
) and isinstance(
values._column, cudf.core.column.CategoricalColumn
):
res = self._data[col] == values._column
res = (self._data[col] == values._column).fillna(False)
result[col] = res
elif (
isinstance(
Expand All @@ -6073,7 +6073,9 @@ def isin(self, values):
):
result[col] = utils.scalar_broadcast_to(False, len(self))
else:
result[col] = self._data[col] == values._column
result[col] = (self._data[col] == values._column).fillna(
False
)

result.index = self.index
return result
Expand All @@ -6083,7 +6085,9 @@ def isin(self, values):
result = DataFrame()
for col in self._data.names:
if col in values.columns:
result[col] = self._data[col] == values[col]._column
result[col] = (
self._data[col] == values[col]._column
).fillna(False)
else:
result[col] = utils.scalar_broadcast_to(False, len(self))
result.index = self.index
Expand Down
5 changes: 1 addition & 4 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1566,10 +1566,7 @@ def _apply_boolean_mask(self, boolean_mask):
rows corresponding to `False` is dropped
"""
boolean_mask = as_column(boolean_mask)
if boolean_mask.has_nulls:
raise ValueError(
"cannot mask with boolean_mask containing null values"
)

result = self.__class__._from_table(
libcudf.stream_compaction.apply_boolean_mask(
self, as_column(boolean_mask)
Expand Down
6 changes: 4 additions & 2 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3120,8 +3120,10 @@ def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
"bool_only parameter is not implemented yet"
)

if self.empty:
return False
skipna = False if skipna is None else skipna

if skipna is False and self.has_nulls:
return True

if skipna:
result_series = self.nans_to_nulls()
Expand Down
Loading

0 comments on commit 561f68a

Please sign in to comment.