Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Match Pandas logic for comparing two objects with nulls #7490

Merged
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
9927d88
write a test
brandon-b-miller Feb 3, 2021
a707d78
remove code that fills nas in numerical and string
brandon-b-miller Feb 4, 2021
1bdf1fc
update test_string_series_compare
brandon-b-miller Feb 4, 2021
d5d97be
make a class that handles nulls for floats correctly
brandon-b-miller Feb 8, 2021
0c65c90
progress
brandon-b-miller Feb 9, 2021
138eaf8
finally have nan logic figured out
brandon-b-miller Feb 10, 2021
290d443
improve NullableFloatSeriesCompare
brandon-b-miller Feb 10, 2021
8d8b7d2
make it so we can compare a scalar and fix tests
brandon-b-miller Feb 10, 2021
0a591ec
binops tests pass
brandon-b-miller Feb 16, 2021
38ef833
discover issue 7385 thanks to my own mistake
brandon-b-miller Feb 16, 2021
e49d6c5
Merge branch 'branch-0.19' into fea-7066-null-comparisons
brandon-b-miller Feb 23, 2021
4e87f75
revert test
brandon-b-miller Feb 25, 2021
8b4e3a0
Merge branch 'branch-0.19' into fea-7066-null-comparisons
brandon-b-miller Mar 1, 2021
cbe80dd
cleanup binaryop.pyx
brandon-b-miller Mar 1, 2021
cc73371
remove my own comparison object
brandon-b-miller Mar 1, 2021
0416671
plumb and fix a few tests
brandon-b-miller Mar 1, 2021
3b87bfa
finally passing
brandon-b-miller Mar 1, 2021
5b0230c
fix test_operator_func_series_and_scalar_logical
brandon-b-miller Mar 1, 2021
e652106
update DataFrame.isin
brandon-b-miller Mar 1, 2021
636c03a
remove handle_null_for_string_column
brandon-b-miller Mar 3, 2021
ea66fe9
style
brandon-b-miller Mar 3, 2021
def3bd7
cython style
brandon-b-miller Mar 3, 2021
9f0e121
fix more bugs
brandon-b-miller Mar 7, 2021
18c0eab
fix small bugs
brandon-b-miller Mar 8, 2021
470975b
xfail tests for now
brandon-b-miller Mar 9, 2021
fbed297
remove probably legacy code
brandon-b-miller Mar 9, 2021
7f7c345
correct previous commit
brandon-b-miller Mar 9, 2021
c6150b7
add checking code back into string for now
brandon-b-miller Mar 9, 2021
bd23b19
remove na filling in ColumnBase.__setitem__
brandon-b-miller Mar 9, 2021
d383ba0
add issue comments
brandon-b-miller Mar 11, 2021
952750c
fix any reduction
brandon-b-miller Mar 11, 2021
6d7ffda
style
brandon-b-miller Mar 12, 2021
cf725ef
Merge branch 'branch-0.19' into fea-7066-null-comparisons
brandon-b-miller Mar 12, 2021
417d220
updte test_equality_ops_index_mismatch
brandon-b-miller Mar 12, 2021
01876d2
plumb NULL_EQUALS
brandon-b-miller Mar 13, 2021
63da3bf
plumb NULL_EQUALS through binary_operator, add test
brandon-b-miller Mar 15, 2021
981457d
style
brandon-b-miller Mar 15, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions python/cudf/cudf/_lib/binaryop.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -212,11 +212,7 @@ def binaryop(lhs, rhs, op, dtype):
c_op,
c_dtype
)

if is_string_col is True:
return handle_null_for_string_column(result, op.name.lower())
kkraus14 marked this conversation as resolved.
Show resolved Hide resolved
else:
return result
return result


def binaryop_udf(Column lhs, Column rhs, udf_ptx, dtype):
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def equals(self, other: ColumnBase, check_dtypes: bool = False) -> bool:
if check_dtypes:
if self.dtype != other.dtype:
return False
return (self == other).min()
return (self == other).fillna(True).min()

def all(self) -> bool:
return bool(libcudf.reduce.reduce("all", self, dtype=np.bool_))
Expand Down
3 changes: 0 additions & 3 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -694,9 +694,6 @@ def _numeric_column_binop(

out = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)

if is_op_comparison:
out = out.fillna(op == "ne")

return out


Expand Down
7 changes: 3 additions & 4 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5714,7 +5714,6 @@ def isin(self, values):
DataFrame of booleans showing whether each element in
the DataFrame is contained in values.
"""

if isinstance(values, dict):

result_df = DataFrame()
Expand All @@ -5741,7 +5740,7 @@ def isin(self, values):
) and isinstance(
values._column, cudf.core.column.CategoricalColumn
):
res = self._data[col] == values._column
res = (self._data[col] == values._column).fillna(False)
kkraus14 marked this conversation as resolved.
Show resolved Hide resolved
result[col] = res
elif (
isinstance(
Expand All @@ -5756,7 +5755,7 @@ def isin(self, values):
):
result[col] = utils.scalar_broadcast_to(False, len(self))
else:
result[col] = self._data[col] == values._column
result[col] = (self._data[col] == values._column).fillna(False)

result.index = self.index
return result
Expand All @@ -5766,7 +5765,7 @@ def isin(self, values):
result = DataFrame()
for col in self._data.names:
if col in values.columns:
result[col] = self._data[col] == values[col]._column
result[col] = (self._data[col] == values[col]._column).fillna(False)
else:
result[col] = utils.scalar_broadcast_to(False, len(self))
result.index = self.index
Expand Down
85 changes: 64 additions & 21 deletions python/cudf/cudf/tests/test_binops.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
INTEGER_TYPES,
NUMERIC_TYPES,
TIMEDELTA_TYPES,
ALL_TYPES,
cudf_dtypes_to_pandas_dtypes
)

STRING_TYPES = {"str"}
Expand Down Expand Up @@ -205,13 +207,43 @@ def test_series_compare(cmpop, obj_class, dtype):
np.testing.assert_equal(result2.to_array(), cmpop(arr2, arr2))
np.testing.assert_equal(result3.to_array(), cmpop(arr1, arr2))

def _series_compare_nulls_typegen():
tests = []
tests += list(product(DATETIME_TYPES, DATETIME_TYPES))
tests += list(product(TIMEDELTA_TYPES, TIMEDELTA_TYPES))
tests += list(product(NUMERIC_TYPES, NUMERIC_TYPES))
tests += list(product(STRING_TYPES, STRING_TYPES))

return tests

@pytest.mark.parametrize("cmpop", _cmpops)
@pytest.mark.parametrize("dtypes", _series_compare_nulls_typegen())
def test_series_compare_nulls(cmpop, dtypes):
ltype, rtype = dtypes

ldata = [1, 2, None, None, 5]
rdata = [2, 1, None, 4, None]

lser = Series(ldata, dtype=ltype)
rser = Series(rdata, dtype=rtype)

lmask = ~lser.isnull()
rmask = ~rser.isnull()

expect_mask = np.logical_and(lmask, rmask)
expect = cudf.Series([None] * 5, dtype='bool')
expect[expect_mask] = cmpop(lser[expect_mask], rser[expect_mask])

got = cmpop(lser, rser)
utils.assert_eq(expect, got)


@pytest.mark.parametrize(
"obj", [pd.Series(["a", "b", None, "d", "e", None]), "a"]
"obj", [pd.Series(["a", "b", None, "d", "e", None], dtype='string'), "a"]
)
@pytest.mark.parametrize("cmpop", _cmpops)
@pytest.mark.parametrize(
"cmp_obj", [pd.Series(["b", "a", None, "d", "f", None]), "a"]
"cmp_obj", [pd.Series(["b", "a", None, "d", "f", None], dtype='string'), "a"]
)
def test_string_series_compare(obj, cmpop, cmp_obj):

Expand All @@ -221,10 +253,12 @@ def test_string_series_compare(obj, cmpop, cmp_obj):
g_cmp_obj = cmp_obj
if isinstance(g_cmp_obj, pd.Series):
g_cmp_obj = Series.from_pandas(g_cmp_obj)

got = cmpop(g_obj, g_cmp_obj)
expected = cmpop(obj, cmp_obj)

if isinstance(expected, pd.Series):
expected = cudf.from_pandas(expected)

utils.assert_eq(expected, got)


Expand Down Expand Up @@ -694,28 +728,35 @@ def test_operator_func_series_and_scalar(
def test_operator_func_between_series_logical(
dtype, func, scalar_a, scalar_b, fill_value
):
gdf_series_a = Series([scalar_a]).astype(dtype)
gdf_series_b = Series([scalar_b]).astype(dtype)
pdf_series_a = gdf_series_a.to_pandas()
pdf_series_b = gdf_series_b.to_pandas()
pandas_dtype = cudf_dtypes_to_pandas_dtypes[np.dtype(dtype)]

gdf_series_a = Series([scalar_a], nan_as_null=False).astype(dtype)
gdf_series_b = Series([scalar_b], nan_as_null=False).astype(dtype)

pdf_series_a = gdf_series_a.to_pandas(nullable=True)
pdf_series_b = gdf_series_b.to_pandas(nullable=True)


gdf_series_result = getattr(gdf_series_a, func)(
gdf_series_b, fill_value=fill_value
)
pdf_series_result = getattr(pdf_series_a, func)(
pdf_series_b, fill_value=fill_value
)

if scalar_a in [None, np.nan] and scalar_b in [None, np.nan]:
# cudf binary operations will return `None` when both left- and right-
# side values are `None`. It will return `np.nan` when either side is
# `np.nan`. As a consequence, when we convert our gdf => pdf during
# assert_eq, we get a pdf with dtype='object' (all inputs are none).
# to account for this, we use fillna.
gdf_series_result.fillna(func == "ne", inplace=True)

utils.assert_eq(pdf_series_result, gdf_series_result)

expect = pdf_series_result
got = gdf_series_result.to_pandas(nullable=True)

# If fill_value is np.nan, things break down a bit,
# because setting a NaN into a pandas nullable float
# array still gets transformed to <NA>. As such,
# pd_series_with_nulls.fillna(np.nan) has no effect.
if (
pdf_series_a.isnull().sum() != pdf_series_b.isnull().sum()
) and np.isscalar(fill_value) and np.isnan(fill_value):
with pytest.raises(AssertionError):
utils.assert_eq(expect, got)
return
utils.assert_eq(expect, got)

@pytest.mark.parametrize("dtype", ["float32", "float64"])
@pytest.mark.parametrize("func", _operators_comparison)
Expand All @@ -729,8 +770,7 @@ def test_operator_func_series_and_scalar_logical(
gdf_series = utils.gen_rand_series(
dtype, 1000, has_nulls=has_nulls, stride=10000
)
pdf_series = gdf_series.to_pandas()

pdf_series = gdf_series.to_pandas(nullable=True)
gdf_series_result = getattr(gdf_series, func)(
cudf.Scalar(scalar) if use_cudf_scalar else scalar,
fill_value=fill_value,
Expand All @@ -739,7 +779,10 @@ def test_operator_func_series_and_scalar_logical(
scalar, fill_value=fill_value
)

utils.assert_eq(pdf_series_result, gdf_series_result)
expect = pdf_series_result
got = gdf_series_result.to_pandas(nullable=True)

utils.assert_eq(expect, got)


@pytest.mark.parametrize("func", _operators_arithmetic)
Expand Down