rapidsai · rapids-bot · Mar 16, 2021 · Feb 3, 2021 · Feb 4, 2021 · Feb 4, 2021
@@ -212,11 +212,7 @@ def binaryop(lhs, rhs, op, dtype):
             c_op,
             c_dtype
         )
-
-    if is_string_col is True:
-        return handle_null_for_string_column(result, op.name.lower())
-    else:
-        return result
+    return result
 
 
 def binaryop_udf(Column lhs, Column rhs, udf_ptx, dtype):

@@ -172,7 +172,7 @@ def equals(self, other: ColumnBase, check_dtypes: bool = False) -> bool:
         if check_dtypes:
             if self.dtype != other.dtype:
                 return False
-        return (self == other).min()
+        return (self == other).fillna(True).min()
 
     def all(self) -> bool:
         return bool(libcudf.reduce.reduce("all", self, dtype=np.bool_))

@@ -694,9 +694,6 @@ def _numeric_column_binop(
 
     out = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
 
-    if is_op_comparison:
-        out = out.fillna(op == "ne")
-
     return out
 
 

@@ -5714,7 +5714,6 @@ def isin(self, values):
             DataFrame of booleans showing whether each element in
             the DataFrame is contained in values.
         """
-
         if isinstance(values, dict):
 
             result_df = DataFrame()
@@ -5741,7 +5740,7 @@ def isin(self, values):
                 ) and isinstance(
                     values._column, cudf.core.column.CategoricalColumn
                 ):
-                    res = self._data[col] == values._column
+                    res = (self._data[col] == values._column).fillna(False)
                     result[col] = res
                 elif (
                     isinstance(
@@ -5756,7 +5755,7 @@ def isin(self, values):
                 ):
                     result[col] = utils.scalar_broadcast_to(False, len(self))
                 else:
-                    result[col] = self._data[col] == values._column
+                    result[col] = (self._data[col] == values._column).fillna(False)
 
             result.index = self.index
             return result
@@ -5766,7 +5765,7 @@ def isin(self, values):
             result = DataFrame()
             for col in self._data.names:
                 if col in values.columns:
-                    result[col] = self._data[col] == values[col]._column
+                    result[col] = (self._data[col] == values[col]._column).fillna(False)
                 else:
                     result[col] = utils.scalar_broadcast_to(False, len(self))
             result.index = self.index

@@ -22,6 +22,8 @@
     INTEGER_TYPES,
     NUMERIC_TYPES,
     TIMEDELTA_TYPES,
+    ALL_TYPES,
+    cudf_dtypes_to_pandas_dtypes
 )
 
 STRING_TYPES = {"str"}
@@ -205,13 +207,43 @@ def test_series_compare(cmpop, obj_class, dtype):
     np.testing.assert_equal(result2.to_array(), cmpop(arr2, arr2))
     np.testing.assert_equal(result3.to_array(), cmpop(arr1, arr2))
 
+def _series_compare_nulls_typegen():
+    tests = []
+    tests += list(product(DATETIME_TYPES, DATETIME_TYPES))
+    tests += list(product(TIMEDELTA_TYPES, TIMEDELTA_TYPES))
+    tests += list(product(NUMERIC_TYPES, NUMERIC_TYPES))
+    tests += list(product(STRING_TYPES, STRING_TYPES))
+
+    return tests
+
+@pytest.mark.parametrize("cmpop", _cmpops)
+@pytest.mark.parametrize("dtypes", _series_compare_nulls_typegen())
+def test_series_compare_nulls(cmpop, dtypes):
+    ltype, rtype = dtypes
+
+    ldata = [1, 2, None, None, 5]
+    rdata = [2, 1, None, 4, None]
+
+    lser = Series(ldata, dtype=ltype)
+    rser = Series(rdata, dtype=rtype)
+
+    lmask = ~lser.isnull()
+    rmask = ~rser.isnull()
+
+    expect_mask = np.logical_and(lmask, rmask)
+    expect = cudf.Series([None] * 5, dtype='bool')
+    expect[expect_mask] = cmpop(lser[expect_mask], rser[expect_mask])
+
+    got = cmpop(lser, rser)
+    utils.assert_eq(expect, got)
+
 
 @pytest.mark.parametrize(
-    "obj", [pd.Series(["a", "b", None, "d", "e", None]), "a"]
+    "obj", [pd.Series(["a", "b", None, "d", "e", None], dtype='string'), "a"]
 )
 @pytest.mark.parametrize("cmpop", _cmpops)
 @pytest.mark.parametrize(
-    "cmp_obj", [pd.Series(["b", "a", None, "d", "f", None]), "a"]
+    "cmp_obj", [pd.Series(["b", "a", None, "d", "f", None], dtype='string'), "a"]
 )
 def test_string_series_compare(obj, cmpop, cmp_obj):
 
@@ -221,10 +253,12 @@ def test_string_series_compare(obj, cmpop, cmp_obj):
     g_cmp_obj = cmp_obj
     if isinstance(g_cmp_obj, pd.Series):
         g_cmp_obj = Series.from_pandas(g_cmp_obj)
-
     got = cmpop(g_obj, g_cmp_obj)
     expected = cmpop(obj, cmp_obj)
 
+    if isinstance(expected, pd.Series):
+        expected = cudf.from_pandas(expected)
+
     utils.assert_eq(expected, got)
 
 
@@ -694,28 +728,35 @@ def test_operator_func_series_and_scalar(
 def test_operator_func_between_series_logical(
     dtype, func, scalar_a, scalar_b, fill_value
 ):
-    gdf_series_a = Series([scalar_a]).astype(dtype)
-    gdf_series_b = Series([scalar_b]).astype(dtype)
-    pdf_series_a = gdf_series_a.to_pandas()
-    pdf_series_b = gdf_series_b.to_pandas()
+    pandas_dtype = cudf_dtypes_to_pandas_dtypes[np.dtype(dtype)]
+
+    gdf_series_a = Series([scalar_a], nan_as_null=False).astype(dtype)
+    gdf_series_b = Series([scalar_b], nan_as_null=False).astype(dtype)
+
+    pdf_series_a = gdf_series_a.to_pandas(nullable=True)
+    pdf_series_b = gdf_series_b.to_pandas(nullable=True)
+
 
     gdf_series_result = getattr(gdf_series_a, func)(
         gdf_series_b, fill_value=fill_value
     )
     pdf_series_result = getattr(pdf_series_a, func)(
         pdf_series_b, fill_value=fill_value
     )
-
-    if scalar_a in [None, np.nan] and scalar_b in [None, np.nan]:
-        # cudf binary operations will return `None` when both left- and right-
-        # side values are `None`. It will return `np.nan` when either side is
-        # `np.nan`. As a consequence, when we convert our gdf => pdf during
-        # assert_eq, we get a pdf with dtype='object' (all inputs are none).
-        # to account for this, we use fillna.
-        gdf_series_result.fillna(func == "ne", inplace=True)
-
-    utils.assert_eq(pdf_series_result, gdf_series_result)
-
+    expect = pdf_series_result
+    got = gdf_series_result.to_pandas(nullable=True)
+
+    # If fill_value is np.nan, things break down a bit,
+    # because setting a NaN into a pandas nullable float
+    # array still gets transformed to <NA>. As such,
+    # pd_series_with_nulls.fillna(np.nan) has no effect.
+    if (
+        pdf_series_a.isnull().sum() != pdf_series_b.isnull().sum()
+    ) and np.isscalar(fill_value) and np.isnan(fill_value):
+        with pytest.raises(AssertionError):
+            utils.assert_eq(expect, got)
+        return
+    utils.assert_eq(expect, got)
 
 @pytest.mark.parametrize("dtype", ["float32", "float64"])
 @pytest.mark.parametrize("func", _operators_comparison)
@@ -729,8 +770,7 @@ def test_operator_func_series_and_scalar_logical(
     gdf_series = utils.gen_rand_series(
         dtype, 1000, has_nulls=has_nulls, stride=10000
     )
-    pdf_series = gdf_series.to_pandas()
-
+    pdf_series = gdf_series.to_pandas(nullable=True)
     gdf_series_result = getattr(gdf_series, func)(
         cudf.Scalar(scalar) if use_cudf_scalar else scalar,
         fill_value=fill_value,
@@ -739,7 +779,10 @@ def test_operator_func_series_and_scalar_logical(
         scalar, fill_value=fill_value
     )
 
-    utils.assert_eq(pdf_series_result, gdf_series_result)
+    expect = pdf_series_result
+    got = gdf_series_result.to_pandas(nullable=True)
+
+    utils.assert_eq(expect, got)
 
 
 @pytest.mark.parametrize("func", _operators_arithmetic)