rapidsai · rapids-bot · May 17, 2023 · May 10, 2023 · May 10, 2023 · May 10, 2023
@@ -569,18 +569,21 @@ def read_parquet(
     return _apply_post_filters(df, filters)
 
 
-def _handle_in(column, value):
+def _handle_in(column, value, *, negate):
     if not isinstance(value, (list, set, tuple)):
         raise TypeError(
-            "Value of 'in' filter must be a " "list, set, or tuple."
+            "Value of 'in' or 'not in' filter must be a list, set, or tuple."
         )
-    return reduce(operator.or_, (operator.eq(column, v) for v in value))
+    if negate:
+        return reduce(operator.and_, (operator.ne(column, v) for v in value))
+    else:
+        return reduce(operator.or_, (operator.eq(column, v) for v in value))
 
 
 def _handle_is(column, value, *, negate):
     if value not in {np.nan, None}:
         raise TypeError(
-            "Value of 'is' or 'is not' filter " "must be np.nan or None."
+            "Value of 'is' or 'is not' filter must be np.nan or None."
         )
     return ~column.isna() if negate else column.isna()
 
@@ -605,7 +608,8 @@ def _apply_post_filters(df, filters):
         "<=": operator.le,
         ">": operator.gt,
         ">=": operator.ge,
-        "in": _handle_in,
+        "in": partial(_handle_in, negate=False),
+        "not in": partial(_handle_in, negate=True),
         "is": partial(_handle_is, negate=False),
         "is not": partial(_handle_is, negate=True),
     }

@@ -545,6 +545,7 @@ def test_parquet_read_filtered_multiple_files(tmpdir):
         ([("y", "==", "c"), ("x", ">", 8)], 0),
         ([("y", "==", "c"), ("x", ">=", 5)], 1),
         ([[("y", "==", "c")], [("x", "<", 3)]], 5),
+        ([[("x", "not in", (0, 9)), ("z", "not in", (4, 5))]], 6),
         ([[("y", "==", "c")], [("x", "in", (0, 9)), ("z", "in", (0, 9))]], 4),
         ([[("x", "==", 0)], [("x", "==", 1)], [("x", "==", 2)]], 3),
         ([[("x", "==", 0), ("z", "==", 9), ("y", "==", "a")]], 1),

@@ -254,6 +254,41 @@ def test_filters(tmpdir):
     assert not len(c)
 
 
+@pytest.mark.parametrize("numeric", [True, False])
+@pytest.mark.parametrize("null", [np.nan, None])
+def test_isna_filters(tmpdir, null, numeric):
+
+    tmp_path = str(tmpdir)
+    df = pd.DataFrame(
+        {
+            "x": range(10),
+            "y": list("aabbccddee"),
+            "i": [0] * 4 + [np.nan] * 2 + [0] * 4,
+            "j": [""] * 4 + [None] * 2 + [""] * 4,
+        }
+    )
+    ddf = dd.from_pandas(df, npartitions=5)
+    assert ddf.npartitions == 5
+    ddf.to_parquet(tmp_path, engine="pyarrow")
+
+    # Test "is"
+    col = "i" if numeric else "j"
+    filters = [(col, "is", null)]
+    out = dask_cudf.read_parquet(
+        tmp_path, filters=filters, split_row_groups=True
+    )
+    assert len(out) == 2
+    assert list(out.x.compute().values) == [4, 5]
+
+    # Test "is not"
+    filters = [(col, "is not", null)]
+    out = dask_cudf.read_parquet(
+        tmp_path, filters=filters, split_row_groups=True
+    )
+    assert len(out) == 8
+    assert list(out.x.compute().values) == [0, 1, 2, 3, 6, 7, 8, 9]
+
+
 def test_filters_at_row_group_level(tmpdir):
 
     tmp_path = str(tmpdir)