Add xfailing tests for many indexing corner cases

This gives us observability over changes as we fix things.
rapidsai · Jun 6, 2023 · 46f17f7 · 46f17f7
1 parent 4b7b6d5
commit 46f17f7
Show file tree

Hide file tree

Showing 2 changed files with 373 additions and 1 deletion.
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
+from datetime import datetime
 from itertools import combinations
 
 import cupy
@@ -1741,3 +1742,279 @@ def test_boolean_mask_columns_iloc_series():
 
     with pytest.raises(NotImplementedError):
         cdf.iloc[:, mask]
+
+
+@pytest.mark.parametrize("index_type", ["single", "slice"])
+def test_loc_timestamp_8585(index_type):
+    start = pd.Timestamp(
+        datetime.strptime("2021-03-12 00:00", "%Y-%m-%d %H:%M")
+    )
+    end = pd.Timestamp(datetime.strptime("2021-03-12 11:00", "%Y-%m-%d %H:%M"))
+    timestamps = pd.date_range(start, end, periods=12)
+    value = np.random.normal(size=12)
+    df = pd.DataFrame(value, index=timestamps, columns=["value"])
+    cdf = cudf.from_pandas(df)
+    if index_type == "single":
+        index = pd.Timestamp(
+            datetime.strptime("2021-03-12 03:00", "%Y-%m-%d %H:%M")
+        )
+    elif index_type == "slice":
+        index = slice(start, end, None)
+    else:
+        raise ValueError("Invalid index type")
+    expect = df.loc[index]
+    actual = cdf.loc[index]
+    assert_eq(expect, actual)
+
+
+@pytest.mark.parametrize(
+    "index_type",
+    [
+        "single",
+        pytest.param(
+            "slice",
+            marks=pytest.mark.xfail(
+                reason="https://github.com/rapidsai/cudf/issues/8585"
+            ),
+        ),
+        pytest.param(
+            "date_range",
+            marks=pytest.mark.xfail(
+                reason="https://github.com/rapidsai/cudf/issues/8585"
+            ),
+        ),
+    ],
+)
+def test_loc_multiindex_timestamp_8585(index_type):
+    start = pd.Timestamp(
+        datetime.strptime("2021-03-12 00:00", "%Y-%m-%d %H:%M")
+    )
+    end = pd.Timestamp(datetime.strptime("2021-03-12 03:00", "%Y-%m-%d %H:%M"))
+    timestamps = pd.date_range(start, end, periods=4)
+    labels = ["A", "B", "C"]
+    index = pd.MultiIndex.from_product(
+        [timestamps, labels], names=["timestamp", "label"]
+    )
+    value = np.random.normal(size=12)
+    df = pd.DataFrame(value, index=index, columns=["value"])
+    cdf = cudf.from_pandas(df)
+    start = pd.Timestamp(
+        datetime.strptime("2021-03-12 01:00", "%Y-%m-%d %H:%M")
+    )
+    end = pd.Timestamp(datetime.strptime("2021-03-12 02:00", "%Y-%m-%d %H:%M"))
+    if index_type == "single":
+        index = pd.Timestamp(
+            datetime.strptime("2021-03-12 03:00", "%Y-%m-%d %H:%M")
+        )
+    elif index_type == "slice":
+        index = slice(start, end, None)
+    elif index_type == "date_range":
+        index = pd.date_range(start, end, periods=2)
+    else:
+        raise ValueError("Invalid index type")
+    expect = df.loc[index]
+    actual = cdf.loc[index]
+    assert_eq(expect, actual)
+
+
+@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/8693")
+def test_loc_8693():
+    s = pd.Series([1, 2, 3, 4], index=[0, 1, 1, 2])
+    cs = cudf.from_pandas(s)
+    expect = s.loc[1]
+    actual = cs.loc[1]
+    assert_eq(expect, actual)
+
+
+@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/13268")
+@pytest.mark.parametrize(
+    "indexer", [(..., 0), (0, ...)], ids=["row_ellipsis", "column_ellipsis"]
+)
+def test_loc_13268(indexer):
+    df = pd.DataFrame(np.arange(4).reshape(2, 2))
+    cdf = cudf.from_pandas(df)
+
+    expect = df.loc[indexer]
+    actual = cdf.loc[indexer]
+    assert_eq(expect, actual)
+
+
+@pytest.mark.xfail(
+    reason="https://github.com/rapidsai/cudf/issues/13269 "
+    "and https://github.com/rapidsai/cudf/issues/13273"
+)
+def test_iloc_13269():
+    df = pd.DataFrame(np.arange(4).reshape(2, 2))
+    cdf = cudf.from_pandas(df)
+
+    expect = df.loc[:, [0, 1, 0]]
+    actual = cdf.loc[:, [0, 1, 0]]
+    assert_eq(expect, actual)
+
+
+def test_loc_13270():
+    df = pd.DataFrame(np.arange(4).reshape(2, 2))
+    cdf = cudf.from_pandas(df)
+    expect = df.loc[:, [True, True]]
+    actual = cdf.loc[:, [True, True]]
+    assert_eq(expect, actual)
+
+
+@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/13013")
+@pytest.mark.parametrize("indexer", [[1], [0, 2]])
+def test_iloc_13013(indexer):
+    s = pd.Series([0, 1, 2])
+    index = pd.Categorical(indexer)
+    expect = s.iloc[index]
+    c = cudf.from_pandas(s)
+    actual = c.iloc[index]
+    assert_eq(expect, actual)
+
+
+def test_iloc_13015():
+    s = pd.Series([0, 1, 2])
+    with pytest.raises(IndexError):
+        s.iloc[[True, False]]
+    c = cudf.from_pandas(s)
+    with pytest.raises(IndexError):
+        c.iloc[[True, False]]
+
+
+def test_iloc_13265():
+    df = pd.DataFrame(np.arange(4).reshape(2, 2))
+    cdf = cudf.from_pandas(df)
+    expect = df.iloc[:, [True, True]]
+    actual = cdf.iloc[:, [True, True]]
+    assert_eq(expect, actual)
+
+
+@pytest.mark.xfail(
+    reason="https://github.com/rapidsai/cudf/issues/13266 "
+    "and https://github.com/rapidsai/cudf/issues/13273"
+)
+def test_iloc_13266():
+    df = pd.DataFrame(np.arange(4).reshape(2, 2))
+    cdf = cudf.from_pandas(df)
+
+    expect = df.iloc[:, [0, 1, 0]]
+    actual = cdf.iloc[:, [0, 1, 0]]
+    assert_eq(expect, actual)
+
+
+@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/13267")
+@pytest.mark.parametrize(
+    "indexer", [(..., 0), (0, ...)], ids=["row_ellipsis", "column_ellipsis"]
+)
+def test_iloc_13267(indexer):
+    df = pd.DataFrame(np.arange(4).reshape(2, 2))
+    cdf = cudf.from_pandas(df)
+
+    expect = df.iloc[indexer]
+    actual = cdf.iloc[indexer]
+    assert_eq(expect, actual)
+
+
+@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/12833")
+def test_loc_12833_unordered():
+    df = pd.DataFrame({"a": [1, 2, 3]}, index=[7, 0, 4])
+    cdf = cudf.from_pandas(df)
+
+    # Check that pandas don't change their mind
+    with pytest.raises(KeyError):
+        df.loc[1:5]
+
+    with pytest.raises(KeyError):
+        cdf.loc[1:5]
+
+
+@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/13379")
+@pytest.mark.parametrize("index", [range(5), list(range(5))])
+def test_loc_13379_keyerror_missing(index):
+    df = pd.DataFrame({"a": index}, index=index)
+    cdf = cudf.from_pandas(df)
+    # Check that pandas don't change their mind
+    with pytest.raises(KeyError):
+        df.loc[[0, 5]]
+
+    with pytest.raises(KeyError):
+        cdf.loc[[0, 5]]
+
+
+class TestLocIndexWithOrder:
+    @pytest.fixture(params=["increasing", "decreasing", "neither"])
+    def order(self, request):
+        return request.param
+
+    @pytest.fixture(params=[-1, 1], ids=["reverse", "forward"])
+    def take_order(self, request):
+        return request.param
+
+    @pytest.fixture(params=["float", "int", "string"])
+    def dtype(self, request):
+        return request.param
+
+    @pytest.fixture
+    def index(self, order, dtype):
+        if dtype == "string":
+            index = ["a", "h", "f", "z"]
+        elif dtype == "int":
+            index = [-1, 10, 7, 14]
+        elif dtype == "float":
+            index = [-1.5, 7.10, 2.4, 11.2]
+        else:
+            raise ValueError(f"Unhandled index dtype {dtype}")
+        if order == "decreasing":
+            return sorted(index, reverse=True)
+        elif order == "increasing":
+            return sorted(index)
+        elif order == "neither":
+            return index
+        else:
+            raise ValueError(f"Unhandled index order {order}")
+
+    @pytest.fixture
+    def df(self, index):
+        return cudf.DataFrame({"a": range(len(index))}, index=index)
+
+    def test_loc_index_inindex_slice(self, df, take_order):
+        pdf = df.to_pandas()
+        lo = pdf.index[1]
+        hi = pdf.index[-2]
+        expect = pdf.loc[lo:hi:take_order]
+        actual = df.loc[lo:hi:take_order]
+        assert_eq(expect, actual)
+
+    def test_loc_index_inindex_subset(self, df, take_order):
+        pdf = df.to_pandas()
+        vals = [pdf.index[0], pdf.index[2]][::take_order]
+        expect = pdf.loc[vals]
+        actual = df.loc[vals]
+        assert_eq(expect, actual)
+
+    def test_loc_index_notinindex_slice(
+        self, request, df, order, dtype, take_order
+    ):
+        if not (order == "increasing" and dtype in {"int", "float"}):
+            request.applymarker(
+                pytest.mark.xfail(
+                    reason="https://github.com/rapidsai/cudf/issues/12833"
+                )
+            )
+        pdf = df.to_pandas()
+        lo = pdf.index[1]
+        hi = pdf.index[-2]
+        if isinstance(lo, str):
+            lo = chr(ord(lo) - 1)
+            hi = chr(ord(hi) + 1)
+        else:
+            lo -= 1
+            hi += 1
+        if order == "neither":
+            with pytest.raises(KeyError):
+                pdf.loc[lo:hi:take_order]
+            with pytest.raises(KeyError):
+                df.loc[lo:hi:take_order]
+        else:
+            expect = pdf.loc[lo:hi:take_order]
+            actual = df.loc[lo:hi:take_order]
+            assert_eq(expect, actual)
diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py
@@ -153,7 +153,6 @@ def test_setitem_dataframe_series_inplace(df):
     ],
 )
 def test_series_set_equal_length_object_by_mask(replace_data):
-
     psr = pd.Series([1, 2, 3, 4, 5], dtype="Int64")
     gsr = cudf.from_pandas(psr)
 
@@ -368,3 +367,99 @@ def test_setitem_str_trailing_null(n):
     assert s[0] == ""
     s[0] = "\x00"
     assert s[0] == "\x00"
+
+
+@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/7448")
+def test_iloc_setitem_7448():
+    index = pd.MultiIndex.from_product([(1, 2), (3, 4)])
+    expect = cudf.Series([1, 2, 3, 4], index=index)
+    actual = cudf.from_pandas(expect)
+    expect[(1, 3)] = 101
+    actual[(1, 3)] = 101
+    assert_eq(expect, actual)
+
+
+@pytest.mark.parametrize(
+    "value",
+    [
+        "7",
+        pytest.param(
+            ["7", "8"],
+            marks=pytest.mark.xfail(
+                reason="https://github.com/rapidsai/cudf/issues/11298"
+            ),
+        ),
+    ],
+)
+def test_loc_setitem_string_11298(value):
+    df = pd.DataFrame({"a": ["a", "b", "c"]})
+    cdf = cudf.from_pandas(df)
+
+    df.loc[:1, "a"] = value
+
+    cdf.loc[:1, "a"] = value
+
+    assert_eq(df, cdf)
+
+
+@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/11944")
+def test_loc_setitem_list_11944():
+    df = pd.DataFrame(
+        data={"a": ["yes", "no"], "b": [["l1", "l2"], ["c", "d"]]}
+    )
+    cdf = cudf.from_pandas(df)
+    df.loc[df.a == "yes", "b"] = [["hello"]]
+    cdf.loc[df.a == "yes", "b"] = [["hello"]]
+    assert_eq(df, cdf)
+
+
+@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/12504")
+def test_loc_setitem_extend_empty_12504():
+    df = pd.DataFrame(columns=["a"])
+    cdf = cudf.from_pandas(df)
+
+    df.loc[0] = [1]
+
+    cdf.loc[0] = [1]
+
+    assert_eq(df, cdf)
+
+
+@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/12505")
+def test_loc_setitem_extend_existing_12505():
+    df = pd.DataFrame({"a": [0]})
+    cdf = cudf.from_pandas(df)
+
+    df.loc[1] = 1
+
+    cdf.loc[1] = 1
+
+    assert_eq(df, cdf)
+
+
+@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/12801")
+def test_loc_setitem_add_column_partial_12801():
+    df = pd.DataFrame({"a": [0, 1, 2]})
+    cdf = cudf.from_pandas(df)
+
+    df.loc[df.a < 2, "b"] = 1
+
+    cdf.loc[cdf.a < 2, "b"] = 1
+
+    assert_eq(df, cdf)
+
+
+@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/13031")
+@pytest.mark.parametrize("other_index", [["1", "3", "2"], [1, 2, 3]])
+def test_loc_setitem_series_index_alignment_13031(other_index):
+    s = pd.Series([1, 2, 3], index=["1", "2", "3"])
+    other = pd.Series([5, 6, 7], index=other_index)
+
+    cs = cudf.from_pandas(s)
+    cother = cudf.from_pandas(other)
+
+    s.loc[["1", "3"]] = other
+
+    cs.loc[["1", "3"]] = cother
+
+    assert_eq(s, cs)