Skip to content

Commit

Permalink
Add xfailing tests for many indexing corner cases
Browse files Browse the repository at this point in the history
This gives us observability over changes as we fix things.
  • Loading branch information
wence- committed Jun 6, 2023
1 parent 4b7b6d5 commit 46f17f7
Show file tree
Hide file tree
Showing 2 changed files with 373 additions and 1 deletion.
277 changes: 277 additions & 0 deletions python/cudf/cudf/tests/test_indexing.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) 2021-2023, NVIDIA CORPORATION.

from datetime import datetime
from itertools import combinations

import cupy
Expand Down Expand Up @@ -1741,3 +1742,279 @@ def test_boolean_mask_columns_iloc_series():

with pytest.raises(NotImplementedError):
cdf.iloc[:, mask]


@pytest.mark.parametrize("index_type", ["single", "slice"])
def test_loc_timestamp_8585(index_type):
start = pd.Timestamp(
datetime.strptime("2021-03-12 00:00", "%Y-%m-%d %H:%M")
)
end = pd.Timestamp(datetime.strptime("2021-03-12 11:00", "%Y-%m-%d %H:%M"))
timestamps = pd.date_range(start, end, periods=12)
value = np.random.normal(size=12)
df = pd.DataFrame(value, index=timestamps, columns=["value"])
cdf = cudf.from_pandas(df)
if index_type == "single":
index = pd.Timestamp(
datetime.strptime("2021-03-12 03:00", "%Y-%m-%d %H:%M")
)
elif index_type == "slice":
index = slice(start, end, None)
else:
raise ValueError("Invalid index type")
expect = df.loc[index]
actual = cdf.loc[index]
assert_eq(expect, actual)


@pytest.mark.parametrize(
"index_type",
[
"single",
pytest.param(
"slice",
marks=pytest.mark.xfail(
reason="https://github.com/rapidsai/cudf/issues/8585"
),
),
pytest.param(
"date_range",
marks=pytest.mark.xfail(
reason="https://github.com/rapidsai/cudf/issues/8585"
),
),
],
)
def test_loc_multiindex_timestamp_8585(index_type):
start = pd.Timestamp(
datetime.strptime("2021-03-12 00:00", "%Y-%m-%d %H:%M")
)
end = pd.Timestamp(datetime.strptime("2021-03-12 03:00", "%Y-%m-%d %H:%M"))
timestamps = pd.date_range(start, end, periods=4)
labels = ["A", "B", "C"]
index = pd.MultiIndex.from_product(
[timestamps, labels], names=["timestamp", "label"]
)
value = np.random.normal(size=12)
df = pd.DataFrame(value, index=index, columns=["value"])
cdf = cudf.from_pandas(df)
start = pd.Timestamp(
datetime.strptime("2021-03-12 01:00", "%Y-%m-%d %H:%M")
)
end = pd.Timestamp(datetime.strptime("2021-03-12 02:00", "%Y-%m-%d %H:%M"))
if index_type == "single":
index = pd.Timestamp(
datetime.strptime("2021-03-12 03:00", "%Y-%m-%d %H:%M")
)
elif index_type == "slice":
index = slice(start, end, None)
elif index_type == "date_range":
index = pd.date_range(start, end, periods=2)
else:
raise ValueError("Invalid index type")
expect = df.loc[index]
actual = cdf.loc[index]
assert_eq(expect, actual)


@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/8693")
def test_loc_8693():
s = pd.Series([1, 2, 3, 4], index=[0, 1, 1, 2])
cs = cudf.from_pandas(s)
expect = s.loc[1]
actual = cs.loc[1]
assert_eq(expect, actual)


@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/13268")
@pytest.mark.parametrize(
"indexer", [(..., 0), (0, ...)], ids=["row_ellipsis", "column_ellipsis"]
)
def test_loc_13268(indexer):
df = pd.DataFrame(np.arange(4).reshape(2, 2))
cdf = cudf.from_pandas(df)

expect = df.loc[indexer]
actual = cdf.loc[indexer]
assert_eq(expect, actual)


@pytest.mark.xfail(
reason="https://github.com/rapidsai/cudf/issues/13269 "
"and https://github.com/rapidsai/cudf/issues/13273"
)
def test_iloc_13269():
df = pd.DataFrame(np.arange(4).reshape(2, 2))
cdf = cudf.from_pandas(df)

expect = df.loc[:, [0, 1, 0]]
actual = cdf.loc[:, [0, 1, 0]]
assert_eq(expect, actual)


def test_loc_13270():
df = pd.DataFrame(np.arange(4).reshape(2, 2))
cdf = cudf.from_pandas(df)
expect = df.loc[:, [True, True]]
actual = cdf.loc[:, [True, True]]
assert_eq(expect, actual)


@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/13013")
@pytest.mark.parametrize("indexer", [[1], [0, 2]])
def test_iloc_13013(indexer):
s = pd.Series([0, 1, 2])
index = pd.Categorical(indexer)
expect = s.iloc[index]
c = cudf.from_pandas(s)
actual = c.iloc[index]
assert_eq(expect, actual)


def test_iloc_13015():
s = pd.Series([0, 1, 2])
with pytest.raises(IndexError):
s.iloc[[True, False]]
c = cudf.from_pandas(s)
with pytest.raises(IndexError):
c.iloc[[True, False]]


def test_iloc_13265():
df = pd.DataFrame(np.arange(4).reshape(2, 2))
cdf = cudf.from_pandas(df)
expect = df.iloc[:, [True, True]]
actual = cdf.iloc[:, [True, True]]
assert_eq(expect, actual)


@pytest.mark.xfail(
reason="https://github.com/rapidsai/cudf/issues/13266 "
"and https://github.com/rapidsai/cudf/issues/13273"
)
def test_iloc_13266():
df = pd.DataFrame(np.arange(4).reshape(2, 2))
cdf = cudf.from_pandas(df)

expect = df.iloc[:, [0, 1, 0]]
actual = cdf.iloc[:, [0, 1, 0]]
assert_eq(expect, actual)


@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/13267")
@pytest.mark.parametrize(
"indexer", [(..., 0), (0, ...)], ids=["row_ellipsis", "column_ellipsis"]
)
def test_iloc_13267(indexer):
df = pd.DataFrame(np.arange(4).reshape(2, 2))
cdf = cudf.from_pandas(df)

expect = df.iloc[indexer]
actual = cdf.iloc[indexer]
assert_eq(expect, actual)


@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/12833")
def test_loc_12833_unordered():
df = pd.DataFrame({"a": [1, 2, 3]}, index=[7, 0, 4])
cdf = cudf.from_pandas(df)

# Check that pandas don't change their mind
with pytest.raises(KeyError):
df.loc[1:5]

with pytest.raises(KeyError):
cdf.loc[1:5]


@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/13379")
@pytest.mark.parametrize("index", [range(5), list(range(5))])
def test_loc_13379_keyerror_missing(index):
df = pd.DataFrame({"a": index}, index=index)
cdf = cudf.from_pandas(df)
# Check that pandas don't change their mind
with pytest.raises(KeyError):
df.loc[[0, 5]]

with pytest.raises(KeyError):
cdf.loc[[0, 5]]


class TestLocIndexWithOrder:
@pytest.fixture(params=["increasing", "decreasing", "neither"])
def order(self, request):
return request.param

@pytest.fixture(params=[-1, 1], ids=["reverse", "forward"])
def take_order(self, request):
return request.param

@pytest.fixture(params=["float", "int", "string"])
def dtype(self, request):
return request.param

@pytest.fixture
def index(self, order, dtype):
if dtype == "string":
index = ["a", "h", "f", "z"]
elif dtype == "int":
index = [-1, 10, 7, 14]
elif dtype == "float":
index = [-1.5, 7.10, 2.4, 11.2]
else:
raise ValueError(f"Unhandled index dtype {dtype}")
if order == "decreasing":
return sorted(index, reverse=True)
elif order == "increasing":
return sorted(index)
elif order == "neither":
return index
else:
raise ValueError(f"Unhandled index order {order}")

@pytest.fixture
def df(self, index):
return cudf.DataFrame({"a": range(len(index))}, index=index)

def test_loc_index_inindex_slice(self, df, take_order):
pdf = df.to_pandas()
lo = pdf.index[1]
hi = pdf.index[-2]
expect = pdf.loc[lo:hi:take_order]
actual = df.loc[lo:hi:take_order]
assert_eq(expect, actual)

def test_loc_index_inindex_subset(self, df, take_order):
pdf = df.to_pandas()
vals = [pdf.index[0], pdf.index[2]][::take_order]
expect = pdf.loc[vals]
actual = df.loc[vals]
assert_eq(expect, actual)

def test_loc_index_notinindex_slice(
self, request, df, order, dtype, take_order
):
if not (order == "increasing" and dtype in {"int", "float"}):
request.applymarker(
pytest.mark.xfail(
reason="https://github.com/rapidsai/cudf/issues/12833"
)
)
pdf = df.to_pandas()
lo = pdf.index[1]
hi = pdf.index[-2]
if isinstance(lo, str):
lo = chr(ord(lo) - 1)
hi = chr(ord(hi) + 1)
else:
lo -= 1
hi += 1
if order == "neither":
with pytest.raises(KeyError):
pdf.loc[lo:hi:take_order]
with pytest.raises(KeyError):
df.loc[lo:hi:take_order]
else:
expect = pdf.loc[lo:hi:take_order]
actual = df.loc[lo:hi:take_order]
assert_eq(expect, actual)
97 changes: 96 additions & 1 deletion python/cudf/cudf/tests/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,6 @@ def test_setitem_dataframe_series_inplace(df):
],
)
def test_series_set_equal_length_object_by_mask(replace_data):

psr = pd.Series([1, 2, 3, 4, 5], dtype="Int64")
gsr = cudf.from_pandas(psr)

Expand Down Expand Up @@ -368,3 +367,99 @@ def test_setitem_str_trailing_null(n):
assert s[0] == ""
s[0] = "\x00"
assert s[0] == "\x00"


@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/7448")
def test_iloc_setitem_7448():
index = pd.MultiIndex.from_product([(1, 2), (3, 4)])
expect = cudf.Series([1, 2, 3, 4], index=index)
actual = cudf.from_pandas(expect)
expect[(1, 3)] = 101
actual[(1, 3)] = 101
assert_eq(expect, actual)


@pytest.mark.parametrize(
"value",
[
"7",
pytest.param(
["7", "8"],
marks=pytest.mark.xfail(
reason="https://github.com/rapidsai/cudf/issues/11298"
),
),
],
)
def test_loc_setitem_string_11298(value):
df = pd.DataFrame({"a": ["a", "b", "c"]})
cdf = cudf.from_pandas(df)

df.loc[:1, "a"] = value

cdf.loc[:1, "a"] = value

assert_eq(df, cdf)


@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/11944")
def test_loc_setitem_list_11944():
df = pd.DataFrame(
data={"a": ["yes", "no"], "b": [["l1", "l2"], ["c", "d"]]}
)
cdf = cudf.from_pandas(df)
df.loc[df.a == "yes", "b"] = [["hello"]]
cdf.loc[df.a == "yes", "b"] = [["hello"]]
assert_eq(df, cdf)


@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/12504")
def test_loc_setitem_extend_empty_12504():
df = pd.DataFrame(columns=["a"])
cdf = cudf.from_pandas(df)

df.loc[0] = [1]

cdf.loc[0] = [1]

assert_eq(df, cdf)


@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/12505")
def test_loc_setitem_extend_existing_12505():
df = pd.DataFrame({"a": [0]})
cdf = cudf.from_pandas(df)

df.loc[1] = 1

cdf.loc[1] = 1

assert_eq(df, cdf)


@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/12801")
def test_loc_setitem_add_column_partial_12801():
df = pd.DataFrame({"a": [0, 1, 2]})
cdf = cudf.from_pandas(df)

df.loc[df.a < 2, "b"] = 1

cdf.loc[cdf.a < 2, "b"] = 1

assert_eq(df, cdf)


@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/13031")
@pytest.mark.parametrize("other_index", [["1", "3", "2"], [1, 2, 3]])
def test_loc_setitem_series_index_alignment_13031(other_index):
s = pd.Series([1, 2, 3], index=["1", "2", "3"])
other = pd.Series([5, 6, 7], index=other_index)

cs = cudf.from_pandas(s)
cother = cudf.from_pandas(other)

s.loc[["1", "3"]] = other

cs.loc[["1", "3"]] = cother

assert_eq(s, cs)

0 comments on commit 46f17f7

Please sign in to comment.