diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py index 447b2b3c4f5..161b245953b 100644 --- a/python/cudf/cudf/tests/test_duplicates.py +++ b/python/cudf/cudf/tests/test_duplicates.py @@ -11,26 +11,6 @@ from cudf import concat from cudf.testing._utils import assert_eq, assert_exceptions_equal -# TODO: PANDAS 1.0 support -# Revisit drop_duplicates() tests to update parameters like ignore_index. - - -def assert_df(g, p): - # assert_eq() with sorted index of dataframes - g = g.sort_index() - p = p.sort_index() - return assert_eq(g, p) - - -def assert_df2(g, p): - assert g.index.dtype == p.index.dtype - np.testing.assert_equal(g.index.to_numpy(), p.index) - assert tuple(g.columns) == tuple(p.columns) - for k in g.columns: - assert g[k].dtype == p[k].dtype - np.testing.assert_equal(g[k].to_numpy(), p[k]) - - # most tests are similar to pandas drop_duplicates @@ -48,6 +28,7 @@ def test_duplicated_with_misspelled_column_name(subset): @pytest.mark.parametrize("keep", ["first", "last", False]) +@pytest.mark.parametrize("ignore_index", [True, False]) @pytest.mark.parametrize( "data", [ @@ -57,14 +38,18 @@ def test_duplicated_with_misspelled_column_name(subset): pd.Series(["aaa"] * 10, dtype="object"), ], ) -def test_drop_duplicates_series(data, keep): +def test_drop_duplicates_series(data, keep, ignore_index): pds = pd.Series(data) gds = cudf.from_pandas(pds) - assert_df(pds.drop_duplicates(keep=keep), gds.drop_duplicates(keep=keep)) - pds.drop_duplicates(keep=keep, inplace=True) - gds.drop_duplicates(keep=keep, inplace=True) - assert_df(pds, gds) + assert_eq( + pds.drop_duplicates(keep=keep, ignore_index=ignore_index), + gds.drop_duplicates(keep=keep, ignore_index=ignore_index), + ) + + pds.drop_duplicates(keep=keep, inplace=True, ignore_index=ignore_index) + gds.drop_duplicates(keep=keep, inplace=True, ignore_index=ignore_index) + assert_eq(pds, gds) def test_drop_duplicates(): @@ -82,31 +67,31 @@ def test_drop_duplicates(): result.drop_duplicates("AAA", inplace=True) expected = pdf.copy() expected.drop_duplicates("AAA", inplace=True) - assert_df(result, expected) + assert_eq(result, expected) result = gdf.drop_duplicates("AAA", keep="last") expected = pdf.drop_duplicates("AAA", keep="last") - assert_df(result, expected) + assert_eq(result, expected) result = gdf.drop_duplicates("AAA", keep=False) expected = pdf.drop_duplicates("AAA", keep=False) - assert_df(result, expected) + assert_eq(result, expected) assert len(result) == 0 # multi column expected = pdf.loc[[0, 1, 2, 3]] result = gdf.drop_duplicates(np.array(["AAA", "B"])) - assert_df(result, expected) + assert_eq(result, expected) result = pdf.drop_duplicates(np.array(["AAA", "B"])) - assert_df(result, expected) + assert_eq(result, expected) result = gdf.drop_duplicates(("AAA", "B"), keep="last") expected = pdf.drop_duplicates(("AAA", "B"), keep="last") - assert_df(result, expected) + assert_eq(result, expected) result = gdf.drop_duplicates(("AAA", "B"), keep=False) expected = pdf.drop_duplicates(("AAA", "B"), keep=False) - assert_df(result, expected) + assert_eq(result, expected) # consider everything df2 = gdf.loc[:, ["AAA", "B", "C"]] @@ -114,60 +99,60 @@ def test_drop_duplicates(): result = df2.drop_duplicates() # in this case only expected = df2.drop_duplicates(["AAA", "B"]) - assert_df(result, expected) + assert_eq(result, expected) result = df2.drop_duplicates(keep="last") expected = df2.drop_duplicates(["AAA", "B"], keep="last") - assert_df(result, expected) + assert_eq(result, expected) result = df2.drop_duplicates(keep=False) expected = df2.drop_duplicates(["AAA", "B"], keep=False) - assert_df(result, expected) + assert_eq(result, expected) # integers result = gdf.drop_duplicates("C") expected = pdf.drop_duplicates("C") - assert_df(result, expected) + assert_eq(result, expected) result = gdf.drop_duplicates("C", keep="last") expected = pdf.drop_duplicates("C", keep="last") - assert_df(result, expected) + assert_eq(result, expected) gdf["E"] = gdf["C"].astype("int8") result = gdf.drop_duplicates("E") pdf["E"] = pdf["C"].astype("int8") expected = pdf.drop_duplicates("E") - assert_df(result, expected) + assert_eq(result, expected) result = gdf.drop_duplicates("E", keep="last") expected = pdf.drop_duplicates("E", keep="last") - assert_df(result, expected) + assert_eq(result, expected) pdf = pd.DataFrame( {"x": [7, 6, 3, 3, 4, 8, 0], "y": [0, 6, 5, 5, 9, 1, 2]} ) gdf = cudf.DataFrame.from_pandas(pdf) - assert_df(gdf.drop_duplicates(), pdf.drop_duplicates()) + assert_eq(gdf.drop_duplicates(), pdf.drop_duplicates()) pdf = pd.DataFrame([[1, 0], [0, 2]]) gdf = cudf.DataFrame.from_pandas(pdf) - assert_df(gdf.drop_duplicates(), pdf.drop_duplicates()) + assert_eq(gdf.drop_duplicates(), pdf.drop_duplicates()) pdf = pd.DataFrame([[-2, 0], [0, -4]]) gdf = cudf.DataFrame.from_pandas(pdf) - assert_df(gdf.drop_duplicates(), pdf.drop_duplicates()) + assert_eq(gdf.drop_duplicates(), pdf.drop_duplicates()) x = np.iinfo(np.int64).max / 3 * 2 pdf = pd.DataFrame([[-x, x], [0, x + 4]]) gdf = cudf.DataFrame.from_pandas(pdf) - assert_df(gdf.drop_duplicates(), pdf.drop_duplicates()) + assert_eq(gdf.drop_duplicates(), pdf.drop_duplicates()) pdf = pd.DataFrame([[-x, x], [x, x + 4]]) gdf = cudf.DataFrame.from_pandas(pdf) - assert_df(gdf.drop_duplicates(), pdf.drop_duplicates()) + assert_eq(gdf.drop_duplicates(), pdf.drop_duplicates()) pdf = pd.DataFrame([i] * 9 for i in range(16)) pdf = pd.concat([pdf, pd.DataFrame([[1] + [0] * 8])], ignore_index=True) gdf = cudf.DataFrame.from_pandas(pdf) - assert_df(gdf.drop_duplicates(), pdf.drop_duplicates()) + assert_eq(gdf.drop_duplicates(), pdf.drop_duplicates()) @pytest.mark.skip(reason="cudf does not support duplicate column names yet") @@ -178,11 +163,11 @@ def test_drop_duplicates_with_duplicate_column_names(): df = cudf.DataFrame.from_pandas(df) result0 = df.drop_duplicates() - assert_df(result0, df) + assert_eq(result0, df) result1 = df.drop_duplicates("a") expected1 = df[:2] - assert_df(result1, expected1) + assert_eq(result1, expected1) def test_drop_duplicates_for_take_all(): @@ -198,28 +183,28 @@ def test_drop_duplicates_for_take_all(): # single column result = gdf.drop_duplicates("AAA") expected = pdf.drop_duplicates("AAA") - assert_df(result, expected) + assert_eq(result, expected) result = gdf.drop_duplicates("AAA", keep="last") expected = pdf.drop_duplicates("AAA", keep="last") - assert_df(result, expected) + assert_eq(result, expected) result = gdf.drop_duplicates("AAA", keep=False) expected = pdf.drop_duplicates("AAA", keep=False) - assert_df(result, expected) + assert_eq(result, expected) # multiple columns result = gdf.drop_duplicates(["AAA", "B"]) expected = pdf.drop_duplicates(["AAA", "B"]) - assert_df(result, expected) + assert_eq(result, expected) result = gdf.drop_duplicates(["AAA", "B"], keep="last") expected = pdf.drop_duplicates(["AAA", "B"], keep="last") - assert_df(result, expected) + assert_eq(result, expected) result = gdf.drop_duplicates(["AAA", "B"], keep=False) expected = pdf.drop_duplicates(["AAA", "B"], keep=False) - assert_df(result, expected) + assert_eq(result, expected) def test_drop_duplicates_tuple(): @@ -244,21 +229,21 @@ def test_drop_duplicates_tuple(): # single column result = gdf.drop_duplicates(("AA", "AB")) expected = pdf.drop_duplicates(("AA", "AB")) - assert_df(result, expected) + assert_eq(result, expected) result = gdf.drop_duplicates(("AA", "AB"), keep="last") expected = pdf.drop_duplicates(("AA", "AB"), keep="last") - assert_df(result, expected) + assert_eq(result, expected) result = gdf.drop_duplicates(("AA", "AB"), keep=False) expected = pdf.drop_duplicates(("AA", "AB"), keep=False) # empty df assert len(result) == 0 - assert_df(result, expected) + assert_eq(result, expected) # multi column expected = pdf.drop_duplicates((("AA", "AB"), "B")) result = gdf.drop_duplicates((("AA", "AB"), "B")) - assert_df(result, expected) + assert_eq(result, expected) @pytest.mark.parametrize( @@ -274,11 +259,11 @@ def test_drop_duplicates_tuple(): def test_drop_duplicates_empty(df): df = cudf.DataFrame.from_pandas(df) result = df.drop_duplicates() - assert_df(result, df) + assert_eq(result, df) result = df.copy() result.drop_duplicates(inplace=True) - assert_df(result, df) + assert_eq(result, df) @pytest.mark.parametrize("num_columns", [3, 4, 5]) @@ -296,18 +281,18 @@ def get_pdf(n_dup): for i in range(5): pdf = get_pdf(i) gdf = cudf.DataFrame.from_pandas(pdf) - assert_df(gdf.drop_duplicates(), pdf.drop_duplicates()) + assert_eq(gdf.drop_duplicates(), pdf.drop_duplicates()) # subset columns, single columns - assert_df( + assert_eq( gdf.drop_duplicates(pdf.columns[:-1]), pdf.drop_duplicates(pdf.columns[:-1]), ) - assert_df( + assert_eq( gdf.drop_duplicates(pdf.columns[-1]), pdf.drop_duplicates(pdf.columns[-1]), ) - assert_df( + assert_eq( gdf.drop_duplicates(pdf.columns[0]), pdf.drop_duplicates(pdf.columns[0]), ) @@ -315,12 +300,12 @@ def get_pdf(n_dup): # subset columns shuffled cols = list(pdf.columns) random.Random(3).shuffle(cols) - assert_df(gdf.drop_duplicates(cols), pdf.drop_duplicates(cols)) + assert_eq(gdf.drop_duplicates(cols), pdf.drop_duplicates(cols)) random.Random(3).shuffle(cols) - assert_df(gdf.drop_duplicates(cols[:-1]), pdf.drop_duplicates(cols[:-1])) + assert_eq(gdf.drop_duplicates(cols[:-1]), pdf.drop_duplicates(cols[:-1])) random.Random(3).shuffle(cols) - assert_df(gdf.drop_duplicates(cols[-1]), pdf.drop_duplicates(cols[-1])) - assert_df( + assert_eq(gdf.drop_duplicates(cols[-1]), pdf.drop_duplicates(cols[-1])) + assert_eq( gdf.drop_duplicates(cols, keep="last"), pdf.drop_duplicates(cols, keep="last"), ) @@ -332,7 +317,7 @@ def test_dataframe_drop_duplicates_method(): columns=["n1", "n2", "s1"], ) gdf = cudf.DataFrame.from_pandas(pdf) - assert_df(gdf.drop_duplicates(), pdf.drop_duplicates()) + assert_eq(gdf.drop_duplicates(), pdf.drop_duplicates()) assert_eq( gdf.drop_duplicates("n1")["n1"].reset_index(drop=True), @@ -355,13 +340,13 @@ def test_dataframe_drop_duplicates_method(): assert gdf.drop_duplicates("s1", inplace=True) is None gdf = cudf.DataFrame.from_pandas(pdf) - assert_df(gdf.drop_duplicates("n1"), pdf.drop_duplicates("n1")) - assert_df(gdf.drop_duplicates("n2"), pdf.drop_duplicates("n2")) - assert_df(gdf.drop_duplicates("s1"), pdf.drop_duplicates("s1")) - assert_df( + assert_eq(gdf.drop_duplicates("n1"), pdf.drop_duplicates("n1")) + assert_eq(gdf.drop_duplicates("n2"), pdf.drop_duplicates("n2")) + assert_eq(gdf.drop_duplicates("s1"), pdf.drop_duplicates("s1")) + assert_eq( gdf.drop_duplicates(["n1", "n2"]), pdf.drop_duplicates(["n1", "n2"]) ) - assert_df( + assert_eq( gdf.drop_duplicates(["n1", "s1"]), pdf.drop_duplicates(["n1", "s1"]) ) @@ -387,13 +372,13 @@ def test_datetime_drop_duplicates(): date_df["value"] = np.random.sample(len(date_df)) df = concat([date_df, date_df[:4]]) - assert_df(df[:-4], df.drop_duplicates()) + assert_eq(df[:-4], df.drop_duplicates()) df2 = df.reset_index() - assert_df(df2[:-4], df2.drop_duplicates()) + assert_eq(df2[:-4], df2.drop_duplicates()) df3 = df.set_index("date") - assert_df(df3[:-4], df3.drop_duplicates()) + assert_eq(df3[:-4], df3.drop_duplicates()) def test_drop_duplicates_NA(): @@ -410,29 +395,29 @@ def test_drop_duplicates_NA(): # single column result = df.drop_duplicates("A") expected = df.to_pandas().loc[[0, 2, 3]] - assert_df(result, expected) + assert_eq(result, expected) result = df.drop_duplicates("A", keep="last") expected = df.to_pandas().loc[[1, 6, 7]] - assert_df(result, expected) + assert_eq(result, expected) result = df.drop_duplicates("A", keep=False) expected = df.to_pandas().loc[[]] # empty df - assert_df(result, expected) + assert_eq(result, expected) assert len(result) == 0 # multi column result = df.drop_duplicates(["A", "B"]) expected = df.to_pandas().loc[[0, 2, 3, 6]] - assert_df(result, expected) + assert_eq(result, expected) result = df.drop_duplicates(["A", "B"], keep="last") expected = df.to_pandas().loc[[1, 5, 6, 7]] - assert_df(result, expected) + assert_eq(result, expected) result = df.drop_duplicates(["A", "B"], keep=False) expected = df.to_pandas().loc[[6]] - assert_df(result, expected) + assert_eq(result, expected) # nan df = pd.DataFrame( @@ -447,83 +432,77 @@ def test_drop_duplicates_NA(): # single column result = df.drop_duplicates("C") expected = df[:2] - assert_df(result, expected) + assert_eq(result, expected) result = df.drop_duplicates("C", keep="last") expected = df.to_pandas().loc[[3, 7]] - assert_df(result, expected) + assert_eq(result, expected) result = df.drop_duplicates("C", keep=False) expected = df.to_pandas().loc[[]] # empty df - assert_df(result, expected) + assert_eq(result, expected) assert len(result) == 0 # multi column result = df.drop_duplicates(["C", "B"]) expected = df.to_pandas().loc[[0, 1, 2, 4]] - assert_df(result, expected) + assert_eq(result, expected) result = df.drop_duplicates(["C", "B"], keep="last") expected = df.to_pandas().loc[[1, 3, 6, 7]] - assert_df(result, expected) + assert_eq(result, expected) result = df.drop_duplicates(["C", "B"], keep=False) expected = df.to_pandas().loc[[1]] - assert_df(result, expected) + assert_eq(result, expected) def test_drop_duplicates_NA_for_take_all(): - # TODO: PANDAS 1.0 support - add ignore_index for - # pandas drop_duplicates calls in this function. - - # none pdf = pd.DataFrame( { "A": [None, None, "foo", "bar", "foo", "baz", "bar", "qux"], "C": [1.0, np.nan, np.nan, np.nan, 1.0, 2.0, 3, 1.0], } ) - df = cudf.DataFrame.from_pandas(pdf) - # single column + + # single column with None result = df.drop_duplicates("A") - expected = pdf.iloc[[0, 2, 3, 5, 7]] - assert_df(result, expected) - assert_df( + expected = pdf.drop_duplicates("A") + assert_eq(result, expected) + assert_eq( df.drop_duplicates("A", ignore_index=True), - result.reset_index(drop=True), + pdf.drop_duplicates("A", ignore_index=True), ) result = df.drop_duplicates("A", keep="last") - expected = pdf.iloc[[1, 4, 5, 6, 7]] - assert_df(result, expected) - assert_df( + expected = pdf.drop_duplicates("A", keep="last") + assert_eq(result, expected) + assert_eq( df.drop_duplicates("A", ignore_index=True, keep="last"), - result.reset_index(drop=True), + pdf.drop_duplicates("A", ignore_index=True, keep="last"), ) result = df.drop_duplicates("A", keep=False) - expected = pdf.iloc[[5, 7]] - assert_df(result, expected) - assert_df( + expected = pdf.drop_duplicates("A", keep=False) + assert_eq(result, expected) + assert_eq( df.drop_duplicates("A", ignore_index=True, keep=False), - result.reset_index(drop=True), + pdf.drop_duplicates("A", ignore_index=True, keep=False), ) - # nan - - # single column + # single column with nan result = df.drop_duplicates("C") - expected = pdf.iloc[[0, 1, 5, 6]] - assert_df(result, expected) + expected = pdf.drop_duplicates("C") + assert_eq(result, expected) result = df.drop_duplicates("C", keep="last") - expected = pdf.iloc[[3, 5, 6, 7]] - assert_df(result, expected) + expected = pdf.drop_duplicates("C", keep="last") + assert_eq(result, expected) result = df.drop_duplicates("C", keep=False) - expected = pdf.iloc[[5, 6]] - assert_df(result, expected) + expected = pdf.drop_duplicates("C", keep=False) + assert_eq(result, expected) def test_drop_duplicates_inplace(): @@ -541,19 +520,19 @@ def test_drop_duplicates_inplace(): df.drop_duplicates("A", inplace=True) expected = orig[:2] result = df - assert_df(result, expected) + assert_eq(result, expected) df = orig.copy() df.drop_duplicates("A", keep="last", inplace=True) expected = orig.loc[[6, 7]] result = df - assert_df(result, expected) + assert_eq(result, expected) df = orig.copy() df.drop_duplicates("A", keep=False, inplace=True) expected = orig.loc[[]] result = df - assert_df(result, expected) + assert_eq(result, expected) assert len(df) == 0 # multi column @@ -561,19 +540,19 @@ def test_drop_duplicates_inplace(): df.drop_duplicates(["A", "B"], inplace=True) expected = orig.loc[[0, 1, 2, 3]] result = df - assert_df(result, expected) + assert_eq(result, expected) df = orig.copy() df.drop_duplicates(["A", "B"], keep="last", inplace=True) expected = orig.loc[[0, 5, 6, 7]] result = df - assert_df(result, expected) + assert_eq(result, expected) df = orig.copy() df.drop_duplicates(["A", "B"], keep=False, inplace=True) expected = orig.loc[[0]] result = df - assert_df(result, expected) + assert_eq(result, expected) # consider everything orig2 = orig.loc[:, ["A", "B", "C"]].copy() @@ -583,19 +562,19 @@ def test_drop_duplicates_inplace(): # in this case only expected = orig2.drop_duplicates(["A", "B"]) result = df2 - assert_df(result, expected) + assert_eq(result, expected) df2 = orig2.copy() df2.drop_duplicates(keep="last", inplace=True) expected = orig2.drop_duplicates(["A", "B"], keep="last") result = df2 - assert_df(result, expected) + assert_eq(result, expected) df2 = orig2.copy() df2.drop_duplicates(keep=False, inplace=True) expected = orig2.drop_duplicates(["A", "B"], keep=False) result = df2 - assert_df(result, expected) + assert_eq(result, expected) def test_drop_duplicates_multi_index(): @@ -610,11 +589,11 @@ def test_drop_duplicates_multi_index(): expected = pdf.drop_duplicates() result = gdf.drop_duplicates() - assert_df(result.to_pandas(), expected) + assert_eq(result.to_pandas(), expected) # FIXME: to_pandas needed until sort_index support for MultiIndex for col in gdf.columns: - assert_df( + assert_eq( gdf[col].drop_duplicates().to_pandas(), pdf[col].drop_duplicates(), ) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index b2bf687ba06..a9ba80a395d 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -972,8 +972,7 @@ def test_string_split_re(data, pat, n, expand): ps = pd.Series(data, dtype="str") gs = cudf.Series(data, dtype="str") - # Pandas does not support the regex parameter until 1.4.0 - expect = ps.str.split(pat=pat, n=n, expand=expand) + expect = ps.str.split(pat=pat, n=n, expand=expand, regex=True) got = gs.str.split(pat=pat, n=n, expand=expand, regex=True) assert_eq(expect, got) diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index afe2a050695..0211f5ee61a 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -6,7 +6,6 @@ import numpy as np import pandas as pd import pytest -from packaging import version import dask from dask import dataframe as dd @@ -596,14 +595,6 @@ def test_unary_ops(func, gdf, gddf): p = func(gdf) g = func(gddf) - # Fixed in https://github.com/dask/dask/pull/4657 - if isinstance(p, cudf.Index): - if version.parse(dask.__version__) < version.parse("1.1.6"): - pytest.skip( - "dask.dataframe assert_eq index check hardcoded to " - "pandas prior to 1.1.6 release" - ) - dd.assert_eq(p, g, check_names=False)