diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index a96db59dee3..5b2b3e72481 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -27,6 +27,16 @@ _index_type_aggs = {"count", "idxmin", "idxmax"} +def assert_groupby_results_equal(expect, got, sort=True, **kwargs): + # Because we don't sort by index by default in groupby, + # sort expect and got by index before comparing + if sort: + expect = expect.sort_index() + got = got.sort_index() + else: + assert_eq(expect.sort_index(), got.sort_index(), **kwargs) + + def make_frame( dataframe_class, nelem, @@ -73,17 +83,11 @@ def pdf(gdf): @pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) def test_groupby_mean(nelem): - got_df = ( - make_frame(DataFrame, nelem=nelem) - .groupby(["x", "y"], sort=True) - .mean() - ) + got_df = make_frame(DataFrame, nelem=nelem).groupby(["x", "y"]).mean() expect_df = ( - make_frame(pd.DataFrame, nelem=nelem) - .groupby(["x", "y"], sort=True) - .mean() + make_frame(pd.DataFrame, nelem=nelem).groupby(["x", "y"]).mean() ) - assert_eq(got_df, expect_df) + assert_groupby_results_equal(got_df, expect_df) @pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) @@ -92,60 +96,60 @@ def test_groupby_mean_3level(nelem): bys = list("xyz") got_df = ( make_frame(DataFrame, nelem=nelem, extra_levels=lvls) - .groupby(bys, sort=True) + .groupby(bys) .mean() ) expect_df = ( make_frame(pd.DataFrame, nelem=nelem, extra_levels=lvls) - .groupby(bys, sort=True) + .groupby(bys) .mean() ) - assert_eq(got_df, expect_df) + assert_groupby_results_equal(got_df, expect_df) @pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) def test_groupby_agg_mean_min(nelem): got_df = ( make_frame(DataFrame, nelem=nelem) - .groupby(["x", "y"], sort=True) + .groupby(["x", "y"]) .agg(["mean", "min"]) ) expect_df = ( make_frame(pd.DataFrame, nelem=nelem) - .groupby(["x", "y"], sort=True) + .groupby(["x", "y"]) .agg(["mean", "min"]) ) - assert_eq(got_df, expect_df) + assert_groupby_results_equal(got_df, expect_df) @pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) def test_groupby_agg_min_max_dictargs(nelem): expect_df = ( make_frame(pd.DataFrame, nelem=nelem, extra_vals="ab") - .groupby(["x", "y"], sort=True) + .groupby(["x", "y"]) .agg({"a": "min", "b": "max"}) ) got_df = ( make_frame(DataFrame, nelem=nelem, extra_vals="ab") - .groupby(["x", "y"], sort=True) + .groupby(["x", "y"]) .agg({"a": "min", "b": "max"}) ) - assert_eq(expect_df, got_df) + assert_groupby_results_equal(expect_df, got_df) @pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) def test_groupby_agg_min_max_dictlist(nelem): expect_df = ( make_frame(pd.DataFrame, nelem=nelem, extra_vals="ab") - .groupby(["x", "y"], sort=True) + .groupby(["x", "y"]) .agg({"a": ["min", "max"], "b": ["min", "max"]}) ) got_df = ( make_frame(DataFrame, nelem=nelem, extra_vals="ab") - .groupby(["x", "y"], sort=True) + .groupby(["x", "y"]) .agg({"a": ["min", "max"], "b": ["min", "max"]}) ) - assert_eq(got_df, expect_df) + assert_groupby_results_equal(got_df, expect_df) @pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) @@ -155,24 +159,18 @@ def test_groupby_agg_min_max_dictlist(nelem): def test_groupby_2keys_agg(nelem, func): # gdf (Note: lack of multiIndex) expect_df = ( - make_frame(pd.DataFrame, nelem=nelem) - .groupby(["x", "y"], sort=True) - .agg(func) - ) - got_df = ( - make_frame(DataFrame, nelem=nelem) - .groupby(["x", "y"], sort=True) - .agg(func) + make_frame(pd.DataFrame, nelem=nelem).groupby(["x", "y"]).agg(func) ) + got_df = make_frame(DataFrame, nelem=nelem).groupby(["x", "y"]).agg(func) check_dtype = False if func in _index_type_aggs else True - assert_eq(got_df, expect_df, check_dtype=check_dtype) + assert_groupby_results_equal(got_df, expect_df, check_dtype=check_dtype) @pytest.mark.parametrize("as_index", [True, False]) def test_groupby_as_index_single_agg(pdf, gdf, as_index): - gdf = gdf.groupby("y", as_index=as_index, sort=True).agg({"x": "mean"}) - pdf = pdf.groupby("y", as_index=as_index, sort=True).agg({"x": "mean"}) - assert_eq(pdf, gdf) + gdf = gdf.groupby("y", as_index=as_index).agg({"x": "mean"}) + pdf = pdf.groupby("y", as_index=as_index).agg({"x": "mean"}) + assert_groupby_results_equal(pdf, gdf) @pytest.mark.parametrize("as_index", [True, False]) @@ -198,36 +196,33 @@ def test_groupby_as_index_multiindex(pdf, gdf, as_index): def test_groupby_default(pdf, gdf): - gdf = gdf.groupby("y", sort=True).agg({"x": "mean"}) - pdf = pdf.groupby("y", sort=True).agg({"x": "mean"}) - assert_eq(pdf, gdf) + gdf = gdf.groupby("y").agg({"x": "mean"}) + pdf = pdf.groupby("y").agg({"x": "mean"}) + assert_groupby_results_equal(pdf, gdf) def test_group_keys_true(pdf, gdf): - gdf = gdf.groupby("y", group_keys=True, sort=True).sum() - pdf = pdf.groupby("y", group_keys=True, sort=True).sum() - assert_eq(pdf, gdf) + gdf = gdf.groupby("y", group_keys=True).sum() + pdf = pdf.groupby("y", group_keys=True).sum() + assert_groupby_results_equal(pdf, gdf) @pytest.mark.parametrize("as_index", [True, False]) def test_groupby_getitem_getattr(as_index): pdf = pd.DataFrame({"x": [1, 3, 1], "y": [1, 2, 3], "z": [1, 4, 5]}) gdf = cudf.from_pandas(pdf) - assert_eq( - pdf.groupby("x", sort=True)["y"].sum(), - gdf.groupby("x", sort=True)["y"].sum(), + assert_groupby_results_equal( + pdf.groupby("x")["y"].sum(), gdf.groupby("x")["y"].sum(), ) - assert_eq( - pdf.groupby("x", sort=True).y.sum(), - gdf.groupby("x", sort=True).y.sum(), + assert_groupby_results_equal( + pdf.groupby("x").y.sum(), gdf.groupby("x").y.sum(), ) - assert_eq( - pdf.groupby("x", sort=True)[["y"]].sum(), - gdf.groupby("x", sort=True)[["y"]].sum(), + assert_groupby_results_equal( + pdf.groupby("x")[["y"]].sum(), gdf.groupby("x")[["y"]].sum(), ) - assert_eq( - pdf.groupby(["x", "y"], as_index=as_index, sort=True).sum(), - gdf.groupby(["x", "y"], as_index=as_index, sort=True).sum(), + assert_groupby_results_equal( + pdf.groupby(["x", "y"], as_index=as_index).sum(), + gdf.groupby(["x", "y"], as_index=as_index).sum(), ) @@ -277,10 +272,8 @@ def test_groupby_apply(): df["val1"] = np.random.random(nelem) df["val2"] = np.random.random(nelem) - expect_grpby = df.to_pandas().groupby( - ["key1", "key2"], as_index=False, sort=True - ) - got_grpby = df.groupby(["key1", "key2"], sort=True) + expect_grpby = df.to_pandas().groupby(["key1", "key2"], as_index=False) + got_grpby = df.groupby(["key1", "key2"]) def foo(df): df["out"] = df["val1"] + df["val2"] @@ -288,7 +281,7 @@ def foo(df): expect = expect_grpby.apply(foo) got = got_grpby.apply(foo) - assert_eq(expect, got) + assert_groupby_results_equal(expect, got) def test_groupby_apply_grouped(): @@ -300,10 +293,8 @@ def test_groupby_apply_grouped(): df["val1"] = np.random.random(nelem) df["val2"] = np.random.random(nelem) - expect_grpby = df.to_pandas().groupby( - ["key1", "key2"], as_index=False, sort=True - ) - got_grpby = df.groupby(["key1", "key2"], sort=True) + expect_grpby = df.to_pandas().groupby(["key1", "key2"], as_index=False) + got_grpby = df.groupby(["key1", "key2"]) def foo(key1, val1, com1, com2): for i in range(cuda.threadIdx.x, len(key1), cuda.blockDim.x): @@ -328,7 +319,7 @@ def emulate(df): expect = expect_grpby.apply(emulate) expect = expect.sort_values(["key1", "key2"]) - assert_eq(expect, got) + assert_groupby_results_equal(expect, got) @pytest.mark.parametrize("nelem", [100, 500]) @@ -337,20 +328,14 @@ def emulate(df): ["mean", "std", "var", "min", "max", "idxmin", "idxmax", "count", "sum"], ) def test_groupby_cudf_2keys_agg(nelem, func): - got_df = ( - make_frame(DataFrame, nelem=nelem) - .groupby(["x", "y"], sort=True) - .agg(func) - ) + got_df = make_frame(DataFrame, nelem=nelem).groupby(["x", "y"]).agg(func) # pandas expect_df = ( - make_frame(pd.DataFrame, nelem=nelem) - .groupby(["x", "y"], sort=True) - .agg(func) + make_frame(pd.DataFrame, nelem=nelem).groupby(["x", "y"]).agg(func) ) check_dtype = False if func in _index_type_aggs else True - assert_eq(got_df, expect_df, check_dtype=check_dtype) + assert_groupby_results_equal(got_df, expect_df, check_dtype=check_dtype) @pytest.mark.parametrize( @@ -364,7 +349,7 @@ def test_series_groupby(agg): sa = getattr(sg, agg)() ga = getattr(gg, agg)() check_dtype = False if agg in _index_type_aggs else True - assert_eq(sa, ga, check_dtype=check_dtype) + assert_groupby_results_equal(sa, ga, check_dtype=check_dtype) @pytest.mark.parametrize( @@ -376,7 +361,7 @@ def test_series_groupby_agg(agg): sg = s.groupby(s // 2).agg(agg) gg = g.groupby(g // 2).agg(agg) check_dtype = False if agg in _index_type_aggs else True - assert_eq(sg, gg, check_dtype=check_dtype) + assert_groupby_results_equal(sg, gg, check_dtype=check_dtype) @pytest.mark.parametrize( @@ -405,7 +390,7 @@ def test_groupby_level_zero(agg): pdresult = getattr(pdg, agg)() gdresult = getattr(gdg, agg)() check_dtype = False if agg in _index_type_aggs else True - assert_eq(pdresult, gdresult, check_dtype=check_dtype) + assert_groupby_results_equal(pdresult, gdresult, check_dtype=check_dtype) @pytest.mark.parametrize( @@ -434,59 +419,59 @@ def test_groupby_series_level_zero(agg): pdresult = getattr(pdg, agg)() gdresult = getattr(gdg, agg)() check_dtype = False if agg in _index_type_aggs else True - assert_eq(pdresult, gdresult, check_dtype=check_dtype) + assert_groupby_results_equal(pdresult, gdresult, check_dtype=check_dtype) def test_groupby_column_name(): pdf = pd.DataFrame({"xx": [1.0, 2.0, 3.0], "yy": [1, 2, 3]}) gdf = DataFrame.from_pandas(pdf) - g = gdf.groupby("yy", sort=True) - p = pdf.groupby("yy", sort=True) + g = gdf.groupby("yy") + p = pdf.groupby("yy") gxx = g["xx"].sum() pxx = p["xx"].sum() - assert_eq(pxx, gxx) + assert_groupby_results_equal(pxx, gxx) gxx = g["xx"].count() pxx = p["xx"].count() - assert_eq(pxx, gxx, check_dtype=False) + assert_groupby_results_equal(pxx, gxx, check_dtype=False) gxx = g["xx"].min() pxx = p["xx"].min() - assert_eq(pxx, gxx) + assert_groupby_results_equal(pxx, gxx) gxx = g["xx"].max() pxx = p["xx"].max() - assert_eq(pxx, gxx) + assert_groupby_results_equal(pxx, gxx) gxx = g["xx"].idxmin() pxx = p["xx"].idxmin() - assert_eq(pxx, gxx, check_dtype=False) + assert_groupby_results_equal(pxx, gxx, check_dtype=False) gxx = g["xx"].idxmax() pxx = p["xx"].idxmax() - assert_eq(pxx, gxx, check_dtype=False) + assert_groupby_results_equal(pxx, gxx, check_dtype=False) gxx = g["xx"].mean() pxx = p["xx"].mean() - assert_eq(pxx, gxx) + assert_groupby_results_equal(pxx, gxx) def test_groupby_column_numeral(): pdf = pd.DataFrame({0: [1.0, 2.0, 3.0], 1: [1, 2, 3]}) gdf = DataFrame.from_pandas(pdf) - p = pdf.groupby(1, sort=True) - g = gdf.groupby(1, sort=True) + p = pdf.groupby(1) + g = gdf.groupby(1) pxx = p[0].sum() gxx = g[0].sum() - assert_eq(pxx, gxx) + assert_groupby_results_equal(pxx, gxx) pdf = pd.DataFrame({0.5: [1.0, 2.0, 3.0], 1.5: [1, 2, 3]}) gdf = DataFrame.from_pandas(pdf) - p = pdf.groupby(1.5, sort=True) - g = gdf.groupby(1.5, sort=True) + p = pdf.groupby(1.5) + g = gdf.groupby(1.5) pxx = p[0.5].sum() gxx = g[0.5].sum() - assert_eq(pxx, gxx) + assert_groupby_results_equal(pxx, gxx) @pytest.mark.parametrize( @@ -496,18 +481,18 @@ def test_groupby_column_numeral(): def test_groupby_external_series(series): pdf = pd.DataFrame({"x": [1.0, 2.0, 3.0], "y": [1, 2, 1]}) gdf = DataFrame.from_pandas(pdf) - pxx = pdf.groupby(pd.Series(series), sort=True).x.sum() - gxx = gdf.groupby(cudf.Series(series), sort=True).x.sum() - assert_eq(pxx, gxx) + pxx = pdf.groupby(pd.Series(series)).x.sum() + gxx = gdf.groupby(cudf.Series(series)).x.sum() + assert_groupby_results_equal(pxx, gxx) @pytest.mark.parametrize("series", [[0.0, 1.0], [1.0, 1.0, 1.0, 1.0]]) def test_groupby_external_series_incorrect_length(series): pdf = pd.DataFrame({"x": [1.0, 2.0, 3.0], "y": [1, 2, 1]}) gdf = DataFrame.from_pandas(pdf) - pxx = pdf.groupby(pd.Series(series), sort=True).x.sum() - gxx = gdf.groupby(cudf.Series(series), sort=True).x.sum() - assert_eq(pxx, gxx) + pxx = pdf.groupby(pd.Series(series)).x.sum() + gxx = gdf.groupby(cudf.Series(series)).x.sum() + assert_groupby_results_equal(pxx, gxx) @pytest.mark.parametrize( @@ -517,52 +502,51 @@ def test_groupby_levels(level): idx = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (2, 2)], names=("a", "b")) pdf = pd.DataFrame({"c": [1, 2, 3], "d": [2, 3, 4]}, index=idx) gdf = cudf.from_pandas(pdf) - assert_eq( - pdf.groupby(level=level, sort=True).sum(), - gdf.groupby(level=level, sort=True).sum(), + assert_groupby_results_equal( + pdf.groupby(level=level).sum(), gdf.groupby(level=level).sum(), ) def test_advanced_groupby_levels(): pdf = pd.DataFrame({"x": [1, 2, 3], "y": [1, 2, 1], "z": [1, 1, 1]}) gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby(["x", "y"], sort=True).sum() - gdg = gdf.groupby(["x", "y"], sort=True).sum() - assert_eq(pdg, gdg) - pdh = pdg.groupby(level=1, sort=True).sum() - gdh = gdg.groupby(level=1, sort=True).sum() - assert_eq(pdh, gdh) - pdg = pdf.groupby(["x", "y", "z"], sort=True).sum() - gdg = gdf.groupby(["x", "y", "z"], sort=True).sum() - assert_eq(pdg, gdg) - pdg = pdf.groupby(["z"], sort=True).sum() - gdg = gdf.groupby(["z"], sort=True).sum() - assert_eq(pdg, gdg) - pdg = pdf.groupby(["y", "z"], sort=True).sum() - gdg = gdf.groupby(["y", "z"], sort=True).sum() - assert_eq(pdg, gdg) - pdg = pdf.groupby(["x", "z"], sort=True).sum() - gdg = gdf.groupby(["x", "z"], sort=True).sum() - assert_eq(pdg, gdg) - pdg = pdf.groupby(["y"], sort=True).sum() - gdg = gdf.groupby(["y"], sort=True).sum() - assert_eq(pdg, gdg) - pdg = pdf.groupby(["x"], sort=True).sum() - gdg = gdf.groupby(["x"], sort=True).sum() - assert_eq(pdg, gdg) - pdh = pdg.groupby(level=0, sort=True).sum() - gdh = gdg.groupby(level=0, sort=True).sum() - assert_eq(pdh, gdh) - pdg = pdf.groupby(["x", "y"], sort=True).sum() - gdg = gdf.groupby(["x", "y"], sort=True).sum() - pdh = pdg.groupby(level=[0, 1], sort=True).sum() - gdh = gdg.groupby(level=[0, 1], sort=True).sum() - assert_eq(pdh, gdh) - pdh = pdg.groupby(level=[1, 0], sort=True).sum() - gdh = gdg.groupby(level=[1, 0], sort=True).sum() - assert_eq(pdh, gdh) - pdg = pdf.groupby(["x", "y"], sort=True).sum() - gdg = gdf.groupby(["x", "y"], sort=True).sum() + pdg = pdf.groupby(["x", "y"]).sum() + gdg = gdf.groupby(["x", "y"]).sum() + assert_groupby_results_equal(pdg, gdg) + pdh = pdg.groupby(level=1).sum() + gdh = gdg.groupby(level=1).sum() + assert_groupby_results_equal(pdh, gdh) + pdg = pdf.groupby(["x", "y", "z"]).sum() + gdg = gdf.groupby(["x", "y", "z"]).sum() + assert_groupby_results_equal(pdg, gdg) + pdg = pdf.groupby(["z"]).sum() + gdg = gdf.groupby(["z"]).sum() + assert_groupby_results_equal(pdg, gdg) + pdg = pdf.groupby(["y", "z"]).sum() + gdg = gdf.groupby(["y", "z"]).sum() + assert_groupby_results_equal(pdg, gdg) + pdg = pdf.groupby(["x", "z"]).sum() + gdg = gdf.groupby(["x", "z"]).sum() + assert_groupby_results_equal(pdg, gdg) + pdg = pdf.groupby(["y"]).sum() + gdg = gdf.groupby(["y"]).sum() + assert_groupby_results_equal(pdg, gdg) + pdg = pdf.groupby(["x"]).sum() + gdg = gdf.groupby(["x"]).sum() + assert_groupby_results_equal(pdg, gdg) + pdh = pdg.groupby(level=0).sum() + gdh = gdg.groupby(level=0).sum() + assert_groupby_results_equal(pdh, gdh) + pdg = pdf.groupby(["x", "y"]).sum() + gdg = gdf.groupby(["x", "y"]).sum() + pdh = pdg.groupby(level=[0, 1]).sum() + gdh = gdg.groupby(level=[0, 1]).sum() + assert_groupby_results_equal(pdh, gdh) + pdh = pdg.groupby(level=[1, 0]).sum() + gdh = gdg.groupby(level=[1, 0]).sum() + assert_groupby_results_equal(pdh, gdh) + pdg = pdf.groupby(["x", "y"]).sum() + gdg = gdf.groupby(["x", "y"]).sum() assert_exceptions_equal( lfunc=pdg.groupby, @@ -595,7 +579,7 @@ def test_advanced_groupby_levels(): def test_empty_groupby(func): pdf = pd.DataFrame({"x": [], "y": [], "z": []}) gdf = cudf.from_pandas(pdf) - assert_eq(func(pdf), func(gdf), check_index_type=False) + assert_groupby_results_equal(func(pdf), func(gdf), check_index_type=False) def test_groupby_unsupported_columns(): @@ -613,21 +597,21 @@ def test_groupby_unsupported_columns(): ) pdf["b"] = pd_cat gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby("x", sort=True).sum() - gdg = gdf.groupby("x", sort=True).sum() - assert_eq(pdg, gdg) + pdg = pdf.groupby("x").sum() + gdg = gdf.groupby("x").sum() + assert_groupby_results_equal(pdg, gdg) def test_list_of_series(): pdf = pd.DataFrame({"x": [1, 2, 3], "y": [1, 2, 1]}) gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby([pdf.x], sort=True).y.sum() - gdg = gdf.groupby([gdf.x], sort=True).y.sum() - assert_eq(pdg, gdg) - pdg = pdf.groupby([pdf.x, pdf.y], sort=True).y.sum() - gdg = gdf.groupby([gdf.x, gdf.y], sort=True).y.sum() + pdg = pdf.groupby([pdf.x]).y.sum() + gdg = gdf.groupby([gdf.x]).y.sum() + assert_groupby_results_equal(pdg, gdg) + pdg = pdf.groupby([pdf.x, pdf.y]).y.sum() + gdg = gdf.groupby([gdf.x, gdf.y]).y.sum() pytest.skip() - assert_eq(pdg, gdg) + assert_groupby_results_equal(pdg, gdg) def test_groupby_use_agg_column_as_index(): @@ -637,7 +621,7 @@ def test_groupby_use_agg_column_as_index(): gdf["a"] = [1, 1, 1, 3, 5] pdg = pdf.groupby("a").agg({"a": "count"}) gdg = gdf.groupby("a").agg({"a": "count"}) - assert_eq(pdg, gdg, check_dtype=False) + assert_groupby_results_equal(pdg, gdg, check_dtype=False) def test_groupby_list_then_string(): @@ -646,13 +630,13 @@ def test_groupby_list_then_string(): gdf["b"] = [11, 2, 15, 12, 2] gdf["c"] = [6, 7, 6, 7, 6] pdf = gdf.to_pandas() - gdg = gdf.groupby("a", as_index=True, sort=True).agg( + gdg = gdf.groupby("a", as_index=True).agg( {"b": ["min", "max"], "c": "max"} ) - pdg = pdf.groupby("a", as_index=True, sort=True).agg( + pdg = pdf.groupby("a", as_index=True).agg( {"b": ["min", "max"], "c": "max"} ) - assert_eq(gdg, pdg) + assert_groupby_results_equal(gdg, pdg) def test_groupby_different_unequal_length_column_aggregations(): @@ -661,13 +645,13 @@ def test_groupby_different_unequal_length_column_aggregations(): gdf["b"] = [11, 2, 15, 12, 2] gdf["c"] = [11, 2, 15, 12, 2] pdf = gdf.to_pandas() - gdg = gdf.groupby("a", as_index=True, sort=True).agg( + gdg = gdf.groupby("a", as_index=True).agg( {"b": "min", "c": ["max", "min"]} ) - pdg = pdf.groupby("a", as_index=True, sort=True).agg( + pdg = pdf.groupby("a", as_index=True).agg( {"b": "min", "c": ["max", "min"]} ) - assert_eq(pdg, gdg) + assert_groupby_results_equal(pdg, gdg) def test_groupby_single_var_two_aggs(): @@ -676,9 +660,9 @@ def test_groupby_single_var_two_aggs(): gdf["b"] = [11, 2, 15, 12, 2] gdf["c"] = [11, 2, 15, 12, 2] pdf = gdf.to_pandas() - gdg = gdf.groupby("a", as_index=True, sort=True).agg({"b": ["min", "max"]}) - pdg = pdf.groupby("a", as_index=True, sort=True).agg({"b": ["min", "max"]}) - assert_eq(pdg, gdg) + gdg = gdf.groupby("a", as_index=True).agg({"b": ["min", "max"]}) + pdg = pdf.groupby("a", as_index=True).agg({"b": ["min", "max"]}) + assert_groupby_results_equal(pdg, gdg) def test_groupby_double_var_two_aggs(): @@ -687,13 +671,9 @@ def test_groupby_double_var_two_aggs(): gdf["b"] = [11, 2, 15, 12, 2] gdf["c"] = [11, 2, 15, 12, 2] pdf = gdf.to_pandas() - gdg = gdf.groupby(["a", "b"], as_index=True, sort=True).agg( - {"c": ["min", "max"]} - ) - pdg = pdf.groupby(["a", "b"], as_index=True, sort=True).agg( - {"c": ["min", "max"]} - ) - assert_eq(pdg, gdg) + gdg = gdf.groupby(["a", "b"], as_index=True).agg({"c": ["min", "max"]}) + pdg = pdf.groupby(["a", "b"], as_index=True).agg({"c": ["min", "max"]}) + assert_groupby_results_equal(pdg, gdg) def test_groupby_apply_basic_agg_single_column(): @@ -703,9 +683,9 @@ def test_groupby_apply_basic_agg_single_column(): gdf["mult"] = gdf["key"] * gdf["val"] pdf = gdf.to_pandas() - gdg = gdf.groupby(["key", "val"], sort=True).mult.sum() - pdg = pdf.groupby(["key", "val"], sort=True).mult.sum() - assert_eq(pdg, gdg) + gdg = gdf.groupby(["key", "val"]).mult.sum() + pdg = pdf.groupby(["key", "val"]).mult.sum() + assert_groupby_results_equal(pdg, gdg) def test_groupby_multi_agg_single_groupby_series(): @@ -716,10 +696,10 @@ def test_groupby_multi_agg_single_groupby_series(): } ) gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby("x", sort=True).y.agg(["sum", "max"]) - gdg = gdf.groupby("x", sort=True).y.agg(["sum", "max"]) + pdg = pdf.groupby("x").y.agg(["sum", "max"]) + gdg = gdf.groupby("x").y.agg(["sum", "max"]) - assert_eq(pdg, gdg) + assert_groupby_results_equal(pdg, gdg) def test_groupby_multi_agg_multi_groupby(): @@ -732,9 +712,9 @@ def test_groupby_multi_agg_multi_groupby(): } ) gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby(["a", "b"], sort=True).agg(["sum", "max"]) - gdg = gdf.groupby(["a", "b"], sort=True).agg(["sum", "max"]) - assert_eq(pdg, gdg) + pdg = pdf.groupby(["a", "b"]).agg(["sum", "max"]) + gdg = gdf.groupby(["a", "b"]).agg(["sum", "max"]) + assert_groupby_results_equal(pdg, gdg) def test_groupby_datetime_multi_agg_multi_groupby(): @@ -751,10 +731,10 @@ def test_groupby_datetime_multi_agg_multi_groupby(): } ) gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby(["a", "b"], sort=True).agg(["sum", "max"]) - gdg = gdf.groupby(["a", "b"], sort=True).agg(["sum", "max"]) + pdg = pdf.groupby(["a", "b"]).agg(["sum", "max"]) + gdg = gdf.groupby(["a", "b"]).agg(["sum", "max"]) - assert_eq(pdg, gdg) + assert_groupby_results_equal(pdg, gdg) @pytest.mark.parametrize( @@ -778,9 +758,9 @@ def test_groupby_multi_agg_hash_groupby(agg): ).reset_index(drop=True) pdf = gdf.to_pandas() check_dtype = False if "count" in agg else True - pdg = pdf.groupby("id", sort=True).agg(agg) - gdg = gdf.groupby("id", sort=True).agg(agg) - assert_eq(pdg, gdg, check_dtype=check_dtype) + pdg = pdf.groupby("id").agg(agg) + gdg = gdf.groupby("id").agg(agg) + assert_groupby_results_equal(pdg, gdg, check_dtype=check_dtype) @pytest.mark.parametrize( @@ -791,9 +771,9 @@ def test_groupby_nulls_basic(agg): pdf = pd.DataFrame({"a": [0, 0, 1, 1, 2, 2], "b": [1, 2, 1, 2, 1, None]}) gdf = cudf.from_pandas(pdf) - assert_eq( - getattr(pdf.groupby("a", sort=True), agg)(), - getattr(gdf.groupby("a", sort=True), agg)(), + assert_groupby_results_equal( + getattr(pdf.groupby("a"), agg)(), + getattr(gdf.groupby("a"), agg)(), check_dtype=check_dtype, ) @@ -805,9 +785,9 @@ def test_groupby_nulls_basic(agg): } ) gdf = cudf.from_pandas(pdf) - assert_eq( - getattr(pdf.groupby("a", sort=True), agg)(), - getattr(gdf.groupby("a", sort=True), agg)(), + assert_groupby_results_equal( + getattr(pdf.groupby("a"), agg)(), + getattr(gdf.groupby("a"), agg)(), check_dtype=check_dtype, ) @@ -822,9 +802,9 @@ def test_groupby_nulls_basic(agg): # TODO: fillna() used here since we don't follow # Pandas' null semantics. Should we change it? - assert_eq( - getattr(pdf.groupby("a", sort=True), agg)().fillna(0), - getattr(gdf.groupby("a", sort=True), agg)().fillna(0), + assert_groupby_results_equal( + getattr(pdf.groupby("a"), agg)().fillna(0), + getattr(gdf.groupby("a"), agg)().fillna(0), check_dtype=check_dtype, ) @@ -833,7 +813,9 @@ def test_groupby_nulls_in_index(): pdf = pd.DataFrame({"a": [None, 2, 1, 1], "b": [1, 2, 3, 4]}) gdf = cudf.from_pandas(pdf) - assert_eq(pdf.groupby("a").sum(), gdf.groupby("a").sum()) + assert_groupby_results_equal( + pdf.groupby("a").sum(), gdf.groupby("a").sum() + ) def test_groupby_all_nulls_index(): @@ -844,13 +826,17 @@ def test_groupby_all_nulls_index(): } ) pdf = gdf.to_pandas() - assert_eq(pdf.groupby("a").sum(), gdf.groupby("a").sum()) + assert_groupby_results_equal( + pdf.groupby("a").sum(), gdf.groupby("a").sum() + ) gdf = cudf.DataFrame( {"a": cudf.Series([np.nan, np.nan, np.nan, np.nan]), "b": [1, 2, 3, 4]} ) pdf = gdf.to_pandas() - assert_eq(pdf.groupby("a").sum(), gdf.groupby("a").sum()) + assert_groupby_results_equal( + pdf.groupby("a").sum(), gdf.groupby("a").sum() + ) @pytest.mark.parametrize("sort", [True, False]) @@ -902,10 +888,8 @@ def test_groupby_cat(): {"a": [1, 1, 2], "b": pd.Series(["b", "b", "a"], dtype="category")} ) gdf = cudf.from_pandas(pdf) - assert_eq( - pdf.groupby("a", sort=True).count(), - gdf.groupby("a", sort=True).count(), - check_dtype=False, + assert_groupby_results_equal( + pdf.groupby("a").count(), gdf.groupby("a").count(), check_dtype=False, ) @@ -947,7 +931,7 @@ def test_groupby_quantile(interpolation, q): "Pandas NaN Rounding will fail nearest interpolation at 0.5" ) - assert_eq(pdresult, gdresult) + assert_groupby_results_equal(pdresult, gdresult) def test_groupby_std(): @@ -957,16 +941,12 @@ def test_groupby_std(): } pdf = pd.DataFrame(raw_data) gdf = DataFrame.from_pandas(pdf) - pdg = pdf.groupby("x", sort=True) - gdg = gdf.groupby("x", sort=True) + pdg = pdf.groupby("x") + gdg = gdf.groupby("x") pdresult = pdg.std() gdresult = gdg.std() - # There's a lot left to add to python bindings like index name - # so this is a temporary workaround - pdresult = pdresult["y"].reset_index(drop=True) - gdresult = gdresult["y"].reset_index(drop=True) - assert_eq(pdresult, gdresult) + assert_groupby_results_equal(pdresult, gdresult) def test_groupby_size(): @@ -979,23 +959,19 @@ def test_groupby_size(): ) gdf = cudf.from_pandas(pdf) - assert_eq( - pdf.groupby("a", sort=True).size(), - gdf.groupby("a", sort=True).size(), - check_dtype=False, + assert_groupby_results_equal( + pdf.groupby("a").size(), gdf.groupby("a").size(), check_dtype=False, ) - assert_eq( - pdf.groupby(["a", "b", "c"], sort=True).size(), - gdf.groupby(["a", "b", "c"], sort=True).size(), + assert_groupby_results_equal( + pdf.groupby(["a", "b", "c"]).size(), + gdf.groupby(["a", "b", "c"]).size(), check_dtype=False, ) sr = pd.Series(range(len(pdf))) - assert_eq( - pdf.groupby(sr, sort=True).size(), - gdf.groupby(sr, sort=True).size(), - check_dtype=False, + assert_groupby_results_equal( + pdf.groupby(sr).size(), gdf.groupby(sr).size(), check_dtype=False, ) @@ -1010,15 +986,15 @@ def test_groupby_datetime(nelem, as_index, agg): check_dtype = agg not in ("mean", "count", "idxmin", "idxmax") pdf = make_frame(pd.DataFrame, nelem=nelem, with_datetime=True) gdf = make_frame(cudf.DataFrame, nelem=nelem, with_datetime=True) - pdg = pdf.groupby("datetime", as_index=as_index, sort=True) - gdg = gdf.groupby("datetime", as_index=as_index, sort=True) + pdg = pdf.groupby("datetime", as_index=as_index) + gdg = gdf.groupby("datetime", as_index=as_index) if as_index is False: pdres = getattr(pdg, agg)() gdres = getattr(gdg, agg)() else: pdres = pdg.agg({"datetime": agg}) gdres = gdg.agg({"datetime": agg}) - assert_eq(pdres, gdres, check_dtype=check_dtype) + assert_groupby_results_equal(pdres, gdres, check_dtype=check_dtype) def test_groupby_dropna(): @@ -1026,8 +1002,8 @@ def test_groupby_dropna(): expect = cudf.DataFrame( {"b": [3, 3]}, index=cudf.Series([1, None], name="a") ) - got = df.groupby("a", dropna=False, sort=True).sum() - assert_eq(expect, got) + got = df.groupby("a", dropna=False).sum() + assert_groupby_results_equal(expect, got) df = cudf.DataFrame( {"a": [1, 1, 1, None], "b": [1, None, 1, None], "c": [1, 2, 3, 4]} @@ -1037,22 +1013,22 @@ def test_groupby_dropna(): names=["a", "b"], ) expect = cudf.DataFrame({"c": [4, 2, 4]}, index=idx) - got = df.groupby(["a", "b"], dropna=False, sort=True).sum() + got = df.groupby(["a", "b"], dropna=False).sum() - assert_eq(expect, got) + assert_groupby_results_equal(expect, got) def test_groupby_dropna_getattr(): df = cudf.DataFrame() df["id"] = [0, 1, 1, None, None, 3, 3] df["val"] = [0, 1, 1, 2, 2, 3, 3] - got = df.groupby("id", dropna=False, sort=True).val.sum() + got = df.groupby("id", dropna=False).val.sum() expect = cudf.Series( [0, 2, 6, 4], name="val", index=cudf.Series([0, 1, 3, None], name="id") ) - assert_eq(expect, got) + assert_groupby_results_equal(expect, got) def test_groupby_categorical_from_string(): @@ -1060,9 +1036,9 @@ def test_groupby_categorical_from_string(): gdf["id"] = ["a", "b", "c"] gdf["val"] = [0, 1, 2] gdf["id"] = gdf["id"].astype("category") - assert_eq( + assert_groupby_results_equal( cudf.DataFrame({"val": gdf["val"]}).set_index(keys=gdf["id"]), - gdf.groupby("id", sort=True).sum(), + gdf.groupby("id").sum(), ) @@ -1076,7 +1052,7 @@ def test_groupby_arbitrary_length_series(): expect = pdf.groupby(psr).sum() got = gdf.groupby(gsr).sum() - assert_eq(expect, got) + assert_groupby_results_equal(expect, got) def test_groupby_series_same_name_as_dataframe_column(): @@ -1089,7 +1065,7 @@ def test_groupby_series_same_name_as_dataframe_column(): expect = pdf.groupby(psr).sum() got = gdf.groupby(gsr).sum() - assert_eq(expect, got) + assert_groupby_results_equal(expect, got) def test_group_by_series_and_column_name_in_by(): @@ -1106,7 +1082,7 @@ def test_group_by_series_and_column_name_in_by(): expect = pdf.groupby(["x", psr0, psr1]).sum() got = gdf.groupby(["x", gsr0, gsr1]).sum() - assert_eq(expect, got) + assert_groupby_results_equal(expect, got) @pytest.mark.parametrize( @@ -1147,10 +1123,10 @@ def test_groupby_count(agg, by): ) gdf = cudf.from_pandas(pdf) - expect = pdf.groupby(by, sort=True).agg(agg) - got = gdf.groupby(by, sort=True).agg(agg) + expect = pdf.groupby(by).agg(agg) + got = gdf.groupby(by).agg(agg) - assert_eq(expect, got, check_dtype=False) + assert_groupby_results_equal(expect, got, check_dtype=False) @pytest.mark.parametrize("agg", [lambda x: x.median(), "median"]) @@ -1164,7 +1140,7 @@ def test_groupby_median(agg, by): expect = pdf.groupby(by).agg(agg) got = gdf.groupby(by).agg(agg) - assert_eq(expect, got, check_dtype=False) + assert_groupby_results_equal(expect, got, check_dtype=False) @pytest.mark.parametrize("agg", [lambda x: x.nunique(), "nunique"]) @@ -1180,7 +1156,7 @@ def test_groupby_nunique(agg, by): expect = pdf.groupby(by).nunique() got = gdf.groupby(by).nunique() - assert_eq(expect, got, check_dtype=False) + assert_groupby_results_equal(expect, got, check_dtype=False) @pytest.mark.parametrize( @@ -1198,10 +1174,10 @@ def test_groupby_nth(n, by): ) gdf = cudf.from_pandas(pdf) - expect = pdf.groupby(by, sort=True).nth(n) - got = gdf.groupby(by, sort=True).nth(n) + expect = pdf.groupby(by).nth(n) + got = gdf.groupby(by).nth(n) - assert_eq(expect, got, check_dtype=False) + assert_groupby_results_equal(expect, got, check_dtype=False) def test_raise_data_error(): @@ -1217,7 +1193,7 @@ def test_drop_unsupported_multi_agg(): gdf = cudf.DataFrame( {"a": [1, 1, 2, 2], "b": [1, 2, 3, 4], "c": ["a", "b", "c", "d"]} ) - assert_eq( + assert_groupby_results_equal( gdf.groupby("a").agg(["count", "mean"]), gdf.groupby("a").agg({"b": ["count", "mean"], "c": ["count"]}), ) @@ -1245,9 +1221,9 @@ def test_groupby_agg_combinations(agg): ) gdf = cudf.from_pandas(pdf) - assert_eq( - pdf.groupby("a", sort=True).agg(agg), - gdf.groupby("a", sort=True).agg(agg), + assert_groupby_results_equal( + pdf.groupby("a").agg(agg), + gdf.groupby("a").agg(agg), check_dtype=False, ) @@ -1257,7 +1233,7 @@ def test_groupby_apply_noempty_group(): {"a": [1, 1, 2, 2], "b": [1, 2, 1, 2], "c": [1, 2, 3, 4]} ) gdf = cudf.from_pandas(pdf) - assert_eq( + assert_groupby_results_equal( pdf.groupby("a") .apply(lambda x: x.iloc[[0, 1]]) .reset_index(drop=True), @@ -1272,9 +1248,9 @@ def test_reset_index_after_empty_groupby(): pdf = pd.DataFrame({"a": [1, 2, 3]}) gdf = cudf.from_pandas(pdf) - assert_eq( - pdf.groupby("a", sort=True).sum().reset_index(), - gdf.groupby("a", sort=True).sum().reset_index(), + assert_groupby_results_equal( + pdf.groupby("a").sum().reset_index(), + gdf.groupby("a").sum().reset_index(), ) @@ -1354,7 +1330,7 @@ def test_groupby_nunique_series(): pdf = pd.DataFrame({"a": [1, 1, 1, 2, 2, 2], "b": [1, 2, 3, 1, 1, 2]}) gdf = cudf.from_pandas(pdf) - assert_eq( + assert_groupby_results_equal( pdf.groupby("a")["b"].nunique(), gdf.groupby("a")["b"].nunique(), check_dtype=False, @@ -1366,7 +1342,7 @@ def test_groupby_list_simple(list_agg): pdf = pd.DataFrame({"a": [1, 1, 1, 2, 2, 2], "b": [1, 2, None, 4, 5, 6]}) gdf = cudf.from_pandas(pdf) - assert_eq( + assert_groupby_results_equal( pdf.groupby("a").agg({"b": list}), gdf.groupby("a").agg({"b": list_agg}), check_dtype=False, @@ -1383,7 +1359,7 @@ def test_groupby_list_of_lists(list_agg): ) gdf = cudf.from_pandas(pdf) - assert_eq( + assert_groupby_results_equal( pdf.groupby("a").agg({"b": list}), gdf.groupby("a").agg({"b": list_agg}), check_dtype=False, @@ -1395,7 +1371,7 @@ def test_groupby_list_single_element(list_agg): pdf = pd.DataFrame({"a": [1, 2], "b": [3, None]}) gdf = cudf.from_pandas(pdf) - assert_eq( + assert_groupby_results_equal( pdf.groupby("a").agg({"b": list}), gdf.groupby("a").agg({"b": list_agg}), check_dtype=False, @@ -1415,7 +1391,7 @@ def test_groupby_list_strings(agg): ) gdf = cudf.from_pandas(pdf) - assert_eq( + assert_groupby_results_equal( pdf.groupby("a").agg(agg), gdf.groupby("a").agg(agg), check_dtype=False, @@ -1432,11 +1408,11 @@ def test_groupby_list_columns_excluded(): ) gdf = cudf.from_pandas(pdf) - assert_eq( + assert_groupby_results_equal( pdf.groupby("a").mean(), gdf.groupby("a").mean(), check_dtype=False ) - assert_eq( + assert_groupby_results_equal( pdf.groupby("a").agg("mean"), gdf.groupby("a").agg("mean"), check_dtype=False, @@ -1450,7 +1426,7 @@ def test_groupby_pipe(): expected = pdf.groupby("A").pipe(lambda x: x.max() - x.min()) actual = gdf.groupby("A").pipe(lambda x: x.max() - x.min()) - assert_eq(expected, actual) + assert_groupby_results_equal(expected, actual) def test_groupby_apply_return_scalars(): @@ -1482,7 +1458,7 @@ def custom_map_func(x): expected = pdf.groupby("A").apply(lambda x: custom_map_func(x)) actual = gdf.groupby("A").apply(lambda x: custom_map_func(x)) - assert_eq(expected, actual) + assert_groupby_results_equal(expected, actual) @pytest.mark.parametrize( @@ -1498,7 +1474,7 @@ def test_groupby_apply_return_series_dataframe(cust_func): expected = pdf.groupby(["key"]).apply(cust_func) actual = gdf.groupby(["key"]).apply(cust_func) - assert_eq(expected, actual) + assert_groupby_results_equal(expected, actual) @pytest.mark.parametrize( @@ -1507,7 +1483,7 @@ def test_groupby_apply_return_series_dataframe(cust_func): ) def test_groupby_no_keys(pdf): gdf = cudf.from_pandas(pdf) - assert_eq( + assert_groupby_results_equal( pdf.groupby([]).max(), gdf.groupby([]).max(), check_dtype=False, @@ -1521,7 +1497,7 @@ def test_groupby_no_keys(pdf): ) def test_groupby_apply_no_keys(pdf): gdf = cudf.from_pandas(pdf) - assert_eq( + assert_groupby_results_equal( pdf.groupby([]).apply(lambda x: x.max()), gdf.groupby([]).apply(lambda x: x.max()), ) @@ -1560,4 +1536,4 @@ def test_groupby_unique(by, data, dtype): expect = pdf.groupby("by")["data"].unique() got = gdf.groupby("by")["data"].unique() - assert_eq(expect, got) + assert_groupby_results_equal(expect, got)