From 45cda024ec950cafbdbf44e929c480f14820c78e Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Thu, 1 Apr 2021 12:58:43 -0400 Subject: [PATCH] Sort by index in groupby tests more consistently (#7802) Pandas sorts the result of a groupby by its index by default, whereas we do not. For testing purposes, we currently call groupby() with `sort=True` in many of our tests. But some tests (notably newer ones) don't do that. These tests have been passing silently. [Until now](https://gpuci.gpuopenanalytics.com/job/rapidsai/job/gpuci/job/cudf/job/prb/job/cudf-gpu-test/1842/CUDA=10.2,GPU_LABEL=gpu,OS=centos7,PYTHON=3.8/testReport/junit/cudf.tests/test_groupby/test_series_groupby_agg_max_/). * adds a `assert_groupby_results_equal` helper in `test_groupby.py`, which sorts its inputs by index before comparing them * removes the manual `sort=True` we are doing in each of our groupby tests Authors: - Ashwin Srinath (https://github.com/shwina) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Keith Kraus (https://github.com/kkraus14) URL: https://github.com/rapidsai/cudf/pull/7802 --- python/cudf/cudf/tests/test_groupby.py | 480 ++++++++++++------------- 1 file changed, 231 insertions(+), 249 deletions(-) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 84b52b1befb..4dbe608af82 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -2,13 +2,15 @@ import datetime import itertools +from decimal import Decimal import numpy as np import pandas as pd import pytest from numba import cuda from numpy.testing import assert_array_equal -from decimal import Decimal + +import rmm import cudf from cudf.core import DataFrame, Series @@ -21,8 +23,6 @@ assert_exceptions_equal, ) -import rmm - _now = np.datetime64("now") _tomorrow = _now + np.timedelta64(1, "D") _now = np.int64(_now.astype("datetime64[ns]")) @@ -30,6 +30,16 @@ _index_type_aggs = {"count", "idxmin", "idxmax"} +def assert_groupby_results_equal(expect, got, sort=True, **kwargs): + # Because we don't sort by index by default in groupby, + # sort expect and got by index before comparing + if sort: + expect = expect.sort_index() + got = got.sort_index() + else: + assert_eq(expect.sort_index(), got.sort_index(), **kwargs) + + def make_frame( dataframe_class, nelem, @@ -76,17 +86,11 @@ def pdf(gdf): @pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) def test_groupby_mean(nelem): - got_df = ( - make_frame(DataFrame, nelem=nelem) - .groupby(["x", "y"], sort=True) - .mean() - ) + got_df = make_frame(DataFrame, nelem=nelem).groupby(["x", "y"]).mean() expect_df = ( - make_frame(pd.DataFrame, nelem=nelem) - .groupby(["x", "y"], sort=True) - .mean() + make_frame(pd.DataFrame, nelem=nelem).groupby(["x", "y"]).mean() ) - assert_eq(got_df, expect_df) + assert_groupby_results_equal(got_df, expect_df) @pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) @@ -95,67 +99,67 @@ def test_groupby_mean_3level(nelem): bys = list("xyz") got_df = ( make_frame(DataFrame, nelem=nelem, extra_levels=lvls) - .groupby(bys, sort=True) + .groupby(bys) .mean() ) expect_df = ( make_frame(pd.DataFrame, nelem=nelem, extra_levels=lvls) - .groupby(bys, sort=True) + .groupby(bys) .mean() ) - assert_eq(got_df, expect_df) + assert_groupby_results_equal(got_df, expect_df) @pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) def test_groupby_agg_mean_min(nelem): got_df = ( make_frame(DataFrame, nelem=nelem) - .groupby(["x", "y"], sort=True) + .groupby(["x", "y"]) .agg(["mean", "min"]) ) expect_df = ( make_frame(pd.DataFrame, nelem=nelem) - .groupby(["x", "y"], sort=True) + .groupby(["x", "y"]) .agg(["mean", "min"]) ) - assert_eq(got_df, expect_df) + assert_groupby_results_equal(got_df, expect_df) @pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) def test_groupby_agg_min_max_dictargs(nelem): expect_df = ( make_frame(pd.DataFrame, nelem=nelem, extra_vals="ab") - .groupby(["x", "y"], sort=True) + .groupby(["x", "y"]) .agg({"a": "min", "b": "max"}) ) got_df = ( make_frame(DataFrame, nelem=nelem, extra_vals="ab") - .groupby(["x", "y"], sort=True) + .groupby(["x", "y"]) .agg({"a": "min", "b": "max"}) ) - assert_eq(expect_df, got_df) + assert_groupby_results_equal(expect_df, got_df) @pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) def test_groupby_agg_min_max_dictlist(nelem): expect_df = ( make_frame(pd.DataFrame, nelem=nelem, extra_vals="ab") - .groupby(["x", "y"], sort=True) + .groupby(["x", "y"]) .agg({"a": ["min", "max"], "b": ["min", "max"]}) ) got_df = ( make_frame(DataFrame, nelem=nelem, extra_vals="ab") - .groupby(["x", "y"], sort=True) + .groupby(["x", "y"]) .agg({"a": ["min", "max"], "b": ["min", "max"]}) ) - assert_eq(got_df, expect_df) + assert_groupby_results_equal(got_df, expect_df) @pytest.mark.parametrize("as_index", [True, False]) def test_groupby_as_index_single_agg(pdf, gdf, as_index): - gdf = gdf.groupby("y", as_index=as_index, sort=True).agg({"x": "mean"}) - pdf = pdf.groupby("y", as_index=as_index, sort=True).agg({"x": "mean"}) - assert_eq(pdf, gdf) + gdf = gdf.groupby("y", as_index=as_index).agg({"x": "mean"}) + pdf = pdf.groupby("y", as_index=as_index).agg({"x": "mean"}) + assert_groupby_results_equal(pdf, gdf) @pytest.mark.parametrize("as_index", [True, False]) @@ -181,36 +185,33 @@ def test_groupby_as_index_multiindex(pdf, gdf, as_index): def test_groupby_default(pdf, gdf): - gdf = gdf.groupby("y", sort=True).agg({"x": "mean"}) - pdf = pdf.groupby("y", sort=True).agg({"x": "mean"}) - assert_eq(pdf, gdf) + gdf = gdf.groupby("y").agg({"x": "mean"}) + pdf = pdf.groupby("y").agg({"x": "mean"}) + assert_groupby_results_equal(pdf, gdf) def test_group_keys_true(pdf, gdf): - gdf = gdf.groupby("y", group_keys=True, sort=True).sum() - pdf = pdf.groupby("y", group_keys=True, sort=True).sum() - assert_eq(pdf, gdf) + gdf = gdf.groupby("y", group_keys=True).sum() + pdf = pdf.groupby("y", group_keys=True).sum() + assert_groupby_results_equal(pdf, gdf) @pytest.mark.parametrize("as_index", [True, False]) def test_groupby_getitem_getattr(as_index): pdf = pd.DataFrame({"x": [1, 3, 1], "y": [1, 2, 3], "z": [1, 4, 5]}) gdf = cudf.from_pandas(pdf) - assert_eq( - pdf.groupby("x", sort=True)["y"].sum(), - gdf.groupby("x", sort=True)["y"].sum(), + assert_groupby_results_equal( + pdf.groupby("x")["y"].sum(), gdf.groupby("x")["y"].sum(), ) - assert_eq( - pdf.groupby("x", sort=True).y.sum(), - gdf.groupby("x", sort=True).y.sum(), + assert_groupby_results_equal( + pdf.groupby("x").y.sum(), gdf.groupby("x").y.sum(), ) - assert_eq( - pdf.groupby("x", sort=True)[["y"]].sum(), - gdf.groupby("x", sort=True)[["y"]].sum(), + assert_groupby_results_equal( + pdf.groupby("x")[["y"]].sum(), gdf.groupby("x")[["y"]].sum(), ) - assert_eq( - pdf.groupby(["x", "y"], as_index=as_index, sort=True).sum(), - gdf.groupby(["x", "y"], as_index=as_index, sort=True).sum(), + assert_groupby_results_equal( + pdf.groupby(["x", "y"], as_index=as_index).sum(), + gdf.groupby(["x", "y"], as_index=as_index).sum(), ) @@ -260,10 +261,8 @@ def test_groupby_apply(): df["val1"] = np.random.random(nelem) df["val2"] = np.random.random(nelem) - expect_grpby = df.to_pandas().groupby( - ["key1", "key2"], as_index=False, sort=True - ) - got_grpby = df.groupby(["key1", "key2"], sort=True) + expect_grpby = df.to_pandas().groupby(["key1", "key2"], as_index=False) + got_grpby = df.groupby(["key1", "key2"]) def foo(df): df["out"] = df["val1"] + df["val2"] @@ -271,7 +270,7 @@ def foo(df): expect = expect_grpby.apply(foo) got = got_grpby.apply(foo) - assert_eq(expect, got) + assert_groupby_results_equal(expect, got) def test_groupby_apply_grouped(): @@ -283,10 +282,8 @@ def test_groupby_apply_grouped(): df["val1"] = np.random.random(nelem) df["val2"] = np.random.random(nelem) - expect_grpby = df.to_pandas().groupby( - ["key1", "key2"], as_index=False, sort=True - ) - got_grpby = df.groupby(["key1", "key2"], sort=True) + expect_grpby = df.to_pandas().groupby(["key1", "key2"], as_index=False) + got_grpby = df.groupby(["key1", "key2"]) def foo(key1, val1, com1, com2): for i in range(cuda.threadIdx.x, len(key1), cuda.blockDim.x): @@ -311,7 +308,7 @@ def emulate(df): expect = expect_grpby.apply(emulate) expect = expect.sort_values(["key1", "key2"]) - assert_eq(expect, got) + assert_groupby_results_equal(expect, got) @pytest.mark.parametrize("nelem", [2, 3, 100, 500, 1000]) @@ -322,18 +319,12 @@ def emulate(df): def test_groupby_2keys_agg(nelem, func): # gdf (Note: lack of multiIndex) expect_df = ( - make_frame(pd.DataFrame, nelem=nelem) - .groupby(["x", "y"], sort=True) - .agg(func) - ) - got_df = ( - make_frame(DataFrame, nelem=nelem) - .groupby(["x", "y"], sort=True) - .agg(func) + make_frame(pd.DataFrame, nelem=nelem).groupby(["x", "y"]).agg(func) ) + got_df = make_frame(DataFrame, nelem=nelem).groupby(["x", "y"]).agg(func) check_dtype = False if func in _index_type_aggs else True - assert_eq(got_df, expect_df, check_dtype=check_dtype) + assert_groupby_results_equal(got_df, expect_df, check_dtype=check_dtype) @pytest.mark.parametrize("num_groups", [2, 3, 10, 50, 100]) @@ -409,7 +400,7 @@ def test_series_groupby(agg): sa = getattr(sg, agg)() ga = getattr(gg, agg)() check_dtype = False if agg in _index_type_aggs else True - assert_eq(sa, ga, check_dtype=check_dtype) + assert_groupby_results_equal(sa, ga, check_dtype=check_dtype) @pytest.mark.parametrize( @@ -421,7 +412,7 @@ def test_series_groupby_agg(agg): sg = s.groupby(s // 2).agg(agg) gg = g.groupby(g // 2).agg(agg) check_dtype = False if agg in _index_type_aggs else True - assert_eq(sg, gg, check_dtype=check_dtype) + assert_groupby_results_equal(sg, gg, check_dtype=check_dtype) @pytest.mark.parametrize( @@ -450,7 +441,7 @@ def test_groupby_level_zero(agg): pdresult = getattr(pdg, agg)() gdresult = getattr(gdg, agg)() check_dtype = False if agg in _index_type_aggs else True - assert_eq(pdresult, gdresult, check_dtype=check_dtype) + assert_groupby_results_equal(pdresult, gdresult, check_dtype=check_dtype) @pytest.mark.parametrize( @@ -479,59 +470,59 @@ def test_groupby_series_level_zero(agg): pdresult = getattr(pdg, agg)() gdresult = getattr(gdg, agg)() check_dtype = False if agg in _index_type_aggs else True - assert_eq(pdresult, gdresult, check_dtype=check_dtype) + assert_groupby_results_equal(pdresult, gdresult, check_dtype=check_dtype) def test_groupby_column_name(): pdf = pd.DataFrame({"xx": [1.0, 2.0, 3.0], "yy": [1, 2, 3]}) gdf = DataFrame.from_pandas(pdf) - g = gdf.groupby("yy", sort=True) - p = pdf.groupby("yy", sort=True) + g = gdf.groupby("yy") + p = pdf.groupby("yy") gxx = g["xx"].sum() pxx = p["xx"].sum() - assert_eq(pxx, gxx) + assert_groupby_results_equal(pxx, gxx) gxx = g["xx"].count() pxx = p["xx"].count() - assert_eq(pxx, gxx, check_dtype=False) + assert_groupby_results_equal(pxx, gxx, check_dtype=False) gxx = g["xx"].min() pxx = p["xx"].min() - assert_eq(pxx, gxx) + assert_groupby_results_equal(pxx, gxx) gxx = g["xx"].max() pxx = p["xx"].max() - assert_eq(pxx, gxx) + assert_groupby_results_equal(pxx, gxx) gxx = g["xx"].idxmin() pxx = p["xx"].idxmin() - assert_eq(pxx, gxx, check_dtype=False) + assert_groupby_results_equal(pxx, gxx, check_dtype=False) gxx = g["xx"].idxmax() pxx = p["xx"].idxmax() - assert_eq(pxx, gxx, check_dtype=False) + assert_groupby_results_equal(pxx, gxx, check_dtype=False) gxx = g["xx"].mean() pxx = p["xx"].mean() - assert_eq(pxx, gxx) + assert_groupby_results_equal(pxx, gxx) def test_groupby_column_numeral(): pdf = pd.DataFrame({0: [1.0, 2.0, 3.0], 1: [1, 2, 3]}) gdf = DataFrame.from_pandas(pdf) - p = pdf.groupby(1, sort=True) - g = gdf.groupby(1, sort=True) + p = pdf.groupby(1) + g = gdf.groupby(1) pxx = p[0].sum() gxx = g[0].sum() - assert_eq(pxx, gxx) + assert_groupby_results_equal(pxx, gxx) pdf = pd.DataFrame({0.5: [1.0, 2.0, 3.0], 1.5: [1, 2, 3]}) gdf = DataFrame.from_pandas(pdf) - p = pdf.groupby(1.5, sort=True) - g = gdf.groupby(1.5, sort=True) + p = pdf.groupby(1.5) + g = gdf.groupby(1.5) pxx = p[0.5].sum() gxx = g[0.5].sum() - assert_eq(pxx, gxx) + assert_groupby_results_equal(pxx, gxx) @pytest.mark.parametrize( @@ -541,18 +532,18 @@ def test_groupby_column_numeral(): def test_groupby_external_series(series): pdf = pd.DataFrame({"x": [1.0, 2.0, 3.0], "y": [1, 2, 1]}) gdf = DataFrame.from_pandas(pdf) - pxx = pdf.groupby(pd.Series(series), sort=True).x.sum() - gxx = gdf.groupby(cudf.Series(series), sort=True).x.sum() - assert_eq(pxx, gxx) + pxx = pdf.groupby(pd.Series(series)).x.sum() + gxx = gdf.groupby(cudf.Series(series)).x.sum() + assert_groupby_results_equal(pxx, gxx) @pytest.mark.parametrize("series", [[0.0, 1.0], [1.0, 1.0, 1.0, 1.0]]) def test_groupby_external_series_incorrect_length(series): pdf = pd.DataFrame({"x": [1.0, 2.0, 3.0], "y": [1, 2, 1]}) gdf = DataFrame.from_pandas(pdf) - pxx = pdf.groupby(pd.Series(series), sort=True).x.sum() - gxx = gdf.groupby(cudf.Series(series), sort=True).x.sum() - assert_eq(pxx, gxx) + pxx = pdf.groupby(pd.Series(series)).x.sum() + gxx = gdf.groupby(cudf.Series(series)).x.sum() + assert_groupby_results_equal(pxx, gxx) @pytest.mark.parametrize( @@ -562,52 +553,51 @@ def test_groupby_levels(level): idx = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (2, 2)], names=("a", "b")) pdf = pd.DataFrame({"c": [1, 2, 3], "d": [2, 3, 4]}, index=idx) gdf = cudf.from_pandas(pdf) - assert_eq( - pdf.groupby(level=level, sort=True).sum(), - gdf.groupby(level=level, sort=True).sum(), + assert_groupby_results_equal( + pdf.groupby(level=level).sum(), gdf.groupby(level=level).sum(), ) def test_advanced_groupby_levels(): pdf = pd.DataFrame({"x": [1, 2, 3], "y": [1, 2, 1], "z": [1, 1, 1]}) gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby(["x", "y"], sort=True).sum() - gdg = gdf.groupby(["x", "y"], sort=True).sum() - assert_eq(pdg, gdg) - pdh = pdg.groupby(level=1, sort=True).sum() - gdh = gdg.groupby(level=1, sort=True).sum() - assert_eq(pdh, gdh) - pdg = pdf.groupby(["x", "y", "z"], sort=True).sum() - gdg = gdf.groupby(["x", "y", "z"], sort=True).sum() - assert_eq(pdg, gdg) - pdg = pdf.groupby(["z"], sort=True).sum() - gdg = gdf.groupby(["z"], sort=True).sum() - assert_eq(pdg, gdg) - pdg = pdf.groupby(["y", "z"], sort=True).sum() - gdg = gdf.groupby(["y", "z"], sort=True).sum() - assert_eq(pdg, gdg) - pdg = pdf.groupby(["x", "z"], sort=True).sum() - gdg = gdf.groupby(["x", "z"], sort=True).sum() - assert_eq(pdg, gdg) - pdg = pdf.groupby(["y"], sort=True).sum() - gdg = gdf.groupby(["y"], sort=True).sum() - assert_eq(pdg, gdg) - pdg = pdf.groupby(["x"], sort=True).sum() - gdg = gdf.groupby(["x"], sort=True).sum() - assert_eq(pdg, gdg) - pdh = pdg.groupby(level=0, sort=True).sum() - gdh = gdg.groupby(level=0, sort=True).sum() - assert_eq(pdh, gdh) - pdg = pdf.groupby(["x", "y"], sort=True).sum() - gdg = gdf.groupby(["x", "y"], sort=True).sum() - pdh = pdg.groupby(level=[0, 1], sort=True).sum() - gdh = gdg.groupby(level=[0, 1], sort=True).sum() - assert_eq(pdh, gdh) - pdh = pdg.groupby(level=[1, 0], sort=True).sum() - gdh = gdg.groupby(level=[1, 0], sort=True).sum() - assert_eq(pdh, gdh) - pdg = pdf.groupby(["x", "y"], sort=True).sum() - gdg = gdf.groupby(["x", "y"], sort=True).sum() + pdg = pdf.groupby(["x", "y"]).sum() + gdg = gdf.groupby(["x", "y"]).sum() + assert_groupby_results_equal(pdg, gdg) + pdh = pdg.groupby(level=1).sum() + gdh = gdg.groupby(level=1).sum() + assert_groupby_results_equal(pdh, gdh) + pdg = pdf.groupby(["x", "y", "z"]).sum() + gdg = gdf.groupby(["x", "y", "z"]).sum() + assert_groupby_results_equal(pdg, gdg) + pdg = pdf.groupby(["z"]).sum() + gdg = gdf.groupby(["z"]).sum() + assert_groupby_results_equal(pdg, gdg) + pdg = pdf.groupby(["y", "z"]).sum() + gdg = gdf.groupby(["y", "z"]).sum() + assert_groupby_results_equal(pdg, gdg) + pdg = pdf.groupby(["x", "z"]).sum() + gdg = gdf.groupby(["x", "z"]).sum() + assert_groupby_results_equal(pdg, gdg) + pdg = pdf.groupby(["y"]).sum() + gdg = gdf.groupby(["y"]).sum() + assert_groupby_results_equal(pdg, gdg) + pdg = pdf.groupby(["x"]).sum() + gdg = gdf.groupby(["x"]).sum() + assert_groupby_results_equal(pdg, gdg) + pdh = pdg.groupby(level=0).sum() + gdh = gdg.groupby(level=0).sum() + assert_groupby_results_equal(pdh, gdh) + pdg = pdf.groupby(["x", "y"]).sum() + gdg = gdf.groupby(["x", "y"]).sum() + pdh = pdg.groupby(level=[0, 1]).sum() + gdh = gdg.groupby(level=[0, 1]).sum() + assert_groupby_results_equal(pdh, gdh) + pdh = pdg.groupby(level=[1, 0]).sum() + gdh = gdg.groupby(level=[1, 0]).sum() + assert_groupby_results_equal(pdh, gdh) + pdg = pdf.groupby(["x", "y"]).sum() + gdg = gdf.groupby(["x", "y"]).sum() assert_exceptions_equal( lfunc=pdg.groupby, @@ -640,7 +630,7 @@ def test_advanced_groupby_levels(): def test_empty_groupby(func): pdf = pd.DataFrame({"x": [], "y": [], "z": []}) gdf = cudf.from_pandas(pdf) - assert_eq(func(pdf), func(gdf), check_index_type=False) + assert_groupby_results_equal(func(pdf), func(gdf), check_index_type=False) def test_groupby_unsupported_columns(): @@ -658,21 +648,21 @@ def test_groupby_unsupported_columns(): ) pdf["b"] = pd_cat gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby("x", sort=True).sum() - gdg = gdf.groupby("x", sort=True).sum() - assert_eq(pdg, gdg) + pdg = pdf.groupby("x").sum() + gdg = gdf.groupby("x").sum() + assert_groupby_results_equal(pdg, gdg) def test_list_of_series(): pdf = pd.DataFrame({"x": [1, 2, 3], "y": [1, 2, 1]}) gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby([pdf.x], sort=True).y.sum() - gdg = gdf.groupby([gdf.x], sort=True).y.sum() - assert_eq(pdg, gdg) - pdg = pdf.groupby([pdf.x, pdf.y], sort=True).y.sum() - gdg = gdf.groupby([gdf.x, gdf.y], sort=True).y.sum() + pdg = pdf.groupby([pdf.x]).y.sum() + gdg = gdf.groupby([gdf.x]).y.sum() + assert_groupby_results_equal(pdg, gdg) + pdg = pdf.groupby([pdf.x, pdf.y]).y.sum() + gdg = gdf.groupby([gdf.x, gdf.y]).y.sum() pytest.skip() - assert_eq(pdg, gdg) + assert_groupby_results_equal(pdg, gdg) def test_groupby_use_agg_column_as_index(): @@ -682,7 +672,7 @@ def test_groupby_use_agg_column_as_index(): gdf["a"] = [1, 1, 1, 3, 5] pdg = pdf.groupby("a").agg({"a": "count"}) gdg = gdf.groupby("a").agg({"a": "count"}) - assert_eq(pdg, gdg, check_dtype=False) + assert_groupby_results_equal(pdg, gdg, check_dtype=False) def test_groupby_list_then_string(): @@ -691,13 +681,13 @@ def test_groupby_list_then_string(): gdf["b"] = [11, 2, 15, 12, 2] gdf["c"] = [6, 7, 6, 7, 6] pdf = gdf.to_pandas() - gdg = gdf.groupby("a", as_index=True, sort=True).agg( + gdg = gdf.groupby("a", as_index=True).agg( {"b": ["min", "max"], "c": "max"} ) - pdg = pdf.groupby("a", as_index=True, sort=True).agg( + pdg = pdf.groupby("a", as_index=True).agg( {"b": ["min", "max"], "c": "max"} ) - assert_eq(gdg, pdg) + assert_groupby_results_equal(gdg, pdg) def test_groupby_different_unequal_length_column_aggregations(): @@ -706,13 +696,13 @@ def test_groupby_different_unequal_length_column_aggregations(): gdf["b"] = [11, 2, 15, 12, 2] gdf["c"] = [11, 2, 15, 12, 2] pdf = gdf.to_pandas() - gdg = gdf.groupby("a", as_index=True, sort=True).agg( + gdg = gdf.groupby("a", as_index=True).agg( {"b": "min", "c": ["max", "min"]} ) - pdg = pdf.groupby("a", as_index=True, sort=True).agg( + pdg = pdf.groupby("a", as_index=True).agg( {"b": "min", "c": ["max", "min"]} ) - assert_eq(pdg, gdg) + assert_groupby_results_equal(pdg, gdg) def test_groupby_single_var_two_aggs(): @@ -721,9 +711,9 @@ def test_groupby_single_var_two_aggs(): gdf["b"] = [11, 2, 15, 12, 2] gdf["c"] = [11, 2, 15, 12, 2] pdf = gdf.to_pandas() - gdg = gdf.groupby("a", as_index=True, sort=True).agg({"b": ["min", "max"]}) - pdg = pdf.groupby("a", as_index=True, sort=True).agg({"b": ["min", "max"]}) - assert_eq(pdg, gdg) + gdg = gdf.groupby("a", as_index=True).agg({"b": ["min", "max"]}) + pdg = pdf.groupby("a", as_index=True).agg({"b": ["min", "max"]}) + assert_groupby_results_equal(pdg, gdg) def test_groupby_double_var_two_aggs(): @@ -732,13 +722,9 @@ def test_groupby_double_var_two_aggs(): gdf["b"] = [11, 2, 15, 12, 2] gdf["c"] = [11, 2, 15, 12, 2] pdf = gdf.to_pandas() - gdg = gdf.groupby(["a", "b"], as_index=True, sort=True).agg( - {"c": ["min", "max"]} - ) - pdg = pdf.groupby(["a", "b"], as_index=True, sort=True).agg( - {"c": ["min", "max"]} - ) - assert_eq(pdg, gdg) + gdg = gdf.groupby(["a", "b"], as_index=True).agg({"c": ["min", "max"]}) + pdg = pdf.groupby(["a", "b"], as_index=True).agg({"c": ["min", "max"]}) + assert_groupby_results_equal(pdg, gdg) def test_groupby_apply_basic_agg_single_column(): @@ -748,9 +734,9 @@ def test_groupby_apply_basic_agg_single_column(): gdf["mult"] = gdf["key"] * gdf["val"] pdf = gdf.to_pandas() - gdg = gdf.groupby(["key", "val"], sort=True).mult.sum() - pdg = pdf.groupby(["key", "val"], sort=True).mult.sum() - assert_eq(pdg, gdg) + gdg = gdf.groupby(["key", "val"]).mult.sum() + pdg = pdf.groupby(["key", "val"]).mult.sum() + assert_groupby_results_equal(pdg, gdg) def test_groupby_multi_agg_single_groupby_series(): @@ -761,10 +747,10 @@ def test_groupby_multi_agg_single_groupby_series(): } ) gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby("x", sort=True).y.agg(["sum", "max"]) - gdg = gdf.groupby("x", sort=True).y.agg(["sum", "max"]) + pdg = pdf.groupby("x").y.agg(["sum", "max"]) + gdg = gdf.groupby("x").y.agg(["sum", "max"]) - assert_eq(pdg, gdg) + assert_groupby_results_equal(pdg, gdg) def test_groupby_multi_agg_multi_groupby(): @@ -777,9 +763,9 @@ def test_groupby_multi_agg_multi_groupby(): } ) gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby(["a", "b"], sort=True).agg(["sum", "max"]) - gdg = gdf.groupby(["a", "b"], sort=True).agg(["sum", "max"]) - assert_eq(pdg, gdg) + pdg = pdf.groupby(["a", "b"]).agg(["sum", "max"]) + gdg = gdf.groupby(["a", "b"]).agg(["sum", "max"]) + assert_groupby_results_equal(pdg, gdg) def test_groupby_datetime_multi_agg_multi_groupby(): @@ -796,10 +782,10 @@ def test_groupby_datetime_multi_agg_multi_groupby(): } ) gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby(["a", "b"], sort=True).agg(["sum", "max"]) - gdg = gdf.groupby(["a", "b"], sort=True).agg(["sum", "max"]) + pdg = pdf.groupby(["a", "b"]).agg(["sum", "max"]) + gdg = gdf.groupby(["a", "b"]).agg(["sum", "max"]) - assert_eq(pdg, gdg) + assert_groupby_results_equal(pdg, gdg) @pytest.mark.parametrize( @@ -823,9 +809,9 @@ def test_groupby_multi_agg_hash_groupby(agg): ).reset_index(drop=True) pdf = gdf.to_pandas() check_dtype = False if "count" in agg else True - pdg = pdf.groupby("id", sort=True).agg(agg) - gdg = gdf.groupby("id", sort=True).agg(agg) - assert_eq(pdg, gdg, check_dtype=check_dtype) + pdg = pdf.groupby("id").agg(agg) + gdg = gdf.groupby("id").agg(agg) + assert_groupby_results_equal(pdg, gdg, check_dtype=check_dtype) @pytest.mark.parametrize( @@ -836,9 +822,9 @@ def test_groupby_nulls_basic(agg): pdf = pd.DataFrame({"a": [0, 0, 1, 1, 2, 2], "b": [1, 2, 1, 2, 1, None]}) gdf = cudf.from_pandas(pdf) - assert_eq( - getattr(pdf.groupby("a", sort=True), agg)(), - getattr(gdf.groupby("a", sort=True), agg)(), + assert_groupby_results_equal( + getattr(pdf.groupby("a"), agg)(), + getattr(gdf.groupby("a"), agg)(), check_dtype=check_dtype, ) @@ -850,9 +836,9 @@ def test_groupby_nulls_basic(agg): } ) gdf = cudf.from_pandas(pdf) - assert_eq( - getattr(pdf.groupby("a", sort=True), agg)(), - getattr(gdf.groupby("a", sort=True), agg)(), + assert_groupby_results_equal( + getattr(pdf.groupby("a"), agg)(), + getattr(gdf.groupby("a"), agg)(), check_dtype=check_dtype, ) @@ -867,9 +853,9 @@ def test_groupby_nulls_basic(agg): # TODO: fillna() used here since we don't follow # Pandas' null semantics. Should we change it? - assert_eq( - getattr(pdf.groupby("a", sort=True), agg)().fillna(0), - getattr(gdf.groupby("a", sort=True), agg)().fillna(0), + assert_groupby_results_equal( + getattr(pdf.groupby("a"), agg)().fillna(0), + getattr(gdf.groupby("a"), agg)().fillna(0), check_dtype=check_dtype, ) @@ -878,7 +864,9 @@ def test_groupby_nulls_in_index(): pdf = pd.DataFrame({"a": [None, 2, 1, 1], "b": [1, 2, 3, 4]}) gdf = cudf.from_pandas(pdf) - assert_eq(pdf.groupby("a").sum(), gdf.groupby("a").sum()) + assert_groupby_results_equal( + pdf.groupby("a").sum(), gdf.groupby("a").sum() + ) def test_groupby_all_nulls_index(): @@ -889,13 +877,17 @@ def test_groupby_all_nulls_index(): } ) pdf = gdf.to_pandas() - assert_eq(pdf.groupby("a").sum(), gdf.groupby("a").sum()) + assert_groupby_results_equal( + pdf.groupby("a").sum(), gdf.groupby("a").sum() + ) gdf = cudf.DataFrame( {"a": cudf.Series([np.nan, np.nan, np.nan, np.nan]), "b": [1, 2, 3, 4]} ) pdf = gdf.to_pandas() - assert_eq(pdf.groupby("a").sum(), gdf.groupby("a").sum()) + assert_groupby_results_equal( + pdf.groupby("a").sum(), gdf.groupby("a").sum() + ) @pytest.mark.parametrize("sort", [True, False]) @@ -947,10 +939,8 @@ def test_groupby_cat(): {"a": [1, 1, 2], "b": pd.Series(["b", "b", "a"], dtype="category")} ) gdf = cudf.from_pandas(pdf) - assert_eq( - pdf.groupby("a", sort=True).count(), - gdf.groupby("a", sort=True).count(), - check_dtype=False, + assert_groupby_results_equal( + pdf.groupby("a").count(), gdf.groupby("a").count(), check_dtype=False, ) @@ -992,7 +982,7 @@ def test_groupby_quantile(interpolation, q): "Pandas NaN Rounding will fail nearest interpolation at 0.5" ) - assert_eq(pdresult, gdresult) + assert_groupby_results_equal(pdresult, gdresult) def test_groupby_std(): @@ -1002,16 +992,12 @@ def test_groupby_std(): } pdf = pd.DataFrame(raw_data) gdf = DataFrame.from_pandas(pdf) - pdg = pdf.groupby("x", sort=True) - gdg = gdf.groupby("x", sort=True) + pdg = pdf.groupby("x") + gdg = gdf.groupby("x") pdresult = pdg.std() gdresult = gdg.std() - # There's a lot left to add to python bindings like index name - # so this is a temporary workaround - pdresult = pdresult["y"].reset_index(drop=True) - gdresult = gdresult["y"].reset_index(drop=True) - assert_eq(pdresult, gdresult) + assert_groupby_results_equal(pdresult, gdresult) def test_groupby_size(): @@ -1024,23 +1010,19 @@ def test_groupby_size(): ) gdf = cudf.from_pandas(pdf) - assert_eq( - pdf.groupby("a", sort=True).size(), - gdf.groupby("a", sort=True).size(), - check_dtype=False, + assert_groupby_results_equal( + pdf.groupby("a").size(), gdf.groupby("a").size(), check_dtype=False, ) - assert_eq( - pdf.groupby(["a", "b", "c"], sort=True).size(), - gdf.groupby(["a", "b", "c"], sort=True).size(), + assert_groupby_results_equal( + pdf.groupby(["a", "b", "c"]).size(), + gdf.groupby(["a", "b", "c"]).size(), check_dtype=False, ) sr = pd.Series(range(len(pdf))) - assert_eq( - pdf.groupby(sr, sort=True).size(), - gdf.groupby(sr, sort=True).size(), - check_dtype=False, + assert_groupby_results_equal( + pdf.groupby(sr).size(), gdf.groupby(sr).size(), check_dtype=False, ) @@ -1055,15 +1037,15 @@ def test_groupby_datetime(nelem, as_index, agg): check_dtype = agg not in ("mean", "count", "idxmin", "idxmax") pdf = make_frame(pd.DataFrame, nelem=nelem, with_datetime=True) gdf = make_frame(cudf.DataFrame, nelem=nelem, with_datetime=True) - pdg = pdf.groupby("datetime", as_index=as_index, sort=True) - gdg = gdf.groupby("datetime", as_index=as_index, sort=True) + pdg = pdf.groupby("datetime", as_index=as_index) + gdg = gdf.groupby("datetime", as_index=as_index) if as_index is False: pdres = getattr(pdg, agg)() gdres = getattr(gdg, agg)() else: pdres = pdg.agg({"datetime": agg}) gdres = gdg.agg({"datetime": agg}) - assert_eq(pdres, gdres, check_dtype=check_dtype) + assert_groupby_results_equal(pdres, gdres, check_dtype=check_dtype) def test_groupby_dropna(): @@ -1071,8 +1053,8 @@ def test_groupby_dropna(): expect = cudf.DataFrame( {"b": [3, 3]}, index=cudf.Series([1, None], name="a") ) - got = df.groupby("a", dropna=False, sort=True).sum() - assert_eq(expect, got) + got = df.groupby("a", dropna=False).sum() + assert_groupby_results_equal(expect, got) df = cudf.DataFrame( {"a": [1, 1, 1, None], "b": [1, None, 1, None], "c": [1, 2, 3, 4]} @@ -1082,22 +1064,22 @@ def test_groupby_dropna(): names=["a", "b"], ) expect = cudf.DataFrame({"c": [4, 2, 4]}, index=idx) - got = df.groupby(["a", "b"], dropna=False, sort=True).sum() + got = df.groupby(["a", "b"], dropna=False).sum() - assert_eq(expect, got) + assert_groupby_results_equal(expect, got) def test_groupby_dropna_getattr(): df = cudf.DataFrame() df["id"] = [0, 1, 1, None, None, 3, 3] df["val"] = [0, 1, 1, 2, 2, 3, 3] - got = df.groupby("id", dropna=False, sort=True).val.sum() + got = df.groupby("id", dropna=False).val.sum() expect = cudf.Series( [0, 2, 6, 4], name="val", index=cudf.Series([0, 1, 3, None], name="id") ) - assert_eq(expect, got) + assert_groupby_results_equal(expect, got) def test_groupby_categorical_from_string(): @@ -1105,9 +1087,9 @@ def test_groupby_categorical_from_string(): gdf["id"] = ["a", "b", "c"] gdf["val"] = [0, 1, 2] gdf["id"] = gdf["id"].astype("category") - assert_eq( + assert_groupby_results_equal( cudf.DataFrame({"val": gdf["val"]}).set_index(keys=gdf["id"]), - gdf.groupby("id", sort=True).sum(), + gdf.groupby("id").sum(), ) @@ -1121,7 +1103,7 @@ def test_groupby_arbitrary_length_series(): expect = pdf.groupby(psr).sum() got = gdf.groupby(gsr).sum() - assert_eq(expect, got) + assert_groupby_results_equal(expect, got) def test_groupby_series_same_name_as_dataframe_column(): @@ -1134,7 +1116,7 @@ def test_groupby_series_same_name_as_dataframe_column(): expect = pdf.groupby(psr).sum() got = gdf.groupby(gsr).sum() - assert_eq(expect, got) + assert_groupby_results_equal(expect, got) def test_group_by_series_and_column_name_in_by(): @@ -1151,7 +1133,7 @@ def test_group_by_series_and_column_name_in_by(): expect = pdf.groupby(["x", psr0, psr1]).sum() got = gdf.groupby(["x", gsr0, gsr1]).sum() - assert_eq(expect, got) + assert_groupby_results_equal(expect, got) @pytest.mark.parametrize( @@ -1192,10 +1174,10 @@ def test_groupby_count(agg, by): ) gdf = cudf.from_pandas(pdf) - expect = pdf.groupby(by, sort=True).agg(agg) - got = gdf.groupby(by, sort=True).agg(agg) + expect = pdf.groupby(by).agg(agg) + got = gdf.groupby(by).agg(agg) - assert_eq(expect, got, check_dtype=False) + assert_groupby_results_equal(expect, got, check_dtype=False) @pytest.mark.parametrize("agg", [lambda x: x.median(), "median"]) @@ -1209,7 +1191,7 @@ def test_groupby_median(agg, by): expect = pdf.groupby(by).agg(agg) got = gdf.groupby(by).agg(agg) - assert_eq(expect, got, check_dtype=False) + assert_groupby_results_equal(expect, got, check_dtype=False) @pytest.mark.parametrize("agg", [lambda x: x.nunique(), "nunique"]) @@ -1225,7 +1207,7 @@ def test_groupby_nunique(agg, by): expect = pdf.groupby(by).nunique() got = gdf.groupby(by).nunique() - assert_eq(expect, got, check_dtype=False) + assert_groupby_results_equal(expect, got, check_dtype=False) @pytest.mark.parametrize( @@ -1243,10 +1225,10 @@ def test_groupby_nth(n, by): ) gdf = cudf.from_pandas(pdf) - expect = pdf.groupby(by, sort=True).nth(n) - got = gdf.groupby(by, sort=True).nth(n) + expect = pdf.groupby(by).nth(n) + got = gdf.groupby(by).nth(n) - assert_eq(expect, got, check_dtype=False) + assert_groupby_results_equal(expect, got, check_dtype=False) def test_raise_data_error(): @@ -1262,7 +1244,7 @@ def test_drop_unsupported_multi_agg(): gdf = cudf.DataFrame( {"a": [1, 1, 2, 2], "b": [1, 2, 3, 4], "c": ["a", "b", "c", "d"]} ) - assert_eq( + assert_groupby_results_equal( gdf.groupby("a").agg(["count", "mean"]), gdf.groupby("a").agg({"b": ["count", "mean"], "c": ["count"]}), ) @@ -1290,9 +1272,9 @@ def test_groupby_agg_combinations(agg): ) gdf = cudf.from_pandas(pdf) - assert_eq( - pdf.groupby("a", sort=True).agg(agg), - gdf.groupby("a", sort=True).agg(agg), + assert_groupby_results_equal( + pdf.groupby("a").agg(agg), + gdf.groupby("a").agg(agg), check_dtype=False, ) @@ -1302,7 +1284,7 @@ def test_groupby_apply_noempty_group(): {"a": [1, 1, 2, 2], "b": [1, 2, 1, 2], "c": [1, 2, 3, 4]} ) gdf = cudf.from_pandas(pdf) - assert_eq( + assert_groupby_results_equal( pdf.groupby("a") .apply(lambda x: x.iloc[[0, 1]]) .reset_index(drop=True), @@ -1317,9 +1299,9 @@ def test_reset_index_after_empty_groupby(): pdf = pd.DataFrame({"a": [1, 2, 3]}) gdf = cudf.from_pandas(pdf) - assert_eq( - pdf.groupby("a", sort=True).sum().reset_index(), - gdf.groupby("a", sort=True).sum().reset_index(), + assert_groupby_results_equal( + pdf.groupby("a").sum().reset_index(), + gdf.groupby("a").sum().reset_index(), ) @@ -1399,7 +1381,7 @@ def test_groupby_nunique_series(): pdf = pd.DataFrame({"a": [1, 1, 1, 2, 2, 2], "b": [1, 2, 3, 1, 1, 2]}) gdf = cudf.from_pandas(pdf) - assert_eq( + assert_groupby_results_equal( pdf.groupby("a")["b"].nunique(), gdf.groupby("a")["b"].nunique(), check_dtype=False, @@ -1411,7 +1393,7 @@ def test_groupby_list_simple(list_agg): pdf = pd.DataFrame({"a": [1, 1, 1, 2, 2, 2], "b": [1, 2, None, 4, 5, 6]}) gdf = cudf.from_pandas(pdf) - assert_eq( + assert_groupby_results_equal( pdf.groupby("a").agg({"b": list}), gdf.groupby("a").agg({"b": list_agg}), check_dtype=False, @@ -1428,7 +1410,7 @@ def test_groupby_list_of_lists(list_agg): ) gdf = cudf.from_pandas(pdf) - assert_eq( + assert_groupby_results_equal( pdf.groupby("a").agg({"b": list}), gdf.groupby("a").agg({"b": list_agg}), check_dtype=False, @@ -1440,7 +1422,7 @@ def test_groupby_list_single_element(list_agg): pdf = pd.DataFrame({"a": [1, 2], "b": [3, None]}) gdf = cudf.from_pandas(pdf) - assert_eq( + assert_groupby_results_equal( pdf.groupby("a").agg({"b": list}), gdf.groupby("a").agg({"b": list_agg}), check_dtype=False, @@ -1460,7 +1442,7 @@ def test_groupby_list_strings(agg): ) gdf = cudf.from_pandas(pdf) - assert_eq( + assert_groupby_results_equal( pdf.groupby("a").agg(agg), gdf.groupby("a").agg(agg), check_dtype=False, @@ -1477,11 +1459,11 @@ def test_groupby_list_columns_excluded(): ) gdf = cudf.from_pandas(pdf) - assert_eq( + assert_groupby_results_equal( pdf.groupby("a").mean(), gdf.groupby("a").mean(), check_dtype=False ) - assert_eq( + assert_groupby_results_equal( pdf.groupby("a").agg("mean"), gdf.groupby("a").agg("mean"), check_dtype=False, @@ -1495,7 +1477,7 @@ def test_groupby_pipe(): expected = pdf.groupby("A").pipe(lambda x: x.max() - x.min()) actual = gdf.groupby("A").pipe(lambda x: x.max() - x.min()) - assert_eq(expected, actual) + assert_groupby_results_equal(expected, actual) def test_groupby_apply_return_scalars(): @@ -1527,7 +1509,7 @@ def custom_map_func(x): expected = pdf.groupby("A").apply(lambda x: custom_map_func(x)) actual = gdf.groupby("A").apply(lambda x: custom_map_func(x)) - assert_eq(expected, actual) + assert_groupby_results_equal(expected, actual) @pytest.mark.parametrize( @@ -1543,7 +1525,7 @@ def test_groupby_apply_return_series_dataframe(cust_func): expected = pdf.groupby(["key"]).apply(cust_func) actual = gdf.groupby(["key"]).apply(cust_func) - assert_eq(expected, actual) + assert_groupby_results_equal(expected, actual) @pytest.mark.parametrize( @@ -1552,7 +1534,7 @@ def test_groupby_apply_return_series_dataframe(cust_func): ) def test_groupby_no_keys(pdf): gdf = cudf.from_pandas(pdf) - assert_eq( + assert_groupby_results_equal( pdf.groupby([]).max(), gdf.groupby([]).max(), check_dtype=False, @@ -1566,7 +1548,7 @@ def test_groupby_no_keys(pdf): ) def test_groupby_apply_no_keys(pdf): gdf = cudf.from_pandas(pdf) - assert_eq( + assert_groupby_results_equal( pdf.groupby([]).apply(lambda x: x.max()), gdf.groupby([]).apply(lambda x: x.max()), ) @@ -1605,4 +1587,4 @@ def test_groupby_unique(by, data, dtype): expect = pdf.groupby("by")["data"].unique() got = gdf.groupby("by")["data"].unique() - assert_eq(expect, got) + assert_groupby_results_equal(expect, got)