Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix order-preservation in pandas-compat unsorted groupby #16942

Merged
merged 1 commit into from
Sep 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 22 additions & 5 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from cudf.core.abc import Serializable
from cudf.core.column.column import ColumnBase, StructDtype, as_column
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.copy_types import GatherMap
from cudf.core.join._join_helpers import _match_join_keys
from cudf.core.mixins import Reducible, Scannable
from cudf.core.multiindex import MultiIndex
Expand Down Expand Up @@ -754,17 +755,33 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
left_cols = list(self.grouping.keys.drop_duplicates()._columns)
right_cols = list(result_index._columns)
join_keys = [
_match_join_keys(lcol, rcol, "left")
_match_join_keys(lcol, rcol, "inner")
for lcol, rcol in zip(left_cols, right_cols)
]
# TODO: In future, see if we can centralize
# logic else where that has similar patterns.
join_keys = map(list, zip(*join_keys))
_, indices = libcudf.join.join(
*join_keys,
how="left",
# By construction, left and right keys are related by
# a permutation, so we can use an inner join.
left_order, right_order = libcudf.join.join(
*join_keys, how="inner"
)
# left order is some permutation of the ordering we
# want, and right order is a matching gather map for
# the result table. Get the correct order by sorting
# the right gather map.
(right_order,) = libcudf.sort.sort_by_key(
[right_order],
[left_order],
[True],
["first"],
stable=False,
)
result = result._gather(
GatherMap.from_column_unchecked(
right_order, len(result), nullify=False
)
)
result = result.take(indices)

if not self._as_index:
result = result.reset_index()
Expand Down
29 changes: 29 additions & 0 deletions python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
import numpy as np
import pytest

import cudf
from cudf.testing import assert_eq


@pytest.fixture(params=[False, True], ids=["without_nulls", "with_nulls"])
def with_nulls(request):
return request.param


@pytest.mark.parametrize("nrows", [30, 300, 300_000])
@pytest.mark.parametrize("nkeys", [1, 2, 4])
def test_groupby_maintain_order_random(nrows, nkeys, with_nulls):
key_names = [f"key{key}" for key in range(nkeys)]
key_values = [np.random.randint(100, size=nrows) for _ in key_names]
value = np.random.randint(-100, 100, size=nrows)
df = cudf.DataFrame(dict(zip(key_names, key_values), value=value))
if with_nulls:
for key in key_names:
df.loc[df[key] == 1, key] = None
with cudf.option_context("mode.pandas_compatible", True):
got = df.groupby(key_names, sort=False).agg({"value": "sum"})
expect = (
df.to_pandas().groupby(key_names, sort=False).agg({"value": "sum"})
)
assert_eq(expect, got, check_index_type=not with_nulls)
Loading