Skip to content
/ cudf Public
forked from rapidsai/cudf

Commit

Permalink
Fix order-preservation in pandas-compat unsorted groupby
Browse files Browse the repository at this point in the history
When sort=False is requested in groupby aggregations and pandas
compatibility mode is enabled, we are on the hook to produce the
grouped aggregation result in an order which matches the input key
order. We previously nearly did this, but the reordering relied on
the (incorrect) assumption that when joining two tables with a left
join, the resulting gather map for the left table is the identity.
This is not the case.

To fix this, we must permute the right (result) table gather map by
the ordering that makes the left map the identity (AKA, sort by key
with the left map as keys) before permuting the result.

While here, replace the (bounds-checking) IndexedFrame.take call with
usage of the internal (non-bounds-checking) _gather method. This
avoids a redundant reduction over the indices, since by construction
they are in bounds.

- Closes rapidsai#16908
  • Loading branch information
wence- committed Sep 27, 2024
1 parent 0632538 commit cc1f3b5
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 5 deletions.
27 changes: 22 additions & 5 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from cudf.core.abc import Serializable
from cudf.core.column.column import ColumnBase, StructDtype, as_column
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.copy_types import GatherMap
from cudf.core.join._join_helpers import _match_join_keys
from cudf.core.mixins import Reducible, Scannable
from cudf.core.multiindex import MultiIndex
Expand Down Expand Up @@ -754,17 +755,33 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
left_cols = list(self.grouping.keys.drop_duplicates()._columns)
right_cols = list(result_index._columns)
join_keys = [
_match_join_keys(lcol, rcol, "left")
_match_join_keys(lcol, rcol, "inner")
for lcol, rcol in zip(left_cols, right_cols)
]
# TODO: In future, see if we can centralize
# logic else where that has similar patterns.
join_keys = map(list, zip(*join_keys))
_, indices = libcudf.join.join(
*join_keys,
how="left",
# By construction, left and right keys are related by
# a permutation, so we can use an inner join.
left_order, right_order = libcudf.join.join(
*join_keys, how="inner"
)
# left order is some permutation of the ordering we
# want, and right order is a matching gather map for
# the result table. Get the correct order by sorting
# the right gather map.
(right_order,) = libcudf.sort.sort_by_key(
[right_order],
[left_order],
[True],
["first"],
stable=False,
)
result = result._gather(
GatherMap.from_column_unchecked(
right_order, len(result), nullify=False
)
)
result = result.take(indices)

if not self._as_index:
result = result.reset_index()
Expand Down
29 changes: 29 additions & 0 deletions python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
import numpy as np
import pytest

import cudf
from cudf.testing import assert_eq


@pytest.fixture(params=[False, True], ids=["without_nulls", "with_nulls"])
def with_nulls(request):
return request.param


@pytest.mark.parametrize("nrows", [30, 300, 300_000])
@pytest.mark.parametrize("nkeys", [1, 2, 4])
def test_groupby_maintain_order_random(nrows, nkeys, with_nulls):
key_names = [f"key{key}" for key in range(nkeys)]
key_values = [np.random.randint(100, size=nrows) for _ in key_names]
value = np.random.randint(-100, 100, size=nrows)
df = cudf.DataFrame(dict(zip(key_names, key_values), value=value))
if with_nulls:
for key in key_names:
df.loc[df[key] == 1, key] = None
with cudf.option_context("mode.pandas_compatible", True):
got = df.groupby(key_names, sort=False).agg({"value": "sum"})
expect = (
df.to_pandas().groupby(key_names, sort=False).agg({"value": "sum"})
)
assert_eq(expect, got, check_index_type=not with_nulls)

0 comments on commit cc1f3b5

Please sign in to comment.