Skip to content

Commit

Permalink
Introduce pandas_compatible option in cudf (#13241)
Browse files Browse the repository at this point in the history
This PR adds `mode.pandas_compatible` option in `cudf`, which if set to true will try to mimic pandas behavior as much as possible if there is an API inconsistency with pandas.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: #13241
  • Loading branch information
galipremsagar authored May 15, 2023
1 parent 1581773 commit b3f89c7
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 4 deletions.
5 changes: 4 additions & 1 deletion python/cudf/cudf/api/extensions/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2023, NVIDIA CORPORATION.

from pandas.api.extensions import no_default

from cudf.api.extensions.accessor import (
register_dataframe_accessor,
Expand All @@ -7,6 +9,7 @@
)

__all__ = [
"no_default",
"register_dataframe_accessor",
"register_index_accessor",
"register_series_accessor",
Expand Down
3 changes: 2 additions & 1 deletion python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import cudf.core.common
from cudf import _lib as libcudf
from cudf._typing import ColumnLike, Dtype, NotImplementedType
from cudf.api.extensions import no_default
from cudf.api.types import (
_is_scalar_or_zero_d_array,
is_bool_dtype,
Expand Down Expand Up @@ -4066,7 +4067,7 @@ def groupby(
axis=0,
level=None,
as_index=True,
sort=False,
sort=no_default,
group_keys=False,
squeeze=False,
observed=True,
Expand Down
6 changes: 5 additions & 1 deletion python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
Dtype,
NotImplementedType,
)
from cudf.api.extensions import no_default
from cudf.api.types import (
_is_non_decimal_numeric_dtype,
is_bool_dtype,
Expand Down Expand Up @@ -3914,12 +3915,15 @@ def groupby(
axis=0,
level=None,
as_index=True,
sort=False,
sort=no_default,
group_keys=False,
squeeze=False,
observed=True,
dropna=True,
):
if sort is no_default:
sort = cudf.get_option("mode.pandas_compatible")

if axis not in (0, "index"):
raise NotImplementedError("axis parameter is not yet implemented")

Expand Down
3 changes: 2 additions & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
NotImplementedType,
ScalarLike,
)
from cudf.api.extensions import no_default
from cudf.api.types import (
_is_non_decimal_numeric_dtype,
_is_scalar_or_zero_d_array,
Expand Down Expand Up @@ -3294,7 +3295,7 @@ def groupby(
axis=0,
level=None,
as_index=True,
sort=False,
sort=no_default,
group_keys=False,
squeeze=False,
observed=True,
Expand Down
15 changes: 15 additions & 0 deletions python/cudf/cudf/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,3 +304,18 @@ def _integer_and_none_validator(val):
),
_integer_validator,
)

_register_option(
"mode.pandas_compatible",
False,
textwrap.dedent(
"""
If set to `False`, retains `cudf` specific behavior.
If set to `True`, enables pandas compatibility mode,
which will try to match pandas API behaviors in case of
any inconsistency.
\tValid values are True or False. Default is False.
"""
),
_make_contains_validator([False, True]),
)
29 changes: 29 additions & 0 deletions python/cudf/cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) 2018-2023, NVIDIA CORPORATION.

import collections
import contextlib
import datetime
import itertools
import operator
Expand Down Expand Up @@ -38,6 +39,17 @@
_index_type_aggs = {"count", "idxmin", "idxmax", "cumcount"}


# TODO: Make use of set_option context manager
# once https://github.com/rapidsai/cudf/issues/12736
# is resolved.
@contextlib.contextmanager
def with_pandas_compat(on):
original_compat_setting = cudf.get_option("mode.pandas_compatible")
cudf.set_option("mode.pandas_compatible", on)
yield
cudf.set_option("mode.pandas_compatible", original_compat_setting)


def assert_groupby_results_equal(
expect, got, sort=True, as_index=True, by=None, **kwargs
):
Expand Down Expand Up @@ -3044,6 +3056,23 @@ def test_groupby_by_index_names(index_names):
)


@with_pandas_compat(on=True)
@pytest.mark.parametrize(
"groups", ["a", "b", "c", ["a", "c"], ["a", "b", "c"]]
)
def test_group_by_pandas_compat(groups):
df = cudf.DataFrame(
{
"a": [1, 3, 2, 3, 3],
"b": ["x", "a", "y", "z", "a"],
"c": [10, 13, 11, 12, 12],
}
)
pdf = df.to_pandas()

assert_eq(pdf.groupby(groups).max(), df.groupby(groups).max())


class TestSample:
@pytest.fixture(params=["default", "rangeindex", "intindex", "strindex"])
def index(self, request):
Expand Down

0 comments on commit b3f89c7

Please sign in to comment.