From b3f89c701247dafaa704ab075cc93c32847f8347 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 15 May 2023 18:11:18 -0500 Subject: [PATCH] Introduce `pandas_compatible` option in `cudf` (#13241) This PR adds `mode.pandas_compatible` option in `cudf`, which if set to true will try to mimic pandas behavior as much as possible if there is an API inconsistency with pandas. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/13241 --- python/cudf/cudf/api/extensions/__init__.py | 5 +++- python/cudf/cudf/core/dataframe.py | 3 ++- python/cudf/cudf/core/indexed_frame.py | 6 ++++- python/cudf/cudf/core/series.py | 3 ++- python/cudf/cudf/options.py | 15 +++++++++++ python/cudf/cudf/tests/test_groupby.py | 29 +++++++++++++++++++++ 6 files changed, 57 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/api/extensions/__init__.py b/python/cudf/cudf/api/extensions/__init__.py index eeb5dcdb32a..6118b6bf620 100644 --- a/python/cudf/cudf/api/extensions/__init__.py +++ b/python/cudf/cudf/api/extensions/__init__.py @@ -1,4 +1,6 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. + +from pandas.api.extensions import no_default from cudf.api.extensions.accessor import ( register_dataframe_accessor, @@ -7,6 +9,7 @@ ) __all__ = [ + "no_default", "register_dataframe_accessor", "register_index_accessor", "register_series_accessor", diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 2aa370ac8e5..afd2a59037f 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -42,6 +42,7 @@ import cudf.core.common from cudf import _lib as libcudf from cudf._typing import ColumnLike, Dtype, NotImplementedType +from cudf.api.extensions import no_default from cudf.api.types import ( _is_scalar_or_zero_d_array, is_bool_dtype, @@ -4066,7 +4067,7 @@ def groupby( axis=0, level=None, as_index=True, - sort=False, + sort=no_default, group_keys=False, squeeze=False, observed=True, diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 3c4d8d84c34..7141958f62d 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -38,6 +38,7 @@ Dtype, NotImplementedType, ) +from cudf.api.extensions import no_default from cudf.api.types import ( _is_non_decimal_numeric_dtype, is_bool_dtype, @@ -3914,12 +3915,15 @@ def groupby( axis=0, level=None, as_index=True, - sort=False, + sort=no_default, group_keys=False, squeeze=False, observed=True, dropna=True, ): + if sort is no_default: + sort = cudf.get_option("mode.pandas_compatible") + if axis not in (0, "index"): raise NotImplementedError("axis parameter is not yet implemented") diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 6d4caebb8ad..aa755e1c878 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -26,6 +26,7 @@ NotImplementedType, ScalarLike, ) +from cudf.api.extensions import no_default from cudf.api.types import ( _is_non_decimal_numeric_dtype, _is_scalar_or_zero_d_array, @@ -3294,7 +3295,7 @@ def groupby( axis=0, level=None, as_index=True, - sort=False, + sort=no_default, group_keys=False, squeeze=False, observed=True, diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index a375d8236d6..cbcd3fcb595 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -304,3 +304,18 @@ def _integer_and_none_validator(val): ), _integer_validator, ) + +_register_option( + "mode.pandas_compatible", + False, + textwrap.dedent( + """ + If set to `False`, retains `cudf` specific behavior. + If set to `True`, enables pandas compatibility mode, + which will try to match pandas API behaviors in case of + any inconsistency. + \tValid values are True or False. Default is False. + """ + ), + _make_contains_validator([False, True]), +) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 9a72b85dd13..e5199146fef 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -1,6 +1,7 @@ # Copyright (c) 2018-2023, NVIDIA CORPORATION. import collections +import contextlib import datetime import itertools import operator @@ -38,6 +39,17 @@ _index_type_aggs = {"count", "idxmin", "idxmax", "cumcount"} +# TODO: Make use of set_option context manager +# once https://github.com/rapidsai/cudf/issues/12736 +# is resolved. +@contextlib.contextmanager +def with_pandas_compat(on): + original_compat_setting = cudf.get_option("mode.pandas_compatible") + cudf.set_option("mode.pandas_compatible", on) + yield + cudf.set_option("mode.pandas_compatible", original_compat_setting) + + def assert_groupby_results_equal( expect, got, sort=True, as_index=True, by=None, **kwargs ): @@ -3044,6 +3056,23 @@ def test_groupby_by_index_names(index_names): ) +@with_pandas_compat(on=True) +@pytest.mark.parametrize( + "groups", ["a", "b", "c", ["a", "c"], ["a", "b", "c"]] +) +def test_group_by_pandas_compat(groups): + df = cudf.DataFrame( + { + "a": [1, 3, 2, 3, 3], + "b": ["x", "a", "y", "z", "a"], + "c": [10, 13, 11, 12, 12], + } + ) + pdf = df.to_pandas() + + assert_eq(pdf.groupby(groups).max(), df.groupby(groups).max()) + + class TestSample: @pytest.fixture(params=["default", "rangeindex", "intindex", "strindex"]) def index(self, request):