Introduce pandas_compatible option in cudf (#13241)

This PR adds `mode.pandas_compatible` option in `cudf`, which if set to true will try to mimic pandas behavior as much as possible if there is an API inconsistency with pandas. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: #13241
rapidsai · May 15, 2023 · b3f89c7 · b3f89c7
1 parent 1581773
commit b3f89c7
Show file tree

Hide file tree

Showing 6 changed files with 57 additions and 4 deletions.
diff --git a/python/cudf/cudf/api/extensions/__init__.py b/python/cudf/cudf/api/extensions/__init__.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+
+from pandas.api.extensions import no_default
 
 from cudf.api.extensions.accessor import (
     register_dataframe_accessor,
@@ -7,6 +9,7 @@
 )
 
 __all__ = [
+    "no_default",
     "register_dataframe_accessor",
     "register_index_accessor",
     "register_series_accessor",

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -42,6 +42,7 @@
 import cudf.core.common
 from cudf import _lib as libcudf
 from cudf._typing import ColumnLike, Dtype, NotImplementedType
+from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
     is_bool_dtype,
@@ -4066,7 +4067,7 @@ def groupby(
         axis=0,
         level=None,
         as_index=True,
-        sort=False,
+        sort=no_default,
         group_keys=False,
         squeeze=False,
         observed=True,

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
@@ -38,6 +38,7 @@
     Dtype,
     NotImplementedType,
 )
+from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
     is_bool_dtype,
@@ -3914,12 +3915,15 @@ def groupby(
         axis=0,
         level=None,
         as_index=True,
-        sort=False,
+        sort=no_default,
         group_keys=False,
         squeeze=False,
         observed=True,
         dropna=True,
     ):
+        if sort is no_default:
+            sort = cudf.get_option("mode.pandas_compatible")
+
         if axis not in (0, "index"):
             raise NotImplementedError("axis parameter is not yet implemented")
 

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -26,6 +26,7 @@
     NotImplementedType,
     ScalarLike,
 )
+from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
     _is_scalar_or_zero_d_array,
@@ -3294,7 +3295,7 @@ def groupby(
         axis=0,
         level=None,
         as_index=True,
-        sort=False,
+        sort=no_default,
         group_keys=False,
         squeeze=False,
         observed=True,

diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py
@@ -304,3 +304,18 @@ def _integer_and_none_validator(val):
     ),
     _integer_validator,
 )
+
+_register_option(
+    "mode.pandas_compatible",
+    False,
+    textwrap.dedent(
+        """
+        If set to `False`, retains `cudf` specific behavior.
+        If set to `True`, enables pandas compatibility mode,
+        which will try to match pandas API behaviors in case of
+        any inconsistency.
+        \tValid values are True or False. Default is False.
+    """
+    ),
+    _make_contains_validator([False, True]),
+)
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
 import collections
+import contextlib
 import datetime
 import itertools
 import operator
@@ -38,6 +39,17 @@
 _index_type_aggs = {"count", "idxmin", "idxmax", "cumcount"}
 
 
+# TODO: Make use of set_option context manager
+# once https://github.com/rapidsai/cudf/issues/12736
+# is resolved.
+@contextlib.contextmanager
+def with_pandas_compat(on):
+    original_compat_setting = cudf.get_option("mode.pandas_compatible")
+    cudf.set_option("mode.pandas_compatible", on)
+    yield
+    cudf.set_option("mode.pandas_compatible", original_compat_setting)
+
+
 def assert_groupby_results_equal(
     expect, got, sort=True, as_index=True, by=None, **kwargs
 ):
@@ -3044,6 +3056,23 @@ def test_groupby_by_index_names(index_names):
     )
 
 
+@with_pandas_compat(on=True)
+@pytest.mark.parametrize(
+    "groups", ["a", "b", "c", ["a", "c"], ["a", "b", "c"]]
+)
+def test_group_by_pandas_compat(groups):
+    df = cudf.DataFrame(
+        {
+            "a": [1, 3, 2, 3, 3],
+            "b": ["x", "a", "y", "z", "a"],
+            "c": [10, 13, 11, 12, 12],
+        }
+    )
+    pdf = df.to_pandas()
+
+    assert_eq(pdf.groupby(groups).max(), df.groupby(groups).max())
+
+
 class TestSample:
     @pytest.fixture(params=["default", "rangeindex", "intindex", "strindex"])
     def index(self, request):