REFACTOR-modin-project#6807: Rename experimental groupby and experime…

…ntal numpy variables Signed-off-by: Dmitry Chigarev <[email protected]>
dchigarev · Dec 7, 2023 · 01419eb · 01419eb
1 parent a405217
commit 01419eb
Show file tree

Hide file tree

Showing 14 changed files with 466 additions and 57 deletions.
diff --git a/.github/actions/run-core-tests/group_3/action.yml b/.github/actions/run-core-tests/group_3/action.yml
@@ -19,6 +19,6 @@ runs:
         shell: bash -l {0}
       - run: |
           echo "::group::Running experimental groupby tests (group 3)..."
-          MODIN_EXPERIMENTAL_GROUPBY=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/test_groupby.py
+          MODIN_RANGE_PARTITIONING_GROUPBY=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/test_groupby.py
           echo "::endgroup::"
         shell: bash -l {0}
diff --git a/docs/flow/modin/experimental/reshuffling_groupby.rst b/docs/flow/modin/experimental/reshuffling_groupby.rst
@@ -1,19 +1,19 @@
-Reshuffling GroupBy
-"""""""""""""""""""
+Range-partitioning GroupBy
+""""""""""""""""""""""""""
 
-The experimental GroupBy implementation utilizes Modin's reshuffling mechanism that gives an
+The range-partitioning GroupBy implementation utilizes Modin's reshuffling mechanism that gives an
 ability to build range partitioning over a Modin DataFrame.
 
-In order to enable/disable this new implementation you have to specify ``cfg.ExperimentalGroupbyImpl``
+In order to enable/disable this new implementation you have to specify ``cfg.RangePartitioningGroupbyImpl``
 :doc:`configuration variable: </flow/modin/config>`
 
 .. code-block:: ipython
 
-    In [4]: import modin.config as cfg; cfg.ExperimentalGroupbyImpl.put(True)
+    In [4]: import modin.config as cfg; cfg.RangePartitioningGroupbyImpl.put(True)
 
     In [5]: # past this point, Modin will always use the new reshuffling groupby implementation
 
-    In [6]: cfg.ExperimentalGroupbyImpl.put(False)
+    In [6]: cfg.RangePartitioningGroupbyImpl.put(False)
 
     In [7]: # past this point, Modin won't use reshuffling groupby implementation anymore
 
@@ -32,7 +32,7 @@ The reshuffling implementation appears to be quite efficient when compared to ol
     In [6]: %timeit df.groupby("col0").nunique() # old full-axis implementation
     Out[6]: # 2.73 s ± 28.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
 
-    In [7]: import modin.config as cfg; cfg.ExperimentalGroupbyImpl.put(True)
+    In [7]: import modin.config as cfg; cfg.RangePartitioningGroupbyImpl.put(True)
 
     In [8]: %timeit df.groupby("col0").nunique() # new reshuffling implementation
     Out[8]: # 595 ms ± 61.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
@@ -51,7 +51,7 @@ have too few unique values (and thus fewer units of parallelization):
     In [6]: %timeit df.groupby("col0").sum() # old TreeReduce implementation
     Out[6]: # 155 ms ± 5.02 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
 
-    In [7]: import modin.config as cfg; cfg.ExperimentalGroupbyImpl.put(True)
+    In [7]: import modin.config as cfg; cfg.RangePartitioningGroupbyImpl.put(True)
 
     In [8]: %timeit df.groupby("col0").sum() # new reshuffling implementation
     Out[8]: # 230 ms ± 22.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
@@ -60,15 +60,15 @@ We're still looking for a heuristic that would be able to automatically switch t
 for each groupby case, but for now, we're offering to play with this switch on your own to see which
 implementation works best for your particular case.
 
-The new experimental groupby does not yet support all of the pandas API and falls back to older
+The new range-partitioning groupby does not yet support all of the pandas API and falls back to older
 implementation with the respective warning if it meets an unsupported case:
 
 .. code-block:: python
 
-    In [14]: import modin.config as cfg; cfg.ExperimentalGroupbyImpl.put(True)
+    In [14]: import modin.config as cfg; cfg.RangePartitioningGroupbyImpl.put(True)
 
     In [15]: df.groupby(level=0).sum()
-    Out[15]: # UserWarning: Can't use experimental reshuffling groupby implementation because of: 
+    Out[15]: # UserWarning: Can't use reshuffling groupby implementation because of:
         ...  # Reshuffling groupby is only supported when grouping on a column(s) of the same frame.
         ...  # https://github.com/modin-project/modin/issues/5926
         ...  # Falling back to a TreeReduce implementation.
diff --git a/modin/config/envvars.py b/modin/config/envvars.py
@@ -18,12 +18,19 @@
 import sys
 import warnings
 from textwrap import dedent
-from typing import Any, Optional
+from typing import Any, Optional, cast
 
 from packaging import version
 from pandas.util._decorators import doc  # type: ignore[attr-defined]
 
-from .pubsub import _TYPE_PARAMS, ExactStr, Parameter, ValueSource
+from .pubsub import (
+    _TYPE_PARAMS,
+    _UNSET,
+    DeprecationDescriptor,
+    ExactStr,
+    Parameter,
+    ValueSource,
+)
 
 
 class EnvironmentVariable(Parameter, type=str, abstract=True):
@@ -67,6 +74,81 @@ def get_help(cls) -> str:
         return help
 
 
+class EnvWithSibilings(
+    EnvironmentVariable,
+    # we have to pass anything here in order to derive from 'EnvironmentVariable',
+    # this doesn't force child classes to have 'str' type, they actually can be any type
+    type=str,
+):
+    """Ensure values synchronization between sibling parameters."""
+
+    _update_sibling = True
+
+    @classmethod
+    def _sibling(cls) -> type["EnvWithSibilings"]:
+        """Return a sibling parameter."""
+        raise NotImplementedError()
+
+    @classmethod
+    def get(cls) -> Any:
+        """
+        Get parameter's value and ensure that it's equal to the sibling's value.
+
+        Returns
+        -------
+        Any
+        """
+        if cls._sibling()._value is _UNSET and cls._value is _UNSET:
+            old_v: Any
+            new_v: Any
+            try:
+                old_v = cls._sibling()._get_raw_from_config()
+            except KeyError:
+                old_v = _UNSET
+            try:
+                new_v = cls._get_raw_from_config()
+            except KeyError:
+                new_v = _UNSET
+            if old_v is not _UNSET and new_v is _UNSET:
+                if not _TYPE_PARAMS[cls.type].verify(old_v):
+                    raise ValueError(f"Unsupported raw value: {old_v}")
+                old_v = _TYPE_PARAMS[cls.type].decode(old_v)
+                cls._sibling()._value = old_v
+                cls._sibling()._value_source = ValueSource.GOT_FROM_CFG_SOURCE
+
+                cls._value = old_v
+                cls._value_source = ValueSource.GOT_FROM_CFG_SOURCE
+                return cls._value
+            res = super().get()
+            cls._sibling()._value = res
+            cls._sibling()._value_source = cls._value_source
+            return res
+        return super().get()
+
+    @classmethod
+    def put(cls, value: Any) -> None:
+        """
+        Set a new value to this parameter as well as to its sibling.
+
+        Parameters
+        ----------
+        value : Any
+        """
+        super().put(value)
+        # avoid getting into an infinite recursion
+        if cls._update_sibling:
+            cls._update_sibling = False
+            try:
+                with warnings.catch_warnings():
+                    # filter potential future warnings of the sibling
+                    warnings.filterwarnings("ignore", category=FutureWarning)
+                    cls._sibling().put(value)
+            except BaseException:
+                pass
+            finally:
+                cls._update_sibling = True
+
+
 class IsDebug(EnvironmentVariable, type=bool):
     """Force Modin engine to be "Python" unless specified by $MODIN_ENGINE."""
 
@@ -621,26 +703,82 @@ class GithubCI(EnvironmentVariable, type=bool):
     default = False
 
 
-class ExperimentalNumPyAPI(EnvironmentVariable, type=bool):
-    """Set to true to use Modin's experimental NumPy API."""
+class NumpyOnModin(EnvWithSibilings, type=bool):
+    """Set to true to use Modin's implementation of NumPy API."""
+
+    varname = "NUMPY_ON_MODIN"
+    default = False
+
+    @classmethod
+    def _sibling(cls) -> type[EnvWithSibilings]:
+        """Get a parameter sibling."""
+        return ExperimentalNumPyAPI
+
+
+class ExperimentalNumPyAPI(EnvWithSibilings, type=bool):
+    """
+    Set to true to use Modin's implementation of NumPy API.
+
+    This parameter is deprecated. Use ``NumpyOnModin`` instead.
+    """
 
     varname = "MODIN_EXPERIMENTAL_NUMPY_API"
     default = False
 
+    @classmethod
+    def _sibling(cls) -> type[EnvWithSibilings]:
+        """Get a parameter sibling."""
+        return NumpyOnModin
+
+
+# Let the parameter's handling logic know that this variable is deprecated and that
+# we should raise respective warnings
+ExperimentalNumPyAPI._deprecation_descriptor = DeprecationDescriptor(
+    ExperimentalNumPyAPI, NumpyOnModin
+)
 
-class ExperimentalGroupbyImpl(EnvironmentVariable, type=bool):
+
+class RangePartitioningGroupbyImpl(EnvWithSibilings, type=bool):
     """
-    Set to true to use Modin's experimental group by implementation.
+    Set to true to use Modin's range-partitioning group by implementation.
 
     Experimental groupby is implemented using a range-partitioning technique,
     note that it may not always work better than the original Modin's TreeReduce
     and FullAxis implementations. For more information visit the according section
     of Modin's documentation: TODO: add a link to the section once it's written.
     """
 
+    varname = "MODIN_RANGE_PARTITIONING_GROUPBY"
+    default = False
+
+    @classmethod
+    def _sibling(cls) -> type[EnvWithSibilings]:
+        """Get a parameter sibling."""
+        return ExperimentalGroupbyImpl
+
+
+class ExperimentalGroupbyImpl(EnvWithSibilings, type=bool):
+    """
+    Set to true to use Modin's range-partitioning group by implementation.
+
+    This parameter is deprecated. Use ``RangePartitioningGroupbyImpl`` instead.
+    """
+
     varname = "MODIN_EXPERIMENTAL_GROUPBY"
     default = False
 
+    @classmethod
+    def _sibling(cls) -> type[EnvWithSibilings]:
+        """Get a parameter sibling."""
+        return RangePartitioningGroupbyImpl
+
+
+# Let the parameter's handling logic know that this variable is deprecated and that
+# we should raise respective warnings
+ExperimentalGroupbyImpl._deprecation_descriptor = DeprecationDescriptor(
+    ExperimentalGroupbyImpl, RangePartitioningGroupbyImpl
+)
+
 
 class CIAWSSecretAccessKey(EnvironmentVariable, type=str):
     """Set to AWS_SECRET_ACCESS_KEY when running mock S3 tests for Modin in GitHub CI."""
@@ -704,12 +842,28 @@ def _check_vars() -> None:
     }
     found_names = {name for name in os.environ if name.startswith("MODIN_")}
     unknown = found_names - valid_names
+    deprecated = {
+        obj.varname: obj
+        for obj in globals().values()
+        if isinstance(obj, type)
+        and issubclass(obj, EnvironmentVariable)
+        and not obj.is_abstract
+        and obj._deprecation_descriptor is not None
+    }
+    found_deprecated = found_names & deprecated.keys()
     if unknown:
         warnings.warn(
             f"Found unknown environment variable{'s' if len(unknown) > 1 else ''},"
             + f" please check {'their' if len(unknown) > 1 else 'its'} spelling: "
             + ", ".join(sorted(unknown))
         )
+    for depr_var in found_deprecated:
+        warnings.warn(
+            cast(
+                DeprecationDescriptor, deprecated[depr_var]._deprecation_descriptor
+            ).deprecation_message(use_envvar_names=True),
+            FutureWarning,
+        )
 
 
 _check_vars()