From 01419eb58b47f9490571a27c503daccc53d53cf3 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Thu, 7 Dec 2023 19:06:47 +0100 Subject: [PATCH] REFACTOR-#6807: Rename experimental groupby and experimental numpy variables Signed-off-by: Dmitry Chigarev --- .../actions/run-core-tests/group_3/action.yml | 2 +- .../experimental/reshuffling_groupby.rst | 22 +-- modin/config/envvars.py | 166 ++++++++++++++++- modin/config/pubsub.py | 80 +++++++- modin/config/test/test_envvars.py | 174 +++++++++++++++++- modin/core/storage_formats/pandas/groupby.py | 6 +- .../storage_formats/pandas/query_compiler.py | 18 +- modin/numpy/arr.py | 2 +- modin/pandas/base.py | 4 +- modin/pandas/series.py | 8 +- modin/pandas/test/test_groupby.py | 18 +- .../storage_formats/pandas/test_internals.py | 17 +- modin/utils.py | 4 +- setup.cfg | 2 +- 14 files changed, 466 insertions(+), 57 deletions(-) diff --git a/.github/actions/run-core-tests/group_3/action.yml b/.github/actions/run-core-tests/group_3/action.yml index 578673326f9..38d69ced09b 100644 --- a/.github/actions/run-core-tests/group_3/action.yml +++ b/.github/actions/run-core-tests/group_3/action.yml @@ -19,6 +19,6 @@ runs: shell: bash -l {0} - run: | echo "::group::Running experimental groupby tests (group 3)..." - MODIN_EXPERIMENTAL_GROUPBY=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/test_groupby.py + MODIN_RANGE_PARTITIONING_GROUPBY=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/test_groupby.py echo "::endgroup::" shell: bash -l {0} diff --git a/docs/flow/modin/experimental/reshuffling_groupby.rst b/docs/flow/modin/experimental/reshuffling_groupby.rst index 3265197bc15..e6a297a2fc4 100644 --- a/docs/flow/modin/experimental/reshuffling_groupby.rst +++ b/docs/flow/modin/experimental/reshuffling_groupby.rst @@ -1,19 +1,19 @@ -Reshuffling GroupBy -""""""""""""""""""" +Range-partitioning GroupBy +"""""""""""""""""""""""""" -The experimental GroupBy implementation utilizes Modin's reshuffling mechanism that gives an +The range-partitioning GroupBy implementation utilizes Modin's reshuffling mechanism that gives an ability to build range partitioning over a Modin DataFrame. -In order to enable/disable this new implementation you have to specify ``cfg.ExperimentalGroupbyImpl`` +In order to enable/disable this new implementation you have to specify ``cfg.RangePartitioningGroupbyImpl`` :doc:`configuration variable: ` .. code-block:: ipython - In [4]: import modin.config as cfg; cfg.ExperimentalGroupbyImpl.put(True) + In [4]: import modin.config as cfg; cfg.RangePartitioningGroupbyImpl.put(True) In [5]: # past this point, Modin will always use the new reshuffling groupby implementation - In [6]: cfg.ExperimentalGroupbyImpl.put(False) + In [6]: cfg.RangePartitioningGroupbyImpl.put(False) In [7]: # past this point, Modin won't use reshuffling groupby implementation anymore @@ -32,7 +32,7 @@ The reshuffling implementation appears to be quite efficient when compared to ol In [6]: %timeit df.groupby("col0").nunique() # old full-axis implementation Out[6]: # 2.73 s ± 28.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) - In [7]: import modin.config as cfg; cfg.ExperimentalGroupbyImpl.put(True) + In [7]: import modin.config as cfg; cfg.RangePartitioningGroupbyImpl.put(True) In [8]: %timeit df.groupby("col0").nunique() # new reshuffling implementation Out[8]: # 595 ms ± 61.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) @@ -51,7 +51,7 @@ have too few unique values (and thus fewer units of parallelization): In [6]: %timeit df.groupby("col0").sum() # old TreeReduce implementation Out[6]: # 155 ms ± 5.02 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) - In [7]: import modin.config as cfg; cfg.ExperimentalGroupbyImpl.put(True) + In [7]: import modin.config as cfg; cfg.RangePartitioningGroupbyImpl.put(True) In [8]: %timeit df.groupby("col0").sum() # new reshuffling implementation Out[8]: # 230 ms ± 22.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) @@ -60,15 +60,15 @@ We're still looking for a heuristic that would be able to automatically switch t for each groupby case, but for now, we're offering to play with this switch on your own to see which implementation works best for your particular case. -The new experimental groupby does not yet support all of the pandas API and falls back to older +The new range-partitioning groupby does not yet support all of the pandas API and falls back to older implementation with the respective warning if it meets an unsupported case: .. code-block:: python - In [14]: import modin.config as cfg; cfg.ExperimentalGroupbyImpl.put(True) + In [14]: import modin.config as cfg; cfg.RangePartitioningGroupbyImpl.put(True) In [15]: df.groupby(level=0).sum() - Out[15]: # UserWarning: Can't use experimental reshuffling groupby implementation because of: + Out[15]: # UserWarning: Can't use reshuffling groupby implementation because of: ... # Reshuffling groupby is only supported when grouping on a column(s) of the same frame. ... # https://github.com/modin-project/modin/issues/5926 ... # Falling back to a TreeReduce implementation. diff --git a/modin/config/envvars.py b/modin/config/envvars.py index da35bbaff00..4158e91f708 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -18,12 +18,19 @@ import sys import warnings from textwrap import dedent -from typing import Any, Optional +from typing import Any, Optional, cast from packaging import version from pandas.util._decorators import doc # type: ignore[attr-defined] -from .pubsub import _TYPE_PARAMS, ExactStr, Parameter, ValueSource +from .pubsub import ( + _TYPE_PARAMS, + _UNSET, + DeprecationDescriptor, + ExactStr, + Parameter, + ValueSource, +) class EnvironmentVariable(Parameter, type=str, abstract=True): @@ -67,6 +74,81 @@ def get_help(cls) -> str: return help +class EnvWithSibilings( + EnvironmentVariable, + # we have to pass anything here in order to derive from 'EnvironmentVariable', + # this doesn't force child classes to have 'str' type, they actually can be any type + type=str, +): + """Ensure values synchronization between sibling parameters.""" + + _update_sibling = True + + @classmethod + def _sibling(cls) -> type["EnvWithSibilings"]: + """Return a sibling parameter.""" + raise NotImplementedError() + + @classmethod + def get(cls) -> Any: + """ + Get parameter's value and ensure that it's equal to the sibling's value. + + Returns + ------- + Any + """ + if cls._sibling()._value is _UNSET and cls._value is _UNSET: + old_v: Any + new_v: Any + try: + old_v = cls._sibling()._get_raw_from_config() + except KeyError: + old_v = _UNSET + try: + new_v = cls._get_raw_from_config() + except KeyError: + new_v = _UNSET + if old_v is not _UNSET and new_v is _UNSET: + if not _TYPE_PARAMS[cls.type].verify(old_v): + raise ValueError(f"Unsupported raw value: {old_v}") + old_v = _TYPE_PARAMS[cls.type].decode(old_v) + cls._sibling()._value = old_v + cls._sibling()._value_source = ValueSource.GOT_FROM_CFG_SOURCE + + cls._value = old_v + cls._value_source = ValueSource.GOT_FROM_CFG_SOURCE + return cls._value + res = super().get() + cls._sibling()._value = res + cls._sibling()._value_source = cls._value_source + return res + return super().get() + + @classmethod + def put(cls, value: Any) -> None: + """ + Set a new value to this parameter as well as to its sibling. + + Parameters + ---------- + value : Any + """ + super().put(value) + # avoid getting into an infinite recursion + if cls._update_sibling: + cls._update_sibling = False + try: + with warnings.catch_warnings(): + # filter potential future warnings of the sibling + warnings.filterwarnings("ignore", category=FutureWarning) + cls._sibling().put(value) + except BaseException: + pass + finally: + cls._update_sibling = True + + class IsDebug(EnvironmentVariable, type=bool): """Force Modin engine to be "Python" unless specified by $MODIN_ENGINE.""" @@ -621,16 +703,44 @@ class GithubCI(EnvironmentVariable, type=bool): default = False -class ExperimentalNumPyAPI(EnvironmentVariable, type=bool): - """Set to true to use Modin's experimental NumPy API.""" +class NumpyOnModin(EnvWithSibilings, type=bool): + """Set to true to use Modin's implementation of NumPy API.""" + + varname = "NUMPY_ON_MODIN" + default = False + + @classmethod + def _sibling(cls) -> type[EnvWithSibilings]: + """Get a parameter sibling.""" + return ExperimentalNumPyAPI + + +class ExperimentalNumPyAPI(EnvWithSibilings, type=bool): + """ + Set to true to use Modin's implementation of NumPy API. + + This parameter is deprecated. Use ``NumpyOnModin`` instead. + """ varname = "MODIN_EXPERIMENTAL_NUMPY_API" default = False + @classmethod + def _sibling(cls) -> type[EnvWithSibilings]: + """Get a parameter sibling.""" + return NumpyOnModin + + +# Let the parameter's handling logic know that this variable is deprecated and that +# we should raise respective warnings +ExperimentalNumPyAPI._deprecation_descriptor = DeprecationDescriptor( + ExperimentalNumPyAPI, NumpyOnModin +) -class ExperimentalGroupbyImpl(EnvironmentVariable, type=bool): + +class RangePartitioningGroupbyImpl(EnvWithSibilings, type=bool): """ - Set to true to use Modin's experimental group by implementation. + Set to true to use Modin's range-partitioning group by implementation. Experimental groupby is implemented using a range-partitioning technique, note that it may not always work better than the original Modin's TreeReduce @@ -638,9 +748,37 @@ class ExperimentalGroupbyImpl(EnvironmentVariable, type=bool): of Modin's documentation: TODO: add a link to the section once it's written. """ + varname = "MODIN_RANGE_PARTITIONING_GROUPBY" + default = False + + @classmethod + def _sibling(cls) -> type[EnvWithSibilings]: + """Get a parameter sibling.""" + return ExperimentalGroupbyImpl + + +class ExperimentalGroupbyImpl(EnvWithSibilings, type=bool): + """ + Set to true to use Modin's range-partitioning group by implementation. + + This parameter is deprecated. Use ``RangePartitioningGroupbyImpl`` instead. + """ + varname = "MODIN_EXPERIMENTAL_GROUPBY" default = False + @classmethod + def _sibling(cls) -> type[EnvWithSibilings]: + """Get a parameter sibling.""" + return RangePartitioningGroupbyImpl + + +# Let the parameter's handling logic know that this variable is deprecated and that +# we should raise respective warnings +ExperimentalGroupbyImpl._deprecation_descriptor = DeprecationDescriptor( + ExperimentalGroupbyImpl, RangePartitioningGroupbyImpl +) + class CIAWSSecretAccessKey(EnvironmentVariable, type=str): """Set to AWS_SECRET_ACCESS_KEY when running mock S3 tests for Modin in GitHub CI.""" @@ -704,12 +842,28 @@ def _check_vars() -> None: } found_names = {name for name in os.environ if name.startswith("MODIN_")} unknown = found_names - valid_names + deprecated = { + obj.varname: obj + for obj in globals().values() + if isinstance(obj, type) + and issubclass(obj, EnvironmentVariable) + and not obj.is_abstract + and obj._deprecation_descriptor is not None + } + found_deprecated = found_names & deprecated.keys() if unknown: warnings.warn( f"Found unknown environment variable{'s' if len(unknown) > 1 else ''}," + f" please check {'their' if len(unknown) > 1 else 'its'} spelling: " + ", ".join(sorted(unknown)) ) + for depr_var in found_deprecated: + warnings.warn( + cast( + DeprecationDescriptor, deprecated[depr_var]._deprecation_descriptor + ).deprecation_message(use_envvar_names=True), + FutureWarning, + ) _check_vars() diff --git a/modin/config/pubsub.py b/modin/config/pubsub.py index a0bff312ec6..8e148271841 100644 --- a/modin/config/pubsub.py +++ b/modin/config/pubsub.py @@ -13,9 +13,76 @@ """Module houses ``Parameter`` class - base class for all configs.""" +import warnings from collections import defaultdict from enum import IntEnum -from typing import Any, Callable, DefaultDict, NamedTuple, Optional, Tuple +from typing import ( + TYPE_CHECKING, + Any, + Callable, + DefaultDict, + NamedTuple, + Optional, + Tuple, + cast, +) + +if TYPE_CHECKING: + from .envvars import EnvironmentVariable + + +class DeprecationDescriptor: + """ + Describe deprecated parameter. + + Parameters + ---------- + parameter : type[Parameter] + Deprecated parameter. + new_parameter : type[Parameter], optional + If there's a replacement parameter for the deprecated one, specify it here. + when_removed : str, optional + If known, the exact release when the deprecated parameter is planned to be removed. + """ + + def __init__( + self, + parameter: type["Parameter"], + new_parameter: Optional[type["Parameter"]] = None, + when_removed: Optional[str] = None, + ): + self._parameter: type["Parameter"] = parameter + self._new_parameter: Optional[type["Parameter"]] = new_parameter + self._when_removed: str = "a future" if when_removed is None else when_removed + + def deprecation_message(self, use_envvar_names: bool = False) -> str: + """ + Generate a message to be used in a warning raised when using the deprecated parameter. + + Parameters + ---------- + use_envvar_names : bool, default: False + Whether to use environment variable names in the warning. If ``True``, both + ``self._parameter`` and ``self._new_parameter`` have to be a type of ``EnvironmentVariable``. + + Returns + ------- + str + """ + name = ( + cast("EnvironmentVariable", self._parameter).varname + if use_envvar_names + else self._parameter.__name__ + ) + msg = f"'{name}' is deprecated and will be removed in {self._when_removed} version." + if self._new_parameter is not None: + new_name = ( + cast("EnvironmentVariable", self._new_parameter) + if use_envvar_names + else self._new_parameter.__name__ + ) + msg += f" Use '{new_name}' instead." + return msg class TypeDescriptor(NamedTuple): @@ -134,6 +201,8 @@ class Parameter(object): _value_source : Optional[ValueSource] Source of the ``Parameter`` value, should be set by ``ValueSource``. + _deprecation_descriptor : Optional[DeprecationDescriptor] + Indicate whether this parameter is deprecated. """ choices: Optional[Tuple[str, ...]] = None @@ -144,6 +213,7 @@ class Parameter(object): _value: Any = _UNSET _subs: list = [] _once: DefaultDict[Any, list] = defaultdict(list) + _deprecation_descriptor: Optional[DeprecationDescriptor] = None @classmethod def _get_raw_from_config(cls) -> str: @@ -254,6 +324,10 @@ def get(cls) -> Any: Any Decoded and verified config value. """ + if cls._deprecation_descriptor is not None: + warnings.warn( + cls._deprecation_descriptor.deprecation_message(), FutureWarning + ) if cls._value is _UNSET: # get the value from env try: @@ -278,6 +352,10 @@ def put(cls, value: Any) -> None: value : Any Config value to set. """ + if cls._deprecation_descriptor is not None: + warnings.warn( + cls._deprecation_descriptor.deprecation_message(), FutureWarning + ) cls._check_callbacks(cls._put_nocallback(value)) cls._value_source = ValueSource.SET_BY_USER diff --git a/modin/config/test/test_envvars.py b/modin/config/test/test_envvars.py index a52b2ed5595..ed681d4dd50 100644 --- a/modin/config/test/test_envvars.py +++ b/modin/config/test/test_envvars.py @@ -12,12 +12,35 @@ # governing permissions and limitations under the License. import os +import warnings import pytest from packaging import version import modin.config as cfg -from modin.config.envvars import EnvironmentVariable, ExactStr, _check_vars +from modin.config.envvars import ( + _UNSET, + EnvironmentVariable, + ExactStr, + Parameter, + _check_vars, +) + + +def reset_vars(*vars: tuple[Parameter]): + """ + Reset value for the passed parameters. + + Parameters + ---------- + *vars : tuple[Parameter] + """ + for var in vars: + var._value = _UNSET + try: + del os.environ[var.varname] + except KeyError: + pass @pytest.fixture @@ -105,3 +128,152 @@ def test_hdk_envvar(): assert params["enable_union"] == 4 assert params["enable_thrift_logs"] == 5 assert params["enable_lazy_dict_materialization"] == 6 + + +@pytest.mark.parametrize( + "deprecated_var, new_var", + [ + (cfg.ExperimentalGroupbyImpl, cfg.RangePartitioningGroupbyImpl), + (cfg.ExperimentalNumPyAPI, cfg.NumpyOnModin), + ], +) +def test_deprecated_bool_vars_warnings(deprecated_var, new_var): + """Test that deprecated parameters are raising `FutureWarnings` and their replacements don't.""" + old_depr_val = deprecated_var.get() + old_new_var = new_var.get() + + try: + reset_vars(deprecated_var, new_var) + with pytest.warns(FutureWarning): + deprecated_var.get() + + with pytest.warns(FutureWarning): + deprecated_var.put(False) + + os.environ[deprecated_var.varname] = "1" + with pytest.warns(FutureWarning): + _check_vars() + del os.environ[deprecated_var.varname] + + # check that the new var doesn't raise any warnings + reset_vars(deprecated_var, new_var) + with warnings.catch_warnings(): + warnings.simplefilter("error") + new_var.get() + new_var.put(False) + os.environ[new_var.varname] = "1" + _check_vars() + del os.environ[new_var.varname] + finally: + deprecated_var.put(old_depr_val) + new_var.put(old_new_var) + + +@pytest.mark.parametrize( + "deprecated_var, new_var", + [ + (cfg.ExperimentalGroupbyImpl, cfg.RangePartitioningGroupbyImpl), + (cfg.ExperimentalNumPyAPI, cfg.NumpyOnModin), + ], +) +@pytest.mark.parametrize("get_depr_first", [True, False]) +def test_deprecated_bool_vars_equals(deprecated_var, new_var, get_depr_first): + """ + Test that deprecated parameters always have values equal to the new replacement parameters. + + Parameters + ---------- + deprecated_var : EnvironmentVariable + new_var : EnvironmentVariable + get_depr_first : bool + Defines an order in which the ``.get()`` method should be called when comparing values. + If ``True``: get deprecated value at first ``deprecated_var.get() == new_var.get() == value``. + If ``False``: get new value at first ``new_var.get() == deprecated_var.get() == value``. + The logic of the ``.get()`` method depends on which parameter was initialized first, + that's why it's worth testing both cases. + """ + old_depr_val = deprecated_var.get() + old_new_var = new_var.get() + + def get_values(): + return ( + (deprecated_var.get(), new_var.get()) + if get_depr_first + else (new_var.get(), deprecated_var.get()) + ) + + try: + # case1: initializing the value using 'deprecated_var' + reset_vars(deprecated_var, new_var) + deprecated_var.put(True) + val1, val2 = get_values() + assert val1 == val2 == True # noqa: E712 ('obj == True' comparison) + + new_var.put(False) + val1, val2 = get_values() + assert val1 == val2 == False # noqa: E712 ('obj == False' comparison) + + new_var.put(True) + val1, val2 = get_values() + assert val1 == val2 == True # noqa: E712 ('obj == True' comparison) + + deprecated_var.put(False) + val1, val2 = get_values() + assert val1 == val2 == False # noqa: E712 ('obj == False' comparison) + + # case2: initializing the value using 'new_var' + reset_vars(deprecated_var, new_var) + new_var.put(True) + val1, val2 = get_values() + assert val1 == val2 == True # noqa: E712 ('obj == True' comparison) + + deprecated_var.put(False) + val1, val2 = get_values() + assert val1 == val2 == False # noqa: E712 ('obj == False' comparison) + + deprecated_var.put(True) + val1, val2 = get_values() + assert val1 == val2 == True # noqa: E712 ('obj == True' comparison) + + new_var.put(False) + val1, val2 = get_values() + assert val1 == val2 == False # noqa: E712 ('obj == False' comparison) + + # case3: initializing the value using 'deprecated_var' with env variable + reset_vars(deprecated_var, new_var) + os.environ[deprecated_var.varname] = "True" + val1, val2 = get_values() + assert val1 == val2 == True # noqa: E712 ('obj == True' comparison) + + new_var.put(False) + val1, val2 = get_values() + assert val1 == val2 == False # noqa: E712 ('obj == False' comparison) + + new_var.put(True) + val1, val2 = get_values() + assert val1 == val2 == True # noqa: E712 ('obj == True' comparison) + + deprecated_var.put(False) + val1, val2 = get_values() + assert val1 == val2 == False # noqa: E712 ('obj == False' comparison) + + # case4: initializing the value using 'new_var' with env variable + reset_vars(deprecated_var, new_var) + os.environ[new_var.varname] = "True" + val1, val2 = get_values() + assert val1 == val2 == True # noqa: E712 ('obj == True' comparison) + + deprecated_var.put(False) + val1, val2 = get_values() + assert val1 == val2 == False # noqa: E712 ('obj == False' comparison) + + deprecated_var.put(True) + val1, val2 = get_values() + assert val1 == val2 == True # noqa: E712 ('obj == True' comparison) + + new_var.put(False) + val1, val2 = get_values() + assert val1 == val2 == False # noqa: E712 ('obj == False' comparison) + finally: + deprecated_var.put(old_depr_val) + new_var.put(old_new_var) diff --git a/modin/core/storage_formats/pandas/groupby.py b/modin/core/storage_formats/pandas/groupby.py index b89fcfc44c7..be87ae14eeb 100644 --- a/modin/core/storage_formats/pandas/groupby.py +++ b/modin/core/storage_formats/pandas/groupby.py @@ -16,7 +16,7 @@ import numpy as np import pandas -from modin.config import ExperimentalGroupbyImpl +from modin.config import RangePartitioningGroupbyImpl from modin.core.dataframe.algebra import GroupByReduce from modin.error_message import ErrorMessage from modin.utils import hashable @@ -93,7 +93,7 @@ def build_qc_method(cls, agg_name, finalizer_fn=None): ) def method(query_compiler, *args, **kwargs): - if ExperimentalGroupbyImpl.get(): + if RangePartitioningGroupbyImpl.get(): try: if finalizer_fn is not None: raise NotImplementedError( @@ -104,7 +104,7 @@ def method(query_compiler, *args, **kwargs): ) except NotImplementedError as e: ErrorMessage.warn( - f"Can't use experimental reshuffling groupby implementation because of: {e}" + f"Can't use reshuffling groupby implementation because of: {e}" + "\nFalling back to a TreeReduce implementation." ) return map_reduce_method(query_compiler, *args, **kwargs) diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 4f2f957bb58..8e9ce1a36f0 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -43,7 +43,7 @@ from pandas.core.indexing import check_bool_indexer from pandas.errors import DataError, MergeError -from modin.config import CpuCount, ExperimentalGroupbyImpl +from modin.config import CpuCount, RangePartitioningGroupbyImpl from modin.core.dataframe.algebra import ( Binary, Fold, @@ -3521,7 +3521,7 @@ def groupby_nth( return result def groupby_mean(self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False): - if ExperimentalGroupbyImpl.get(): + if RangePartitioningGroupbyImpl.get(): try: return self._groupby_shuffle( by=by, @@ -3534,7 +3534,7 @@ def groupby_mean(self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=Fals ) except NotImplementedError as e: ErrorMessage.warn( - f"Can't use experimental reshuffling groupby implementation because of: {e}" + f"Can't use reshuffling groupby implementation because of: {e}" + "\nFalling back to a TreeReduce implementation." ) @@ -3592,7 +3592,7 @@ def groupby_size( agg_kwargs, drop=False, ): - if ExperimentalGroupbyImpl.get(): + if RangePartitioningGroupbyImpl.get(): try: return self._groupby_shuffle( by=by, @@ -3605,7 +3605,7 @@ def groupby_size( ) except NotImplementedError as e: ErrorMessage.warn( - f"Can't use experimental reshuffling groupby implementation because of: {e}" + f"Can't use reshuffling groupby implementation because of: {e}" + "\nFalling back to a TreeReduce implementation." ) @@ -3962,9 +3962,9 @@ def groupby_agg( ) # 'group_wise' means 'groupby.apply()'. We're certain that range-partitioning groupby - # always works better for '.apply()', so we're using it regardless of the 'ExperimentalGroupbyImpl' + # always works better for '.apply()', so we're using it regardless of the 'RangePartitioningGroupbyImpl' # value - if how == "group_wise" or ExperimentalGroupbyImpl.get(): + if how == "group_wise" or RangePartitioningGroupbyImpl.get(): try: return self._groupby_shuffle( by=by, @@ -3980,11 +3980,11 @@ def groupby_agg( # if a user wants to use range-partitioning groupby explicitly, then we should print a visible # warning to them on a failure, otherwise we're only logging it message = ( - f"Can't use experimental reshuffling groupby implementation because of: {e}" + f"Can't use reshuffling groupby implementation because of: {e}" + "\nFalling back to a full-axis implementation." ) get_logger().info(message) - if ExperimentalGroupbyImpl.get(): + if RangePartitioningGroupbyImpl.get(): ErrorMessage.warn(message) if isinstance(agg_func, dict) and GroupbyReduceImpl.has_impl_for(agg_func): diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index 6bf7b3186c5..d6dfd19e701 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -166,7 +166,7 @@ def __init__( ): self._siblings = [] ErrorMessage.single_warning( - "Using Modin's new NumPy API. To convert from a Modin object to a NumPy array, either turn off the ExperimentalNumPyAPI flag, or use `modin.utils.to_numpy`." + "Using Modin's new NumPy API. To convert from a Modin object to a NumPy array, either turn off the NumpyOnModin flag, or use `modin.pandas.io.to_numpy`." ) if isinstance(object, array): _query_compiler = object._query_compiler.copy() diff --git a/modin/pandas/base.py b/modin/pandas/base.py index aedeed9555c..8df685c2758 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -3371,9 +3371,9 @@ def to_numpy( """ Convert the `BasePandasDataset` to a NumPy array or a Modin wrapper for NumPy array. """ - from modin.config import ExperimentalNumPyAPI + from modin.config import NumpyOnModin - if ExperimentalNumPyAPI.get(): + if NumpyOnModin.get(): from ..numpy.arr import array return array(self, copy=copy) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 849bd0ff656..8867ff4b71e 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -495,9 +495,9 @@ def values(self): # noqa: RT01, D200 data = self.to_numpy() if isinstance(self.dtype, pd.CategoricalDtype): - from modin.config import ExperimentalNumPyAPI + from modin.config import NumpyOnModin - if ExperimentalNumPyAPI.get(): + if NumpyOnModin.get(): data = data._to_numpy() data = pd.Categorical(data, dtype=self.dtype) return data @@ -1913,9 +1913,9 @@ def to_numpy( """ Return the NumPy ndarray representing the values in this Series or Index. """ - from modin.config import ExperimentalNumPyAPI + from modin.config import NumpyOnModin - if not ExperimentalNumPyAPI.get(): + if not NumpyOnModin.get(): return ( super(Series, self) .to_numpy( diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index 2b491372a92..40d1bcce7bb 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -22,7 +22,7 @@ import modin.pandas as pd from modin.config import NPartitions, StorageFormat -from modin.config.envvars import ExperimentalGroupbyImpl, IsRayCluster +from modin.config.envvars import IsRayCluster, RangePartitioningGroupbyImpl from modin.core.dataframe.algebra.default2pandas.groupby import GroupBy from modin.core.dataframe.pandas.partitioning.axis_partition import ( PandasDataframeAxisPartition, @@ -282,7 +282,7 @@ def test_mixed_dtypes_groupby(as_index): eval_max(modin_groupby, pandas_groupby) eval_len(modin_groupby, pandas_groupby) eval_sum(modin_groupby, pandas_groupby) - if not ExperimentalGroupbyImpl.get(): + if not RangePartitioningGroupbyImpl.get(): # `.group` fails with experimental groupby # https://github.com/modin-project/modin/issues/6083 eval_ngroup(modin_groupby, pandas_groupby) @@ -1178,7 +1178,7 @@ def sort_index_if_experimental_groupby(*dfs): the experimental implementation changes the order of rows for that: https://github.com/modin-project/modin/issues/5924 """ - if ExperimentalGroupbyImpl.get(): + if RangePartitioningGroupbyImpl.get(): return tuple(df.sort_index() for df in dfs) return dfs @@ -1439,7 +1439,7 @@ def test(grp): def eval_groups(modin_groupby, pandas_groupby): for k, v in modin_groupby.groups.items(): assert v.equals(pandas_groupby.groups[k]) - if ExperimentalGroupbyImpl.get(): + if RangePartitioningGroupbyImpl.get(): # `.get_group()` doesn't work correctly with experimental groupby: # https://github.com/modin-project/modin/issues/6093 return @@ -1750,7 +1750,7 @@ def test_agg_func_None_rename(by_and_agg_dict, as_index): False, marks=pytest.mark.skipif( get_current_execution() == "BaseOnPython" - or ExperimentalGroupbyImpl.get(), + or RangePartitioningGroupbyImpl.get(), reason="See Pandas issue #39103", ), ), @@ -2883,8 +2883,8 @@ def perform(lib): @pytest.mark.parametrize( "modify_config", [ - {ExperimentalGroupbyImpl: True, IsRayCluster: True}, - {ExperimentalGroupbyImpl: True, IsRayCluster: False}, + {RangePartitioningGroupbyImpl: True, IsRayCluster: True}, + {RangePartitioningGroupbyImpl: True, IsRayCluster: False}, ], indirect=True, ) @@ -2947,7 +2947,7 @@ def func3(group): @pytest.mark.parametrize( - "modify_config", [{ExperimentalGroupbyImpl: True}], indirect=True + "modify_config", [{RangePartitioningGroupbyImpl: True}], indirect=True ) def test_reshuffling_groupby_on_strings(modify_config): # reproducer from https://github.com/modin-project/modin/issues/6509 @@ -2964,7 +2964,7 @@ def test_reshuffling_groupby_on_strings(modify_config): @pytest.mark.parametrize( - "modify_config", [{ExperimentalGroupbyImpl: True}], indirect=True + "modify_config", [{RangePartitioningGroupbyImpl: True}], indirect=True ) def test_groupby_apply_series_result(modify_config): # reproducer from the issue: diff --git a/modin/test/storage_formats/pandas/test_internals.py b/modin/test/storage_formats/pandas/test_internals.py index c0eb02ca971..c1e17fc2e7a 100644 --- a/modin/test/storage_formats/pandas/test_internals.py +++ b/modin/test/storage_formats/pandas/test_internals.py @@ -20,7 +20,12 @@ import pytest import modin.pandas as pd -from modin.config import Engine, ExperimentalGroupbyImpl, MinPartitionSize, NPartitions +from modin.config import ( + Engine, + MinPartitionSize, + NPartitions, + RangePartitioningGroupbyImpl, +) from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe from modin.core.dataframe.pandas.dataframe.utils import ColumnInfo, ShuffleSortFunctions from modin.core.dataframe.pandas.metadata import ( @@ -1130,7 +1135,7 @@ def test_setitem_unhashable_preserve_dtypes(): @pytest.mark.parametrize( - "modify_config", [{ExperimentalGroupbyImpl: True}], indirect=True + "modify_config", [{RangePartitioningGroupbyImpl: True}], indirect=True ) def test_groupby_size_shuffling(modify_config): # verifies that 'groupby.size()' works with reshuffling implementation @@ -2404,7 +2409,7 @@ def test_groupby_index_dtype(self): assert res_dtypes._known_dtypes["a"] == np.dtype("int64") # case 2: ExperimentalImpl impl, Series as an output of groupby - ExperimentalGroupbyImpl.put(True) + RangePartitioningGroupbyImpl.put(True) try: df = pd.DataFrame({"a": [1, 2, 2], "b": [3, 4, 5]}) res = df.groupby("a").size().reset_index(name="new_name") @@ -2412,7 +2417,7 @@ def test_groupby_index_dtype(self): assert "a" in res_dtypes._known_dtypes assert res_dtypes._known_dtypes["a"] == np.dtype("int64") finally: - ExperimentalGroupbyImpl.put(False) + RangePartitioningGroupbyImpl.put(False) # case 3: MapReduce impl, DataFrame as an output of groupby df = pd.DataFrame({"a": [1, 2, 2], "b": [3, 4, 5]}) @@ -2422,7 +2427,7 @@ def test_groupby_index_dtype(self): assert res_dtypes._known_dtypes["a"] == np.dtype("int64") # case 4: ExperimentalImpl impl, DataFrame as an output of groupby - ExperimentalGroupbyImpl.put(True) + RangePartitioningGroupbyImpl.put(True) try: df = pd.DataFrame({"a": [1, 2, 2], "b": [3, 4, 5]}) res = df.groupby("a").sum().reset_index() @@ -2430,7 +2435,7 @@ def test_groupby_index_dtype(self): assert "a" in res_dtypes._known_dtypes assert res_dtypes._known_dtypes["a"] == np.dtype("int64") finally: - ExperimentalGroupbyImpl.put(False) + RangePartitioningGroupbyImpl.put(False) # case 5: FullAxis impl, DataFrame as an output of groupby df = pd.DataFrame({"a": [1, 2, 2], "b": [3, 4, 5]}) diff --git a/modin/utils.py b/modin/utils.py index c42a0516fa3..dfe5e616607 100644 --- a/modin/utils.py +++ b/modin/utils.py @@ -48,7 +48,7 @@ ) from modin._version import get_versions -from modin.config import Engine, ExperimentalNumPyAPI, IsExperimental, StorageFormat +from modin.config import Engine, IsExperimental, NumpyOnModin, StorageFormat T = TypeVar("T") """Generic type parameter""" @@ -527,7 +527,7 @@ def to_numpy( if isinstance(modin_obj, SupportsPrivateToNumPy): return modin_obj._to_numpy() array = modin_obj.to_numpy() - if ExperimentalNumPyAPI.get(): + if NumpyOnModin.get(): array = array._to_numpy() return array diff --git a/setup.cfg b/setup.cfg index 8f479523bec..689f3ba32ad 100644 --- a/setup.cfg +++ b/setup.cfg @@ -12,7 +12,7 @@ tag_prefix = parentdir_prefix = modin- [tool:pytest] -addopts = --cov-config=setup.cfg --cov=modin --cov-append --cov-report= +addopts = xfail_strict=true markers = xfail_executions