diff --git a/modin/core/dataframe/algebra/default2pandas/groupby.py b/modin/core/dataframe/algebra/default2pandas/groupby.py index 59d4d4196aa..8e2e4de062d 100644 --- a/modin/core/dataframe/algebra/default2pandas/groupby.py +++ b/modin/core/dataframe/algebra/default2pandas/groupby.py @@ -13,6 +13,7 @@ """Module houses default GroupBy functions builder class.""" +import warnings from typing import Any import pandas @@ -59,7 +60,9 @@ def is_transformation_kernel(agg_func: Any) -> bool: @classmethod def _call_groupby(cls, df, *args, **kwargs): # noqa: PR01 """Call .groupby() on passed `df`.""" - return df.groupby(*args, **kwargs) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=FutureWarning) + return df.groupby(*args, **kwargs) @classmethod def validate_by(cls, by): @@ -563,7 +566,9 @@ def _call_groupby(cls, df, *args, **kwargs): # noqa: PR01 # In second case surrounding logic will supplement grouping columns, # so we need to drop them after grouping is over; our originally # selected column is always the first, so use it - return df.groupby(*args, **kwargs)[df.columns[0]] + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=FutureWarning) + return df.groupby(*args, **kwargs)[df.columns[0]] class GroupByDefault(DefaultMethod): diff --git a/modin/core/dataframe/pandas/interchange/dataframe_protocol/column.py b/modin/core/dataframe/pandas/interchange/dataframe_protocol/column.py index e68460ba2b9..23eea6bf872 100644 --- a/modin/core/dataframe/pandas/interchange/dataframe_protocol/column.py +++ b/modin/core/dataframe/pandas/interchange/dataframe_protocol/column.py @@ -127,7 +127,7 @@ def dtype(self) -> Tuple[DTypeKind, int, str, str]: if self._dtype_cache is not None: return self._dtype_cache - dtype = self._col.dtypes[0] + dtype = self._col.dtypes.iloc[0] if isinstance(dtype, pandas.CategoricalDtype): pandas_series = self._col.to_pandas().squeeze(axis=1) diff --git a/modin/core/dataframe/pandas/partitioning/axis_partition.py b/modin/core/dataframe/pandas/partitioning/axis_partition.py index 70a20918d6b..30ed90c4321 100644 --- a/modin/core/dataframe/pandas/partitioning/axis_partition.py +++ b/modin/core/dataframe/pandas/partitioning/axis_partition.py @@ -13,6 +13,8 @@ """The module defines base interface for an axis partition of a Modin DataFrame.""" +import warnings + import numpy as np import pandas @@ -418,7 +420,9 @@ def deploy_axis_func( A list of pandas DataFrames. """ dataframe = pandas.concat(list(partitions), axis=axis, copy=False) - result = func(dataframe, *f_args, **f_kwargs) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=FutureWarning) + result = func(dataframe, *f_args, **f_kwargs) if num_splits == 1: # If we're not going to split the result, we don't need to specify @@ -497,7 +501,9 @@ def deploy_func_between_two_axis_partitions( for i in range(1, len(other_shape)) ] rt_frame = pandas.concat(combined_axis, axis=axis ^ 1, copy=False) - result = func(lt_frame, rt_frame, *f_args, **f_kwargs) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=FutureWarning) + result = func(lt_frame, rt_frame, *f_args, **f_kwargs) return split_result_of_axis_func_pandas(axis, num_splits, result) @classmethod diff --git a/modin/core/execution/dask/common/utils.py b/modin/core/execution/dask/common/utils.py index cf544ab343c..3eda2a50375 100644 --- a/modin/core/execution/dask/common/utils.py +++ b/modin/core/execution/dask/common/utils.py @@ -23,6 +23,7 @@ Memory, NPartitions, ) +from modin.core.execution.utils import set_env from modin.error_message import ErrorMessage @@ -32,6 +33,14 @@ def initialize_dask(): try: client = default_client() + + def _disable_warnings(): + import warnings + + warnings.simplefilter("ignore", category=FutureWarning) + + client.run(_disable_warnings) + except ValueError: from distributed import Client @@ -47,7 +56,11 @@ def initialize_dask(): num_cpus = CpuCount.get() memory_limit = Memory.get() worker_memory_limit = memory_limit // num_cpus if memory_limit else "auto" - client = Client(n_workers=num_cpus, memory_limit=worker_memory_limit) + + # when the client is initialized, environment variables are inherited + with set_env(PYTHONWARNINGS="ignore::FutureWarning"): + client = Client(n_workers=num_cpus, memory_limit=worker_memory_limit) + if GithubCI.get(): # set these keys to run tests that write to the mock s3 service. this seems # to be the way to pass environment variables to the workers: diff --git a/modin/core/execution/python/implementations/pandas_on_python/partitioning/partition.py b/modin/core/execution/python/implementations/pandas_on_python/partitioning/partition.py index 059f3ae2286..307c8f186b7 100644 --- a/modin/core/execution/python/implementations/pandas_on_python/partitioning/partition.py +++ b/modin/core/execution/python/implementations/pandas_on_python/partitioning/partition.py @@ -13,6 +13,8 @@ """The module defines interface for a partition with pandas storage format and Python engine.""" +import warnings + from modin.core.dataframe.pandas.partitioning.partition import PandasDataframePartition from modin.core.execution.python.common import PythonWrapper @@ -116,7 +118,9 @@ def call_queue_closure(data, call_queue): self._data = call_queue_closure(self._data, self.call_queue) self.call_queue = [] - return self.__constructor__(func(self._data.copy(), *args, **kwargs)) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=FutureWarning) + return self.__constructor__(func(self._data.copy(), *args, **kwargs)) def drain_call_queue(self): """Execute all operations stored in the call queue on the object wrapped by this partition.""" diff --git a/modin/core/execution/ray/common/utils.py b/modin/core/execution/ray/common/utils.py index 9473ec0fa97..3b168c6f044 100644 --- a/modin/core/execution/ray/common/utils.py +++ b/modin/core/execution/ray/common/utils.py @@ -36,6 +36,7 @@ StorageFormat, ValueSource, ) +from modin.core.execution.utils import set_env from modin.error_message import ErrorMessage from .engine_wrapper import RayWrapper @@ -82,7 +83,10 @@ def initialize_ray( # the `pandas` module has been fully imported inside of each process before # any execution begins: # https://github.com/modin-project/modin/pull/4603 - env_vars = {"__MODIN_AUTOIMPORT_PANDAS__": "1"} + env_vars = { + "__MODIN_AUTOIMPORT_PANDAS__": "1", + "PYTHONWARNINGS": "ignore::FutureWarning", + } if GithubCI.get(): # need these to write parquet to the moto service mocking s3. env_vars.update( @@ -143,9 +147,8 @@ def initialize_ray( # time and doesn't enforce us with any overhead that Ray's native `runtime_env` # is usually causing. You can visit this gh-issue for more info: # https://github.com/modin-project/modin/issues/5157#issuecomment-1500225150 - for key, value in env_vars.items(): - os.environ[key] = value - ray.init(**ray_init_kwargs) + with set_env(**env_vars): + ray.init(**ray_init_kwargs) if StorageFormat.get() == "Cudf": from modin.core.execution.ray.implementations.cudf_on_ray.partitioning import ( @@ -163,12 +166,7 @@ def initialize_ray( runtime_env_vars = ray.get_runtime_context().runtime_env.get("env_vars", {}) for varname, varvalue in env_vars.items(): if str(runtime_env_vars.get(varname, "")) != str(varvalue): - if is_cluster or ( - # Here we relax our requirements for a non-cluster case allowing for the `env_vars` - # to be set at least as a process environment variable - not is_cluster - and os.environ.get(varname, "") != str(varvalue) - ): + if is_cluster: ErrorMessage.single_warning( "When using a pre-initialized Ray cluster, please ensure that the runtime env " + f"sets environment variable {varname} to {varvalue}" diff --git a/modin/core/execution/unidist/common/utils.py b/modin/core/execution/unidist/common/utils.py index 7de30d560af..db9648382b5 100644 --- a/modin/core/execution/unidist/common/utils.py +++ b/modin/core/execution/unidist/common/utils.py @@ -45,7 +45,8 @@ def initialize_unidist(): unidist.init() """, ) - + # TODO: allow unidist to inherit env variables on initialization + # with set_env(PYTHONWARNINGS="ignore::FutureWarning"): unidist.init() num_cpus = sum(v["CPU"] for v in unidist.cluster_resources().values()) diff --git a/modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/partition.py b/modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/partition.py index 05c0538be4f..000eb59c7f6 100644 --- a/modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/partition.py +++ b/modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/partition.py @@ -13,6 +13,8 @@ """Module houses class that wraps data (block partition) and its metadata.""" +import warnings + import unidist from modin.core.dataframe.pandas.partitioning.partition import PandasDataframePartition @@ -351,12 +353,16 @@ def _apply_func(partition, func, *args, **kwargs): # pragma: no cover destructuring it causes a performance penalty. """ try: - result = func(partition, *args, **kwargs) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=FutureWarning) + result = func(partition, *args, **kwargs) # Sometimes Arrow forces us to make a copy of an object before we operate on it. We # don't want the error to propagate to the user, and we want to avoid copying unless # we absolutely have to. except ValueError: - result = func(partition.copy(), *args, **kwargs) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=FutureWarning) + result = func(partition.copy(), *args, **kwargs) return ( result, len(result) if hasattr(result, "__len__") else 0, @@ -393,12 +399,16 @@ def _apply_list_of_funcs(call_queue, partition): # pragma: no cover args = deserialize(f_args) kwargs = deserialize(f_kwargs) try: - partition = func(partition, *args, **kwargs) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=FutureWarning) + partition = func(partition, *args, **kwargs) # Sometimes Arrow forces us to make a copy of an object before we operate on it. We # don't want the error to propagate to the user, and we want to avoid copying unless # we absolutely have to. except ValueError: - partition = func(partition.copy(), *args, **kwargs) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=FutureWarning) + partition = func(partition.copy(), *args, **kwargs) return ( partition, diff --git a/modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/virtual_partition.py b/modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/virtual_partition.py index c1abfe9e749..f662c7a8d81 100644 --- a/modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/virtual_partition.py +++ b/modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/virtual_partition.py @@ -13,6 +13,8 @@ """Module houses classes responsible for storing a virtual partition and applying a function to it.""" +import warnings + import pandas import unidist @@ -310,7 +312,9 @@ def _deploy_unidist_func( Unidist functions are not detected by codecov (thus pragma: no cover). """ f_args = deserialize(f_args) - result = deployer(axis, f_to_deploy, f_args, f_kwargs, *args, **kwargs) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=FutureWarning) + result = deployer(axis, f_to_deploy, f_args, f_kwargs, *args, **kwargs) if not extract_metadata: return result ip = unidist.get_ip() diff --git a/modin/core/execution/utils.py b/modin/core/execution/utils.py new file mode 100644 index 00000000000..7245da3c094 --- /dev/null +++ b/modin/core/execution/utils.py @@ -0,0 +1,31 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +"""General utils for execution module.""" + +import contextlib +import os + + +@contextlib.contextmanager +def set_env(**environ): + """ + Temporarily set the process environment variables. + """ + old_environ = os.environ.copy() + os.environ.update(environ) + try: + yield + finally: + os.environ.clear() + os.environ.update(old_environ) diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index 17c45b0dac7..ec31f98bd25 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -18,6 +18,7 @@ """ import abc +import warnings from typing import Hashable, List, Optional import numpy as np @@ -164,7 +165,9 @@ def default_to_pandas(self, pandas_op, *args, **kwargs): args = try_cast_to_pandas(args) kwargs = try_cast_to_pandas(kwargs) - result = pandas_op(try_cast_to_pandas(self), *args, **kwargs) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=FutureWarning) + result = pandas_op(try_cast_to_pandas(self), *args, **kwargs) if isinstance(result, (tuple, list)): return [self.__wrap_in_qc(obj) for obj in result] return self.__wrap_in_qc(result) diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index d8d00d5bb9f..9feb32f4919 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -2721,7 +2721,7 @@ def getitem_array(self, key): # here we check for a subset of bool indexers only to simplify the code; # there could (potentially) be more of those, but we assume the most frequent # ones are just of bool dtype - if len(key.dtypes) == 1 and is_bool_dtype(key.dtypes[0]): + if len(key.dtypes) == 1 and is_bool_dtype(key.dtypes.iloc[0]): self.__validate_bool_indexer(key.index) return self.__getitem_bool(key, broadcast=True, dtypes="copy") diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py index c888effb8e2..a8d44423b3d 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py @@ -2779,8 +2779,8 @@ def test_dict(self): at = mdf._query_compiler._modin_frame._partitions[0][0].get() assert len(at.column(0).chunks) == nchunks - mdt = mdf.dtypes[0] - pdt = pdf.dtypes[0] + mdt = mdf.dtypes.iloc[0] + pdt = pdf.dtypes.iloc[0] assert mdt == "category" assert isinstance(mdt, pandas.CategoricalDtype) assert str(mdt) == str(pdt) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 4047434fec4..5bf30b4fed9 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -503,23 +503,25 @@ def _default_to_pandas(self, op, *args, **kwargs): args = try_cast_to_pandas(args) kwargs = try_cast_to_pandas(kwargs) pandas_obj = self._to_pandas() - if callable(op): - result = op(pandas_obj, *args, **kwargs) - elif isinstance(op, str): - # The inner `getattr` is ensuring that we are treating this object (whether - # it is a DataFrame, Series, etc.) as a pandas object. The outer `getattr` - # will get the operation (`op`) from the pandas version of the class and run - # it on the object after we have converted it to pandas. - attr = getattr(self._pandas_class, op) - if isinstance(attr, property): - result = getattr(pandas_obj, op) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=FutureWarning) + if callable(op): + result = op(pandas_obj, *args, **kwargs) + elif isinstance(op, str): + # The inner `getattr` is ensuring that we are treating this object (whether + # it is a DataFrame, Series, etc.) as a pandas object. The outer `getattr` + # will get the operation (`op`) from the pandas version of the class and run + # it on the object after we have converted it to pandas. + attr = getattr(self._pandas_class, op) + if isinstance(attr, property): + result = getattr(pandas_obj, op) + else: + result = attr(pandas_obj, *args, **kwargs) else: - result = attr(pandas_obj, *args, **kwargs) - else: - ErrorMessage.catch_bugs_and_request_email( - failure_condition=True, - extra_log="{} is an unsupported operation".format(op), - ) + ErrorMessage.catch_bugs_and_request_email( + failure_condition=True, + extra_log="{} is an unsupported operation".format(op), + ) # SparseDataFrames cannot be serialized by arrow and cause problems for Modin. # For now we will use pandas. if isinstance(result, type(self)) and not isinstance( diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index cdd78c62e2c..b9c4f2109a7 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -226,7 +226,13 @@ def ffill(self, limit=None): + "which can be impacted by pandas bug https://github.com/pandas-dev/pandas/issues/43412 " + "on dataframes with duplicated indices" ) - return self.fillna(limit=limit, method="ffill") + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message=".*fillna with 'method' is deprecated.*", + category=FutureWarning, + ) + return self.fillna(limit=limit, method="ffill") def sem(self, ddof=1, numeric_only=False): return self._wrap_aggregation( @@ -369,7 +375,7 @@ def max(self, numeric_only=False, min_count=-1, engine=None, engine_kwargs=None) ) def idxmax(self, axis=lib.no_default, skipna=True, numeric_only=False): - if axis is lib.no_default: + if axis is not lib.no_default: self._deprecate_axis(axis, "idxmax") # default behaviour for aggregations; for the reference see # `_op_via_apply` func in pandas==2.0.2 @@ -382,7 +388,7 @@ def idxmax(self, axis=lib.no_default, skipna=True, numeric_only=False): ) def idxmin(self, axis=lib.no_default, skipna=True, numeric_only=False): - if axis is lib.no_default: + if axis is not lib.no_default: self._deprecate_axis(axis, "idxmin") # default behaviour for aggregations; for the reference see # `_op_via_apply` func in pandas==2.0.2 @@ -663,6 +669,11 @@ def apply(self, func, *args, **kwargs): def dtypes(self): if self._axis == 1: raise ValueError("Cannot call dtypes on groupby with axis=1") + warnings.warn( + f"{type(self).__name__}.dtypes is deprecated and will be removed in " + + "a future version. Check the dtypes on the base object instead", + FutureWarning, + ) return self._check_index( self._wrap_aggregation( type(self._query_compiler).groupby_dtypes, @@ -825,7 +836,13 @@ def bfill(self, limit=None): + "which can be impacted by pandas bug https://github.com/pandas-dev/pandas/issues/43412 " + "on dataframes with duplicated indices" ) - return self.fillna(limit=limit, method="bfill") + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message=".*fillna with 'method' is deprecated.*", + category=FutureWarning, + ) + return self.fillna(limit=limit, method="bfill") def prod(self, numeric_only=False, min_count=0): return self._wrap_aggregation( @@ -867,7 +884,15 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) and isinstance(func, BuiltinFunctionType) and func.__name__ in dir(self) ): - func = func.__name__ + func_name = func.__name__ + warnings.warn( + f"The provided callable {func} is currently using " + + f"{type(self).__name__}.{func_name}. In a future version of pandas, " + + "the provided callable will be used directly. To keep current " + + f"behavior pass the string {func_name} instead.", + category=FutureWarning, + ) + func = func_name do_relabel = None if isinstance(func, dict) or func is None: @@ -1237,9 +1262,17 @@ def fillna( limit=None, downcast=lib.no_default, ): - if axis is lib.no_default: + if axis is not lib.no_default: self._deprecate_axis(axis, "fillna") + if method is not None: + warnings.warn( + f"{type(self).__name__}.fillna with 'method' is deprecated and " + + "will raise in a future version. Use obj.ffill() or obj.bfill() " + + "instead.", + FutureWarning, + ) + # default behaviour for aggregations; for the reference see # `_op_via_apply` func in pandas==2.0.2 if axis is None or axis is lib.no_default: diff --git a/modin/pandas/test/dataframe/test_map_metadata.py b/modin/pandas/test/dataframe/test_map_metadata.py index f37178c5635..dd86c8966cc 100644 --- a/modin/pandas/test/dataframe/test_map_metadata.py +++ b/modin/pandas/test/dataframe/test_map_metadata.py @@ -845,7 +845,7 @@ def test_convert_dtypes_5653(): assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) modin_df = modin_df.convert_dtypes() assert len(modin_df.dtypes) == 1 - assert modin_df.dtypes[0] == "string" + assert modin_df.dtypes.iloc[0] == "string" @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index 41a1d41e171..2406cf20497 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -17,6 +17,7 @@ import numpy as np import pandas +import pandas._libs.lib as lib import pytest import modin.pandas as pd @@ -58,7 +59,49 @@ # have too many such instances. # TODO(https://github.com/modin-project/modin/issues/3655): catch all instances # of defaulting to pandas. -pytestmark = pytest.mark.filterwarnings(default_to_pandas_ignore_string) +pytestmark = [ + pytest.mark.filterwarnings(default_to_pandas_ignore_string), + # TO MAKE SURE ALL FUTUREWARNINGS ARE CONSIDERED + pytest.mark.filterwarnings("error::FutureWarning"), + # IGNORE FUTUREWARNINGS MARKS TO CLEANUP OUTPUT + pytest.mark.filterwarnings( + "ignore:DataFrame.groupby with axis=1 is deprecated:FutureWarning" + ), + pytest.mark.filterwarnings( + "ignore:DataFrameGroupBy.dtypes is deprecated:FutureWarning" + ), + pytest.mark.filterwarnings( + "ignore:DataFrameGroupBy.diff with axis=1 is deprecated:FutureWarning" + ), + pytest.mark.filterwarnings( + "ignore:DataFrameGroupBy.pct_change with axis=1 is deprecated:FutureWarning" + ), + pytest.mark.filterwarnings( + "ignore:The 'fill_method' and 'limit' keywords in (DataFrame|DataFrameGroupBy).pct_change are deprecated:FutureWarning" + ), + pytest.mark.filterwarnings( + "ignore:DataFrameGroupBy.shift with axis=1 is deprecated:FutureWarning" + ), + pytest.mark.filterwarnings( + "ignore:(DataFrameGroupBy|SeriesGroupBy|DataFrame|Series).fillna with 'method' is deprecated:FutureWarning" + ), + # FIXME: these cases inconsistent between modin and pandas + pytest.mark.filterwarnings( + "ignore:A grouping was used that is not in the columns of the DataFrame and so was excluded from the result:FutureWarning" + ), + pytest.mark.filterwarnings( + "ignore:The default of observed=False is deprecated:FutureWarning" + ), + pytest.mark.filterwarnings( + "ignore:.*DataFrame.idxmax with all-NA values, or any-NA and skipna=False, is deprecated:FutureWarning" + ), + pytest.mark.filterwarnings( + "ignore:.*DataFrame.idxmin with all-NA values, or any-NA and skipna=False, is deprecated:FutureWarning" + ), + pytest.mark.filterwarnings( + "ignore:.*In a future version of pandas, the provided callable will be used directly.*:FutureWarning" + ), +] def modin_groupby_equals_pandas(modin_groupby, pandas_groupby): @@ -414,17 +457,17 @@ def maybe_get_columns(df, by): eval_ndim(modin_groupby, pandas_groupby) if not check_df_columns_have_nans(modin_df, by): # cum* functions produce undefined results for columns with NaNs so we run them only when "by" columns contain no NaNs - eval_general(modin_groupby, pandas_groupby, lambda df: df.cumsum(axis=0)) - eval_general(modin_groupby, pandas_groupby, lambda df: df.cummax(axis=0)) - eval_general(modin_groupby, pandas_groupby, lambda df: df.cummin(axis=0)) - eval_general(modin_groupby, pandas_groupby, lambda df: df.cumprod(axis=0)) + eval_general(modin_groupby, pandas_groupby, lambda df: df.cumsum()) + eval_general(modin_groupby, pandas_groupby, lambda df: df.cummax()) + eval_general(modin_groupby, pandas_groupby, lambda df: df.cummin()) + eval_general(modin_groupby, pandas_groupby, lambda df: df.cumprod()) eval_general(modin_groupby, pandas_groupby, lambda df: df.cumcount()) eval_general( modin_groupby, pandas_groupby, lambda df: df.pct_change( - periods=2, fill_method="pad", limit=1, freq=None, axis=1 + periods=2, fill_method="bfill", limit=1, freq=None, axis=1 ), modin_df_almost_equals_pandas, ) @@ -1151,7 +1194,7 @@ def eval_ndim(modin_groupby, pandas_groupby): assert modin_groupby.ndim == pandas_groupby.ndim -def eval_cumsum(modin_groupby, pandas_groupby, axis=0, numeric_only=False): +def eval_cumsum(modin_groupby, pandas_groupby, axis=lib.no_default, numeric_only=False): df_equals( *sort_index_if_experimental_groupby( modin_groupby.cumsum(axis=axis, numeric_only=numeric_only), @@ -1160,7 +1203,7 @@ def eval_cumsum(modin_groupby, pandas_groupby, axis=0, numeric_only=False): ) -def eval_cummax(modin_groupby, pandas_groupby, axis=0, numeric_only=False): +def eval_cummax(modin_groupby, pandas_groupby, axis=lib.no_default, numeric_only=False): df_equals( *sort_index_if_experimental_groupby( modin_groupby.cummax(axis=axis, numeric_only=numeric_only), @@ -1169,7 +1212,7 @@ def eval_cummax(modin_groupby, pandas_groupby, axis=0, numeric_only=False): ) -def eval_cummin(modin_groupby, pandas_groupby, axis=0, numeric_only=False): +def eval_cummin(modin_groupby, pandas_groupby, axis=lib.no_default, numeric_only=False): df_equals( *sort_index_if_experimental_groupby( modin_groupby.cummin(axis=axis, numeric_only=numeric_only), @@ -1250,7 +1293,9 @@ def eval_median(modin_groupby, pandas_groupby, numeric_only=False): ) -def eval_cumprod(modin_groupby, pandas_groupby, axis=0, numeric_only=False): +def eval_cumprod( + modin_groupby, pandas_groupby, axis=lib.no_default, numeric_only=False +): df_equals( *sort_index_if_experimental_groupby( modin_groupby.cumprod(numeric_only=numeric_only), @@ -1587,8 +1632,8 @@ def test_groupby_with_kwarg_dropna(groupby_kwargs, dropna): df_equals(md_grp._default_to_pandas(lambda df: df.sum()), pd_grp.sum()) -@pytest.mark.parametrize("groupby_axis", [0, 1]) -@pytest.mark.parametrize("shift_axis", [0, 1]) +@pytest.mark.parametrize("groupby_axis", [lib.no_default, 1]) +@pytest.mark.parametrize("shift_axis", [lib.no_default, 1]) @pytest.mark.parametrize("groupby_sort", [True, False]) def test_shift_freq(groupby_axis, shift_axis, groupby_sort): pandas_df = pandas.DataFrame( @@ -1751,9 +1796,7 @@ def col3(x): [ "quantile", "mean", - pytest.param( - "sum", marks=pytest.mark.skip("See Modin issue #2255 for details") - ), + "sum", "median", "unique", "cumprod", @@ -2884,3 +2927,161 @@ def test_reshuffling_groupby_on_strings(modify_config): eval_general( modin_df.groupby("col1"), pandas_df.groupby("col1"), lambda grp: grp.mean() ) + + +### TEST GROUPBY WARNINGS ### + + +def test_groupby_axis_1_warning(): + data = { + "col1": [0, 3, 2, 3], + "col2": [4, 1, 6, 7], + } + modin_df, pandas_df = create_test_dfs(data) + + with pytest.warns( + FutureWarning, match="DataFrame.groupby with axis=1 is deprecated" + ): + modin_df.groupby(by="col1", axis=1) + with pytest.warns( + FutureWarning, match="DataFrame.groupby with axis=1 is deprecated" + ): + pandas_df.groupby(by="col1", axis=1) + + +def test_groupby_dtypes_warning(): + data = { + "col1": [0, 3, 2, 3], + "col2": [4, 1, 6, 7], + } + modin_df, pandas_df = create_test_dfs(data) + modin_groupby = modin_df.groupby(by="col1") + pandas_groupby = pandas_df.groupby(by="col1") + + with pytest.warns(FutureWarning, match="DataFrameGroupBy.dtypes is deprecated"): + modin_groupby.dtypes + with pytest.warns(FutureWarning, match="DataFrameGroupBy.dtypes is deprecated"): + pandas_groupby.dtypes + + +def test_groupby_diff_axis_1_warning(): + data = { + "col1": [0, 3, 2, 3], + "col2": [4, 1, 6, 7], + } + modin_df, pandas_df = create_test_dfs(data) + modin_groupby = modin_df.groupby(by="col1") + pandas_groupby = pandas_df.groupby(by="col1") + + with pytest.warns( + FutureWarning, match="DataFrameGroupBy.diff with axis=1 is deprecated" + ): + modin_groupby.diff(axis=1) + with pytest.warns( + FutureWarning, match="DataFrameGroupBy.diff with axis=1 is deprecated" + ): + pandas_groupby.diff(axis=1) + + +def test_groupby_pct_change_axis_1_warning(): + data = { + "col1": [0, 3, 2, 3], + "col2": [4, 1, 6, 7], + } + modin_df, pandas_df = create_test_dfs(data) + modin_groupby = modin_df.groupby(by="col1") + pandas_groupby = pandas_df.groupby(by="col1") + + with pytest.warns( + FutureWarning, match="DataFrameGroupBy.pct_change with axis=1 is deprecated" + ): + modin_groupby.pct_change(axis=1) + with pytest.warns( + FutureWarning, match="DataFrameGroupBy.pct_change with axis=1 is deprecated" + ): + pandas_groupby.pct_change(axis=1) + + +def test_groupby_pct_change_parameters_warning(): + data = { + "col1": [0, 3, 2, 3], + "col2": [4, 1, 6, 7], + } + modin_df, pandas_df = create_test_dfs(data) + modin_groupby = modin_df.groupby(by="col1") + pandas_groupby = pandas_df.groupby(by="col1") + + with pytest.warns( + FutureWarning, + match="The 'fill_method' and 'limit' keywords in (DataFrame|DataFrameGroupBy).pct_change are deprecated", + ): + modin_groupby.pct_change(fill_method="bfill", limit=1) + with pytest.warns( + FutureWarning, + match="The 'fill_method' and 'limit' keywords in (DataFrame|DataFrameGroupBy).pct_change are deprecated", + ): + pandas_groupby.pct_change(fill_method="bfill", limit=1) + + +def test_groupby_shift_axis_1_warning(): + data = { + "col1": [0, 3, 2, 3], + "col2": [4, 1, 6, 7], + } + modin_df, pandas_df = create_test_dfs(data) + modin_groupby = modin_df.groupby(by="col1") + pandas_groupby = pandas_df.groupby(by="col1") + + with pytest.warns( + FutureWarning, + match="DataFrameGroupBy.shift with axis=1 is deprecated", + ): + pandas_groupby.shift(axis=1, fill_value=777) + with pytest.warns( + FutureWarning, + match="DataFrameGroupBy.shift with axis=1 is deprecated", + ): + modin_groupby.shift(axis=1, fill_value=777) + + +def test_groupby_fillna_axis_1_warning(): + data = { + "col1": [0, 3, 2, 3], + "col2": [4, None, 6, None], + } + modin_df, pandas_df = create_test_dfs(data) + modin_groupby = modin_df.groupby(by="col1") + pandas_groupby = pandas_df.groupby(by="col1") + + with pytest.warns( + FutureWarning, + match="DataFrameGroupBy.fillna with 'method' is deprecated", + ): + modin_groupby.fillna(method="ffill") + with pytest.warns( + FutureWarning, + match="DataFrameGroupBy.fillna with 'method' is deprecated", + ): + pandas_groupby.fillna(method="ffill") + + +def test_groupby_agg_provided_callable_warning(): + data = { + "col1": [0, 3, 2, 3], + "col2": [4, 1, 6, 7], + } + modin_df, pandas_df = create_test_dfs(data) + modin_groupby = modin_df.groupby(by="col1") + pandas_groupby = pandas_df.groupby(by="col1") + + for func in (sum, max): + with pytest.warns( + FutureWarning, + match="In a future version of pandas, the provided callable will be used directly", + ): + modin_groupby.agg(func) + with pytest.warns( + FutureWarning, + match="In a future version of pandas, the provided callable will be used directly", + ): + pandas_groupby.agg(func)