From 7bab684e3b553f96357fea07e91385be50f5c2fb Mon Sep 17 00:00:00 2001 From: ChengjieLi Date: Tue, 19 Sep 2023 13:34:23 +0800 Subject: [PATCH 1/2] fix --- .../xorbits/_mars/dataframe/groupby/fill.py | 12 +--- python/xorbits/_mars/dataframe/hash_utils.py | 5 +- .../xorbits/_mars/dataframe/missing/fillna.py | 57 ++++++------------- .../dataframe/missing/tests/test_missing.py | 2 - 4 files changed, 22 insertions(+), 54 deletions(-) diff --git a/python/xorbits/_mars/dataframe/groupby/fill.py b/python/xorbits/_mars/dataframe/groupby/fill.py index 7d71a89de..4baa08ea1 100644 --- a/python/xorbits/_mars/dataframe/groupby/fill.py +++ b/python/xorbits/_mars/dataframe/groupby/fill.py @@ -18,7 +18,7 @@ from ... import opcodes from ...core import OutputType -from ...serialization.serializables import AnyField, DictField, Int64Field, StringField +from ...serialization.serializables import AnyField, Int64Field, StringField from ..operands import DataFrameOperand, DataFrameOperandMixin from ..utils import build_empty_df, build_empty_series, parse_index @@ -29,7 +29,6 @@ class GroupByFillOperand(DataFrameOperand, DataFrameOperandMixin): value = AnyField("value", default=None) method = StringField("method", default=None) limit = Int64Field("limit", default=None) - downcast = DictField("downcast", default=None) def _calc_out_dtypes(self, in_groupby): mock_groupby = in_groupby.op.build_mock_groupby() @@ -40,7 +39,6 @@ def _calc_out_dtypes(self, in_groupby): value=self.value, method=self.method, limit=self.limit, - downcast=self.downcast, ) else: result_df = getattr(mock_groupby, func_name)(limit=self.limit) @@ -133,7 +131,6 @@ def execute(cls, ctx, op: "GroupByFillOperand"): value=op.value, method=op.method, limit=op.limit, - downcast=op.downcast, ) else: result = getattr(in_data, func_name)(limit=op.limit) @@ -184,7 +181,7 @@ def bfill(groupby, limit=None): return op(groupby) -def fillna(groupby, value=None, method=None, limit=None, downcast=None): +def fillna(groupby, value=None, method=None, limit=None): """ Fill NA/NaN values using the specified method @@ -197,11 +194,8 @@ def fillna(groupby, value=None, method=None, limit=None, downcast=None): limit: int, default None If method is specified, this is the maximum number of consecutive NaN values to forward/backward fill - downcast: dict, default None - A dict of item->dtype of what to downcast if possible, - or the string ‘infer’ which will try to downcast to an appropriate equal type return: DataFrame or None """ - op = GroupByFillNa(value=value, method=method, limit=limit, downcast=downcast) + op = GroupByFillNa(value=value, method=method, limit=limit) return op(groupby) diff --git a/python/xorbits/_mars/dataframe/hash_utils.py b/python/xorbits/_mars/dataframe/hash_utils.py index 348c4b16e..0d4211e2d 100644 --- a/python/xorbits/_mars/dataframe/hash_utils.py +++ b/python/xorbits/_mars/dataframe/hash_utils.py @@ -9,10 +9,11 @@ from typing import TYPE_CHECKING, Hashable, Iterable, Iterator, cast import numpy as np +import pandas as pd from pandas._libs import lib from pandas._libs.hashing import hash_object_array from pandas._typing import ArrayLike, npt -from pandas.core.dtypes.common import is_categorical_dtype, is_list_like +from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.generic import ( ABCDataFrame, ABCExtensionArray, @@ -272,7 +273,7 @@ def hash_array( # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke). - if is_categorical_dtype(dtype): + if isinstance(dtype, pd.CategoricalDtype): vals = cast("Categorical", vals) return _hash_categorical(vals, encoding, hash_key) diff --git a/python/xorbits/_mars/dataframe/missing/fillna.py b/python/xorbits/_mars/dataframe/missing/fillna.py index a2a87615e..65bca253b 100644 --- a/python/xorbits/_mars/dataframe/missing/fillna.py +++ b/python/xorbits/_mars/dataframe/missing/fillna.py @@ -41,7 +41,6 @@ class FillNA(DataFrameOperand, DataFrameOperandMixin): _method = StringField("method") _axis = AnyField("axis") _limit = Int64Field("limit") - _downcast = AnyField("downcast") _output_limit = Int64Field("output_limit") @@ -51,7 +50,6 @@ def __init__( method=None, axis=None, limit=None, - downcast=None, output_types=None, output_limit=None, **kw @@ -61,7 +59,6 @@ def __init__( _method=method, _axis=axis, _limit=limit, - _downcast=downcast, _output_types=output_types, _output_limit=output_limit, **kw @@ -83,10 +80,6 @@ def axis(self): def limit(self): return self._limit - @property - def downcast(self): - return self._downcast - def _set_inputs(self, inputs): super()._set_inputs(inputs) if self._method is None and len(inputs) > 1: @@ -123,7 +116,9 @@ def _execute_map(cls, ctx, op): method = op.method filled = input_data.fillna( - method=method, axis=axis, limit=limit, downcast=op.downcast + method=method, + axis=axis, + limit=limit, ) ctx[op.outputs[0].key] = cls._get_first_slice(op, filled, 1) del filled @@ -143,7 +138,9 @@ def _execute_combine(cls, ctx, op): if not summaries: ctx[op.outputs[0].key] = input_data.fillna( - method=method, axis=axis, limit=limit, downcast=op.downcast + method=method, + axis=axis, + limit=limit, ) return @@ -158,7 +155,9 @@ def _execute_combine(cls, ctx, op): if is_pandas_2(): concat_df = concat_df.fillna( - method=method, axis=axis, limit=limit, downcast=op.downcast + method=method, + axis=axis, + limit=limit, ) else: concat_df.fillna( @@ -166,7 +165,6 @@ def _execute_combine(cls, ctx, op): axis=axis, inplace=True, limit=limit, - downcast=op.downcast, ) ctx[op.outputs[0].key] = cls._get_first_slice(op, concat_df, -1) @@ -187,12 +185,9 @@ def execute(cls, ctx, op): method=op.method, axis=op.axis, limit=op.limit, - downcast=op.downcast, ) else: - ctx[op.outputs[0].key] = input_data.fillna( - value=value, downcast=op.downcast - ) + ctx[op.outputs[0].key] = input_data.fillna(value=value) @classmethod def _tile_one_by_one(cls, op): @@ -478,9 +473,7 @@ def __call__(self, a, value_df=None): ) -def fillna( - df, value=None, method=None, axis=None, inplace=False, limit=None, downcast=None -): +def fillna(df, value=None, method=None, axis=None, inplace=False, limit=None): """ Fill NA/NaN values using the specified method. @@ -509,10 +502,6 @@ def fillna( be partially filled. If method is not specified, this is the maximum number of entries along the entire axis where NaNs will be filled. Must be greater than 0 if not None. - downcast : dict, default is None - A dict of item->dtype of what to downcast if possible, - or the string 'infer' which will try to downcast to an appropriate - equal type (e.g. float64 to int64 if possible). Returns ------- @@ -583,10 +572,6 @@ def fillna( % type(value).__name__ ) - if downcast is not None: - raise NotImplementedError( - 'Currently argument "downcast" is not implemented yet' - ) if limit is not None: raise NotImplementedError('Currently argument "limit" is not implemented yet') @@ -600,7 +585,6 @@ def fillna( method=method, axis=axis, limit=limit, - downcast=downcast, output_types=get_output_types(df), ) out_df = op(df, value_df=value_df) @@ -610,7 +594,7 @@ def fillna( return out_df -def ffill(df, axis=None, inplace=False, limit=None, downcast=None): +def ffill(df, axis=None, inplace=False, limit=None): """ Synonym for :meth:`DataFrame.fillna` with ``method='ffill'``. @@ -619,12 +603,10 @@ def ffill(df, axis=None, inplace=False, limit=None, downcast=None): {klass} or None Object with missing values filled or None if ``inplace=True``. """ - return fillna( - df, method="ffill", axis=axis, inplace=inplace, limit=limit, downcast=downcast - ) + return fillna(df, method="ffill", axis=axis, inplace=inplace, limit=limit) -def bfill(df, axis=None, inplace=False, limit=None, downcast=None): +def bfill(df, axis=None, inplace=False, limit=None): """ Synonym for :meth:`DataFrame.fillna` with ``method='bfill'``. @@ -633,12 +615,10 @@ def bfill(df, axis=None, inplace=False, limit=None, downcast=None): {klass} or None Object with missing values filled or None if ``inplace=True``. """ - return fillna( - df, method="bfill", axis=axis, inplace=inplace, limit=limit, downcast=downcast - ) + return fillna(df, method="bfill", axis=axis, inplace=inplace, limit=limit) -def index_fillna(index, value=None, downcast=None): +def index_fillna(index, value=None): """ Fill NA/NaN values with the specified value. @@ -647,10 +627,6 @@ def index_fillna(index, value=None, downcast=None): value : scalar Scalar value to use to fill holes (e.g. 0). This value cannot be a list-likes. - downcast : dict, default is None - A dict of item->dtype of what to downcast if possible, - or the string 'infer' which will try to downcast to an appropriate - equal type (e.g. float64 to int64 if possible). Returns ------- @@ -666,7 +642,6 @@ def index_fillna(index, value=None, downcast=None): op = FillNA( value=value, - downcast=downcast, output_types=get_output_types(index), ) return op(index) diff --git a/python/xorbits/_mars/dataframe/missing/tests/test_missing.py b/python/xorbits/_mars/dataframe/missing/tests/test_missing.py index 0f33f15e9..50a7a7578 100644 --- a/python/xorbits/_mars/dataframe/missing/tests/test_missing.py +++ b/python/xorbits/_mars/dataframe/missing/tests/test_missing.py @@ -56,8 +56,6 @@ def test_fill_na(): series.fillna(value=df) with pytest.raises(ValueError): series.fillna(value=df_raw) - with pytest.raises(NotImplementedError): - series.fillna(value=series_raw, downcast="infer") with pytest.raises(NotImplementedError): series.ffill(limit=1) From b990ca583276813926caa77e11c209397e10e1e1 Mon Sep 17 00:00:00 2001 From: ChengjieLi Date: Thu, 21 Sep 2023 13:09:57 +0800 Subject: [PATCH 2/2] fix pandas 2.1.1 --- .../base/tests/test_base_execution.py | 19 ++++++++----------- .../_mars/dataframe/base/value_counts.py | 3 +++ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/python/xorbits/_mars/dataframe/base/tests/test_base_execution.py b/python/xorbits/_mars/dataframe/base/tests/test_base_execution.py index 23c19e122..bb997d233 100644 --- a/python/xorbits/_mars/dataframe/base/tests/test_base_execution.py +++ b/python/xorbits/_mars/dataframe/base/tests/test_base_execution.py @@ -1729,12 +1729,10 @@ def test_value_counts_execution(setup): r = series.value_counts() pd.testing.assert_series_equal(r.execute().fetch(), s.value_counts()) - # pandas issue: https://github.com/pandas-dev/pandas/issues/54857 - if pd.__version__ != "2.1.0": - r = series.value_counts(bins=5, normalize=True) - pd.testing.assert_series_equal( - r.execute().fetch(), s.value_counts(bins=5, normalize=True) - ) + r = series.value_counts(bins=5, normalize=True) + pd.testing.assert_series_equal( + r.execute().fetch(), s.value_counts(bins=5, normalize=True) + ) # test multi chunks series = from_pandas_series(s, chunk_size=30) @@ -1746,11 +1744,10 @@ def test_value_counts_execution(setup): pd.testing.assert_series_equal(r.execute().fetch(), s.value_counts(normalize=True)) # test bins and normalize - if pd.__version__ != "2.1.0": - r = series.value_counts(method="tree", bins=5, normalize=True) - pd.testing.assert_series_equal( - r.execute().fetch(), s.value_counts(bins=5, normalize=True) - ) + r = series.value_counts(method="tree", bins=5, normalize=True) + pd.testing.assert_series_equal( + r.execute().fetch(), s.value_counts(bins=5, normalize=True) + ) def test_astype(setup): diff --git a/python/xorbits/_mars/dataframe/base/value_counts.py b/python/xorbits/_mars/dataframe/base/value_counts.py index 457250014..ae80f3db3 100644 --- a/python/xorbits/_mars/dataframe/base/value_counts.py +++ b/python/xorbits/_mars/dataframe/base/value_counts.py @@ -193,6 +193,9 @@ def execute(cls, ctx, op: "DataFrameValueCounts"): # convert CategoricalDtype which generated in `cut` # to IntervalDtype result.index = result.index.astype("interval") + # index name changed since pandas 2.1.1 + if pd_release_version >= (2, 1, 1): + result.index.name = None if op.nrows: result = result.head(op.nrows) result.name = op.outputs[0].name