Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CHORE: Deprecate downcast in fillna #705

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 8 additions & 11 deletions python/xorbits/_mars/dataframe/base/tests/test_base_execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -1729,12 +1729,10 @@ def test_value_counts_execution(setup):
r = series.value_counts()
pd.testing.assert_series_equal(r.execute().fetch(), s.value_counts())

# pandas issue: https://github.com/pandas-dev/pandas/issues/54857
if pd.__version__ != "2.1.0":
r = series.value_counts(bins=5, normalize=True)
pd.testing.assert_series_equal(
r.execute().fetch(), s.value_counts(bins=5, normalize=True)
)
r = series.value_counts(bins=5, normalize=True)
pd.testing.assert_series_equal(
r.execute().fetch(), s.value_counts(bins=5, normalize=True)
)

# test multi chunks
series = from_pandas_series(s, chunk_size=30)
Expand All @@ -1746,11 +1744,10 @@ def test_value_counts_execution(setup):
pd.testing.assert_series_equal(r.execute().fetch(), s.value_counts(normalize=True))

# test bins and normalize
if pd.__version__ != "2.1.0":
r = series.value_counts(method="tree", bins=5, normalize=True)
pd.testing.assert_series_equal(
r.execute().fetch(), s.value_counts(bins=5, normalize=True)
)
r = series.value_counts(method="tree", bins=5, normalize=True)
pd.testing.assert_series_equal(
r.execute().fetch(), s.value_counts(bins=5, normalize=True)
)


def test_astype(setup):
Expand Down
3 changes: 3 additions & 0 deletions python/xorbits/_mars/dataframe/base/value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,9 @@ def execute(cls, ctx, op: "DataFrameValueCounts"):
# convert CategoricalDtype which generated in `cut`
# to IntervalDtype
result.index = result.index.astype("interval")
# index name changed since pandas 2.1.1
if pd_release_version >= (2, 1, 1):
result.index.name = None
if op.nrows:
result = result.head(op.nrows)
result.name = op.outputs[0].name
Expand Down
12 changes: 3 additions & 9 deletions python/xorbits/_mars/dataframe/groupby/fill.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

from ... import opcodes
from ...core import OutputType
from ...serialization.serializables import AnyField, DictField, Int64Field, StringField
from ...serialization.serializables import AnyField, Int64Field, StringField
from ..operands import DataFrameOperand, DataFrameOperandMixin
from ..utils import build_empty_df, build_empty_series, parse_index

Expand All @@ -29,7 +29,6 @@ class GroupByFillOperand(DataFrameOperand, DataFrameOperandMixin):
value = AnyField("value", default=None)
method = StringField("method", default=None)
limit = Int64Field("limit", default=None)
downcast = DictField("downcast", default=None)

def _calc_out_dtypes(self, in_groupby):
mock_groupby = in_groupby.op.build_mock_groupby()
Expand All @@ -40,7 +39,6 @@ def _calc_out_dtypes(self, in_groupby):
value=self.value,
method=self.method,
limit=self.limit,
downcast=self.downcast,
)
else:
result_df = getattr(mock_groupby, func_name)(limit=self.limit)
Expand Down Expand Up @@ -133,7 +131,6 @@ def execute(cls, ctx, op: "GroupByFillOperand"):
value=op.value,
method=op.method,
limit=op.limit,
downcast=op.downcast,
)
else:
result = getattr(in_data, func_name)(limit=op.limit)
Expand Down Expand Up @@ -184,7 +181,7 @@ def bfill(groupby, limit=None):
return op(groupby)


def fillna(groupby, value=None, method=None, limit=None, downcast=None):
def fillna(groupby, value=None, method=None, limit=None):
"""
Fill NA/NaN values using the specified method

Expand All @@ -197,11 +194,8 @@ def fillna(groupby, value=None, method=None, limit=None, downcast=None):
limit: int, default None
If method is specified, this is the maximum number of consecutive
NaN values to forward/backward fill
downcast: dict, default None
A dict of item->dtype of what to downcast if possible,
or the string ‘infer’ which will try to downcast to an appropriate equal type

return: DataFrame or None
"""
op = GroupByFillNa(value=value, method=method, limit=limit, downcast=downcast)
op = GroupByFillNa(value=value, method=method, limit=limit)
return op(groupby)
5 changes: 3 additions & 2 deletions python/xorbits/_mars/dataframe/hash_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@
from typing import TYPE_CHECKING, Hashable, Iterable, Iterator, cast

import numpy as np
import pandas as pd
from pandas._libs import lib
from pandas._libs.hashing import hash_object_array
from pandas._typing import ArrayLike, npt
from pandas.core.dtypes.common import is_categorical_dtype, is_list_like
from pandas.core.dtypes.common import is_list_like
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCExtensionArray,
Expand Down Expand Up @@ -272,7 +273,7 @@ def hash_array(
# For categoricals, we hash the categories, then remap the codes to the
# hash values. (This check is above the complex check so that we don't ask
# numpy if categorical is a subdtype of complex, as it will choke).
if is_categorical_dtype(dtype):
if isinstance(dtype, pd.CategoricalDtype):
vals = cast("Categorical", vals)
return _hash_categorical(vals, encoding, hash_key)

Expand Down
57 changes: 16 additions & 41 deletions python/xorbits/_mars/dataframe/missing/fillna.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ class FillNA(DataFrameOperand, DataFrameOperandMixin):
_method = StringField("method")
_axis = AnyField("axis")
_limit = Int64Field("limit")
_downcast = AnyField("downcast")

_output_limit = Int64Field("output_limit")

Expand All @@ -51,7 +50,6 @@ def __init__(
method=None,
axis=None,
limit=None,
downcast=None,
output_types=None,
output_limit=None,
**kw
Expand All @@ -61,7 +59,6 @@ def __init__(
_method=method,
_axis=axis,
_limit=limit,
_downcast=downcast,
_output_types=output_types,
_output_limit=output_limit,
**kw
Expand All @@ -83,10 +80,6 @@ def axis(self):
def limit(self):
return self._limit

@property
def downcast(self):
return self._downcast

def _set_inputs(self, inputs):
super()._set_inputs(inputs)
if self._method is None and len(inputs) > 1:
Expand Down Expand Up @@ -123,7 +116,9 @@ def _execute_map(cls, ctx, op):
method = op.method

filled = input_data.fillna(
method=method, axis=axis, limit=limit, downcast=op.downcast
method=method,
axis=axis,
limit=limit,
)
ctx[op.outputs[0].key] = cls._get_first_slice(op, filled, 1)
del filled
Expand All @@ -143,7 +138,9 @@ def _execute_combine(cls, ctx, op):

if not summaries:
ctx[op.outputs[0].key] = input_data.fillna(
method=method, axis=axis, limit=limit, downcast=op.downcast
method=method,
axis=axis,
limit=limit,
)
return

Expand All @@ -158,15 +155,16 @@ def _execute_combine(cls, ctx, op):

if is_pandas_2():
concat_df = concat_df.fillna(
method=method, axis=axis, limit=limit, downcast=op.downcast
method=method,
axis=axis,
limit=limit,
)
else:
concat_df.fillna(
method=method,
axis=axis,
inplace=True,
limit=limit,
downcast=op.downcast,
)
ctx[op.outputs[0].key] = cls._get_first_slice(op, concat_df, -1)

Expand All @@ -187,12 +185,9 @@ def execute(cls, ctx, op):
method=op.method,
axis=op.axis,
limit=op.limit,
downcast=op.downcast,
)
else:
ctx[op.outputs[0].key] = input_data.fillna(
value=value, downcast=op.downcast
)
ctx[op.outputs[0].key] = input_data.fillna(value=value)

@classmethod
def _tile_one_by_one(cls, op):
Expand Down Expand Up @@ -478,9 +473,7 @@ def __call__(self, a, value_df=None):
)


def fillna(
df, value=None, method=None, axis=None, inplace=False, limit=None, downcast=None
):
def fillna(df, value=None, method=None, axis=None, inplace=False, limit=None):
"""
Fill NA/NaN values using the specified method.

Expand Down Expand Up @@ -509,10 +502,6 @@ def fillna(
be partially filled. If method is not specified, this is the
maximum number of entries along the entire axis where NaNs will be
filled. Must be greater than 0 if not None.
downcast : dict, default is None
A dict of item->dtype of what to downcast if possible,
or the string 'infer' which will try to downcast to an appropriate
equal type (e.g. float64 to int64 if possible).

Returns
-------
Expand Down Expand Up @@ -583,10 +572,6 @@ def fillna(
% type(value).__name__
)

if downcast is not None:
raise NotImplementedError(
'Currently argument "downcast" is not implemented yet'
)
if limit is not None:
raise NotImplementedError('Currently argument "limit" is not implemented yet')

Expand All @@ -600,7 +585,6 @@ def fillna(
method=method,
axis=axis,
limit=limit,
downcast=downcast,
output_types=get_output_types(df),
)
out_df = op(df, value_df=value_df)
Expand All @@ -610,7 +594,7 @@ def fillna(
return out_df


def ffill(df, axis=None, inplace=False, limit=None, downcast=None):
def ffill(df, axis=None, inplace=False, limit=None):
"""
Synonym for :meth:`DataFrame.fillna` with ``method='ffill'``.

Expand All @@ -619,12 +603,10 @@ def ffill(df, axis=None, inplace=False, limit=None, downcast=None):
{klass} or None
Object with missing values filled or None if ``inplace=True``.
"""
return fillna(
df, method="ffill", axis=axis, inplace=inplace, limit=limit, downcast=downcast
)
return fillna(df, method="ffill", axis=axis, inplace=inplace, limit=limit)


def bfill(df, axis=None, inplace=False, limit=None, downcast=None):
def bfill(df, axis=None, inplace=False, limit=None):
"""
Synonym for :meth:`DataFrame.fillna` with ``method='bfill'``.

Expand All @@ -633,12 +615,10 @@ def bfill(df, axis=None, inplace=False, limit=None, downcast=None):
{klass} or None
Object with missing values filled or None if ``inplace=True``.
"""
return fillna(
df, method="bfill", axis=axis, inplace=inplace, limit=limit, downcast=downcast
)
return fillna(df, method="bfill", axis=axis, inplace=inplace, limit=limit)


def index_fillna(index, value=None, downcast=None):
def index_fillna(index, value=None):
"""
Fill NA/NaN values with the specified value.

Expand All @@ -647,10 +627,6 @@ def index_fillna(index, value=None, downcast=None):
value : scalar
Scalar value to use to fill holes (e.g. 0).
This value cannot be a list-likes.
downcast : dict, default is None
A dict of item->dtype of what to downcast if possible,
or the string 'infer' which will try to downcast to an appropriate
equal type (e.g. float64 to int64 if possible).

Returns
-------
Expand All @@ -666,7 +642,6 @@ def index_fillna(index, value=None, downcast=None):

op = FillNA(
value=value,
downcast=downcast,
output_types=get_output_types(index),
)
return op(index)
2 changes: 0 additions & 2 deletions python/xorbits/_mars/dataframe/missing/tests/test_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,6 @@ def test_fill_na():
series.fillna(value=df)
with pytest.raises(ValueError):
series.fillna(value=df_raw)
with pytest.raises(NotImplementedError):
series.fillna(value=series_raw, downcast="infer")
with pytest.raises(NotImplementedError):
series.ffill(limit=1)

Expand Down