Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add downcast as method to DataFrame and Series #51641

Closed
wants to merge 13 commits into from
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ enhancement2

Other enhancements
^^^^^^^^^^^^^^^^^^
- Added :meth:`DataFrame.downcast` and :meth:`Series.downcast` (:issue:`51641`)
- Improve error message when setting :class:`DataFrame` with wrong number of columns through :meth:`DataFrame.isetitem` (:issue:`51701`)
-

Expand Down
4 changes: 4 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5515,6 +5515,10 @@ def fillna(
downcast=downcast,
)

@doc(NDFrame.downcast, **_shared_doc_kwargs)
def downcast(self) -> DataFrame:
return super().downcast()

def pop(self, item: Hashable) -> Series:
"""
Return item and drop from frame. Raise KeyError if not found.
Expand Down
43 changes: 43 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -7012,6 +7012,49 @@ def fillna(
else:
return result.__finalize__(self, method="fillna")

def downcast(self: NDFrameT) -> NDFrameT:
"""Downcasts the columns to an appropriate dtype.

Possibly casts floats to integers. The dtype is inferred.

Returns
-------
{klass}
{klass} with the same shape and converted columns.

Notes
-----
The downcasting logic protects against truncating floats.
If the values don't fit into the specified dtype, the column is ignored.

Examples
--------
>>> df = pd.DataFrame({"foo": [1.0, 2.0], "bar": [1.5, 2.5], "baz": [3.0, 4.0]})
>>> df
foo bar baz
0 1.0 1.5 3.0
1 2.0 2.5 4.0

>>> result = df.downcast()
>>> result
foo bar baz
0 1 1.5 3
1 2 2.5 4

>>> result.dtypes
foo int64
bar float64
baz int64
dtype: object
"""
if using_copy_on_write():
result = self.copy(deep=False)
else:
result = self.copy(deep=True)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i would have expected this to be handled within the Manager method. am i wrong to be surprised?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn’t really matter, when we supported dict like inputs this was better, but can move it to the manager now

new_data = result._mgr.downcast("infer")
result = self._constructor(new_data)
return result.__finalize__(self, method="downcast")

@overload
def ffill(
self: NDFrameT,
Expand Down
3 changes: 3 additions & 0 deletions pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,9 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T:
"fillna", value=value, limit=limit, inplace=inplace, downcast=downcast
)

def downcast(self: T, dtype) -> T:
return self.apply_with_block("downcast", dtype=dtype)

def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> T:
if copy is None:
copy = True
Expand Down
4 changes: 4 additions & 0 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,10 @@ def coerce_to_target_dtype(self, other) -> Block:

return self.astype(new_dtype, copy=False)

@final
def downcast(self, dtype: DtypeObj, using_cow: bool = False) -> list[Block]:
return self._maybe_downcast([self], downcast=dtype, using_cow=using_cow)

@final
def _maybe_downcast(
self, blocks: list[Block], downcast=None, using_cow: bool = False
Expand Down
3 changes: 3 additions & 0 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,9 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T:
using_cow=using_copy_on_write(),
)

def downcast(self: T, dtype) -> T:
return self.apply("downcast", dtype=dtype, using_cow=using_copy_on_write())

def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> T:
if copy is None:
if using_copy_on_write():
Expand Down
4 changes: 4 additions & 0 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -5081,6 +5081,10 @@ def fillna(
downcast=downcast,
)

@doc(NDFrame.downcast, **_shared_doc_kwargs)
def downcast(self) -> Series:
return super().downcast()

def pop(self, item: Hashable) -> Any:
"""
Return item and drops from series. Raise KeyError if not found.
Expand Down
21 changes: 21 additions & 0 deletions pandas/tests/copy_view/test_downcast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import numpy as np

from pandas import DataFrame
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array


class TestDowncast:
def test_downcast(self, using_copy_on_write):
df = DataFrame({"a": [1.0, 2.0], "b": 1.5})
df_orig = df.copy()
result = df.downcast()

assert not np.shares_memory(get_array(df, "a"), get_array(result, "a"))
if using_copy_on_write:
assert np.shares_memory(get_array(df, "b"), get_array(result, "b"))
else:
assert not np.shares_memory(get_array(df, "b"), get_array(result, "b"))

result.iloc[0, 1] = 100.5
tm.assert_frame_equal(df, df_orig)
10 changes: 10 additions & 0 deletions pandas/tests/frame/methods/test_downcast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from pandas import DataFrame
import pandas._testing as tm


class TestDowncast:
def test_downcast(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

parametrize over frame_or_series?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this makes it more complicated

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the alternative is to implement an analogous test in the series tests

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just do

if frame_or_series is Series:
    obj = obj["A"]
    expected = expected["A"]

result = ...

?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a series test already, should have commented. Are you ok with that?

df = DataFrame({"a": [1.0, 2.0], "b": 1.5, "c": 2.0, "d": "a"})
result = df.downcast()
expected = DataFrame({"a": [1, 2], "b": 1.5, "c": 2, "d": "a"})
tm.assert_frame_equal(result, expected)