DEPR: downcasting in NDFrame.where, mask, clip (pandas-dev#53656)

* DEPR: downcasting in NDFrame.where, mask, clip * GH ref * suppress warning in doctet * add caller * implement future.no_silent_downcasting * move whatsnew to 2.2
jbrockmendel · Sep 1, 2023 · 80a1a8b · 80a1a8b
1 parent bb08817
commit 80a1a8b
Show file tree

Hide file tree

Showing 8 changed files with 112 additions and 28 deletions.
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -145,6 +145,7 @@ Deprecations
 - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_parquet` except ``path``. (:issue:`54229`)
 - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_pickle` except ``path``. (:issue:`54229`)
 - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_string` except ``buf``. (:issue:`54229`)
+- Deprecated downcasting behavior in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, :meth:`DataFrame.mask`, :meth:`Series.clip`, :meth:`DataFrame.clip`; in a future version these will not infer object-dtype columns to non-object dtype, or all-round floats to integer dtype. Call ``result.infer_objects(copy=False)`` on the result for object inference, or explicitly cast floats to ints. To opt in to the future version, use ``pd.set_option("future.downcasting", True)`` (:issue:`53656`)
 - Deprecated not passing a tuple to :class:`DataFrameGroupBy.get_group` or :class:`SeriesGroupBy.get_group` when grouping by a length-1 list-like (:issue:`25971`)
 - Deprecated strings ``S``, ``U``, and ``N`` denoting units in :func:`to_timedelta` (:issue:`52536`)
 - Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`52536`)

diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -142,6 +142,7 @@ def pytest_collection_modifyitems(items, config) -> None:
         ("is_sparse", "is_sparse is deprecated"),
         ("NDFrame.replace", "The 'method' keyword"),
         ("NDFrame.replace", "Series.replace without 'value'"),
+        ("NDFrame.clip", "Downcasting behavior in Series and DataFrame methods"),
         ("Series.idxmin", "The behavior of Series.idxmin"),
         ("Series.idxmax", "The behavior of Series.idxmax"),
         ("SeriesGroupBy.idxmin", "The behavior of Series.idxmin"),

diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
@@ -902,3 +902,14 @@ def register_converter_cb(key) -> None:
         "(at which point this option will be deprecated).",
         validator=is_one_of_factory([True, False]),
     )
+
+    cf.register_option(
+        "no_silent_downcasting",
+        False,
+        "Whether to opt-in to the future behavior which will *not* silently "
+        "downcast results from Series and DataFrame `where`, `mask`, and `clip` "
+        "methods. "
+        "Silent downcasting will be removed in pandas 3.0 "
+        "(at which point this option will be deprecated).",
+        validator=is_one_of_factory([True, False]),
+    )
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -15,7 +15,10 @@
 
 import numpy as np
 
-from pandas._config import using_copy_on_write
+from pandas._config import (
+    get_option,
+    using_copy_on_write,
+)
 
 from pandas._libs import (
     NaT,
@@ -495,7 +498,7 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block:
 
     @final
     def _maybe_downcast(
-        self, blocks: list[Block], downcast=None, using_cow: bool = False
+        self, blocks: list[Block], downcast, using_cow: bool, caller: str
     ) -> list[Block]:
         if downcast is False:
             return blocks
@@ -507,14 +510,43 @@ def _maybe_downcast(
             #  but ATM it breaks too much existing code.
             # split and convert the blocks
 
-            return extend_blocks(
+            nbs = extend_blocks(
                 [blk.convert(using_cow=using_cow, copy=not using_cow) for blk in blocks]
             )
 
-        if downcast is None:
+        elif downcast is None:
+            return blocks
+        elif caller == "where" and get_option("future.no_silent_downcasting") is True:
             return blocks
+        else:
+            nbs = extend_blocks([b._downcast_2d(downcast, using_cow) for b in blocks])
+
+        # When _maybe_downcast is called with caller="where", it is either
+        #  a) with downcast=False, which is a no-op (the desired future behavior)
+        #  b) with downcast="infer", which is _not_ passed by the user.
+        # In the latter case the future behavior is to stop doing inference,
+        #  so we issue a warning if and only if some inference occurred.
+        if caller == "where":
+            # GH#53656
+            if len(blocks) != len(nbs) or any(
+                left.dtype != right.dtype for left, right in zip(blocks, nbs)
+            ):
+                # In this case _maybe_downcast was _not_ a no-op, so the behavior
+                #  will change, so we issue a warning.
+                warnings.warn(
+                    "Downcasting behavior in Series and DataFrame methods 'where', "
+                    "'mask', and 'clip' is deprecated. In a future "
+                    "version this will not infer object dtypes or cast all-round "
+                    "floats to integers. Instead call "
+                    "result.infer_objects(copy=False) for object inference, "
+                    "or cast round floats explicitly. To opt-in to the future "
+                    "behavior, set "
+                    "`pd.set_option('future.no_silent_downcasting', True)`",
+                    FutureWarning,
+                    stacklevel=find_stack_level(),
+                )
 
-        return extend_blocks([b._downcast_2d(downcast, using_cow) for b in blocks])
+        return nbs
 
     @final
     @maybe_split
@@ -1308,7 +1340,7 @@ def where(
                 block = self.coerce_to_target_dtype(other)
                 blocks = block.where(orig_other, cond, using_cow=using_cow)
                 return self._maybe_downcast(
-                    blocks, downcast=_downcast, using_cow=using_cow
+                    blocks, downcast=_downcast, using_cow=using_cow, caller="where"
                 )
 
             else:
@@ -1404,7 +1436,9 @@ def fillna(
             else:
                 # GH#45423 consistent downcasting on no-ops.
                 nb = self.copy(deep=not using_cow)
-                nbs = nb._maybe_downcast([nb], downcast=downcast, using_cow=using_cow)
+                nbs = nb._maybe_downcast(
+                    [nb], downcast=downcast, using_cow=using_cow, caller="fillna"
+                )
                 return nbs
 
         if limit is not None:
@@ -1422,7 +1456,9 @@ def fillna(
         #  different behavior in _maybe_downcast.
         return extend_blocks(
             [
-                blk._maybe_downcast([blk], downcast=downcast, using_cow=using_cow)
+                blk._maybe_downcast(
+                    [blk], downcast=downcast, using_cow=using_cow, caller="fillna"
+                )
                 for blk in nbs
             ]
         )
@@ -1463,7 +1499,7 @@ def pad_or_backfill(
         data = extract_array(new_values, extract_numpy=True)
 
         nb = self.make_block_same_class(data, refs=refs)
-        return nb._maybe_downcast([nb], downcast, using_cow)
+        return nb._maybe_downcast([nb], downcast, using_cow, caller="pad_or_backfill")
 
     @final
     def interpolate(
@@ -1516,7 +1552,7 @@ def interpolate(
         data = extract_array(new_values, extract_numpy=True)
 
         nb = self.make_block_same_class(data, refs=refs)
-        return nb._maybe_downcast([nb], downcast, using_cow)
+        return nb._maybe_downcast([nb], downcast, using_cow, caller="interpolate")
 
     @final
     def diff(self, n: int) -> list[Block]:
@@ -1805,7 +1841,7 @@ def where(
                     blk = self.coerce_to_target_dtype(orig_other)
                     nbs = blk.where(orig_other, orig_cond, using_cow=using_cow)
                     return self._maybe_downcast(
-                        nbs, downcast=_downcast, using_cow=using_cow
+                        nbs, downcast=_downcast, using_cow=using_cow, caller="where"
                     )
 
                 elif isinstance(self, NDArrayBackedExtensionBlock):
@@ -1814,7 +1850,7 @@ def where(
                     blk = self.coerce_to_target_dtype(orig_other)
                     nbs = blk.where(orig_other, orig_cond, using_cow=using_cow)
                     return self._maybe_downcast(
-                        nbs, downcast=_downcast, using_cow=using_cow
+                        nbs, downcast=_downcast, using_cow=using_cow, caller="where"
                     )
 
                 else:
@@ -2013,7 +2049,7 @@ def fillna(
                 )
 
         nb = self.make_block_same_class(new_values, refs=refs)
-        return nb._maybe_downcast([nb], downcast, using_cow=using_cow)
+        return nb._maybe_downcast([nb], downcast, using_cow=using_cow, caller="fillna")
 
     @cache_readonly
     def shape(self) -> Shape:

diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py
@@ -175,7 +175,7 @@ def test_where_set(self, where_frame, float_string_frame, mixed_int_frame):
 
         def _check_set(df, cond, check_dtypes=True):
             dfi = df.copy()
-            econd = cond.reindex_like(df).fillna(True)
+            econd = cond.reindex_like(df).fillna(True).infer_objects(copy=False)
             expected = dfi.mask(~econd)
 
             return_value = dfi.where(cond, np.nan, inplace=True)
@@ -356,7 +356,9 @@ def test_where_bug_transposition(self):
         expected = a.copy()
         expected[~do_not_replace] = b
 
-        result = a.where(do_not_replace, b)
+        msg = "Downcasting behavior in Series and DataFrame methods 'where'"
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            result = a.where(do_not_replace, b)
         tm.assert_frame_equal(result, expected)
 
         a = DataFrame({0: [4, 6], 1: [1, 0]})
@@ -366,7 +368,8 @@ def test_where_bug_transposition(self):
         expected = a.copy()
         expected[~do_not_replace] = b
 
-        result = a.where(do_not_replace, b)
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            result = a.where(do_not_replace, b)
         tm.assert_frame_equal(result, expected)
 
     def test_where_datetime(self):
@@ -718,7 +721,9 @@ def test_where_ea_other(self):
         ser2 = Series(arr[:2], index=["A", "B"])
         expected = DataFrame({"A": [1, 7, 3], "B": [4, pd.NA, 6]})
         expected["B"] = expected["B"].astype(object)
-        result = df.where(mask, ser2, axis=1)
+        msg = "Downcasting behavior in Series and DataFrame methods 'where'"
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            result = df.where(mask, ser2, axis=1)
         tm.assert_frame_equal(result, expected)
 
     def test_where_interval_noop(self):
@@ -735,7 +740,10 @@ def test_where_interval_fullop_downcast(self, frame_or_series):
         # GH#45768
         obj = frame_or_series([pd.Interval(0, 0)] * 2)
         other = frame_or_series([1.0, 2.0])
-        res = obj.where(~obj.notna(), other)
+
+        msg = "Downcasting behavior in Series and DataFrame methods 'where'"
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            res = obj.where(~obj.notna(), other)
 
         # since all entries are being changed, we will downcast result
         #  from object to ints (not floats)
@@ -780,7 +788,9 @@ def test_where_datetimelike_noop(self, dtype):
 
         # opposite case where we are replacing *all* values -> we downcast
         #  from object dtype # GH#45768
-        res5 = df.where(mask2, 4)
+        msg = "Downcasting behavior in Series and DataFrame methods 'where'"
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            res5 = df.where(mask2, 4)
         expected = DataFrame(4, index=df.index, columns=df.columns)
         tm.assert_frame_equal(res5, expected)
 
@@ -984,10 +994,18 @@ def test_where_downcast_to_td64():
 
     td = pd.Timedelta(days=1)
 
-    res = ser.where(mask, td)
+    msg = "Downcasting behavior in Series and DataFrame methods 'where'"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        res = ser.where(mask, td)
     expected = Series([td, td, td], dtype="m8[ns]")
     tm.assert_series_equal(res, expected)
 
+    with pd.option_context("future.no_silent_downcasting", True):
+        with tm.assert_produces_warning(None, match=msg):
+            res2 = ser.where(mask, td)
+    expected2 = expected.astype(object)
+    tm.assert_series_equal(res2, expected2)
+
 
 def _check_where_equivalences(df, mask, other, expected):
     # similar to tests.series.indexing.test_setitem.SetitemCastingEquivalences

diff --git a/pandas/tests/frame/methods/test_clip.py b/pandas/tests/frame/methods/test_clip.py
@@ -151,7 +151,11 @@ def test_clip_with_na_args(self, float_frame):
         # GH#19992 and adjusted in GH#40420
         df = DataFrame({"col_0": [1, 2, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]})
 
-        result = df.clip(lower=[4, 5, np.nan], axis=0)
+        msg = "Downcasting behavior in Series and DataFrame methods 'where'"
+        # TODO: avoid this warning here?  seems like we should never be upcasting
+        #  in the first place?
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            result = df.clip(lower=[4, 5, np.nan], axis=0)
         expected = DataFrame(
             {"col_0": [4, 5, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]}
         )
@@ -167,7 +171,8 @@ def test_clip_with_na_args(self, float_frame):
         data = {"col_0": [9, -3, 0, -1, 5], "col_1": [-2, -7, 6, 8, -5]}
         df = DataFrame(data)
         t = Series([2, -4, np.nan, 6, 3])
-        result = df.clip(lower=t, axis=0)
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            result = df.clip(lower=t, axis=0)
         expected = DataFrame({"col_0": [9, -3, 0, 6, 5], "col_1": [2, -4, 6, 8, 3]})
         tm.assert_frame_equal(result, expected)
 

diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py
@@ -393,16 +393,21 @@ def test_where_datetimelike_coerce(dtype):
     expected = Series([10, 10])
     mask = np.array([False, False])
 
-    rs = ser.where(mask, [10, 10])
+    msg = "Downcasting behavior in Series and DataFrame methods 'where'"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        rs = ser.where(mask, [10, 10])
     tm.assert_series_equal(rs, expected)
 
-    rs = ser.where(mask, 10)
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        rs = ser.where(mask, 10)
     tm.assert_series_equal(rs, expected)
 
-    rs = ser.where(mask, 10.0)
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        rs = ser.where(mask, 10.0)
     tm.assert_series_equal(rs, expected)
 
-    rs = ser.where(mask, [10.0, 10.0])
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        rs = ser.where(mask, [10.0, 10.0])
     tm.assert_series_equal(rs, expected)
 
     rs = ser.where(mask, [10.0, np.nan])

diff --git a/pandas/tests/series/methods/test_clip.py b/pandas/tests/series/methods/test_clip.py
@@ -69,8 +69,15 @@ def test_clip_with_na_args(self):
         tm.assert_series_equal(s.clip(upper=np.nan, lower=np.nan), Series([1, 2, 3]))
 
         # GH#19992
-        tm.assert_series_equal(s.clip(lower=[0, 4, np.nan]), Series([1, 4, 3]))
-        tm.assert_series_equal(s.clip(upper=[1, np.nan, 1]), Series([1, 2, 1]))
+        msg = "Downcasting behavior in Series and DataFrame methods 'where'"
+        # TODO: avoid this warning here?  seems like we should never be upcasting
+        #  in the first place?
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            res = s.clip(lower=[0, 4, np.nan])
+        tm.assert_series_equal(res, Series([1, 4, 3]))
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            res = s.clip(upper=[1, np.nan, 1])
+        tm.assert_series_equal(res, Series([1, 2, 1]))
 
         # GH#40420
         s = Series([1, 2, 3])