BUG: IntegerArray/FloatingArray constructors mismatched NAs (#44514)

pandas-dev · Dec 1, 2021 · 6b84ee7 · 6b84ee7
1 parent ede6234
commit 6b84ee7
Show file tree

Hide file tree

Showing 10 changed files with 212 additions and 21 deletions.
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -680,6 +680,7 @@ Indexing
 - Bug in :meth:`Series.reset_index` not ignoring ``name`` argument when ``drop`` and ``inplace`` are set to ``True`` (:issue:`44575`)
 - Bug in :meth:`DataFrame.loc.__setitem__` and :meth:`DataFrame.iloc.__setitem__` with mixed dtypes sometimes failing to operate in-place (:issue:`44345`)
 - Bug in :meth:`DataFrame.loc.__getitem__` incorrectly raising ``KeyError`` when selecting a single column with a boolean key (:issue:`44322`).
+- Bug in setting :meth:`DataFrame.iloc` with a single ``ExtensionDtype`` column and setting 2D values e.g. ``df.iloc[:] = df.values`` incorrectly raising (:issue:`44514`)
 - Bug in indexing on columns with ``loc`` or ``iloc`` using a slice with a negative step with ``ExtensionDtype`` columns incorrectly raising (:issue:`44551`)
 - Bug in :meth:`IntervalIndex.get_indexer_non_unique` returning boolean mask instead of array of integers for a non unique and non monotonic index (:issue:`44084`)
 - Bug in :meth:`IntervalIndex.get_indexer_non_unique` not handling targets of ``dtype`` 'object' with NaNs correctly (:issue:`44482`)
@@ -799,6 +800,7 @@ ExtensionArray
 - NumPy ufuncs ``np.abs``, ``np.positive``, ``np.negative`` now correctly preserve dtype when called on ExtensionArrays that implement ``__abs__, __pos__, __neg__``, respectively. In particular this is fixed for :class:`TimedeltaArray` (:issue:`43899`)
 - NumPy ufuncs ``np.minimum.reduce`` and ``np.maximum.reduce`` now work correctly instead of raising ``NotImplementedError`` on :class:`Series` with ``IntegerDtype`` or ``FloatDtype`` (:issue:`43923`)
 - Avoid raising ``PerformanceWarning`` about fragmented DataFrame when using many columns with an extension dtype (:issue:`44098`)
+- Bug in :class:`IntegerArray` and :class:`FloatingArray` construction incorrectly coercing mismatched NA values (e.g. ``np.timedelta64("NaT")``) to numeric NA (:issue:`44514`)
 - Bug in :meth:`BooleanArray.__eq__` and :meth:`BooleanArray.__ne__` raising ``TypeError`` on comparison with an incompatible type (like a string). This caused :meth:`DataFrame.replace` to sometimes raise a ``TypeError`` if a nullable boolean column was included (:issue:`44499`)
 -
 

diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx
@@ -248,6 +248,36 @@ cdef bint checknull_with_nat_and_na(object obj):
     return checknull_with_nat(obj) or obj is C_NA
 
 
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def is_numeric_na(values: ndarray) -> ndarray:
+    """
+    Check for NA values consistent with IntegerArray/FloatingArray.
+
+    Similar to a vectorized is_valid_na_for_dtype restricted to numeric dtypes.
+
+    Returns
+    -------
+    ndarray[bool]
+    """
+    cdef:
+        ndarray[uint8_t] result
+        Py_ssize_t i, N
+        object val
+
+    N = len(values)
+    result = np.zeros(N, dtype=np.uint8)
+
+    for i in range(N):
+        val = values[i]
+        if checknull(val):
+            if val is None or val is C_NA or util.is_nan(val) or is_decimal_na(val):
+                result[i] = True
+            else:
+                raise TypeError(f"'values' contains non-numeric NA {val}")
+    return result.view(bool)
+
+
 # -----------------------------------------------------------------------------
 # Implementation of NA singleton
 

diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py
@@ -4,7 +4,10 @@
 
 import numpy as np
 
-from pandas._libs import lib
+from pandas._libs import (
+    lib,
+    missing as libmissing,
+)
 from pandas._typing import (
     ArrayLike,
     AstypeArg,
@@ -27,7 +30,6 @@
     ExtensionDtype,
     register_extension_dtype,
 )
-from pandas.core.dtypes.missing import isna
 
 from pandas.core.arrays import ExtensionArray
 from pandas.core.arrays.numeric import (
@@ -129,8 +131,7 @@ def coerce_to_array(
     if is_object_dtype(values):
         inferred_type = lib.infer_dtype(values, skipna=True)
         if inferred_type == "empty":
-            values = np.empty(len(values))
-            values.fill(np.nan)
+            pass
         elif inferred_type not in [
             "floating",
             "integer",
@@ -146,13 +147,15 @@ def coerce_to_array(
     elif not (is_integer_dtype(values) or is_float_dtype(values)):
         raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype")
 
+    if values.ndim != 1:
+        raise TypeError("values must be a 1D list-like")
+
     if mask is None:
-        mask = isna(values)
+        mask = libmissing.is_numeric_na(values)
+
     else:
         assert len(mask) == len(values)
 
-    if not values.ndim == 1:
-        raise TypeError("values must be a 1D list-like")
     if not mask.ndim == 1:
         raise TypeError("mask must be a 1D list-like")
 

diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -7,6 +7,7 @@
 from pandas._libs import (
     iNaT,
     lib,
+    missing as libmissing,
 )
 from pandas._typing import (
     ArrayLike,
@@ -32,7 +33,6 @@
     is_string_dtype,
     pandas_dtype,
 )
-from pandas.core.dtypes.missing import isna
 
 from pandas.core.arrays import ExtensionArray
 from pandas.core.arrays.masked import BaseMaskedDtype
@@ -183,8 +183,7 @@ def coerce_to_array(
     if is_object_dtype(values) or is_string_dtype(values):
         inferred_type = lib.infer_dtype(values, skipna=True)
         if inferred_type == "empty":
-            values = np.empty(len(values))
-            values.fill(np.nan)
+            pass
         elif inferred_type not in [
             "floating",
             "integer",
@@ -202,13 +201,14 @@ def coerce_to_array(
     elif not (is_integer_dtype(values) or is_float_dtype(values)):
         raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype")
 
+    if values.ndim != 1:
+        raise TypeError("values must be a 1D list-like")
+
     if mask is None:
-        mask = isna(values)
+        mask = libmissing.is_numeric_na(values)
     else:
         assert len(mask) == len(values)
 
-    if values.ndim != 1:
-        raise TypeError("values must be a 1D list-like")
     if mask.ndim != 1:
         raise TypeError("mask must be a 1D list-like")
 

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -1511,6 +1511,17 @@ def setitem(self, indexer, value):
             # we are always 1-D
             indexer = indexer[0]
 
+        # TODO(EA2D): not needed with 2D EAS
+        if isinstance(value, (np.ndarray, ExtensionArray)) and value.ndim == 2:
+            assert value.shape[1] == 1
+            # error: No overload variant of "__getitem__" of "ExtensionArray"
+            # matches argument type "Tuple[slice, int]"
+            value = value[:, 0]  # type: ignore[call-overload]
+        elif isinstance(value, ABCDataFrame):
+            # TODO: should we avoid getting here with DataFrame?
+            assert value.shape[1] == 1
+            value = value._ixs(0, axis=1)._values
+
         check_setitem_lengths(indexer, value, self.values)
         self.values[indexer] = value
         return self

diff --git a/pandas/tests/arrays/floating/test_construction.py b/pandas/tests/arrays/floating/test_construction.py
@@ -97,14 +97,18 @@ def test_to_array_mixed_integer_float():
         np.array(["foo"]),
         [[1, 2], [3, 4]],
         [np.nan, {"a": 1}],
+        # GH#44514 all-NA case used to get quietly swapped out before checking ndim
+        np.array([pd.NA] * 6, dtype=object).reshape(3, 2),
     ],
 )
 def test_to_array_error(values):
     # error in converting existing arrays to FloatingArray
-    msg = (
-        r"(:?.* cannot be converted to a FloatingDtype)"
-        r"|(:?values must be a 1D list-like)"
-        r"|(:?Cannot pass scalar)"
+    msg = "|".join(
+        [
+            "cannot be converted to a FloatingDtype",
+            "values must be a 1D list-like",
+            "Cannot pass scalar",
+        ]
     )
     with pytest.raises((TypeError, ValueError), match=msg):
         pd.array(values, dtype="Float64")

diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py
@@ -1,6 +1,13 @@
 import numpy as np
 import pytest
 
+from pandas.core.dtypes.dtypes import (
+    DatetimeTZDtype,
+    IntervalDtype,
+    PandasDtype,
+    PeriodDtype,
+)
+
 import pandas as pd
 import pandas._testing as tm
 from pandas.tests.extension.base.base import BaseExtensionTests
@@ -357,6 +364,36 @@ def test_setitem_series(self, data, full_indexer):
         )
         self.assert_series_equal(result, expected)
 
+    def test_setitem_frame_2d_values(self, data, request):
+        # GH#44514
+        df = pd.DataFrame({"A": data})
+
+        # Avoiding using_array_manager fixture
+        #  https://github.com/pandas-dev/pandas/pull/44514#discussion_r754002410
+        using_array_manager = isinstance(df._mgr, pd.core.internals.ArrayManager)
+        if using_array_manager:
+            if not isinstance(
+                data.dtype, (PandasDtype, PeriodDtype, IntervalDtype, DatetimeTZDtype)
+            ):
+                # These dtypes have non-broken implementations of _can_hold_element
+                mark = pytest.mark.xfail(reason="Goes through split path, loses dtype")
+                request.node.add_marker(mark)
+
+        df = pd.DataFrame({"A": data})
+        orig = df.copy()
+
+        df.iloc[:] = df
+        self.assert_frame_equal(df, orig)
+
+        df.iloc[:-1] = df.iloc[:-1]
+        self.assert_frame_equal(df, orig)
+
+        df.iloc[:] = df.values
+        self.assert_frame_equal(df, orig)
+
+        df.iloc[:-1] = df.values[:-1]
+        self.assert_frame_equal(df, orig)
+
     def test_delitem_series(self, data):
         # GH#40763
         ser = pd.Series(data, name="data")

diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py
@@ -1211,6 +1211,75 @@ def test_setitem_array_as_cell_value(self):
         expected = DataFrame({"a": [np.zeros((2,))], "b": [np.zeros((2, 2))]})
         tm.assert_frame_equal(df, expected)
 
+    # with AM goes through split-path, loses dtype
+    @td.skip_array_manager_not_yet_implemented
+    def test_iloc_setitem_nullable_2d_values(self):
+        df = DataFrame({"A": [1, 2, 3]}, dtype="Int64")
+        orig = df.copy()
+
+        df.loc[:] = df.values[:, ::-1]
+        tm.assert_frame_equal(df, orig)
+
+        df.loc[:] = pd.core.arrays.PandasArray(df.values[:, ::-1])
+        tm.assert_frame_equal(df, orig)
+
+        df.iloc[:] = df.iloc[:, :]
+        tm.assert_frame_equal(df, orig)
+
+    @pytest.mark.parametrize(
+        "null", [pd.NaT, pd.NaT.to_numpy("M8[ns]"), pd.NaT.to_numpy("m8[ns]")]
+    )
+    def test_setting_mismatched_na_into_nullable_fails(
+        self, null, any_numeric_ea_dtype
+    ):
+        # GH#44514 don't cast mismatched nulls to pd.NA
+        df = DataFrame({"A": [1, 2, 3]}, dtype=any_numeric_ea_dtype)
+        ser = df["A"]
+        arr = ser._values
+
+        msg = "|".join(
+            [
+                r"int\(\) argument must be a string, a bytes-like object or a "
+                "(real )?number, not 'NaTType'",
+                r"timedelta64\[ns\] cannot be converted to an? (Floating|Integer)Dtype",
+                r"datetime64\[ns\] cannot be converted to an? (Floating|Integer)Dtype",
+                "object cannot be converted to a FloatingDtype",
+                "'values' contains non-numeric NA",
+            ]
+        )
+        with pytest.raises(TypeError, match=msg):
+            arr[0] = null
+
+        with pytest.raises(TypeError, match=msg):
+            arr[:2] = [null, null]
+
+        with pytest.raises(TypeError, match=msg):
+            ser[0] = null
+
+        with pytest.raises(TypeError, match=msg):
+            ser[:2] = [null, null]
+
+        with pytest.raises(TypeError, match=msg):
+            ser.iloc[0] = null
+
+        with pytest.raises(TypeError, match=msg):
+            ser.iloc[:2] = [null, null]
+
+        with pytest.raises(TypeError, match=msg):
+            df.iloc[0, 0] = null
+
+        with pytest.raises(TypeError, match=msg):
+            df.iloc[:2, 0] = [null, null]
+
+        # Multi-Block
+        df2 = df.copy()
+        df2["B"] = ser.copy()
+        with pytest.raises(TypeError, match=msg):
+            df2.iloc[0, 0] = null
+
+        with pytest.raises(TypeError, match=msg):
+            df2.iloc[:2, 0] = [null, null]
+
 
 class TestDataFrameIndexingUInt64:
     def test_setitem(self, uint64_frame):

diff --git a/pandas/tests/series/methods/test_clip.py b/pandas/tests/series/methods/test_clip.py
@@ -46,9 +46,14 @@ def test_series_clipping_with_na_values(self, any_numeric_ea_dtype, nulls_fixtur
         # Ensure that clipping method can handle NA values with out failing
         # GH#40581
 
-        s = Series([nulls_fixture, 1.0, 3.0], dtype=any_numeric_ea_dtype)
-        s_clipped_upper = s.clip(upper=2.0)
-        s_clipped_lower = s.clip(lower=2.0)
+        if nulls_fixture is pd.NaT:
+            # constructor will raise, see
+            #  test_constructor_mismatched_null_nullable_dtype
+            return
+
+        ser = Series([nulls_fixture, 1.0, 3.0], dtype=any_numeric_ea_dtype)
+        s_clipped_upper = ser.clip(upper=2.0)
+        s_clipped_lower = ser.clip(lower=2.0)
 
         expected_upper = Series([nulls_fixture, 1.0, 2.0], dtype=any_numeric_ea_dtype)
         expected_lower = Series([nulls_fixture, 2.0, 3.0], dtype=any_numeric_ea_dtype)

diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
@@ -13,7 +13,10 @@
     iNaT,
     lib,
 )
-from pandas.compat.numpy import np_version_under1p19
+from pandas.compat.numpy import (
+    np_version_under1p19,
+    np_version_under1p20,
+)
 import pandas.util._test_decorators as td
 
 from pandas.core.dtypes.common import (
@@ -1811,6 +1814,33 @@ def test_constructor_bool_dtype_missing_values(self):
         expected = Series(True, index=[0], dtype="bool")
         tm.assert_series_equal(result, expected)
 
+    @pytest.mark.filterwarnings(
+        "ignore:elementwise comparison failed:DeprecationWarning"
+    )
+    @pytest.mark.xfail(
+        np_version_under1p20, reason="np.array([td64nat, float, float]) raises"
+    )
+    @pytest.mark.parametrize("func", [Series, DataFrame, Index, pd.array])
+    def test_constructor_mismatched_null_nullable_dtype(
+        self, func, any_numeric_ea_dtype
+    ):
+        # GH#44514
+        msg = "|".join(
+            [
+                "cannot safely cast non-equivalent object",
+                r"int\(\) argument must be a string, a bytes-like object "
+                "or a (real )?number",
+                r"Cannot cast array data from dtype\('O'\) to dtype\('float64'\) "
+                "according to the rule 'safe'",
+                "object cannot be converted to a FloatingDtype",
+                "'values' contains non-numeric NA",
+            ]
+        )
+
+        for null in tm.NP_NAT_OBJECTS + [NaT]:
+            with pytest.raises(TypeError, match=msg):
+                func([null, 1.0, 3.0], dtype=any_numeric_ea_dtype)
+
 
 class TestSeriesConstructorIndexCoercion:
     def test_series_constructor_datetimelike_index_coercion(self):