Skip to content

Commit

Permalink
BUG: IntegerArray/FloatingArray constructors mismatched NAs (#44514)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored Dec 1, 2021
1 parent ede6234 commit 6b84ee7
Show file tree
Hide file tree
Showing 10 changed files with 212 additions and 21 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -680,6 +680,7 @@ Indexing
- Bug in :meth:`Series.reset_index` not ignoring ``name`` argument when ``drop`` and ``inplace`` are set to ``True`` (:issue:`44575`)
- Bug in :meth:`DataFrame.loc.__setitem__` and :meth:`DataFrame.iloc.__setitem__` with mixed dtypes sometimes failing to operate in-place (:issue:`44345`)
- Bug in :meth:`DataFrame.loc.__getitem__` incorrectly raising ``KeyError`` when selecting a single column with a boolean key (:issue:`44322`).
- Bug in setting :meth:`DataFrame.iloc` with a single ``ExtensionDtype`` column and setting 2D values e.g. ``df.iloc[:] = df.values`` incorrectly raising (:issue:`44514`)
- Bug in indexing on columns with ``loc`` or ``iloc`` using a slice with a negative step with ``ExtensionDtype`` columns incorrectly raising (:issue:`44551`)
- Bug in :meth:`IntervalIndex.get_indexer_non_unique` returning boolean mask instead of array of integers for a non unique and non monotonic index (:issue:`44084`)
- Bug in :meth:`IntervalIndex.get_indexer_non_unique` not handling targets of ``dtype`` 'object' with NaNs correctly (:issue:`44482`)
Expand Down Expand Up @@ -799,6 +800,7 @@ ExtensionArray
- NumPy ufuncs ``np.abs``, ``np.positive``, ``np.negative`` now correctly preserve dtype when called on ExtensionArrays that implement ``__abs__, __pos__, __neg__``, respectively. In particular this is fixed for :class:`TimedeltaArray` (:issue:`43899`)
- NumPy ufuncs ``np.minimum.reduce`` and ``np.maximum.reduce`` now work correctly instead of raising ``NotImplementedError`` on :class:`Series` with ``IntegerDtype`` or ``FloatDtype`` (:issue:`43923`)
- Avoid raising ``PerformanceWarning`` about fragmented DataFrame when using many columns with an extension dtype (:issue:`44098`)
- Bug in :class:`IntegerArray` and :class:`FloatingArray` construction incorrectly coercing mismatched NA values (e.g. ``np.timedelta64("NaT")``) to numeric NA (:issue:`44514`)
- Bug in :meth:`BooleanArray.__eq__` and :meth:`BooleanArray.__ne__` raising ``TypeError`` on comparison with an incompatible type (like a string). This caused :meth:`DataFrame.replace` to sometimes raise a ``TypeError`` if a nullable boolean column was included (:issue:`44499`)
-

Expand Down
30 changes: 30 additions & 0 deletions pandas/_libs/missing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,36 @@ cdef bint checknull_with_nat_and_na(object obj):
return checknull_with_nat(obj) or obj is C_NA


@cython.wraparound(False)
@cython.boundscheck(False)
def is_numeric_na(values: ndarray) -> ndarray:
"""
Check for NA values consistent with IntegerArray/FloatingArray.

Similar to a vectorized is_valid_na_for_dtype restricted to numeric dtypes.

Returns
-------
ndarray[bool]
"""
cdef:
ndarray[uint8_t] result
Py_ssize_t i, N
object val

N = len(values)
result = np.zeros(N, dtype=np.uint8)

for i in range(N):
val = values[i]
if checknull(val):
if val is None or val is C_NA or util.is_nan(val) or is_decimal_na(val):
result[i] = True
else:
raise TypeError(f"'values' contains non-numeric NA {val}")
return result.view(bool)


# -----------------------------------------------------------------------------
# Implementation of NA singleton

Expand Down
17 changes: 10 additions & 7 deletions pandas/core/arrays/floating.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@

import numpy as np

from pandas._libs import lib
from pandas._libs import (
lib,
missing as libmissing,
)
from pandas._typing import (
ArrayLike,
AstypeArg,
Expand All @@ -27,7 +30,6 @@
ExtensionDtype,
register_extension_dtype,
)
from pandas.core.dtypes.missing import isna

from pandas.core.arrays import ExtensionArray
from pandas.core.arrays.numeric import (
Expand Down Expand Up @@ -129,8 +131,7 @@ def coerce_to_array(
if is_object_dtype(values):
inferred_type = lib.infer_dtype(values, skipna=True)
if inferred_type == "empty":
values = np.empty(len(values))
values.fill(np.nan)
pass
elif inferred_type not in [
"floating",
"integer",
Expand All @@ -146,13 +147,15 @@ def coerce_to_array(
elif not (is_integer_dtype(values) or is_float_dtype(values)):
raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype")

if values.ndim != 1:
raise TypeError("values must be a 1D list-like")

if mask is None:
mask = isna(values)
mask = libmissing.is_numeric_na(values)

else:
assert len(mask) == len(values)

if not values.ndim == 1:
raise TypeError("values must be a 1D list-like")
if not mask.ndim == 1:
raise TypeError("mask must be a 1D list-like")

Expand Down
12 changes: 6 additions & 6 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from pandas._libs import (
iNaT,
lib,
missing as libmissing,
)
from pandas._typing import (
ArrayLike,
Expand All @@ -32,7 +33,6 @@
is_string_dtype,
pandas_dtype,
)
from pandas.core.dtypes.missing import isna

from pandas.core.arrays import ExtensionArray
from pandas.core.arrays.masked import BaseMaskedDtype
Expand Down Expand Up @@ -183,8 +183,7 @@ def coerce_to_array(
if is_object_dtype(values) or is_string_dtype(values):
inferred_type = lib.infer_dtype(values, skipna=True)
if inferred_type == "empty":
values = np.empty(len(values))
values.fill(np.nan)
pass
elif inferred_type not in [
"floating",
"integer",
Expand All @@ -202,13 +201,14 @@ def coerce_to_array(
elif not (is_integer_dtype(values) or is_float_dtype(values)):
raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype")

if values.ndim != 1:
raise TypeError("values must be a 1D list-like")

if mask is None:
mask = isna(values)
mask = libmissing.is_numeric_na(values)
else:
assert len(mask) == len(values)

if values.ndim != 1:
raise TypeError("values must be a 1D list-like")
if mask.ndim != 1:
raise TypeError("mask must be a 1D list-like")

Expand Down
11 changes: 11 additions & 0 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1511,6 +1511,17 @@ def setitem(self, indexer, value):
# we are always 1-D
indexer = indexer[0]

# TODO(EA2D): not needed with 2D EAS
if isinstance(value, (np.ndarray, ExtensionArray)) and value.ndim == 2:
assert value.shape[1] == 1
# error: No overload variant of "__getitem__" of "ExtensionArray"
# matches argument type "Tuple[slice, int]"
value = value[:, 0] # type: ignore[call-overload]
elif isinstance(value, ABCDataFrame):
# TODO: should we avoid getting here with DataFrame?
assert value.shape[1] == 1
value = value._ixs(0, axis=1)._values

check_setitem_lengths(indexer, value, self.values)
self.values[indexer] = value
return self
Expand Down
12 changes: 8 additions & 4 deletions pandas/tests/arrays/floating/test_construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,14 +97,18 @@ def test_to_array_mixed_integer_float():
np.array(["foo"]),
[[1, 2], [3, 4]],
[np.nan, {"a": 1}],
# GH#44514 all-NA case used to get quietly swapped out before checking ndim
np.array([pd.NA] * 6, dtype=object).reshape(3, 2),
],
)
def test_to_array_error(values):
# error in converting existing arrays to FloatingArray
msg = (
r"(:?.* cannot be converted to a FloatingDtype)"
r"|(:?values must be a 1D list-like)"
r"|(:?Cannot pass scalar)"
msg = "|".join(
[
"cannot be converted to a FloatingDtype",
"values must be a 1D list-like",
"Cannot pass scalar",
]
)
with pytest.raises((TypeError, ValueError), match=msg):
pd.array(values, dtype="Float64")
Expand Down
37 changes: 37 additions & 0 deletions pandas/tests/extension/base/setitem.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
import numpy as np
import pytest

from pandas.core.dtypes.dtypes import (
DatetimeTZDtype,
IntervalDtype,
PandasDtype,
PeriodDtype,
)

import pandas as pd
import pandas._testing as tm
from pandas.tests.extension.base.base import BaseExtensionTests
Expand Down Expand Up @@ -357,6 +364,36 @@ def test_setitem_series(self, data, full_indexer):
)
self.assert_series_equal(result, expected)

def test_setitem_frame_2d_values(self, data, request):
# GH#44514
df = pd.DataFrame({"A": data})

# Avoiding using_array_manager fixture
# https://github.com/pandas-dev/pandas/pull/44514#discussion_r754002410
using_array_manager = isinstance(df._mgr, pd.core.internals.ArrayManager)
if using_array_manager:
if not isinstance(
data.dtype, (PandasDtype, PeriodDtype, IntervalDtype, DatetimeTZDtype)
):
# These dtypes have non-broken implementations of _can_hold_element
mark = pytest.mark.xfail(reason="Goes through split path, loses dtype")
request.node.add_marker(mark)

df = pd.DataFrame({"A": data})
orig = df.copy()

df.iloc[:] = df
self.assert_frame_equal(df, orig)

df.iloc[:-1] = df.iloc[:-1]
self.assert_frame_equal(df, orig)

df.iloc[:] = df.values
self.assert_frame_equal(df, orig)

df.iloc[:-1] = df.values[:-1]
self.assert_frame_equal(df, orig)

def test_delitem_series(self, data):
# GH#40763
ser = pd.Series(data, name="data")
Expand Down
69 changes: 69 additions & 0 deletions pandas/tests/frame/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1211,6 +1211,75 @@ def test_setitem_array_as_cell_value(self):
expected = DataFrame({"a": [np.zeros((2,))], "b": [np.zeros((2, 2))]})
tm.assert_frame_equal(df, expected)

# with AM goes through split-path, loses dtype
@td.skip_array_manager_not_yet_implemented
def test_iloc_setitem_nullable_2d_values(self):
df = DataFrame({"A": [1, 2, 3]}, dtype="Int64")
orig = df.copy()

df.loc[:] = df.values[:, ::-1]
tm.assert_frame_equal(df, orig)

df.loc[:] = pd.core.arrays.PandasArray(df.values[:, ::-1])
tm.assert_frame_equal(df, orig)

df.iloc[:] = df.iloc[:, :]
tm.assert_frame_equal(df, orig)

@pytest.mark.parametrize(
"null", [pd.NaT, pd.NaT.to_numpy("M8[ns]"), pd.NaT.to_numpy("m8[ns]")]
)
def test_setting_mismatched_na_into_nullable_fails(
self, null, any_numeric_ea_dtype
):
# GH#44514 don't cast mismatched nulls to pd.NA
df = DataFrame({"A": [1, 2, 3]}, dtype=any_numeric_ea_dtype)
ser = df["A"]
arr = ser._values

msg = "|".join(
[
r"int\(\) argument must be a string, a bytes-like object or a "
"(real )?number, not 'NaTType'",
r"timedelta64\[ns\] cannot be converted to an? (Floating|Integer)Dtype",
r"datetime64\[ns\] cannot be converted to an? (Floating|Integer)Dtype",
"object cannot be converted to a FloatingDtype",
"'values' contains non-numeric NA",
]
)
with pytest.raises(TypeError, match=msg):
arr[0] = null

with pytest.raises(TypeError, match=msg):
arr[:2] = [null, null]

with pytest.raises(TypeError, match=msg):
ser[0] = null

with pytest.raises(TypeError, match=msg):
ser[:2] = [null, null]

with pytest.raises(TypeError, match=msg):
ser.iloc[0] = null

with pytest.raises(TypeError, match=msg):
ser.iloc[:2] = [null, null]

with pytest.raises(TypeError, match=msg):
df.iloc[0, 0] = null

with pytest.raises(TypeError, match=msg):
df.iloc[:2, 0] = [null, null]

# Multi-Block
df2 = df.copy()
df2["B"] = ser.copy()
with pytest.raises(TypeError, match=msg):
df2.iloc[0, 0] = null

with pytest.raises(TypeError, match=msg):
df2.iloc[:2, 0] = [null, null]


class TestDataFrameIndexingUInt64:
def test_setitem(self, uint64_frame):
Expand Down
11 changes: 8 additions & 3 deletions pandas/tests/series/methods/test_clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,14 @@ def test_series_clipping_with_na_values(self, any_numeric_ea_dtype, nulls_fixtur
# Ensure that clipping method can handle NA values with out failing
# GH#40581

s = Series([nulls_fixture, 1.0, 3.0], dtype=any_numeric_ea_dtype)
s_clipped_upper = s.clip(upper=2.0)
s_clipped_lower = s.clip(lower=2.0)
if nulls_fixture is pd.NaT:
# constructor will raise, see
# test_constructor_mismatched_null_nullable_dtype
return

ser = Series([nulls_fixture, 1.0, 3.0], dtype=any_numeric_ea_dtype)
s_clipped_upper = ser.clip(upper=2.0)
s_clipped_lower = ser.clip(lower=2.0)

expected_upper = Series([nulls_fixture, 1.0, 2.0], dtype=any_numeric_ea_dtype)
expected_lower = Series([nulls_fixture, 2.0, 3.0], dtype=any_numeric_ea_dtype)
Expand Down
32 changes: 31 additions & 1 deletion pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@
iNaT,
lib,
)
from pandas.compat.numpy import np_version_under1p19
from pandas.compat.numpy import (
np_version_under1p19,
np_version_under1p20,
)
import pandas.util._test_decorators as td

from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -1811,6 +1814,33 @@ def test_constructor_bool_dtype_missing_values(self):
expected = Series(True, index=[0], dtype="bool")
tm.assert_series_equal(result, expected)

@pytest.mark.filterwarnings(
"ignore:elementwise comparison failed:DeprecationWarning"
)
@pytest.mark.xfail(
np_version_under1p20, reason="np.array([td64nat, float, float]) raises"
)
@pytest.mark.parametrize("func", [Series, DataFrame, Index, pd.array])
def test_constructor_mismatched_null_nullable_dtype(
self, func, any_numeric_ea_dtype
):
# GH#44514
msg = "|".join(
[
"cannot safely cast non-equivalent object",
r"int\(\) argument must be a string, a bytes-like object "
"or a (real )?number",
r"Cannot cast array data from dtype\('O'\) to dtype\('float64'\) "
"according to the rule 'safe'",
"object cannot be converted to a FloatingDtype",
"'values' contains non-numeric NA",
]
)

for null in tm.NP_NAT_OBJECTS + [NaT]:
with pytest.raises(TypeError, match=msg):
func([null, 1.0, 3.0], dtype=any_numeric_ea_dtype)


class TestSeriesConstructorIndexCoercion:
def test_series_constructor_datetimelike_index_coercion(self):
Expand Down

0 comments on commit 6b84ee7

Please sign in to comment.