Convert ea to appropriate numpy dtype

phofl · Sep 3, 2023 · 03ff78e · 03ff78e
1 parent c3c718c
commit 03ff78e
Show file tree

Hide file tree

Showing 14 changed files with 95 additions and 41 deletions.
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -1476,7 +1476,7 @@ def _maybe_upcast(
         import pyarrow as pa
         if isinstance(arr, IntegerArray) and arr.isna().all():
             # use null instead of int64 in pyarrow
-            arr = arr.to_numpy()
+            arr = arr.to_numpy(na_value=None)
         arr = ArrowExtensionArray(pa.array(arr, from_pandas=True))
 
     return arr

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -1809,6 +1809,8 @@ def map_array(
         return arr.copy()
 
     # we must convert to python types
+    if isinstance(arr.dtype, BaseMaskedDtype):
+        arr = arr.to_numpy()
     values = arr.astype(object, copy=False)
     if na_action is None:
         return lib.map_infer(values, mapper, convert=convert)

diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -38,11 +38,15 @@
     IS64,
     is_platform_windows,
 )
-from pandas.errors import AbstractMethodError
+from pandas.errors import (
+    AbstractMethodError,
+    LossySetitemError,
+)
 from pandas.util._decorators import doc
 from pandas.util._validators import validate_fillna_kwargs
 
 from pandas.core.dtypes.base import ExtensionDtype
+from pandas.core.dtypes.cast import np_can_hold_element
 from pandas.core.dtypes.common import (
     is_bool,
     is_integer_dtype,
@@ -472,20 +476,35 @@ def to_numpy(
         >>> a.to_numpy(dtype="bool", na_value=False)
         array([ True, False, False])
         """
-        if na_value is lib.no_default:
-            na_value = libmissing.NA
+        hasna = self._hasna
+
         if dtype is None:
-            if self._hasna:
-                itemsize = self.dtype.itemsize
-                if itemsize < 4:
-                    itemsize = 4
-                dtype = np.dtype(f"f{itemsize}")
-                na_value = np.nan
+            dtype_given = False
+            if hasna:
+                if self.dtype.kind == "b":
+                    dtype = object
+                else:
+                    if self.dtype.kind in "iu":
+                        dtype = np.dtype(np.float64)
+                    else:
+                        dtype = self.dtype.numpy_dtype
+                    if na_value is lib.no_default:
+                        na_value = np.nan
             else:
                 dtype = self.dtype.numpy_dtype
         else:
             dtype = np.dtype(dtype)
-        if self._hasna:
+            dtype_given = True
+        if na_value is lib.no_default:
+            na_value = libmissing.NA
+
+        if not dtype_given and hasna:
+            try:
+                np_can_hold_element(dtype, na_value)
+            except LossySetitemError:
+                dtype = object
+
+        if hasna:
             if (
                 dtype != object
                 and not is_string_dtype(dtype)
@@ -512,7 +531,7 @@ def tolist(self):
         if self.ndim > 1:
             return [x.tolist() for x in self]
         dtype = None if self._hasna else self._data.dtype
-        return self.to_numpy(dtype=dtype).tolist()
+        return self.to_numpy(dtype=dtype, na_value=libmissing.NA).tolist()
 
     @overload
     def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray:

diff --git a/pandas/core/methods/to_dict.py b/pandas/core/methods/to_dict.py
@@ -8,10 +8,17 @@
 
 import numpy as np
 
+from pandas._libs import (
+    lib,
+    missing as libmissing,
+)
 from pandas.util._exceptions import find_stack_level
 
 from pandas.core.dtypes.cast import maybe_box_native
-from pandas.core.dtypes.dtypes import ExtensionDtype
+from pandas.core.dtypes.dtypes import (
+    BaseMaskedDtype,
+    ExtensionDtype,
+)
 
 from pandas.core import common as com
 
@@ -100,6 +107,10 @@ def to_dict(
         for i, col_dtype in enumerate(df.dtypes.values)
         if col_dtype == np.dtype(object) or isinstance(col_dtype, ExtensionDtype)
     ]
+    box_na_values = [
+        lib.no_default if not isinstance(col_dtype, BaseMaskedDtype) else libmissing.NA
+        for i, col_dtype in enumerate(df.dtypes.values)
+    ]
     are_all_object_dtype_cols = len(box_native_indices) == len(df.dtypes)
 
     if orient == "dict":
@@ -110,7 +121,11 @@ def to_dict(
         return into_c(
             (
                 k,
-                list(map(maybe_box_native, v.to_numpy().tolist()))
+                list(
+                    map(
+                        maybe_box_native, v.to_numpy(na_value=box_na_values[i]).tolist()
+                    )
+                )
                 if i in object_dtype_indices_as_set
                 else v.to_numpy().tolist(),
             )

diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
@@ -70,6 +70,7 @@
 )
 
 from pandas.core.arrays import (
+    BaseMaskedArray,
     Categorical,
     DatetimeArray,
     TimedeltaArray,
@@ -1660,6 +1661,8 @@ def _format_strings(self) -> list[str]:
         if isinstance(values, Categorical):
             # Categorical is special for now, so that we can preserve tzinfo
             array = values._internal_get_values()
+        elif isinstance(values, BaseMaskedArray):
+            array = values.to_numpy(na_value=NA)
         else:
             array = np.asarray(values)
 

diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -63,6 +63,7 @@
 from pandas.core import algorithms
 from pandas.core.arrays import (
     ArrowExtensionArray,
+    BaseMaskedArray,
     BooleanArray,
     Categorical,
     ExtensionArray,
@@ -762,8 +763,15 @@ def _infer_types(
             pa = import_optional_dependency("pyarrow")
             if isinstance(result, np.ndarray):
                 result = ArrowExtensionArray(pa.array(result, from_pandas=True))
+            elif isinstance(result, BaseMaskedArray):
+                if result._mask.all():
+                    result = result.to_numpy(na_value=None)
+                    result = ArrowExtensionArray(pa.array(result))
+                else:
+                    result = ArrowExtensionArray(
+                        pa.array(result._data, mask=result._mask)
+                    )
             else:
-                # ExtensionArray
                 result = ArrowExtensionArray(
                     pa.array(result.to_numpy(), from_pandas=True)
                 )

diff --git a/pandas/tests/arrays/boolean/test_construction.py b/pandas/tests/arrays/boolean/test_construction.py
@@ -223,7 +223,7 @@ def test_coerce_to_numpy_array():
     # also with no missing values -> object dtype
     arr = pd.array([True, False, True], dtype="boolean")
     result = np.array(arr)
-    expected = np.array([True, False, True], dtype="object")
+    expected = np.array([True, False, True], dtype="bool")
     tm.assert_numpy_array_equal(result, expected)
 
     # force bool dtype
@@ -263,7 +263,7 @@ def test_to_numpy(box):
     # default (with or without missing values) -> object dtype
     arr = con([True, False, True], dtype="boolean")
     result = arr.to_numpy()
-    expected = np.array([True, False, True], dtype="object")
+    expected = np.array([True, False, True], dtype="bool")
     tm.assert_numpy_array_equal(result, expected)
 
     arr = con([True, False, None], dtype="boolean")

diff --git a/pandas/tests/arrays/floating/test_to_numpy.py b/pandas/tests/arrays/floating/test_to_numpy.py
@@ -13,12 +13,12 @@ def test_to_numpy(box):
     # default (with or without missing values) -> object dtype
     arr = con([0.1, 0.2, 0.3], dtype="Float64")
     result = arr.to_numpy()
-    expected = np.array([0.1, 0.2, 0.3], dtype="object")
+    expected = np.array([0.1, 0.2, 0.3], dtype="float64")
     tm.assert_numpy_array_equal(result, expected)
 
     arr = con([0.1, 0.2, None], dtype="Float64")
     result = arr.to_numpy()
-    expected = np.array([0.1, 0.2, pd.NA], dtype="object")
+    expected = np.array([0.1, 0.2, np.nan], dtype="float64")
     tm.assert_numpy_array_equal(result, expected)
 
 

diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py
@@ -142,7 +142,7 @@ def test_astype(all_data):
     # coerce to object
     s = pd.Series(mixed)
     result = s.astype("object")
-    expected = pd.Series(np.asarray(mixed))
+    expected = pd.Series(np.asarray(mixed, dtype=object))
     tm.assert_series_equal(result, expected)
 
 

diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py
@@ -286,7 +286,7 @@ def test_array_multiindex_raises():
             pd.core.arrays.period_array(["2000", "2001"], freq="D"),
             np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]),
         ),
-        (pd.array([0, np.nan], dtype="Int64"), np.array([0, pd.NA], dtype=object)),
+        (pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan])),
         (
             IntervalArray.from_breaks([0, 1, 2]),
             np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object),
@@ -335,10 +335,6 @@ def test_to_numpy(arr, expected, index_or_series_or_array, request):
     with tm.assert_produces_warning(None):
         thing = box(arr)
 
-    if arr.dtype.name == "int64" and box is pd.array:
-        mark = pytest.mark.xfail(reason="thing is Int64 and to_numpy() returns object")
-        request.node.add_marker(mark)
-
     result = thing.to_numpy()
     tm.assert_numpy_array_equal(result, expected)
 

diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py
@@ -132,21 +132,25 @@ def test_series_array_ea_dtypes(using_copy_on_write):
         assert arr.flags.writeable is True
 
     arr = np.asarray(ser)
-    assert not np.shares_memory(arr, get_array(ser))
-    assert arr.flags.writeable is True
+    assert np.shares_memory(arr, get_array(ser))
+    if using_copy_on_write:
+        assert arr.flags.writeable is False
+    else:
+        assert arr.flags.writeable is True
 
 
 def test_dataframe_array_ea_dtypes(using_copy_on_write):
     df = DataFrame({"a": [1, 2, 3]}, dtype="Int64")
     arr = np.asarray(df, dtype="int64")
-    # TODO: This should be able to share memory, but we are roundtripping
-    # through object
-    assert not np.shares_memory(arr, get_array(df, "a"))
-    assert arr.flags.writeable is True
+    assert np.shares_memory(arr, get_array(df, "a"))
+    if using_copy_on_write:
+        assert arr.flags.writeable is False
+    else:
+        assert arr.flags.writeable is True
 
     arr = np.asarray(df)
+    assert np.shares_memory(arr, get_array(df, "a"))
     if using_copy_on_write:
-        # TODO(CoW): This should be True
         assert arr.flags.writeable is False
     else:
         assert arr.flags.writeable is True

diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py
@@ -160,6 +160,16 @@ def data_for_grouping(dtype):
 
 
 class TestMaskedArrays(base.ExtensionTests):
+    @pytest.mark.parametrize("na_action", [None, "ignore"])
+    def test_map(self, data_missing, na_action):
+        result = data_missing.map(lambda x: x, na_action=na_action)
+        if data_missing.dtype == Float32Dtype():
+            # map roundtrips through objects, which converts to float64
+            expected = data_missing.to_numpy(dtype="float64", na_value=np.nan)
+        else:
+            expected = data_missing.to_numpy()
+        tm.assert_numpy_array_equal(result, expected)
+
     def _get_expected_exception(self, op_name, obj, other):
         try:
             dtype = tm.get_dtype(obj)

diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py
@@ -715,15 +715,12 @@ def test_where_ea_other(self):
 
         # TODO: ideally we would get Int64 instead of object
         result = df.where(mask, ser, axis=0)
-        expected = DataFrame({"A": [1, pd.NA, 3], "B": [4, pd.NA, 6]}).astype(object)
+        expected = DataFrame({"A": [1, np.nan, 3], "B": [4, np.nan, 6]})
         tm.assert_frame_equal(result, expected)
 
         ser2 = Series(arr[:2], index=["A", "B"])
-        expected = DataFrame({"A": [1, 7, 3], "B": [4, pd.NA, 6]})
-        expected["B"] = expected["B"].astype(object)
-        msg = "Downcasting behavior in Series and DataFrame methods 'where'"
-        with tm.assert_produces_warning(FutureWarning, match=msg):
-            result = df.where(mask, ser2, axis=1)
+        expected = DataFrame({"A": [1, 7, 3], "B": [4, np.nan, 6]})
+        result = df.where(mask, ser2, axis=1)
         tm.assert_frame_equal(result, expected)
 
     def test_where_interval_noop(self):

diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py
@@ -451,9 +451,9 @@ def test_masked_ea_with_formatter(self):
             }
         )
         result = df.to_string(formatters=["{:.2f}".format, "{:.2f}".format])
-        expected = """      a     b
-0  0.12  1.00
-1  1.12  2.00"""
+        expected = """     a    b
+0 0.12 1.00
+1 1.12 2.00"""
         assert result == expected
 
     def test_repr_ea_columns(self, any_string_dtype):