Skip to content

Commit

Permalink
Convert ea to appropriate numpy dtype
Browse files Browse the repository at this point in the history
  • Loading branch information
phofl committed Sep 3, 2023
1 parent c3c718c commit 03ff78e
Show file tree
Hide file tree
Showing 14 changed files with 95 additions and 41 deletions.
2 changes: 1 addition & 1 deletion pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1476,7 +1476,7 @@ def _maybe_upcast(
import pyarrow as pa
if isinstance(arr, IntegerArray) and arr.isna().all():
# use null instead of int64 in pyarrow
arr = arr.to_numpy()
arr = arr.to_numpy(na_value=None)
arr = ArrowExtensionArray(pa.array(arr, from_pandas=True))

return arr
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -1809,6 +1809,8 @@ def map_array(
return arr.copy()

# we must convert to python types
if isinstance(arr.dtype, BaseMaskedDtype):
arr = arr.to_numpy()
values = arr.astype(object, copy=False)
if na_action is None:
return lib.map_infer(values, mapper, convert=convert)
Expand Down
41 changes: 30 additions & 11 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,15 @@
IS64,
is_platform_windows,
)
from pandas.errors import AbstractMethodError
from pandas.errors import (
AbstractMethodError,
LossySetitemError,
)
from pandas.util._decorators import doc
from pandas.util._validators import validate_fillna_kwargs

from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.cast import np_can_hold_element
from pandas.core.dtypes.common import (
is_bool,
is_integer_dtype,
Expand Down Expand Up @@ -472,20 +476,35 @@ def to_numpy(
>>> a.to_numpy(dtype="bool", na_value=False)
array([ True, False, False])
"""
if na_value is lib.no_default:
na_value = libmissing.NA
hasna = self._hasna

if dtype is None:
if self._hasna:
itemsize = self.dtype.itemsize
if itemsize < 4:
itemsize = 4
dtype = np.dtype(f"f{itemsize}")
na_value = np.nan
dtype_given = False
if hasna:
if self.dtype.kind == "b":
dtype = object
else:
if self.dtype.kind in "iu":
dtype = np.dtype(np.float64)
else:
dtype = self.dtype.numpy_dtype
if na_value is lib.no_default:
na_value = np.nan
else:
dtype = self.dtype.numpy_dtype
else:
dtype = np.dtype(dtype)
if self._hasna:
dtype_given = True
if na_value is lib.no_default:
na_value = libmissing.NA

if not dtype_given and hasna:
try:
np_can_hold_element(dtype, na_value)
except LossySetitemError:
dtype = object

if hasna:
if (
dtype != object
and not is_string_dtype(dtype)
Expand All @@ -512,7 +531,7 @@ def tolist(self):
if self.ndim > 1:
return [x.tolist() for x in self]
dtype = None if self._hasna else self._data.dtype
return self.to_numpy(dtype=dtype).tolist()
return self.to_numpy(dtype=dtype, na_value=libmissing.NA).tolist()

@overload
def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray:
Expand Down
19 changes: 17 additions & 2 deletions pandas/core/methods/to_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,17 @@

import numpy as np

from pandas._libs import (
lib,
missing as libmissing,
)
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.cast import maybe_box_native
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.dtypes import (
BaseMaskedDtype,
ExtensionDtype,
)

from pandas.core import common as com

Expand Down Expand Up @@ -100,6 +107,10 @@ def to_dict(
for i, col_dtype in enumerate(df.dtypes.values)
if col_dtype == np.dtype(object) or isinstance(col_dtype, ExtensionDtype)
]
box_na_values = [
lib.no_default if not isinstance(col_dtype, BaseMaskedDtype) else libmissing.NA
for i, col_dtype in enumerate(df.dtypes.values)
]
are_all_object_dtype_cols = len(box_native_indices) == len(df.dtypes)

if orient == "dict":
Expand All @@ -110,7 +121,11 @@ def to_dict(
return into_c(
(
k,
list(map(maybe_box_native, v.to_numpy().tolist()))
list(
map(
maybe_box_native, v.to_numpy(na_value=box_na_values[i]).tolist()
)
)
if i in object_dtype_indices_as_set
else v.to_numpy().tolist(),
)
Expand Down
3 changes: 3 additions & 0 deletions pandas/io/formats/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
)

from pandas.core.arrays import (
BaseMaskedArray,
Categorical,
DatetimeArray,
TimedeltaArray,
Expand Down Expand Up @@ -1660,6 +1661,8 @@ def _format_strings(self) -> list[str]:
if isinstance(values, Categorical):
# Categorical is special for now, so that we can preserve tzinfo
array = values._internal_get_values()
elif isinstance(values, BaseMaskedArray):
array = values.to_numpy(na_value=NA)
else:
array = np.asarray(values)

Expand Down
10 changes: 9 additions & 1 deletion pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
from pandas.core import algorithms
from pandas.core.arrays import (
ArrowExtensionArray,
BaseMaskedArray,
BooleanArray,
Categorical,
ExtensionArray,
Expand Down Expand Up @@ -762,8 +763,15 @@ def _infer_types(
pa = import_optional_dependency("pyarrow")
if isinstance(result, np.ndarray):
result = ArrowExtensionArray(pa.array(result, from_pandas=True))
elif isinstance(result, BaseMaskedArray):
if result._mask.all():
result = result.to_numpy(na_value=None)
result = ArrowExtensionArray(pa.array(result))
else:
result = ArrowExtensionArray(
pa.array(result._data, mask=result._mask)
)
else:
# ExtensionArray
result = ArrowExtensionArray(
pa.array(result.to_numpy(), from_pandas=True)
)
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/arrays/boolean/test_construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def test_coerce_to_numpy_array():
# also with no missing values -> object dtype
arr = pd.array([True, False, True], dtype="boolean")
result = np.array(arr)
expected = np.array([True, False, True], dtype="object")
expected = np.array([True, False, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)

# force bool dtype
Expand Down Expand Up @@ -263,7 +263,7 @@ def test_to_numpy(box):
# default (with or without missing values) -> object dtype
arr = con([True, False, True], dtype="boolean")
result = arr.to_numpy()
expected = np.array([True, False, True], dtype="object")
expected = np.array([True, False, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)

arr = con([True, False, None], dtype="boolean")
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/arrays/floating/test_to_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@ def test_to_numpy(box):
# default (with or without missing values) -> object dtype
arr = con([0.1, 0.2, 0.3], dtype="Float64")
result = arr.to_numpy()
expected = np.array([0.1, 0.2, 0.3], dtype="object")
expected = np.array([0.1, 0.2, 0.3], dtype="float64")
tm.assert_numpy_array_equal(result, expected)

arr = con([0.1, 0.2, None], dtype="Float64")
result = arr.to_numpy()
expected = np.array([0.1, 0.2, pd.NA], dtype="object")
expected = np.array([0.1, 0.2, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)


Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/arrays/integer/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def test_astype(all_data):
# coerce to object
s = pd.Series(mixed)
result = s.astype("object")
expected = pd.Series(np.asarray(mixed))
expected = pd.Series(np.asarray(mixed, dtype=object))
tm.assert_series_equal(result, expected)


Expand Down
6 changes: 1 addition & 5 deletions pandas/tests/base/test_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def test_array_multiindex_raises():
pd.core.arrays.period_array(["2000", "2001"], freq="D"),
np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]),
),
(pd.array([0, np.nan], dtype="Int64"), np.array([0, pd.NA], dtype=object)),
(pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan])),
(
IntervalArray.from_breaks([0, 1, 2]),
np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object),
Expand Down Expand Up @@ -335,10 +335,6 @@ def test_to_numpy(arr, expected, index_or_series_or_array, request):
with tm.assert_produces_warning(None):
thing = box(arr)

if arr.dtype.name == "int64" and box is pd.array:
mark = pytest.mark.xfail(reason="thing is Int64 and to_numpy() returns object")
request.node.add_marker(mark)

result = thing.to_numpy()
tm.assert_numpy_array_equal(result, expected)

Expand Down
18 changes: 11 additions & 7 deletions pandas/tests/copy_view/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,21 +132,25 @@ def test_series_array_ea_dtypes(using_copy_on_write):
assert arr.flags.writeable is True

arr = np.asarray(ser)
assert not np.shares_memory(arr, get_array(ser))
assert arr.flags.writeable is True
assert np.shares_memory(arr, get_array(ser))
if using_copy_on_write:
assert arr.flags.writeable is False
else:
assert arr.flags.writeable is True


def test_dataframe_array_ea_dtypes(using_copy_on_write):
df = DataFrame({"a": [1, 2, 3]}, dtype="Int64")
arr = np.asarray(df, dtype="int64")
# TODO: This should be able to share memory, but we are roundtripping
# through object
assert not np.shares_memory(arr, get_array(df, "a"))
assert arr.flags.writeable is True
assert np.shares_memory(arr, get_array(df, "a"))
if using_copy_on_write:
assert arr.flags.writeable is False
else:
assert arr.flags.writeable is True

arr = np.asarray(df)
assert np.shares_memory(arr, get_array(df, "a"))
if using_copy_on_write:
# TODO(CoW): This should be True
assert arr.flags.writeable is False
else:
assert arr.flags.writeable is True
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/extension/test_masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,16 @@ def data_for_grouping(dtype):


class TestMaskedArrays(base.ExtensionTests):
@pytest.mark.parametrize("na_action", [None, "ignore"])
def test_map(self, data_missing, na_action):
result = data_missing.map(lambda x: x, na_action=na_action)
if data_missing.dtype == Float32Dtype():
# map roundtrips through objects, which converts to float64
expected = data_missing.to_numpy(dtype="float64", na_value=np.nan)
else:
expected = data_missing.to_numpy()
tm.assert_numpy_array_equal(result, expected)

def _get_expected_exception(self, op_name, obj, other):
try:
dtype = tm.get_dtype(obj)
Expand Down
9 changes: 3 additions & 6 deletions pandas/tests/frame/indexing/test_where.py
Original file line number Diff line number Diff line change
Expand Up @@ -715,15 +715,12 @@ def test_where_ea_other(self):

# TODO: ideally we would get Int64 instead of object
result = df.where(mask, ser, axis=0)
expected = DataFrame({"A": [1, pd.NA, 3], "B": [4, pd.NA, 6]}).astype(object)
expected = DataFrame({"A": [1, np.nan, 3], "B": [4, np.nan, 6]})
tm.assert_frame_equal(result, expected)

ser2 = Series(arr[:2], index=["A", "B"])
expected = DataFrame({"A": [1, 7, 3], "B": [4, pd.NA, 6]})
expected["B"] = expected["B"].astype(object)
msg = "Downcasting behavior in Series and DataFrame methods 'where'"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.where(mask, ser2, axis=1)
expected = DataFrame({"A": [1, 7, 3], "B": [4, np.nan, 6]})
result = df.where(mask, ser2, axis=1)
tm.assert_frame_equal(result, expected)

def test_where_interval_noop(self):
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/frame/test_repr_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,9 +451,9 @@ def test_masked_ea_with_formatter(self):
}
)
result = df.to_string(formatters=["{:.2f}".format, "{:.2f}".format])
expected = """ a b
0 0.12 1.00
1 1.12 2.00"""
expected = """ a b
0 0.12 1.00
1 1.12 2.00"""
assert result == expected

def test_repr_ea_columns(self, any_string_dtype):
Expand Down

0 comments on commit 03ff78e

Please sign in to comment.