From 7ffc0ad1e803513f7a24e351c198fc0308c1cc9f Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 9 Feb 2023 12:33:43 -0500 Subject: [PATCH] PERF: ArrowExtensionArray.to_numpy(dtype=object) (#51227) * PERF: ArrowExtensionArray.to_numpy(dtype=object) * gh refs --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/core/arrays/arrow/array.py | 10 +++++----- pandas/tests/extension/test_arrow.py | 10 ++++++++++ 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 3aed031bec00b..3c9c249e0f6ea 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1073,7 +1073,7 @@ Performance improvements - Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.__setitem__` (:issue:`50248`, :issue:`50632`) - Performance improvement in :class:`~arrays.ArrowExtensionArray` comparison methods when array contains NA (:issue:`50524`) -- Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`49973`) +- Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`49973`, :issue:`51227`) - Performance improvement when parsing strings to :class:`BooleanDtype` (:issue:`50613`) - Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`) - Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index ad10d82c0ca3c..5a72f59cad890 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -842,12 +842,12 @@ def to_numpy( na_value = self.dtype.na_value pa_type = self._data.type - if ( - is_object_dtype(dtype) - or pa.types.is_timestamp(pa_type) - or pa.types.is_duration(pa_type) - ): + if pa.types.is_timestamp(pa_type) or pa.types.is_duration(pa_type): result = np.array(list(self), dtype=dtype) + elif is_object_dtype(dtype) and self._hasna: + result = np.empty(len(self), dtype=object) + mask = ~self.isna() + result[mask] = np.asarray(self[mask]._data) else: result = np.asarray(self._data, dtype=dtype) if copy or self._hasna: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 51edae326417a..0d3f3e9e9e48c 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1525,6 +1525,16 @@ def test_to_numpy_with_defaults(data): tm.assert_numpy_array_equal(result, expected) +def test_to_numpy_int_with_na(): + # GH51227: ensure to_numpy does not convert int to float + data = [1, None] + arr = pd.array(data, dtype="int64[pyarrow]") + result = arr.to_numpy() + expected = np.array([1, pd.NA], dtype=object) + assert isinstance(result[0], int) + tm.assert_numpy_array_equal(result, expected) + + def test_setitem_null_slice(data): # GH50248 orig = data.copy()