diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index b2d30f3540e77..cfadb3e9f45c5 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -30,11 +30,13 @@ ) -def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: +def cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: """ Helper function for `arr.astype(common_dtype)` but handling all special cases. """ + if is_dtype_equal(arr.dtype, dtype): + return arr if ( is_categorical_dtype(arr.dtype) and isinstance(dtype, np.dtype) @@ -121,7 +123,7 @@ def is_nonempty(x) -> bool: # for axis=0 if not single_dtype: target_dtype = find_common_type([x.dtype for x in to_concat]) - to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] + to_concat = [cast_to_common_type(arr, target_dtype) for arr in to_concat] if isinstance(to_concat[0], ExtensionArray): cls = type(to_concat[0]) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 5581305a9baea..aedb9488b7454 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -18,12 +18,14 @@ ) from pandas._typing import ( ArrayLike, + DtypeObj, Hashable, ) from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( astype_array_safe, + ensure_dtype_can_hold_na, infer_dtype_from_scalar, soft_convert_objects, ) @@ -49,6 +51,7 @@ from pandas.core.dtypes.missing import ( array_equals, isna, + na_value_for_dtype, ) import pandas.core.algorithms as algos @@ -952,10 +955,18 @@ def reindex_indexer( # ignored keywords consolidate: bool = True, only_slice: bool = False, + # ArrayManager specific keywords + use_na_proxy: bool = False, ) -> T: axis = self._normalize_axis(axis) return self._reindex_indexer( - new_axis, indexer, axis, fill_value, allow_dups, copy + new_axis, + indexer, + axis, + fill_value, + allow_dups, + copy, + use_na_proxy, ) def _reindex_indexer( @@ -966,6 +977,7 @@ def _reindex_indexer( fill_value=None, allow_dups: bool = False, copy: bool = True, + use_na_proxy: bool = False, ) -> T: """ Parameters @@ -1000,7 +1012,9 @@ def _reindex_indexer( new_arrays = [] for i in indexer: if i == -1: - arr = self._make_na_array(fill_value=fill_value) + arr = self._make_na_array( + fill_value=fill_value, use_na_proxy=use_na_proxy + ) else: arr = self.arrays[i] new_arrays.append(arr) @@ -1051,7 +1065,11 @@ def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T: new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True ) - def _make_na_array(self, fill_value=None): + def _make_na_array(self, fill_value=None, use_na_proxy=False): + if use_na_proxy: + assert fill_value is None + return NullArrayProxy(self.shape_proper[0]) + if fill_value is None: fill_value = np.nan @@ -1271,3 +1289,50 @@ def set_values(self, values: ArrayLike): valid for the current SingleArrayManager (length, dtype, etc). """ self.arrays[0] = values + + +class NullArrayProxy: + """ + Proxy object for an all-NA array. + + Only stores the length of the array, and not the dtype. The dtype + will only be known when actually concatenating (after determining the + common dtype, for which this proxy is ignored). + Using this object avoids that the internals/concat.py needs to determine + the proper dtype and array type. + """ + + ndim = 1 + + def __init__(self, n: int): + self.n = n + + @property + def shape(self): + return (self.n,) + + def to_array(self, dtype: DtypeObj) -> ArrayLike: + """ + Helper function to create the actual all-NA array from the NullArrayProxy + object. + + Parameters + ---------- + arr : NullArrayProxy + dtype : the dtype for the resulting array + + Returns + ------- + np.ndarray or ExtensionArray + """ + if isinstance(dtype, ExtensionDtype): + empty = dtype.construct_array_type()._from_sequence([], dtype=dtype) + indexer = -np.ones(self.n, dtype=np.intp) + return empty.take(indexer, allow_fill=True) + else: + # when introducing missing values, int becomes float, bool becomes object + dtype = ensure_dtype_can_hold_na(dtype) + fill_value = na_value_for_dtype(dtype) + arr = np.empty(self.n, dtype=dtype) + arr.fill(fill_value) + return ensure_wrapped_if_datetimelike(arr) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index b1f90834f09c3..687c8768fb251 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -28,7 +28,10 @@ is_extension_array_dtype, is_sparse, ) -from pandas.core.dtypes.concat import concat_compat +from pandas.core.dtypes.concat import ( + cast_to_common_type, + concat_compat, +) from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.missing import ( is_valid_na_for_dtype, @@ -42,7 +45,10 @@ ExtensionArray, ) from pandas.core.construction import ensure_wrapped_if_datetimelike -from pandas.core.internals.array_manager import ArrayManager +from pandas.core.internals.array_manager import ( + ArrayManager, + NullArrayProxy, +) from pandas.core.internals.blocks import ( ensure_block_shape, new_block, @@ -74,14 +80,16 @@ def _concatenate_array_managers( mgrs = [] for mgr, indexers in mgrs_indexers: for ax, indexer in indexers.items(): - mgr = mgr.reindex_indexer(axes[ax], indexer, axis=ax, allow_dups=True) + mgr = mgr.reindex_indexer( + axes[ax], indexer, axis=ax, allow_dups=True, use_na_proxy=True + ) mgrs.append(mgr) if concat_axis == 1: # concatting along the rows -> concat the reindexed arrays # TODO(ArrayManager) doesn't yet preserve the correct dtype arrays = [ - concat_compat([mgrs[i].arrays[j] for i in range(len(mgrs))]) + concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))]) for j in range(len(mgrs[0].arrays)) ] return ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False) @@ -92,6 +100,68 @@ def _concatenate_array_managers( return ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False) +def concat_arrays(to_concat: list) -> ArrayLike: + """ + Alternative for concat_compat but specialized for use in the ArrayManager. + + Differences: only deals with 1D arrays (no axis keyword), assumes + ensure_wrapped_if_datetimelike and does not skip empty arrays to determine + the dtype. + In addition ensures that all NullArrayProxies get replaced with actual + arrays. + + Parameters + ---------- + to_concat : list of arrays + + Returns + ------- + np.ndarray or ExtensionArray + """ + # ignore the all-NA proxies to determine the resulting dtype + to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)] + + single_dtype = len({x.dtype for x in to_concat_no_proxy}) == 1 + + if not single_dtype: + target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy]) + else: + target_dtype = to_concat_no_proxy[0].dtype + + if target_dtype.kind in ["m", "M"]: + # for datetimelike use DatetimeArray/TimedeltaArray concatenation + # don't use arr.astype(target_dtype, copy=False), because that doesn't + # work for DatetimeArray/TimedeltaArray (returns ndarray) + to_concat = [ + arr.to_array(target_dtype) if isinstance(arr, NullArrayProxy) else arr + for arr in to_concat + ] + return type(to_concat_no_proxy[0])._concat_same_type(to_concat, axis=0) + + to_concat = [ + arr.to_array(target_dtype) + if isinstance(arr, NullArrayProxy) + else cast_to_common_type(arr, target_dtype) + for arr in to_concat + ] + + if isinstance(to_concat[0], ExtensionArray): + cls = type(to_concat[0]) + return cls._concat_same_type(to_concat) + + result = np.concatenate(to_concat) + + # TODO decide on exact behaviour (we shouldn't do this only for empty result) + # see https://github.com/pandas-dev/pandas/issues/39817 + if len(result) == 0: + # all empties -> check for bool to not coerce to float + kinds = {obj.dtype.kind for obj in to_concat_no_proxy} + if len(kinds) != 1: + if "b" in kinds: + result = result.astype(object) + return result + + def concatenate_managers( mgrs_indexers, axes: list[Index], concat_axis: int, copy: bool ) -> Manager: diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 5a2d928eea744..de3af31ece7b0 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -3,8 +3,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas.api.extensions import ExtensionArray from pandas.core.internals import ExtensionBlock @@ -111,7 +109,6 @@ def test_concat_extension_arrays_copy_false(self, data, na_value): result = pd.concat([df1, df2], axis=1, copy=False) self.assert_frame_equal(result, expected) - @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) concat reindex def test_concat_with_reindex(self, data): # GH-33027 a = pd.DataFrame({"a": data[:5]}) diff --git a/pandas/tests/frame/methods/test_append.py b/pandas/tests/frame/methods/test_append.py index ba58d88fb4863..f9535e9c7ef17 100644 --- a/pandas/tests/frame/methods/test_append.py +++ b/pandas/tests/frame/methods/test_append.py @@ -1,8 +1,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( DataFrame, @@ -13,9 +11,6 @@ ) import pandas._testing as tm -# TODO td.skip_array_manager_not_yet_implemented -# appending with reindexing not yet working - class TestDataFrameAppend: def test_append_multiindex(self, multiindex_dataframe_random_data, frame_or_series): @@ -43,7 +38,6 @@ def test_append_empty_list(self): tm.assert_frame_equal(result, expected) assert result is not df # .append() should return a new object - @td.skip_array_manager_not_yet_implemented def test_append_series_dict(self): df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) @@ -84,7 +78,6 @@ def test_append_series_dict(self): expected = df.append(df[-1:], ignore_index=True) tm.assert_frame_equal(result, expected) - @td.skip_array_manager_not_yet_implemented def test_append_list_of_series_dicts(self): df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) @@ -103,7 +96,6 @@ def test_append_list_of_series_dicts(self): expected = df.append(DataFrame(dicts), ignore_index=True, sort=True) tm.assert_frame_equal(result, expected) - @td.skip_array_manager_not_yet_implemented def test_append_missing_cols(self): # GH22252 # exercise the conditional branch in append method where the data @@ -148,8 +140,7 @@ def test_append_empty_dataframe(self): expected = df1.copy() tm.assert_frame_equal(result, expected) - @td.skip_array_manager_not_yet_implemented - def test_append_dtypes(self): + def test_append_dtypes(self, using_array_manager): # GH 5754 # row appends of different dtypes (so need to do by-item) @@ -173,6 +164,10 @@ def test_append_dtypes(self): expected = DataFrame( {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} ) + if using_array_manager: + # TODO(ArrayManager) decide on exact casting rules in concat + # With ArrayManager, all-NaN float is not ignored + expected = expected.astype(object) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) @@ -181,6 +176,9 @@ def test_append_dtypes(self): expected = DataFrame( {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} ) + if using_array_manager: + # With ArrayManager, all-NaN float is not ignored + expected = expected.astype(object) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": np.nan}, index=range(1)) @@ -189,6 +187,9 @@ def test_append_dtypes(self): expected = DataFrame( {"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")} ) + if using_array_manager: + # With ArrayManager, all-NaN float is not ignored + expected = expected.astype(object) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) @@ -208,7 +209,6 @@ def test_append_timestamps_aware_or_naive(self, tz_naive_fixture, timestamp): expected = Series(Timestamp(timestamp, tz=tz), name=0) tm.assert_series_equal(result, expected) - @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize( "data, dtype", [ diff --git a/pandas/tests/reshape/concat/__init__.py b/pandas/tests/reshape/concat/__init__.py index 777923be02398..e69de29bb2d1d 100644 --- a/pandas/tests/reshape/concat/__init__.py +++ b/pandas/tests/reshape/concat/__init__.py @@ -1,4 +0,0 @@ -import pandas.util._test_decorators as td - -# TODO(ArrayManager) concat axis=0 -pytestmark = td.skip_array_manager_not_yet_implemented diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py index 7b9f8d1c2879e..1c533ec9fb1e3 100644 --- a/pandas/tests/reshape/concat/test_append.py +++ b/pandas/tests/reshape/concat/test_append.py @@ -6,6 +6,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -338,6 +340,10 @@ def test_append_missing_column_proper_upcast(self, sort): assert appended["A"].dtype == "f8" assert appended["B"].dtype == "O" + # TODO(ArrayManager) DataFrame.append reindexes a Series itself (giving + # float dtype) -> delay reindexing until concat_array_managers which properly + # takes care of all-null dtype inference + @td.skip_array_manager_not_yet_implemented def test_append_empty_frame_to_series_with_dateutil_tz(self): # GH 23682 date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc()) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 55ae754761a07..46029b8a695ea 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -8,6 +8,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -41,6 +43,8 @@ def test_append_concat(self): assert isinstance(result.index, PeriodIndex) assert result.index[0] == s1.index[0] + # TODO(ArrayManager) using block internals to verify, needs rewrite + @td.skip_array_manager_invalid_test def test_concat_copy(self): df = DataFrame(np.random.randn(4, 3)) df2 = DataFrame(np.random.randint(0, 10, size=4).reshape(4, 1)) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 4fa2865a9e320..9699a0dec4891 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -676,7 +676,7 @@ def _constructor(self): assert isinstance(result, NotADataFrame) - def test_join_append_timedeltas(self): + def test_join_append_timedeltas(self, using_array_manager): # timedelta64 issues with join/merge # GH 5695 @@ -690,6 +690,9 @@ def test_join_append_timedeltas(self): "t": [timedelta(0, 22500), timedelta(0, 22500)], } ) + if using_array_manager: + # TODO(ArrayManager) decide on exact casting rules in concat + expected = expected.astype(object) tm.assert_frame_equal(result, expected) td = np.timedelta64(300000000) diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 5cc65feee869b..44299d51a878f 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -1,8 +1,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas.core.dtypes.common import is_categorical_dtype from pandas import ( @@ -440,8 +438,7 @@ def test_crosstab_normalize_arrays(self): ) tm.assert_frame_equal(test_case, norm_sum) - @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) concat axis=0 - def test_crosstab_with_empties(self): + def test_crosstab_with_empties(self, using_array_manager): # Check handling of empties df = DataFrame( { @@ -466,6 +463,9 @@ def test_crosstab_with_empties(self): index=Index([1, 2], name="a", dtype="int64"), columns=Index([3, 4], name="b"), ) + if using_array_manager: + # INFO(ArrayManager) column without NaNs can preserve int dtype + nans[3] = nans[3].astype("int64") calculated = crosstab(df.a, df.b, values=df.c, aggfunc="count", normalize=False) tm.assert_frame_equal(nans, calculated) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 8d8a83c233444..20aa0c9e2ee9a 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -8,8 +8,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( Categorical, @@ -1199,8 +1197,7 @@ def test_pivot_table_with_margins_set_margin_name(self, margin_name): margins_name=margin_name, ) - @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) concat axis=0 - def test_pivot_timegrouper(self): + def test_pivot_timegrouper(self, using_array_manager): df = DataFrame( { "Branch": "A A A A A A A B".split(), @@ -1254,6 +1251,9 @@ def test_pivot_timegrouper(self): ) expected.index.name = "Date" expected.columns.name = "Buyer" + if using_array_manager: + # INFO(ArrayManager) column without NaNs can preserve int dtype + expected["Carl"] = expected["Carl"].astype("int64") result = pivot_table( df, diff --git a/pandas/tests/reshape/test_pivot_multilevel.py b/pandas/tests/reshape/test_pivot_multilevel.py index ab41a94d1ff25..7801262554a5e 100644 --- a/pandas/tests/reshape/test_pivot_multilevel.py +++ b/pandas/tests/reshape/test_pivot_multilevel.py @@ -1,8 +1,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( Index, @@ -198,8 +196,7 @@ def test_pivot_list_like_columns( tm.assert_frame_equal(result, expected) -@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) concat axis=0 -def test_pivot_multiindexed_rows_and_cols(): +def test_pivot_multiindexed_rows_and_cols(using_array_manager): # GH 36360 df = pd.DataFrame( @@ -221,11 +218,14 @@ def test_pivot_multiindexed_rows_and_cols(): ) expected = pd.DataFrame( - data=[[5.0, np.nan], [10.0, 7.0]], + data=[[5, np.nan], [10, 7.0]], columns=MultiIndex.from_tuples( [(0, 1, 0), (0, 1, 1)], names=["col_L0", "col_L1", "idx_L1"] ), index=Int64Index([0, 1], dtype="int64", name="idx_L0"), ) + if not using_array_manager: + # BlockManager does not preserve the dtypes + expected = expected.astype("float64") tm.assert_frame_equal(res, expected)