[ArrayManager] REF: Implement concat with reindexing (#39612)

pandas-dev · Apr 12, 2021 · f0c4093 · f0c4093
1 parent e97c766
commit f0c4093
Show file tree

Hide file tree

Showing 12 changed files with 184 additions and 41 deletions.
diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py
@@ -30,11 +30,13 @@
 )
 
 
-def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
+def cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
     """
     Helper function for `arr.astype(common_dtype)` but handling all special
     cases.
     """
+    if is_dtype_equal(arr.dtype, dtype):
+        return arr
     if (
         is_categorical_dtype(arr.dtype)
         and isinstance(dtype, np.dtype)
@@ -121,7 +123,7 @@ def is_nonempty(x) -> bool:
         # for axis=0
         if not single_dtype:
             target_dtype = find_common_type([x.dtype for x in to_concat])
-            to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat]
+            to_concat = [cast_to_common_type(arr, target_dtype) for arr in to_concat]
 
         if isinstance(to_concat[0], ExtensionArray):
             cls = type(to_concat[0])

diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py
@@ -18,12 +18,14 @@
 )
 from pandas._typing import (
     ArrayLike,
+    DtypeObj,
     Hashable,
 )
 from pandas.util._validators import validate_bool_kwarg
 
 from pandas.core.dtypes.cast import (
     astype_array_safe,
+    ensure_dtype_can_hold_na,
     infer_dtype_from_scalar,
     soft_convert_objects,
 )
@@ -49,6 +51,7 @@
 from pandas.core.dtypes.missing import (
     array_equals,
     isna,
+    na_value_for_dtype,
 )
 
 import pandas.core.algorithms as algos
@@ -952,10 +955,18 @@ def reindex_indexer(
         # ignored keywords
         consolidate: bool = True,
         only_slice: bool = False,
+        # ArrayManager specific keywords
+        use_na_proxy: bool = False,
     ) -> T:
         axis = self._normalize_axis(axis)
         return self._reindex_indexer(
-            new_axis, indexer, axis, fill_value, allow_dups, copy
+            new_axis,
+            indexer,
+            axis,
+            fill_value,
+            allow_dups,
+            copy,
+            use_na_proxy,
         )
 
     def _reindex_indexer(
@@ -966,6 +977,7 @@ def _reindex_indexer(
         fill_value=None,
         allow_dups: bool = False,
         copy: bool = True,
+        use_na_proxy: bool = False,
     ) -> T:
         """
         Parameters
@@ -1000,7 +1012,9 @@ def _reindex_indexer(
             new_arrays = []
             for i in indexer:
                 if i == -1:
-                    arr = self._make_na_array(fill_value=fill_value)
+                    arr = self._make_na_array(
+                        fill_value=fill_value, use_na_proxy=use_na_proxy
+                    )
                 else:
                     arr = self.arrays[i]
                 new_arrays.append(arr)
@@ -1051,7 +1065,11 @@ def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T:
             new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True
         )
 
-    def _make_na_array(self, fill_value=None):
+    def _make_na_array(self, fill_value=None, use_na_proxy=False):
+        if use_na_proxy:
+            assert fill_value is None
+            return NullArrayProxy(self.shape_proper[0])
+
         if fill_value is None:
             fill_value = np.nan
 
@@ -1271,3 +1289,50 @@ def set_values(self, values: ArrayLike):
         valid for the current SingleArrayManager (length, dtype, etc).
         """
         self.arrays[0] = values
+
+
+class NullArrayProxy:
+    """
+    Proxy object for an all-NA array.
+
+    Only stores the length of the array, and not the dtype. The dtype
+    will only be known when actually concatenating (after determining the
+    common dtype, for which this proxy is ignored).
+    Using this object avoids that the internals/concat.py needs to determine
+    the proper dtype and array type.
+    """
+
+    ndim = 1
+
+    def __init__(self, n: int):
+        self.n = n
+
+    @property
+    def shape(self):
+        return (self.n,)
+
+    def to_array(self, dtype: DtypeObj) -> ArrayLike:
+        """
+        Helper function to create the actual all-NA array from the NullArrayProxy
+        object.
+
+        Parameters
+        ----------
+        arr : NullArrayProxy
+        dtype : the dtype for the resulting array
+
+        Returns
+        -------
+        np.ndarray or ExtensionArray
+        """
+        if isinstance(dtype, ExtensionDtype):
+            empty = dtype.construct_array_type()._from_sequence([], dtype=dtype)
+            indexer = -np.ones(self.n, dtype=np.intp)
+            return empty.take(indexer, allow_fill=True)
+        else:
+            # when introducing missing values, int becomes float, bool becomes object
+            dtype = ensure_dtype_can_hold_na(dtype)
+            fill_value = na_value_for_dtype(dtype)
+            arr = np.empty(self.n, dtype=dtype)
+            arr.fill(fill_value)
+            return ensure_wrapped_if_datetimelike(arr)
diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py
@@ -28,7 +28,10 @@
     is_extension_array_dtype,
     is_sparse,
 )
-from pandas.core.dtypes.concat import concat_compat
+from pandas.core.dtypes.concat import (
+    cast_to_common_type,
+    concat_compat,
+)
 from pandas.core.dtypes.dtypes import ExtensionDtype
 from pandas.core.dtypes.missing import (
     is_valid_na_for_dtype,
@@ -42,7 +45,10 @@
     ExtensionArray,
 )
 from pandas.core.construction import ensure_wrapped_if_datetimelike
-from pandas.core.internals.array_manager import ArrayManager
+from pandas.core.internals.array_manager import (
+    ArrayManager,
+    NullArrayProxy,
+)
 from pandas.core.internals.blocks import (
     ensure_block_shape,
     new_block,
@@ -74,14 +80,16 @@ def _concatenate_array_managers(
     mgrs = []
     for mgr, indexers in mgrs_indexers:
         for ax, indexer in indexers.items():
-            mgr = mgr.reindex_indexer(axes[ax], indexer, axis=ax, allow_dups=True)
+            mgr = mgr.reindex_indexer(
+                axes[ax], indexer, axis=ax, allow_dups=True, use_na_proxy=True
+            )
         mgrs.append(mgr)
 
     if concat_axis == 1:
         # concatting along the rows -> concat the reindexed arrays
         # TODO(ArrayManager) doesn't yet preserve the correct dtype
         arrays = [
-            concat_compat([mgrs[i].arrays[j] for i in range(len(mgrs))])
+            concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))])
             for j in range(len(mgrs[0].arrays))
         ]
         return ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False)
@@ -92,6 +100,68 @@ def _concatenate_array_managers(
         return ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False)
 
 
+def concat_arrays(to_concat: list) -> ArrayLike:
+    """
+    Alternative for concat_compat but specialized for use in the ArrayManager.
+
+    Differences: only deals with 1D arrays (no axis keyword), assumes
+    ensure_wrapped_if_datetimelike and does not skip empty arrays to determine
+    the dtype.
+    In addition ensures that all NullArrayProxies get replaced with actual
+    arrays.
+
+    Parameters
+    ----------
+    to_concat : list of arrays
+
+    Returns
+    -------
+    np.ndarray or ExtensionArray
+    """
+    # ignore the all-NA proxies to determine the resulting dtype
+    to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)]
+
+    single_dtype = len({x.dtype for x in to_concat_no_proxy}) == 1
+
+    if not single_dtype:
+        target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy])
+    else:
+        target_dtype = to_concat_no_proxy[0].dtype
+
+    if target_dtype.kind in ["m", "M"]:
+        # for datetimelike use DatetimeArray/TimedeltaArray concatenation
+        # don't use arr.astype(target_dtype, copy=False), because that doesn't
+        # work for DatetimeArray/TimedeltaArray (returns ndarray)
+        to_concat = [
+            arr.to_array(target_dtype) if isinstance(arr, NullArrayProxy) else arr
+            for arr in to_concat
+        ]
+        return type(to_concat_no_proxy[0])._concat_same_type(to_concat, axis=0)
+
+    to_concat = [
+        arr.to_array(target_dtype)
+        if isinstance(arr, NullArrayProxy)
+        else cast_to_common_type(arr, target_dtype)
+        for arr in to_concat
+    ]
+
+    if isinstance(to_concat[0], ExtensionArray):
+        cls = type(to_concat[0])
+        return cls._concat_same_type(to_concat)
+
+    result = np.concatenate(to_concat)
+
+    # TODO decide on exact behaviour (we shouldn't do this only for empty result)
+    # see https://github.com/pandas-dev/pandas/issues/39817
+    if len(result) == 0:
+        # all empties -> check for bool to not coerce to float
+        kinds = {obj.dtype.kind for obj in to_concat_no_proxy}
+        if len(kinds) != 1:
+            if "b" in kinds:
+                result = result.astype(object)
+    return result
+
+
 def concatenate_managers(
     mgrs_indexers, axes: list[Index], concat_axis: int, copy: bool
 ) -> Manager:

diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py
@@ -3,8 +3,6 @@
 import numpy as np
 import pytest
 
-import pandas.util._test_decorators as td
-
 import pandas as pd
 from pandas.api.extensions import ExtensionArray
 from pandas.core.internals import ExtensionBlock
@@ -111,7 +109,6 @@ def test_concat_extension_arrays_copy_false(self, data, na_value):
         result = pd.concat([df1, df2], axis=1, copy=False)
         self.assert_frame_equal(result, expected)
 
-    @td.skip_array_manager_not_yet_implemented  # TODO(ArrayManager) concat reindex
     def test_concat_with_reindex(self, data):
         # GH-33027
         a = pd.DataFrame({"a": data[:5]})

diff --git a/pandas/tests/frame/methods/test_append.py b/pandas/tests/frame/methods/test_append.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-import pandas.util._test_decorators as td
-
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -13,9 +11,6 @@
 )
 import pandas._testing as tm
 
-# TODO td.skip_array_manager_not_yet_implemented
-# appending with reindexing not yet working
-
 
 class TestDataFrameAppend:
     def test_append_multiindex(self, multiindex_dataframe_random_data, frame_or_series):
@@ -43,7 +38,6 @@ def test_append_empty_list(self):
         tm.assert_frame_equal(result, expected)
         assert result is not df  # .append() should return a new object
 
-    @td.skip_array_manager_not_yet_implemented
     def test_append_series_dict(self):
         df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"])
 
@@ -84,7 +78,6 @@ def test_append_series_dict(self):
         expected = df.append(df[-1:], ignore_index=True)
         tm.assert_frame_equal(result, expected)
 
-    @td.skip_array_manager_not_yet_implemented
     def test_append_list_of_series_dicts(self):
         df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"])
 
@@ -103,7 +96,6 @@ def test_append_list_of_series_dicts(self):
         expected = df.append(DataFrame(dicts), ignore_index=True, sort=True)
         tm.assert_frame_equal(result, expected)
 
-    @td.skip_array_manager_not_yet_implemented
     def test_append_missing_cols(self):
         # GH22252
         # exercise the conditional branch in append method where the data
@@ -148,8 +140,7 @@ def test_append_empty_dataframe(self):
         expected = df1.copy()
         tm.assert_frame_equal(result, expected)
 
-    @td.skip_array_manager_not_yet_implemented
-    def test_append_dtypes(self):
+    def test_append_dtypes(self, using_array_manager):
 
         # GH 5754
         # row appends of different dtypes (so need to do by-item)
@@ -173,6 +164,10 @@ def test_append_dtypes(self):
         expected = DataFrame(
             {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")}
         )
+        if using_array_manager:
+            # TODO(ArrayManager) decide on exact casting rules in concat
+            # With ArrayManager, all-NaN float is not ignored
+            expected = expected.astype(object)
         tm.assert_frame_equal(result, expected)
 
         df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
@@ -181,6 +176,9 @@ def test_append_dtypes(self):
         expected = DataFrame(
             {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")}
         )
+        if using_array_manager:
+            # With ArrayManager, all-NaN float is not ignored
+            expected = expected.astype(object)
         tm.assert_frame_equal(result, expected)
 
         df1 = DataFrame({"bar": np.nan}, index=range(1))
@@ -189,6 +187,9 @@ def test_append_dtypes(self):
         expected = DataFrame(
             {"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")}
         )
+        if using_array_manager:
+            # With ArrayManager, all-NaN float is not ignored
+            expected = expected.astype(object)
         tm.assert_frame_equal(result, expected)
 
         df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
@@ -208,7 +209,6 @@ def test_append_timestamps_aware_or_naive(self, tz_naive_fixture, timestamp):
         expected = Series(Timestamp(timestamp, tz=tz), name=0)
         tm.assert_series_equal(result, expected)
 
-    @td.skip_array_manager_not_yet_implemented
     @pytest.mark.parametrize(
         "data, dtype",
         [

diff --git a/pandas/tests/reshape/concat/__init__.py b/pandas/tests/reshape/concat/__init__.py
@@ -1,4 +0,0 @@
-import pandas.util._test_decorators as td
-
-# TODO(ArrayManager) concat axis=0
-pytestmark = td.skip_array_manager_not_yet_implemented

diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py
@@ -6,6 +6,8 @@
 import numpy as np
 import pytest
 
+import pandas.util._test_decorators as td
+
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -338,6 +340,10 @@ def test_append_missing_column_proper_upcast(self, sort):
         assert appended["A"].dtype == "f8"
         assert appended["B"].dtype == "O"
 
+    # TODO(ArrayManager) DataFrame.append reindexes a Series itself (giving
+    # float dtype) -> delay reindexing until concat_array_managers which properly
+    # takes care of all-null dtype inference
+    @td.skip_array_manager_not_yet_implemented
     def test_append_empty_frame_to_series_with_dateutil_tz(self):
         # GH 23682
         date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc())