pandas-dev · jorisvandenbossche · Apr 12, 2021 · Feb 5, 2021 · Feb 5, 2021 · Feb 5, 2021
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -157,3 +157,4 @@ jobs:
       run: |
         source activate pandas-dev
         pytest pandas/tests/frame/methods --array-manager
+        pytest pandas/tests/reshape/concat/ --array-manager
diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py
@@ -5,14 +5,19 @@
 
 import numpy as np
 
+from pandas._libs import NaT, lib
 from pandas._typing import ArrayLike, DtypeObj
 
 from pandas.core.dtypes.cast import find_common_type
 from pandas.core.dtypes.common import (
+    is_bool_dtype,
     is_categorical_dtype,
+    is_datetime64_ns_dtype,
     is_dtype_equal,
     is_extension_array_dtype,
+    is_integer_dtype,
     is_sparse,
+    is_timedelta64_ns_dtype,
 )
 from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCSeries
 
@@ -21,11 +26,78 @@
 from pandas.core.construction import array, ensure_wrapped_if_datetimelike
 
 
+class NullArrayProxy:
+    """
+    Proxy object for an all-NA array.
+
+    Only stores the length of the array, and not the dtype. The dtype
+    will only be known when actually concatenating (after determining the
+    common dtype, for which this proxy is ignored).
+    Using this object avoids that the internals/concat.py needs to determine
+    the proper dtype and array type.
-    Using this object avoids that the internals/concat.py needs to determine
-    the proper dtype and array type.
+
+Using this object simplifies the problem (in internals/concat.py) of determining the result dtype of a concatenation.
-    Using this object avoids that the internals/concat.py needs to determine
-    the proper dtype and array type.
+
+Using this object simplifies the problem (in internals/concat.py) of determining the result dtype of a concatenation.
+    """
+
+    ndim = 1
+
+    def __init__(self, n: int):
+        self.n = n
+
+    @property
+    def shape(self):
+        return (self.n,)
+
+
+def _array_from_proxy(arr, dtype: DtypeObj, fill_value=lib.no_default):
+    """
+    Helper function to create the actual all-NA array from the NullArrayProxy object.
+
+    Parameters
+    ----------
+    arr : NullArrayProxy
+    dtype : the dtype for the resulting array
+    fill_value : scalar NA-like value
+        By default uses the ExtensionDtype's na_value or np.nan. For numpy
+        arrays, this can be overridden to be something else (eg None).
+
+    Returns
+    -------
+    np.ndarray or ExtensionArray
+    """
+    if is_extension_array_dtype(dtype):
+        return dtype.construct_array_type()._from_sequence(
+            [dtype.na_value] * arr.n, dtype=dtype
+        )
+    elif is_datetime64_ns_dtype(dtype):
+        from pandas.core.arrays import DatetimeArray
+
+        return DatetimeArray._from_sequence([NaT] * arr.n, dtype=dtype)
+    elif is_timedelta64_ns_dtype(dtype):
+        from pandas.core.arrays import TimedeltaArray
+
+        return TimedeltaArray._from_sequence([NaT] * arr.n, dtype=dtype)
+    else:
+        if is_integer_dtype(dtype):
+            dtype = "float64"
+            fill_value = np.nan
+        elif is_bool_dtype(dtype):
+            dtype = object
+
+        if fill_value is lib.no_default:
+            fill_value = np.nan
+
+        arr = np.empty(arr.n, dtype=dtype)
+        arr.fill(fill_value)
+        return arr
+
+
 def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
     """
     Helper function for `arr.astype(common_dtype)` but handling all special
     cases.
     """
+    if isinstance(arr, NullArrayProxy):
+        return _array_from_proxy(arr, dtype)
+
     if (
         is_categorical_dtype(arr.dtype)
         and isinstance(dtype, np.dtype)
@@ -132,6 +204,75 @@ def is_nonempty(x) -> bool:
     return np.concatenate(to_concat, axis=axis)
 
 
+def concat_arrays(to_concat):
+    """
+    Alternative for concat_compat but specialized for use in the ArrayManager.
+
+    Differences: only deals with 1D arrays (no axis keyword) and does not skip
+    empty arrays to determine the dtype.
+    In addition ensures that all NullArrayProxies get replaced with actual
+    arrays.
+
+    Parameters
+    ----------
+    to_concat : list of arrays
+
+    Returns
+    -------
+    np.ndarray or ExtensionArray
+    """
+    # ignore the all-NA proxies to determine the resulting dtype
+    to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)]
+
+    kinds = {obj.dtype.kind for obj in to_concat_no_proxy}
+    single_dtype = len({x.dtype for x in to_concat_no_proxy}) == 1
+    any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat_no_proxy)
+
+    if any_ea:
+        if not single_dtype:
+            target_dtype = find_common_type([x.dtype for x in to_concat_no_proxy])
+            to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat]
+        else:
+            target_dtype = to_concat_no_proxy[0].dtype
+            to_concat = [
+                _array_from_proxy(arr, target_dtype)
+                if isinstance(arr, NullArrayProxy)
+                else arr
+                for arr in to_concat
+            ]
+
+        if isinstance(to_concat[0], ExtensionArray):
+            cls = type(to_concat[0])
+            return cls._concat_same_type(to_concat)
+        else:
+            return np.concatenate(to_concat)
+
+    elif any(kind in ["m", "M"] for kind in kinds):
+        return _concat_datetime(to_concat)
+
+    if not single_dtype:
+        target_dtype = np.find_common_type(
+            [arr.dtype for arr in to_concat_no_proxy], []
+        )
+    else:
+        target_dtype = to_concat_no_proxy[0].dtype
+    to_concat = [
+        _array_from_proxy(arr, target_dtype) if isinstance(arr, NullArrayProxy) else arr
+        for arr in to_concat
+    ]
+
+    result = np.concatenate(to_concat)
+
+    # TODO(ArrayManager) this is currently inconsistent between Series and DataFrame
+    # so we should decide whether to keep the below special case or remove it
+    if len(result) == 0:
+        # all empties -> check for bool to not coerce to float
+        if len(kinds) != 1:
+            if "b" in kinds:
+                result = result.astype(object)
+    return result
+
+
 def union_categoricals(
     to_union, sort_categories: bool = False, ignore_order: bool = False
 ):
@@ -322,20 +463,35 @@ def _concat_datetime(to_concat, axis=0):
     a single array, preserving the combined dtypes
     """
     to_concat = [ensure_wrapped_if_datetimelike(x) for x in to_concat]
+    to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)]
 
-    single_dtype = len({x.dtype for x in to_concat}) == 1
+    single_dtype = len({x.dtype for x in to_concat_no_proxy}) == 1
 
     # multiple types, need to coerce to object
     if not single_dtype:
         # ensure_wrapped_if_datetimelike ensures that astype(object) wraps
         #  in Timestamp/Timedelta
+        to_concat = [
+            _array_from_proxy(arr, dtype=object, fill_value=None)
+            if isinstance(arr, NullArrayProxy)
+            else arr
+            for arr in to_concat
+        ]
+
         return _concatenate_2d([x.astype(object) for x in to_concat], axis=axis)
 
     if axis == 1:
         # TODO(EA2D): kludge not necessary with 2D EAs
         to_concat = [x.reshape(1, -1) if x.ndim == 1 else x for x in to_concat]
+    else:
+        to_concat = [
+            _array_from_proxy(arr, dtype=to_concat_no_proxy[0].dtype)
+            if isinstance(arr, NullArrayProxy)
+            else arr
+            for arr in to_concat
+        ]
 
-    result = type(to_concat[0])._concat_same_type(to_concat, axis=axis)
+    result = type(to_concat_no_proxy[0])._concat_same_type(to_concat, axis=axis)
 
     if result.ndim == 2 and is_extension_array_dtype(result.dtype):
         # TODO(EA2D): kludge not necessary with 2D EAs

diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py
@@ -18,6 +18,7 @@
     is_extension_array_dtype,
     is_numeric_dtype,
 )
+from pandas.core.dtypes.concat import NullArrayProxy
 from pandas.core.dtypes.dtypes import ExtensionDtype, PandasDtype
 from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
 from pandas.core.dtypes.missing import isna
@@ -725,10 +726,20 @@ def reindex_indexer(
         # ignored keywords
         consolidate: bool = True,
         only_slice: bool = False,
+        # ArrayManager specific keywords
+        do_integrity_check=True,
+        use_na_proxy=False,
     ) -> T:
         axis = self._normalize_axis(axis)
         return self._reindex_indexer(
-            new_axis, indexer, axis, fill_value, allow_dups, copy
+            new_axis,
+            indexer,
+            axis,
+            fill_value,
+            allow_dups,
+            copy,
+            do_integrity_check,
+            use_na_proxy,
         )
 
     def _reindex_indexer(
@@ -739,6 +750,8 @@ def _reindex_indexer(
         fill_value=None,
         allow_dups: bool = False,
         copy: bool = True,
+        do_integrity_check=True,
+        use_na_proxy=False,
     ) -> T:
         """
         Parameters
@@ -773,7 +786,9 @@ def _reindex_indexer(
             new_arrays = []
             for i in indexer:
                 if i == -1:
-                    arr = self._make_na_array(fill_value=fill_value)
+                    arr = self._make_na_array(
+                        fill_value=fill_value, use_na_proxy=use_na_proxy
+                    )
                 else:
                     arr = self.arrays[i]
                 new_arrays.append(arr)
@@ -793,7 +808,7 @@ def _reindex_indexer(
         new_axes = list(self._axes)
         new_axes[axis] = new_axis
 
-        return type(self)(new_arrays, new_axes)
+        return type(self)(new_arrays, new_axes, do_integrity_check=do_integrity_check)
 
     def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True):
         """
@@ -820,10 +835,11 @@ def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True
             new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True
         )
 
-    def _make_na_array(self, fill_value=None):
+    def _make_na_array(self, fill_value=None, use_na_proxy=False):
+        if use_na_proxy:
+            return NullArrayProxy(self.shape_proper[0])
         if fill_value is None:
             fill_value = np.nan
-
         dtype, fill_value = infer_dtype_from_scalar(fill_value)
         values = np.empty(self.shape_proper[0], dtype=dtype)
         values.fill(fill_value)

diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py
@@ -23,7 +23,7 @@
     is_sparse,
     is_timedelta64_dtype,
 )
-from pandas.core.dtypes.concat import concat_compat
+from pandas.core.dtypes.concat import concat_arrays, concat_compat
 from pandas.core.dtypes.missing import isna_all
 
 import pandas.core.algorithms as algos
@@ -37,6 +37,45 @@
     from pandas.core.arrays.sparse.dtype import SparseDtype
 
 
+def concatenate_array_managers(
+    mgrs_indexers, axes: List[Index], concat_axis: int, copy: bool
+) -> Manager:
+    """
+    Concatenate array managers into one.
+
+    Parameters
+    ----------
+    mgrs_indexers : list of (ArrayManager, {axis: indexer,...}) tuples
+    axes : list of Index
+    concat_axis : int
+    copy : bool
+
+    Returns
+    -------
+    ArrayManager
+    """
+    # reindex all arrays
+    mgrs = []
+    for mgr, indexers in mgrs_indexers:
+        for ax, indexer in indexers.items():
+            mgr = mgr.reindex_indexer(
+                axes[ax], indexer, axis=ax, do_integrity_check=False, use_na_proxy=True
+            )
+        mgrs.append(mgr)
+
+    # concatting along the rows -> concat the reindexed arrays
+    if concat_axis == 1:
+        arrays = [
+            concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))])
+            for j in range(len(mgrs[0].arrays))
+        ]
+        return ArrayManager(arrays, [axes[1], axes[0]], do_integrity_check=False)
+    # concatting along the columns -> combine reindexed arrays in a single manager
+    elif concat_axis == 0:
+        arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs]))
+        return ArrayManager(arrays, [axes[1], axes[0]], do_integrity_check=False)
+
+
 def concatenate_block_managers(
     mgrs_indexers, axes: List[Index], concat_axis: int, copy: bool
 ) -> Manager:
@@ -55,19 +94,7 @@ def concatenate_block_managers(
     BlockManager
     """
     if isinstance(mgrs_indexers[0][0], ArrayManager):
-
-        if concat_axis == 1:
-            # TODO for now only fastpath without indexers
-            mgrs = [t[0] for t in mgrs_indexers]
-            arrays = [
-                concat_compat([mgrs[i].arrays[j] for i in range(len(mgrs))], axis=0)
-                for j in range(len(mgrs[0].arrays))
-            ]
-            return ArrayManager(arrays, [axes[1], axes[0]])
-        elif concat_axis == 0:
-            mgrs = [t[0] for t in mgrs_indexers]
-            arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs]))
-            return ArrayManager(arrays, [axes[1], axes[0]])
+        return concatenate_array_managers(mgrs_indexers, axes, concat_axis, copy)
 
     concat_plans = [
         _get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers