Skip to content

Commit

Permalink
[ArrayManager] REF: Implement concat with reindexing (#39612)
Browse files Browse the repository at this point in the history
  • Loading branch information
jorisvandenbossche authored Apr 12, 2021
1 parent e97c766 commit f0c4093
Show file tree
Hide file tree
Showing 12 changed files with 184 additions and 41 deletions.
6 changes: 4 additions & 2 deletions pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,13 @@
)


def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
def cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
"""
Helper function for `arr.astype(common_dtype)` but handling all special
cases.
"""
if is_dtype_equal(arr.dtype, dtype):
return arr
if (
is_categorical_dtype(arr.dtype)
and isinstance(dtype, np.dtype)
Expand Down Expand Up @@ -121,7 +123,7 @@ def is_nonempty(x) -> bool:
# for axis=0
if not single_dtype:
target_dtype = find_common_type([x.dtype for x in to_concat])
to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat]
to_concat = [cast_to_common_type(arr, target_dtype) for arr in to_concat]

if isinstance(to_concat[0], ExtensionArray):
cls = type(to_concat[0])
Expand Down
71 changes: 68 additions & 3 deletions pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@
)
from pandas._typing import (
ArrayLike,
DtypeObj,
Hashable,
)
from pandas.util._validators import validate_bool_kwarg

from pandas.core.dtypes.cast import (
astype_array_safe,
ensure_dtype_can_hold_na,
infer_dtype_from_scalar,
soft_convert_objects,
)
Expand All @@ -49,6 +51,7 @@
from pandas.core.dtypes.missing import (
array_equals,
isna,
na_value_for_dtype,
)

import pandas.core.algorithms as algos
Expand Down Expand Up @@ -952,10 +955,18 @@ def reindex_indexer(
# ignored keywords
consolidate: bool = True,
only_slice: bool = False,
# ArrayManager specific keywords
use_na_proxy: bool = False,
) -> T:
axis = self._normalize_axis(axis)
return self._reindex_indexer(
new_axis, indexer, axis, fill_value, allow_dups, copy
new_axis,
indexer,
axis,
fill_value,
allow_dups,
copy,
use_na_proxy,
)

def _reindex_indexer(
Expand All @@ -966,6 +977,7 @@ def _reindex_indexer(
fill_value=None,
allow_dups: bool = False,
copy: bool = True,
use_na_proxy: bool = False,
) -> T:
"""
Parameters
Expand Down Expand Up @@ -1000,7 +1012,9 @@ def _reindex_indexer(
new_arrays = []
for i in indexer:
if i == -1:
arr = self._make_na_array(fill_value=fill_value)
arr = self._make_na_array(
fill_value=fill_value, use_na_proxy=use_na_proxy
)
else:
arr = self.arrays[i]
new_arrays.append(arr)
Expand Down Expand Up @@ -1051,7 +1065,11 @@ def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T:
new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True
)

def _make_na_array(self, fill_value=None):
def _make_na_array(self, fill_value=None, use_na_proxy=False):
if use_na_proxy:
assert fill_value is None
return NullArrayProxy(self.shape_proper[0])

if fill_value is None:
fill_value = np.nan

Expand Down Expand Up @@ -1271,3 +1289,50 @@ def set_values(self, values: ArrayLike):
valid for the current SingleArrayManager (length, dtype, etc).
"""
self.arrays[0] = values


class NullArrayProxy:
"""
Proxy object for an all-NA array.
Only stores the length of the array, and not the dtype. The dtype
will only be known when actually concatenating (after determining the
common dtype, for which this proxy is ignored).
Using this object avoids that the internals/concat.py needs to determine
the proper dtype and array type.
"""

ndim = 1

def __init__(self, n: int):
self.n = n

@property
def shape(self):
return (self.n,)

def to_array(self, dtype: DtypeObj) -> ArrayLike:
"""
Helper function to create the actual all-NA array from the NullArrayProxy
object.
Parameters
----------
arr : NullArrayProxy
dtype : the dtype for the resulting array
Returns
-------
np.ndarray or ExtensionArray
"""
if isinstance(dtype, ExtensionDtype):
empty = dtype.construct_array_type()._from_sequence([], dtype=dtype)
indexer = -np.ones(self.n, dtype=np.intp)
return empty.take(indexer, allow_fill=True)
else:
# when introducing missing values, int becomes float, bool becomes object
dtype = ensure_dtype_can_hold_na(dtype)
fill_value = na_value_for_dtype(dtype)
arr = np.empty(self.n, dtype=dtype)
arr.fill(fill_value)
return ensure_wrapped_if_datetimelike(arr)
78 changes: 74 additions & 4 deletions pandas/core/internals/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@
is_extension_array_dtype,
is_sparse,
)
from pandas.core.dtypes.concat import concat_compat
from pandas.core.dtypes.concat import (
cast_to_common_type,
concat_compat,
)
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.missing import (
is_valid_na_for_dtype,
Expand All @@ -42,7 +45,10 @@
ExtensionArray,
)
from pandas.core.construction import ensure_wrapped_if_datetimelike
from pandas.core.internals.array_manager import ArrayManager
from pandas.core.internals.array_manager import (
ArrayManager,
NullArrayProxy,
)
from pandas.core.internals.blocks import (
ensure_block_shape,
new_block,
Expand Down Expand Up @@ -74,14 +80,16 @@ def _concatenate_array_managers(
mgrs = []
for mgr, indexers in mgrs_indexers:
for ax, indexer in indexers.items():
mgr = mgr.reindex_indexer(axes[ax], indexer, axis=ax, allow_dups=True)
mgr = mgr.reindex_indexer(
axes[ax], indexer, axis=ax, allow_dups=True, use_na_proxy=True
)
mgrs.append(mgr)

if concat_axis == 1:
# concatting along the rows -> concat the reindexed arrays
# TODO(ArrayManager) doesn't yet preserve the correct dtype
arrays = [
concat_compat([mgrs[i].arrays[j] for i in range(len(mgrs))])
concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))])
for j in range(len(mgrs[0].arrays))
]
return ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False)
Expand All @@ -92,6 +100,68 @@ def _concatenate_array_managers(
return ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False)


def concat_arrays(to_concat: list) -> ArrayLike:
"""
Alternative for concat_compat but specialized for use in the ArrayManager.
Differences: only deals with 1D arrays (no axis keyword), assumes
ensure_wrapped_if_datetimelike and does not skip empty arrays to determine
the dtype.
In addition ensures that all NullArrayProxies get replaced with actual
arrays.
Parameters
----------
to_concat : list of arrays
Returns
-------
np.ndarray or ExtensionArray
"""
# ignore the all-NA proxies to determine the resulting dtype
to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)]

single_dtype = len({x.dtype for x in to_concat_no_proxy}) == 1

if not single_dtype:
target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy])
else:
target_dtype = to_concat_no_proxy[0].dtype

if target_dtype.kind in ["m", "M"]:
# for datetimelike use DatetimeArray/TimedeltaArray concatenation
# don't use arr.astype(target_dtype, copy=False), because that doesn't
# work for DatetimeArray/TimedeltaArray (returns ndarray)
to_concat = [
arr.to_array(target_dtype) if isinstance(arr, NullArrayProxy) else arr
for arr in to_concat
]
return type(to_concat_no_proxy[0])._concat_same_type(to_concat, axis=0)

to_concat = [
arr.to_array(target_dtype)
if isinstance(arr, NullArrayProxy)
else cast_to_common_type(arr, target_dtype)
for arr in to_concat
]

if isinstance(to_concat[0], ExtensionArray):
cls = type(to_concat[0])
return cls._concat_same_type(to_concat)

result = np.concatenate(to_concat)

# TODO decide on exact behaviour (we shouldn't do this only for empty result)
# see https://github.com/pandas-dev/pandas/issues/39817
if len(result) == 0:
# all empties -> check for bool to not coerce to float
kinds = {obj.dtype.kind for obj in to_concat_no_proxy}
if len(kinds) != 1:
if "b" in kinds:
result = result.astype(object)
return result


def concatenate_managers(
mgrs_indexers, axes: list[Index], concat_axis: int, copy: bool
) -> Manager:
Expand Down
3 changes: 0 additions & 3 deletions pandas/tests/extension/base/reshaping.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

import pandas as pd
from pandas.api.extensions import ExtensionArray
from pandas.core.internals import ExtensionBlock
Expand Down Expand Up @@ -111,7 +109,6 @@ def test_concat_extension_arrays_copy_false(self, data, na_value):
result = pd.concat([df1, df2], axis=1, copy=False)
self.assert_frame_equal(result, expected)

@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) concat reindex
def test_concat_with_reindex(self, data):
# GH-33027
a = pd.DataFrame({"a": data[:5]})
Expand Down
22 changes: 11 additions & 11 deletions pandas/tests/frame/methods/test_append.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

import pandas as pd
from pandas import (
DataFrame,
Expand All @@ -13,9 +11,6 @@
)
import pandas._testing as tm

# TODO td.skip_array_manager_not_yet_implemented
# appending with reindexing not yet working


class TestDataFrameAppend:
def test_append_multiindex(self, multiindex_dataframe_random_data, frame_or_series):
Expand Down Expand Up @@ -43,7 +38,6 @@ def test_append_empty_list(self):
tm.assert_frame_equal(result, expected)
assert result is not df # .append() should return a new object

@td.skip_array_manager_not_yet_implemented
def test_append_series_dict(self):
df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"])

Expand Down Expand Up @@ -84,7 +78,6 @@ def test_append_series_dict(self):
expected = df.append(df[-1:], ignore_index=True)
tm.assert_frame_equal(result, expected)

@td.skip_array_manager_not_yet_implemented
def test_append_list_of_series_dicts(self):
df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"])

Expand All @@ -103,7 +96,6 @@ def test_append_list_of_series_dicts(self):
expected = df.append(DataFrame(dicts), ignore_index=True, sort=True)
tm.assert_frame_equal(result, expected)

@td.skip_array_manager_not_yet_implemented
def test_append_missing_cols(self):
# GH22252
# exercise the conditional branch in append method where the data
Expand Down Expand Up @@ -148,8 +140,7 @@ def test_append_empty_dataframe(self):
expected = df1.copy()
tm.assert_frame_equal(result, expected)

@td.skip_array_manager_not_yet_implemented
def test_append_dtypes(self):
def test_append_dtypes(self, using_array_manager):

# GH 5754
# row appends of different dtypes (so need to do by-item)
Expand All @@ -173,6 +164,10 @@ def test_append_dtypes(self):
expected = DataFrame(
{"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")}
)
if using_array_manager:
# TODO(ArrayManager) decide on exact casting rules in concat
# With ArrayManager, all-NaN float is not ignored
expected = expected.astype(object)
tm.assert_frame_equal(result, expected)

df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
Expand All @@ -181,6 +176,9 @@ def test_append_dtypes(self):
expected = DataFrame(
{"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")}
)
if using_array_manager:
# With ArrayManager, all-NaN float is not ignored
expected = expected.astype(object)
tm.assert_frame_equal(result, expected)

df1 = DataFrame({"bar": np.nan}, index=range(1))
Expand All @@ -189,6 +187,9 @@ def test_append_dtypes(self):
expected = DataFrame(
{"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")}
)
if using_array_manager:
# With ArrayManager, all-NaN float is not ignored
expected = expected.astype(object)
tm.assert_frame_equal(result, expected)

df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
Expand All @@ -208,7 +209,6 @@ def test_append_timestamps_aware_or_naive(self, tz_naive_fixture, timestamp):
expected = Series(Timestamp(timestamp, tz=tz), name=0)
tm.assert_series_equal(result, expected)

@td.skip_array_manager_not_yet_implemented
@pytest.mark.parametrize(
"data, dtype",
[
Expand Down
4 changes: 0 additions & 4 deletions pandas/tests/reshape/concat/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +0,0 @@
import pandas.util._test_decorators as td

# TODO(ArrayManager) concat axis=0
pytestmark = td.skip_array_manager_not_yet_implemented
6 changes: 6 additions & 0 deletions pandas/tests/reshape/concat/test_append.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

import pandas as pd
from pandas import (
DataFrame,
Expand Down Expand Up @@ -338,6 +340,10 @@ def test_append_missing_column_proper_upcast(self, sort):
assert appended["A"].dtype == "f8"
assert appended["B"].dtype == "O"

# TODO(ArrayManager) DataFrame.append reindexes a Series itself (giving
# float dtype) -> delay reindexing until concat_array_managers which properly
# takes care of all-null dtype inference
@td.skip_array_manager_not_yet_implemented
def test_append_empty_frame_to_series_with_dateutil_tz(self):
# GH 23682
date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc())
Expand Down
Loading

0 comments on commit f0c4093

Please sign in to comment.