Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ArrayManager] DataFrame constructors #39991

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
61983d8
[ArrayManager] DataFrame constructors
jorisvandenbossche Feb 23, 2021
1d0315f
clean-up signatures
jorisvandenbossche Feb 23, 2021
ffc8314
'fix' for PandasArrays
jorisvandenbossche Feb 23, 2021
46e73c8
tests
jorisvandenbossche Feb 23, 2021
3e108df
ensure datetime-like array
jorisvandenbossche Feb 23, 2021
854bb17
Merge remote-tracking branch 'upstream/master' into am-constructors
jorisvandenbossche Feb 24, 2021
8726d42
small clean-up - additional comments
jorisvandenbossche Feb 24, 2021
6e17183
use string join for msg
jorisvandenbossche Feb 24, 2021
8096665
add github issue link to comment
jorisvandenbossche Feb 24, 2021
aef4cc8
Merge remote-tracking branch 'upstream/master' into am-constructors
jorisvandenbossche Feb 25, 2021
9c0a3d6
Merge remote-tracking branch 'upstream/master' into am-constructors
jorisvandenbossche Feb 25, 2021
1eb5cb7
Merge remote-tracking branch 'upstream/master' into am-constructors
jorisvandenbossche Feb 26, 2021
0992e67
Merge remote-tracking branch 'upstream/master' into am-constructors
jorisvandenbossche Feb 26, 2021
936b290
Merge remote-tracking branch 'upstream/master' into am-constructors
jorisvandenbossche Mar 1, 2021
54d36ab
move wrapping inside ArrayManager constructor
jorisvandenbossche Mar 1, 2021
c56ffa8
remove skip
jorisvandenbossche Mar 1, 2021
164387c
trigger ci
jorisvandenbossche Mar 1, 2021
143b572
add skip for rename copy
jorisvandenbossche Mar 1, 2021
6166927
Merge remote-tracking branch 'upstream/master' into am-constructors
jorisvandenbossche Mar 2, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,8 @@ jobs:
run: |
source activate pandas-dev
pytest pandas/tests/frame/methods --array-manager
pytest pandas/tests/frame/test_constructors.py --array-manager
pytest pandas/tests/frame/constructors/ --array-manager
pytest pandas/tests/frame/test_reductions.py --array-manager
pytest pandas/tests/reductions/ --array-manager
pytest pandas/tests/generic/test_generic.py --array-manager
Expand Down
60 changes: 42 additions & 18 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,39 +563,55 @@ def __init__(
if isinstance(data, DataFrame):
data = data._mgr

if isinstance(data, (BlockManager, ArrayManager)):
if index is None and columns is None and dtype is None and copy is False:
# GH#33357 fastpath
NDFrame.__init__(self, data)
return
# first check if a Manager is passed without any other arguments
# -> use fastpath (without checking Manager type)
if (
index is None
and columns is None
and dtype is None
and copy is False
and isinstance(data, (BlockManager, ArrayManager))
jreback marked this conversation as resolved.
Show resolved Hide resolved
):
# GH#33357 fastpath
NDFrame.__init__(self, data)
return

manager = get_option("mode.data_manager")

if isinstance(data, (BlockManager, ArrayManager)):
jreback marked this conversation as resolved.
Show resolved Hide resolved
mgr = self._init_mgr(
data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy
)

elif isinstance(data, dict):
mgr = dict_to_mgr(data, index, columns, dtype=dtype)
mgr = dict_to_mgr(data, index, columns, dtype=dtype, typ=manager)
elif isinstance(data, ma.MaskedArray):
import numpy.ma.mrecords as mrecords

# masked recarray
if isinstance(data, mrecords.MaskedRecords):
mgr = rec_array_to_mgr(data, index, columns, dtype, copy)
mgr = rec_array_to_mgr(data, index, columns, dtype, copy, typ=manager)

# a masked array
else:
data = sanitize_masked_array(data)
mgr = ndarray_to_mgr(data, index, columns, dtype=dtype, copy=copy)
mgr = ndarray_to_mgr(
data, index, columns, dtype=dtype, copy=copy, typ=manager
)

elif isinstance(data, (np.ndarray, Series, Index)):
if data.dtype.names:
# i.e. numpy structured array
mgr = rec_array_to_mgr(data, index, columns, dtype, copy)
mgr = rec_array_to_mgr(data, index, columns, dtype, copy, typ=manager)
elif getattr(data, "name", None) is not None:
# i.e. Series/Index with non-None name
mgr = dict_to_mgr({data.name: data}, index, columns, dtype=dtype)
mgr = dict_to_mgr(
{data.name: data}, index, columns, dtype=dtype, typ=manager
)
else:
mgr = ndarray_to_mgr(data, index, columns, dtype=dtype, copy=copy)
mgr = ndarray_to_mgr(
data, index, columns, dtype=dtype, copy=copy, typ=manager
)

# For data is list-like, or Iterable (will consume into list)
elif is_list_like(data):
Expand All @@ -610,11 +626,15 @@ def __init__(
arrays, columns, index = nested_data_to_arrays(
data, columns, index, dtype
)
mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
mgr = arrays_to_mgr(
arrays, columns, index, columns, dtype=dtype, typ=manager
)
else:
mgr = ndarray_to_mgr(data, index, columns, dtype=dtype, copy=copy)
mgr = ndarray_to_mgr(
data, index, columns, dtype=dtype, copy=copy, typ=manager
)
else:
mgr = dict_to_mgr({}, index, columns, dtype=dtype)
mgr = dict_to_mgr({}, index, columns, dtype=dtype, typ=manager)
# For data is scalar
else:
if index is None or columns is None:
Expand All @@ -631,18 +651,19 @@ def __init__(
construct_1d_arraylike_from_scalar(data, len(index), dtype)
for _ in range(len(columns))
]
mgr = arrays_to_mgr(values, columns, index, columns, dtype=None)
mgr = arrays_to_mgr(
values, columns, index, columns, dtype=None, typ=manager
)
else:
values = construct_2d_arraylike_from_scalar(
data, len(index), len(columns), dtype, copy
)

mgr = ndarray_to_mgr(
values, index, columns, dtype=values.dtype, copy=False
values, index, columns, dtype=values.dtype, copy=False, typ=manager
)

# ensure correct Manager type according to settings
manager = get_option("mode.data_manager")
mgr = mgr_to_mgr(mgr, typ=manager)

NDFrame.__init__(self, mgr)
Expand Down Expand Up @@ -1970,7 +1991,8 @@ def from_records(
arr_columns = arr_columns.drop(arr_exclude)
columns = columns.drop(exclude)

mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns)
manager = get_option("mode.data_manager")
mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns, typ=manager)

return cls(mgr)

Expand Down Expand Up @@ -2177,13 +2199,15 @@ def _from_arrays(
if dtype is not None:
dtype = pandas_dtype(dtype)

manager = get_option("mode.data_manager")
mgr = arrays_to_mgr(
arrays,
columns,
index,
columns,
dtype=dtype,
verify_integrity=verify_integrity,
typ=manager,
)
return cls(mgr)

Expand Down
3 changes: 3 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@
ArrayManager,
BlockManager,
)
from pandas.core.internals.construction import mgr_to_mgr
from pandas.core.missing import find_valid_index
from pandas.core.ops import align_method_FRAME
from pandas.core.reshape.concat import concat
Expand Down Expand Up @@ -5755,6 +5756,8 @@ def _to_dict_of_blocks(self, copy: bool_t = True):
Internal ONLY - only works for BlockManager
"""
mgr = self._mgr
# convert to BlockManager if needed -> this way support ArrayManager as well
mgr = mgr_to_mgr(mgr, "block")
mgr = cast(BlockManager, mgr)
return {
k: self._constructor(v).__finalize__(self)
Expand Down
8 changes: 7 additions & 1 deletion pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -840,7 +840,13 @@ def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False

value = extract_array(value, extract_numpy=True)
if value.ndim == 2:
value = value[0, :]
if value.shape[0] == 1:
jreback marked this conversation as resolved.
Show resolved Hide resolved
value = value[0, :]
else:
raise ValueError(
f"Expected a 1D array, got an array with shape {value.shape}"
)

# TODO self.arrays can be empty
# assert len(value) == len(self.arrays[0])

Expand Down
37 changes: 24 additions & 13 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,9 @@
get_objs_combined_axis,
union_indexes,
)
from pandas.core.internals.array_manager import ArrayManager
from pandas.core.internals.managers import (
BlockManager,
create_block_manager_from_arrays,
create_block_manager_from_blocks,
)
Expand All @@ -88,6 +90,7 @@ def arrays_to_mgr(
columns,
dtype: Optional[DtypeObj] = None,
verify_integrity: bool = True,
typ: Optional[str] = None,
):
"""
Segregate Series based on type and coerce into matrices.
Expand All @@ -114,7 +117,12 @@ def arrays_to_mgr(
# from BlockManager perspective
axes = [columns, index]

return create_block_manager_from_arrays(arrays, arr_names, axes)
if typ == "block":
return create_block_manager_from_arrays(arrays, arr_names, axes)
elif typ == "array":
return ArrayManager(arrays, [index, columns])
else:
raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")


def rec_array_to_mgr(
Expand All @@ -123,6 +131,7 @@ def rec_array_to_mgr(
columns,
dtype: Optional[DtypeObj],
copy: bool,
typ: str,
):
"""
Extract from a masked rec array and create the manager.
Expand Down Expand Up @@ -150,7 +159,7 @@ def rec_array_to_mgr(
if columns is None:
columns = arr_columns

mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype)
mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype, typ=typ)

if copy:
mgr = mgr.copy()
Expand Down Expand Up @@ -180,19 +189,14 @@ def mgr_to_mgr(mgr, typ: str):
Convert to specific type of Manager. Does not copy if the type is already
correct. Does not guarantee a copy otherwise.
"""
from pandas.core.internals import (
ArrayManager,
BlockManager,
)

new_mgr: Manager

if typ == "block":
if isinstance(mgr, BlockManager):
new_mgr = mgr
else:
new_mgr = arrays_to_mgr(
mgr.arrays, mgr.axes[0], mgr.axes[1], mgr.axes[0], dtype=None
mgr.arrays, mgr.axes[0], mgr.axes[1], mgr.axes[0], typ="block"
)
elif typ == "array":
if isinstance(mgr, ArrayManager):
Expand All @@ -201,15 +205,17 @@ def mgr_to_mgr(mgr, typ: str):
arrays = [mgr.iget_values(i).copy() for i in range(len(mgr.axes[0]))]
new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]])
else:
raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{type}'")
raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")
return new_mgr


# ---------------------------------------------------------------------
# DataFrame Constructor Interface


def ndarray_to_mgr(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
def ndarray_to_mgr(
values, index, columns, dtype: Optional[DtypeObj], copy: bool, typ: str
):
# used in DataFrame.__init__
# input must be a ndarray, list, Series, index

Expand Down Expand Up @@ -239,7 +245,7 @@ def ndarray_to_mgr(values, index, columns, dtype: Optional[DtypeObj], copy: bool
if columns is None:
columns = Index(range(len(values)))

return arrays_to_mgr(values, columns, index, columns, dtype=dtype)
return arrays_to_mgr(values, columns, index, columns, dtype=dtype, typ=typ)

# by definition an array here
# the dtypes will be coerced to a single dtype
Expand Down Expand Up @@ -303,7 +309,7 @@ def ndarray_to_mgr(values, index, columns, dtype: Optional[DtypeObj], copy: bool
return create_block_manager_from_blocks(block_values, [columns, index])


def dict_to_mgr(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):
def dict_to_mgr(data: Dict, index, columns, dtype: Optional[DtypeObj], typ: str):
"""
Segregate Series based on type and coerce into matrices.
Needs to handle a lot of exceptional cases.
Expand Down Expand Up @@ -349,7 +355,7 @@ def dict_to_mgr(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):
arrays = [
arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
]
return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype, typ=typ)


def nested_data_to_arrays(
Expand Down Expand Up @@ -443,6 +449,11 @@ def _homogenize(data, index: Index, dtype: Optional[DtypeObj]):
# Forces alignment. No need to copy data since we
# are putting it into an ndarray later
val = val.reindex(index, copy=False)
# TODO extract_array should be preferred, but that gives failures for
# `extension/test_numpy.py` (extract_array will convert numpy arrays
# to PandasArray), see https://github.com/pandas-dev/pandas/issues/40021
# val = extract_array(val, extract_numpy=True)
val = val._values
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
else:
if isinstance(val, dict):
if oindex is None:
Expand Down
20 changes: 18 additions & 2 deletions pandas/tests/frame/constructors/test_from_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pytz

from pandas.compat import is_platform_little_endian
import pandas.util._test_decorators as td

from pandas import (
CategoricalIndex,
Expand Down Expand Up @@ -119,6 +120,8 @@ def test_from_records_sequencelike(self):
tm.assert_series_equal(result["C"], df["C"])
tm.assert_series_equal(result["E1"], df["E1"].astype("float64"))

@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) empty from_records
def test_from_records_sequencelike_empty(self):
# empty case
result = DataFrame.from_records([], columns=["foo", "bar", "baz"])
assert len(result) == 0
Expand Down Expand Up @@ -185,7 +188,12 @@ def test_from_records_bad_index_column(self):
tm.assert_index_equal(df1.index, Index(df.C))

# should fail
msg = r"Shape of passed values is \(10, 3\), indices imply \(1, 3\)"
msg = "|".join(
[
r"Shape of passed values is \(10, 3\), indices imply \(1, 3\)",
"Passed arrays should have the same length as the rows Index: 10 vs 1",
]
)
with pytest.raises(ValueError, match=msg):
DataFrame.from_records(df, index=[2])
with pytest.raises(KeyError, match=r"^2$"):
Expand All @@ -209,6 +217,7 @@ def __iter__(self):
expected = DataFrame.from_records(tups)
tm.assert_frame_equal(result, expected)

@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) empty from_records
def test_from_records_len0_with_columns(self):
# GH#2633
result = DataFrame.from_records([], index="foo", columns=["foo", "bar"])
Expand Down Expand Up @@ -260,7 +269,12 @@ def test_from_records_to_records(self):
tm.assert_frame_equal(DataFrame.from_records(arr2), DataFrame(arr2))

# wrong length
msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)"
msg = "|".join(
[
r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)",
"Passed arrays should have the same length as the rows Index: 2 vs 1",
]
)
with pytest.raises(ValueError, match=msg):
DataFrame.from_records(arr, index=index[:-1])

Expand Down Expand Up @@ -387,6 +401,7 @@ def create_dict(order_id):
result = DataFrame.from_records(documents, index=["order_id", "quantity"])
assert result.index.names == ("order_id", "quantity")

@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) empty from_records
def test_from_records_misc_brokenness(self):
# GH#2179

Expand Down Expand Up @@ -425,6 +440,7 @@ def test_from_records_misc_brokenness(self):
)
tm.assert_series_equal(result, expected)

@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) empty from_records
def test_from_records_empty(self):
# GH#3562
result = DataFrame.from_records([], columns=["a", "b", "c"])
Expand Down
Loading