Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Concatenate dictionary of objects along axis=1 #15160

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 85 additions & 27 deletions python/cudf/cudf/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,9 +120,10 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):

Parameters
----------
objs : list of DataFrame, Series, or Index
objs : list or dictionary of DataFrame, Series, or Index
axis : {0/'index', 1/'columns'}, default 0
The axis to concatenate along.
`axis=1` must be passed if a dictionary is passed.
join : {'inner', 'outer'}, default 'outer'
How to handle indexes on other axis (or axes).
ignore_index : bool, default False
Expand Down Expand Up @@ -229,13 +230,28 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
letter number animal name
0 a 1 bird polly
1 b 2 monkey george

Combine a dictionary of DataFrame objects horizontally:

>>> d = {'first': df1, 'second': df2}
>>> cudf.concat(d, axis=1)
first second
letter number letter number
0 a 1 c 3
1 b 2 d 4
"""
# TODO: Do we really need to have different error messages for an empty
# list and a list of None?
if not objs:
raise ValueError("No objects to concatenate")

objs = [obj for obj in objs if obj is not None]
if isinstance(objs, dict):
objs = {k: obj for k, obj in objs.items() if obj is not None}
keys = list(objs)
objs = list(objs.values())
else:
objs = [obj for obj in objs if obj is not None]
keys = None

if not objs:
raise ValueError("All objects passed were None")
Expand All @@ -249,7 +265,6 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
# Return for single object
if len(objs) == 1:
obj = objs[0]

if ignore_index:
if axis == 1:
result = cudf.DataFrame._from_data(
Expand Down Expand Up @@ -280,6 +295,11 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
else:
if axis == 0:
result = obj.copy()
if keys is not None:
raise NotImplementedError(
"Concatenation along axis = 0 "
"when passing a dictionary is not supported yet."
)
else:
data = obj._data.copy(deep=True)
if isinstance(obj, cudf.Series) and obj.name is None:
Expand All @@ -288,6 +308,19 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
result = cudf.DataFrame._from_data(
data, index=obj.index.copy(deep=True)
)
if keys is not None:
if isinstance(result, cudf.DataFrame):
k = keys[0]
result.columns = cudf.MultiIndex.from_tuples(
[
(k, *c) if isinstance(c, tuple) else (k, c)
for c in result.columns
shwina marked this conversation as resolved.
Show resolved Hide resolved
]
)

result.columns = cudf.MultiIndex.from_product(
[keys, result.columns]
shwina marked this conversation as resolved.
Show resolved Hide resolved
)

if isinstance(result, cudf.Series) and axis == 0:
# sort has no effect for series concatted along axis 0
Expand Down Expand Up @@ -351,35 +384,54 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
objs = _align_objs(objs, how=join, sort=sort)
df.index = objs[0].index

for o in objs:
for name, col in o._data.items():
if name in df._data:
raise NotImplementedError(
f"A Column with duplicate name found: {name}, cuDF "
f"doesn't support having multiple columns with "
f"same names yet."
)
if empty_inner:
# if join is inner and it contains an empty df
# we return an empty df, hence creating an empty
# column with dtype metadata retained.
df[name] = cudf.core.column.column_empty_like(
col, newsize=0
)
else:
df[name] = col
if keys is None:
for o in objs:
for name, col in o._data.items():
if name in df._data:
raise NotImplementedError(
f"A Column with duplicate name found: {name}, cuDF "
f"doesn't support having multiple columns with "
f"same names yet."
)
if empty_inner:
# if join is inner and it contains an empty df
# we return an empty df, hence creating an empty
# column with dtype metadata retained.
df[name] = cudf.core.column.column_empty_like(
col, newsize=0
)
else:
df[name] = col

result_columns = (
objs[0]
._data.to_pandas_index()
.append([obj._data.to_pandas_index() for obj in objs[1:]])
.unique()
)

result_columns = (
objs[0]
._data.to_pandas_index()
.append([obj._data.to_pandas_index() for obj in objs[1:]])
)
# need to create a MultiIndex column
else:
for k, o in zip(keys, objs):
for name, col in o._data.items():
# the existing column might be multiindex
if not isinstance(name, tuple):
name = (name,)
if empty_inner:
df[(k, *name)] = cudf.core.column.column_empty_like(
col, newsize=0
)
else:
df[(k, *name)] = col

# MultiIndex construction here
result_columns = cudf.MultiIndex.from_tuples(df.columns)
shwina marked this conversation as resolved.
Show resolved Hide resolved

if ignore_index:
# with ignore_index the column names change to numbers
df.columns = pd.RangeIndex(len(result_columns.unique()))
df.columns = pd.RangeIndex(len(result_columns))
else:
df.columns = result_columns.unique()
df.columns = result_columns

if empty_inner:
# if join is inner and it contains an empty df
Expand All @@ -389,6 +441,12 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
return df

# If we get here, we are always concatenating along axis 0 (the rows).
if keys is not None:
raise NotImplementedError(
"Concatenation along axis = 0 "
"when passing a dictionary is not supported yet."
)

typ = list(typs)[0]
if len(typs) > 1:
if allowed_typs == typs:
Expand Down
35 changes: 35 additions & 0 deletions python/cudf/cudf/tests/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -1889,3 +1889,38 @@ def test_concat_mixed_list_types_error(s1, s2):

with pytest.raises(NotImplementedError):
cudf.concat([s1, s2], ignore_index=True)


@pytest.mark.parametrize(
"d",
[
{
"first": cudf.DataFrame({"A": [1, 2], "B": [3, 4]}),
"second": cudf.DataFrame({"A": [5, 6], "B": [7, 8]}),
},
{"first": cudf.DataFrame({"A": [1, 2], "B": [3, 4]})},
{
"first": cudf.DataFrame({"A": [1, 2], "B": [3, 4]}),
"second": cudf.DataFrame({"A": [5, 6], "B": [7, 8]}),
"third": cudf.DataFrame({"C": [1, 2, 3]}),
},
{"first": cudf.Series([1, 2, 3]), "second": cudf.Series([4, 5, 6])},
],
)
def test_concat_dictionary(d):
result1 = cudf.concat(d, axis=1)
expected1 = cudf.from_pandas(
pd.concat({k: df.to_pandas() for k, df in d.items()}, axis=1)
)
assert_eq(expected1, result1)


def test_concat_dict_incorrect_type():
d = {
"first": cudf.Index([1, 2, 3]),
}
with pytest.raises(
TypeError,
match=f"cannot concatenate object of type {type(d['first'])}",
):
cudf.concat(d, axis=1)
Loading