Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Concatenate dictionary of objects along axis=1 #15160

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 114 additions & 45 deletions python/cudf/cudf/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,9 +120,10 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):

Parameters
----------
objs : list of DataFrame, Series, or Index
objs : list or dictionary of DataFrame, Series, or Index
axis : {0/'index', 1/'columns'}, default 0
The axis to concatenate along.
`axis=1` must be passed if a dictionary is passed.
join : {'inner', 'outer'}, default 'outer'
How to handle indexes on other axis (or axes).
ignore_index : bool, default False
Expand Down Expand Up @@ -229,13 +230,28 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
letter number animal name
0 a 1 bird polly
1 b 2 monkey george

Combine a dictionary of DataFrame objects horizontally:

>>> d = {'first': df1, 'second': df2}
>>> cudf.concat(d, axis=1)
first second
letter number letter number
0 a 1 c 3
1 b 2 d 4
"""
# TODO: Do we really need to have different error messages for an empty
# list and a list of None?
if not objs:
raise ValueError("No objects to concatenate")

objs = [obj for obj in objs if obj is not None]
if isinstance(objs, dict):
objs = {k: obj for k, obj in objs.items() if obj is not None}
keys = list(objs)
objs = list(objs.values())
else:
objs = [obj for obj in objs if obj is not None]
keys = None

if not objs:
raise ValueError("All objects passed were None")
Expand All @@ -246,10 +262,27 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
f'`axis` must be 0 / "index" or 1 / "columns", got: {axis}'
)

# Retrieve the base types of `objs`. In order to support sub-types
# and object wrappers, we use `isinstance()` instead of comparing
# types directly
typs = set()
for o in objs:
if isinstance(o, cudf.MultiIndex):
typs.add(cudf.MultiIndex)
elif isinstance(o, cudf.BaseIndex):
typs.add(type(o))
elif isinstance(o, cudf.DataFrame):
typs.add(cudf.DataFrame)
elif isinstance(o, cudf.Series):
typs.add(cudf.Series)
else:
raise TypeError(f"cannot concatenate object of type {type(o)}")

allowed_typs = {cudf.Series, cudf.DataFrame}

# Return for single object
if len(objs) == 1:
obj = objs[0]

if ignore_index:
if axis == 1:
result = cudf.DataFrame._from_data(
Expand Down Expand Up @@ -280,39 +313,40 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
else:
if axis == 0:
result = obj.copy()
if keys is not None:
raise NotImplementedError(
"Concatenation along axis = 0 "
"when passing a dictionary is not supported yet."
)
else:
o_typ = typs.pop()
if o_typ not in allowed_typs:
raise TypeError(
f"cannot concatenate object of type {o_typ}"
)
data = obj._data.copy(deep=True)
if isinstance(obj, cudf.Series) and obj.name is None:
# If the Series has no name, pandas renames it to 0.
data[0] = data.pop(None)
result = cudf.DataFrame._from_data(
data, index=obj.index.copy(deep=True)
)
if keys is not None:
if isinstance(result, cudf.DataFrame):
k = keys[0]
result.columns = cudf.MultiIndex.from_tuples(
[
(k, *c) if isinstance(c, tuple) else (k, c)
for c in result._column_names
]
)

if isinstance(result, cudf.Series) and axis == 0:
# sort has no effect for series concatted along axis 0
return result
else:
return result.sort_index(axis=(1 - axis)) if sort else result

# Retrieve the base types of `objs`. In order to support sub-types
# and object wrappers, we use `isinstance()` instead of comparing
# types directly
typs = set()
for o in objs:
if isinstance(o, cudf.MultiIndex):
typs.add(cudf.MultiIndex)
elif isinstance(o, cudf.BaseIndex):
typs.add(type(o))
elif isinstance(o, cudf.DataFrame):
typs.add(cudf.DataFrame)
elif isinstance(o, cudf.Series):
typs.add(cudf.Series)
else:
raise TypeError(f"cannot concatenate object of type {type(o)}")

allowed_typs = {cudf.Series, cudf.DataFrame}

# when axis is 1 (column) we can concat with Series and Dataframes
if axis == 1:
if not typs.issubset(allowed_typs):
Expand Down Expand Up @@ -351,35 +385,64 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
objs = _align_objs(objs, how=join, sort=sort)
df.index = objs[0].index

for o in objs:
for name, col in o._data.items():
if name in df._data:
raise NotImplementedError(
f"A Column with duplicate name found: {name}, cuDF "
f"doesn't support having multiple columns with "
f"same names yet."
)
if empty_inner:
# if join is inner and it contains an empty df
# we return an empty df, hence creating an empty
# column with dtype metadata retained.
df[name] = cudf.core.column.column_empty_like(
col, newsize=0
)
else:
df[name] = col
# if the dictionary consists of only dictionaries
# it must be handled differently
only_series = len(typs) == 1 and cudf.Series in typs

result_columns = (
objs[0]
._data.to_pandas_index()
.append([obj._data.to_pandas_index() for obj in objs[1:]])
)
if keys is None:
for o in objs:
for name, col in o._data.items():
if name in df._data:
raise NotImplementedError(
f"A Column with duplicate name found: {name}, cuDF "
f"doesn't support having multiple columns with "
f"same names yet."
)
if empty_inner:
# if join is inner and it contains an empty df
# we return an empty df, hence creating an empty
# column with dtype metadata retained.
df[name] = cudf.core.column.column_empty_like(
col, newsize=0
)
else:
df[name] = col

result_columns = (
objs[0]
._data.to_pandas_index()
.append([obj._data.to_pandas_index() for obj in objs[1:]])
.unique()
)

# need to create a MultiIndex column
else:
for k, o in zip(keys, objs):
for name, col in o._data.items():
# if only series, then only keep keys as column labels
# if the existing column is multiindex, prepend it
# to handle cases where dfs and srs are concatenated,
# explicitly cast int column labels into str
if only_series:
col_label = k
elif isinstance(name, tuple):
col_label = (k, *name)
else:
col_label = (k, str(name))
if empty_inner:
df[col_label] = cudf.core.column.column_empty_like(
col, newsize=0
)
else:
df[col_label] = col

if ignore_index:
# with ignore_index the column names change to numbers
df.columns = pd.RangeIndex(len(result_columns.unique()))
df.columns = pd.RangeIndex(len(result_columns))
elif not only_series:
df.columns = cudf.MultiIndex.from_tuples(df._column_names)
else:
df.columns = result_columns.unique()
pass

if empty_inner:
# if join is inner and it contains an empty df
Expand All @@ -389,6 +452,12 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
return df

# If we get here, we are always concatenating along axis 0 (the rows).
if keys is not None:
raise NotImplementedError(
"Concatenation along axis = 0 "
"when passing a dictionary is not supported yet."
)

typ = list(typs)[0]
if len(typs) > 1:
if allowed_typs == typs:
Expand Down
39 changes: 39 additions & 0 deletions python/cudf/cudf/tests/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -1889,3 +1889,42 @@ def test_concat_mixed_list_types_error(s1, s2):

with pytest.raises(NotImplementedError):
cudf.concat([s1, s2], ignore_index=True)


@pytest.mark.parametrize(
"d",
[
{
"first": cudf.DataFrame({"A": [1, 2], "B": [3, 4]}),
"second": cudf.DataFrame({"A": [5, 6], "B": [7, 8]}),
},
{"first": cudf.DataFrame({"A": [1, 2], "B": [3, 4]})},
{
"first": cudf.DataFrame({"A": [1, 2], "B": [3, 4]}),
"second": cudf.DataFrame({"A": [5, 6], "B": [7, 8]}),
"third": cudf.DataFrame({"C": [1, 2, 3]}),
},
{"first": cudf.Series([1, 2, 3]), "second": cudf.Series([4, 5, 6])},
{
"first": cudf.DataFrame({"A": [1, 2], "B": [3, 4]}),
"second": cudf.Series([5, 6], name="C"),
},
],
)
def test_concat_dictionary(d):
result1 = cudf.concat(d, axis=1)
expected1 = cudf.from_pandas(
pd.concat({k: df.to_pandas() for k, df in d.items()}, axis=1)
)
assert_eq(expected1, result1)


def test_concat_dict_incorrect_type():
d = {
"first": cudf.Index([1, 2, 3]),
}
with pytest.raises(
TypeError,
match=f"cannot concatenate object of type {type(d['first'])}",
):
cudf.concat(d, axis=1)
Loading