Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow cudf.concat to process a dictionary of frames #15115 #15126

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -8215,4 +8215,4 @@ def _from_dict_create_index(indexlist, namelist, library):
index = library.MultiIndex.from_tuples(indexlist, names=namelist)
else:
index = library.Index(indexlist, name=namelist[0])
return index
return index
109 changes: 82 additions & 27 deletions python/cudf/cudf/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,9 +120,10 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):

Parameters
----------
objs : list of DataFrame, Series, or Index
objs : list or dictionary of DataFrame, Series, or Index
axis : {0/'index', 1/'columns'}, default 0
The axis to concatenate along.
`axis=1` must be passed if a dictionary is passed.
join : {'inner', 'outer'}, default 'outer'
How to handle indexes on other axis (or axes).
ignore_index : bool, default False
Expand Down Expand Up @@ -229,13 +230,28 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
letter number animal name
0 a 1 bird polly
1 b 2 monkey george

Combine a dictionary of DataFrame objects horizontally:

>>> d = {'first': df1, 'second': df2}
>>> cudf.concat(d, axis=1)
first second
letter number letter number
0 a 1 c 3
1 b 2 d 4
"""
# TODO: Do we really need to have different error messages for an empty
# list and a list of None?
if not objs:
raise ValueError("No objects to concatenate")

objs = [obj for obj in objs if obj is not None]
if isinstance(objs, dict):
objs = {k: obj for k, obj in objs.items() if obj is not None}
keys = list(objs)
objs = list(objs.values())
else:
objs = [obj for obj in objs if obj is not None]
keys = None

if not objs:
raise ValueError("All objects passed were None")
Expand All @@ -249,7 +265,6 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
# Return for single object
if len(objs) == 1:
obj = objs[0]

if ignore_index:
if axis == 1:
result = cudf.DataFrame._from_data(
Expand Down Expand Up @@ -280,6 +295,11 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
else:
if axis == 0:
result = obj.copy()
if keys is not None:
raise NotImplementedError(
"Concatenation along axis = 0 "
"when passing a dictionary is not supported yet."
)
else:
data = obj._data.copy(deep=True)
if isinstance(obj, cudf.Series) and obj.name is None:
Expand All @@ -288,6 +308,13 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
result = cudf.DataFrame._from_data(
data, index=obj.index.copy(deep=True)
)
if keys is not None:
if isinstance(result, cudf.DataFrame):
k = keys[0]
result.columns = cudf.MultiIndex.from_tuples(
[(k, *c) if isinstance(c, tuple) else (k, c) for c in result.columns]
)


if isinstance(result, cudf.Series) and axis == 0:
# sort has no effect for series concatted along axis 0
Expand Down Expand Up @@ -351,35 +378,56 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
objs = _align_objs(objs, how=join, sort=sort)
df.index = objs[0].index

for o in objs:
for name, col in o._data.items():
if name in df._data:
raise NotImplementedError(
f"A Column with duplicate name found: {name}, cuDF "
f"doesn't support having multiple columns with "
f"same names yet."
)
if empty_inner:
# if join is inner and it contains an empty df
# we return an empty df, hence creating an empty
# column with dtype metadata retained.
df[name] = cudf.core.column.column_empty_like(
col, newsize=0
)
else:
df[name] = col
if keys is None:

for o in objs:
for name, col in o._data.items():
if name in df._data:
raise NotImplementedError(
f"A Column with duplicate name found: {name}, cuDF "
f"doesn't support having multiple columns with "
f"same names yet."
)
if empty_inner:
# if join is inner and it contains an empty df
# we return an empty df, hence creating an empty
# column with dtype metadata retained.
df[name] = cudf.core.column.column_empty_like(
col, newsize=0
)
else:
df[name] = col

result_columns = (
objs[0]
._data.to_pandas_index()
.append([obj._data.to_pandas_index() for obj in objs[1:]])
.unique()
)

# need to create a MultiIndex column
else:
for k, o in zip(keys, objs):
for name, col in o._data.items():
# the existing column might be multiindex
if not isinstance(name, tuple):
name = (name,)
if empty_inner:
df[(k, *name)] = cudf.core.column.column_empty_like(
col, newsize=0
)
else:
df[(k, *name)] = col

# MultiIndex construction here
result_columns = cudf.MultiIndex.from_tuples(df.columns)

result_columns = (
objs[0]
._data.to_pandas_index()
.append([obj._data.to_pandas_index() for obj in objs[1:]])
)

if ignore_index:
# with ignore_index the column names change to numbers
df.columns = pd.RangeIndex(len(result_columns.unique()))
df.columns = pd.RangeIndex(len(result_columns))
else:
df.columns = result_columns.unique()
df.columns = result_columns

if empty_inner:
# if join is inner and it contains an empty df
Expand All @@ -388,7 +436,14 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):

return df


# If we get here, we are always concatenating along axis 0 (the rows).
if keys is not None:
raise NotImplementedError(
"Concatenation along axis = 0 "
"when passing a dictionary is not supported yet."
)

typ = list(typs)[0]
if len(typs) > 1:
if allowed_typs == typs:
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -5229,4 +5229,4 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
if equal_nan is True and a_col.null_count and b_col.null_count:
result_col[equal_nulls] = True

return Series(result_col, index=index)
return Series(result_col, index=index)
18 changes: 18 additions & 0 deletions python/cudf/cudf/tests/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -1921,3 +1921,21 @@ def test_concat_mixed_list_types_error(s1, s2):

with pytest.raises(NotImplementedError):
cudf.concat([s1, s2], ignore_index=True)


def test_horizontal_concat_dictionary():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we also add tests for concatenating dict-of-Series and dict-of-Index objects?

If those cases don't work currently - that's OK. We should at least raise a NotImplementedError.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can add tests for these.


d = {
'first': cudf.DataFrame({'A': [1, 2], 'B': [3, 4]}),
'second': cudf.DataFrame({'A': [5, 6], 'B': [7, 8]}),
}
Comment on lines +1928 to +1931
Copy link
Contributor

@shwina shwina Feb 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we parametrize to include a few additional test cases?

  • A single entry in d
  • A third entry in d with a different number of elements, e.g.,:
    d = {
        'first': pd.DataFrame({'A': [1, 2], 'B': [3, 4]}),
        'second': pd.DataFrame({'A': [5, 6], 'B': [7, 8]}),
        'third': pd.DataFrame({'C': [1, 2, 3]})
    }

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aren't single entry in d and a third entry in d with just 1 key-value pair the same thing? Are you saying there should be a test for d = {'first': cudf.DataFrame({'A': [1, 2], 'B': [3, 4]})}?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry - I mistyped! I've edited my comment


# horizontal concat
result1 = cudf.concat(d, axis=1)
expected1 = cudf.DataFrame({
('first', 'A'): [1, 2],
('first', 'B'): [3, 4],
('second', 'A'): [5, 6],
('second', 'B'): [7, 8]
})
assert_eq(expected1, result1)
Loading