Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Concatenate dictionary of objects along axis=1 #15623

Merged
Merged
Show file tree
Hide file tree
Changes from 31 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
7e89e43
Work from amanlai
er-eis Apr 30, 2024
1a767fb
Tests
er-eis Apr 30, 2024
9cebaa2
Remove extraneous testcase
er-eis Apr 30, 2024
0736e4e
Fix some legacy tests
er-eis Apr 30, 2024
f4c1e77
Merge branch 'branch-24.06' into er-eis/allow-concat-on-frame-dict
er-eis Apr 30, 2024
a55ca67
Address PR comments, add failing testcase
er-eis May 1, 2024
b08e862
Merge branch 'branch-24.06' into er-eis/allow-concat-on-frame-dict
er-eis May 1, 2024
8a3bfb0
Remove extraneous check
er-eis May 1, 2024
de91bd9
Simplify type check, ensure only index concat
er-eis May 1, 2024
29afda0
Merge branch 'branch-24.06' into er-eis/allow-concat-on-frame-dict
er-eis May 1, 2024
8047569
Simplify type check
er-eis May 1, 2024
a1e0949
Fix type check
er-eis May 1, 2024
3136955
Simplify type check
er-eis May 1, 2024
7a994d4
Merge branch 'branch-24.06' into er-eis/allow-concat-on-frame-dict
er-eis May 2, 2024
c813de7
Remove extraneous list cons
er-eis May 2, 2024
b3638d4
Split up large string
er-eis May 2, 2024
4cab646
Simplify BaseIndex type check
er-eis May 2, 2024
b1560d1
Simplify only series bool
er-eis May 2, 2024
546e6b7
Adjust test for axis=0 dict concat
er-eis May 2, 2024
b296e56
Better obj type check
er-eis May 2, 2024
c7c3b65
Merge branch 'branch-24.06' into er-eis/allow-concat-on-frame-dict
er-eis May 2, 2024
ea69fe9
Handle multiple MultiIndex column label types, stop cast to str
er-eis May 2, 2024
932fee5
Clarify comment
er-eis May 2, 2024
251a0b4
Refactor typs, simplify Series
er-eis May 2, 2024
e810c7e
Merge branch 'branch-24.06' into er-eis/allow-concat-on-frame-dict
er-eis May 2, 2024
37c17bb
Merge branch 'branch-24.06' into er-eis/allow-concat-on-frame-dict
er-eis May 2, 2024
7488d69
Merge branch 'branch-24.06' into er-eis/allow-concat-on-frame-dict
er-eis May 2, 2024
8a1840b
Merge branch 'branch-24.06' into er-eis/allow-concat-on-frame-dict
er-eis May 3, 2024
89ed5bd
Slightly looser checking
wence- May 3, 2024
689c526
We were right the first time.
wence- May 3, 2024
d7cedca
More brackets
wence- May 3, 2024
8bd0be7
OMG
wence- May 3, 2024
b5de816
Edit by fixed point iteration
wence- May 3, 2024
2124759
Pythonic object name set uniqueness
er-eis May 3, 2024
b5b9116
Avoid create GPU instances during test collection
er-eis May 3, 2024
0a5a91b
Merge branch 'branch-24.06' into er-eis/allow-concat-on-frame-dict
er-eis May 3, 2024
4ba8e3a
Dedent conditional
er-eis May 3, 2024
8bf95de
Can not -> Cannot
er-eis May 3, 2024
516d28b
Add 'columns' to tests
er-eis May 3, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
195 changes: 131 additions & 64 deletions python/cudf/cudf/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,9 +122,10 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):

Parameters
----------
objs : list of DataFrame, Series, or Index
objs : list or dictionary of DataFrame, Series, or Index
er-eis marked this conversation as resolved.
Show resolved Hide resolved
axis : {0/'index', 1/'columns'}, default 0
The axis to concatenate along.
`axis=1` must be passed if a dictionary is passed.
join : {'inner', 'outer'}, default 'outer'
How to handle indexes on other axis (or axes).
ignore_index : bool, default False
Expand Down Expand Up @@ -231,27 +232,71 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
letter number animal name
0 a 1 bird polly
1 b 2 monkey george

Combine a dictionary of DataFrame objects horizontally:

>>> d = {'first': df1, 'second': df2}
>>> cudf.concat(d, axis=1)
first second
letter number letter number
0 a 1 c 3
1 b 2 d 4
"""
# TODO: Do we really need to have different error messages for an empty
# list and a list of None?
if not objs:
raise ValueError("No objects to concatenate")

objs = [obj for obj in objs if obj is not None]

if not objs:
raise ValueError("All objects passed were None")

axis = _AXIS_MAP.get(axis, None)
if axis is None:
raise ValueError(
f'`axis` must be 0 / "index" or 1 / "columns", got: {axis}'
)

if isinstance(objs, dict):
if axis != 1:
er-eis marked this conversation as resolved.
Show resolved Hide resolved
raise NotImplementedError(
f"Can only concatenate dictionary input along axis=1, not {axis}"
)
objs = {k: obj for k, obj in objs.items() if obj is not None}
er-eis marked this conversation as resolved.
Show resolved Hide resolved
keys = list(objs)
objs = list(objs.values())
if any(isinstance(o, cudf.BaseIndex) for o in objs):
raise TypeError(
"cannot concatenate a dictionary containing indices"
)
else:
objs = [obj for obj in objs if obj is not None]
keys = None

if not objs:
raise ValueError("All objects passed were None")

# Retrieve the base types of `objs`. In order to support sub-types
# and object wrappers, we use `isinstance()` instead of comparing
# types directly
allowed_typs = {
cudf.Series,
cudf.DataFrame,
cudf.BaseIndex,
}
if not all(isinstance(o, tuple(allowed_typs)) for o in objs):
raise TypeError(
f"can only concatenate objects which are instances of "
f"{allowed_typs}, instead received {[type(o) for o in objs]}"
)

if any(isinstance(o, cudf.BaseIndex) for o in objs):
if not all(isinstance(o, cudf.BaseIndex) for o in objs):
raise TypeError(
"when concatenating indices you must provide ONLY indices"
)

only_series = all(isinstance(o, cudf.Series) for o in objs)

# Return for single object
if len(objs) == 1:
obj = objs[0]

if ignore_index:
if axis == 1:
result = cudf.DataFrame._from_data(
Expand Down Expand Up @@ -290,34 +335,25 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
result = cudf.DataFrame._from_data(
data, index=obj.index.copy(deep=True)
)
if keys is not None:
if isinstance(result, cudf.DataFrame):
wence- marked this conversation as resolved.
Show resolved Hide resolved
k = keys[0]
result.columns = cudf.MultiIndex.from_tuples(
[
(k, *c) if isinstance(c, tuple) else (k, c)
for c in result._column_names
]
)

if isinstance(result, cudf.Series) and axis == 0:
# sort has no effect for series concatted along axis 0
return result
else:
return result.sort_index(axis=(1 - axis)) if sort else result

# Retrieve the base types of `objs`. In order to support sub-types
# and object wrappers, we use `isinstance()` instead of comparing
# types directly
typs = set()
for o in objs:
if isinstance(o, cudf.MultiIndex):
typs.add(cudf.MultiIndex)
elif isinstance(o, cudf.BaseIndex):
typs.add(type(o))
elif isinstance(o, cudf.DataFrame):
typs.add(cudf.DataFrame)
elif isinstance(o, cudf.Series):
typs.add(cudf.Series)
else:
raise TypeError(f"cannot concatenate object of type {type(o)}")

allowed_typs = {cudf.Series, cudf.DataFrame}

# when axis is 1 (column) we can concat with Series and Dataframes
if axis == 1:
if not typs.issubset(allowed_typs):
if not all(isinstance(o, (cudf.Series, cudf.DataFrame)) for o in objs):
raise TypeError(
"Can only concatenate Series and DataFrame objects when axis=1"
)
Expand Down Expand Up @@ -353,35 +389,74 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
objs = _align_objs(objs, how=join, sort=sort)
df.index = objs[0].index

for o in objs:
for name, col in o._data.items():
if name in df._data:
raise NotImplementedError(
f"A Column with duplicate name found: {name}, cuDF "
f"doesn't support having multiple columns with "
f"same names yet."
)
if empty_inner:
# if join is inner and it contains an empty df
# we return an empty df, hence creating an empty
# column with dtype metadata retained.
df[name] = cudf.core.column.column_empty_like(
col, newsize=0
)
else:
df[name] = col

result_columns = (
objs[0]
._data.to_pandas_index()
.append([obj._data.to_pandas_index() for obj in objs[1:]])
)
if keys is None:
for o in objs:
for name, col in o._data.items():
if name in df._data:
raise NotImplementedError(
f"A Column with duplicate name found: {name}, cuDF "
f"doesn't support having multiple columns with "
f"same names yet."
)
if empty_inner:
# if join is inner and it contains an empty df
# we return an empty df, hence creating an empty
# column with dtype metadata retained.
df[name] = cudf.core.column.column_empty_like(
col, newsize=0
)
else:
df[name] = col

result_columns = (
objs[0]
._data.to_pandas_index()
.append([obj._data.to_pandas_index() for obj in objs[1:]])
.unique()
)

if ignore_index:
# with ignore_index the column names change to numbers
df.columns = pd.RangeIndex(len(result_columns.unique()))
# need to create a MultiIndex column
else:
# All levels in the multiindex label must have the same type
has_multiple_level_types = (
len(
set().union(*(map(type, obj._data.keys()) for obj in objs))
) > 1
wence- marked this conversation as resolved.
Show resolved Hide resolved
)
if has_multiple_level_types:
raise NotImplementedError(
"Can not construct a MultiIndex column with multiple "
er-eis marked this conversation as resolved.
Show resolved Hide resolved
"label types in cuDF at this time. You must convert "
"the labels to the same type."
)
for k, o in zip(keys, objs):
for name, col in o._data.items():
# if only series, then only keep keys as column labels
# if the existing column is multiindex, prepend it
# to handle cases where dfs and srs are concatenated
if only_series:
col_label = k
elif isinstance(name, tuple):
col_label = (k, *name)
else:
col_label = (k, name)
if empty_inner:
df[col_label] = cudf.core.column.column_empty_like(
col, newsize=0
)
else:
df[col_label] = col

if keys is None:
df.columns = result_columns.unique()
if ignore_index:
df.columns = cudf.RangeIndex(len(result_columns.unique()))
else:
if ignore_index:
# with ignore_index the column names change to numbers
df.columns = cudf.RangeIndex(len(result_columns))
elif not only_series:
df.columns = cudf.MultiIndex.from_tuples(df._column_names)
er-eis marked this conversation as resolved.
Show resolved Hide resolved

if empty_inner:
# if join is inner and it contains an empty df
Expand All @@ -391,18 +466,10 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
return df

# If we get here, we are always concatenating along axis 0 (the rows).
typ = list(typs)[0]
if len(typs) > 1:
if allowed_typs == typs:
# This block of code will run when `objs` has
# both Series & DataFrame kind of inputs.
_normalize_series_and_dataframe(objs, axis=axis)
typ = cudf.DataFrame
else:
raise TypeError(
f"`concat` cannot concatenate objects of "
f"types: {sorted([t.__name__ for t in typs])}."
)
typ = type(objs[0])
if len({type(o) for o in objs}) > 1:
_normalize_series_and_dataframe(objs, axis=axis)
typ = cudf.DataFrame

if typ is cudf.DataFrame:
old_objs = objs
Expand Down
Loading
Loading