diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 89abd7be0ba..b02f0ddafb3 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -8215,4 +8215,4 @@ def _from_dict_create_index(indexlist, namelist, library): index = library.MultiIndex.from_tuples(indexlist, names=namelist) else: index = library.Index(indexlist, name=namelist[0]) - return index + return index \ No newline at end of file diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 2ef39e9357d..5b6ee35be31 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -120,9 +120,10 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): Parameters ---------- - objs : list of DataFrame, Series, or Index + objs : list or dictionary of DataFrame, Series, or Index axis : {0/'index', 1/'columns'}, default 0 The axis to concatenate along. + `axis=1` must be passed if a dictionary is passed. join : {'inner', 'outer'}, default 'outer' How to handle indexes on other axis (or axes). ignore_index : bool, default False @@ -229,13 +230,28 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): letter number animal name 0 a 1 bird polly 1 b 2 monkey george + + Combine a dictionary of DataFrame objects horizontally: + + >>> d = {'first': df1, 'second': df2} + >>> cudf.concat(d, axis=1) + first second + letter number letter number + 0 a 1 c 3 + 1 b 2 d 4 """ # TODO: Do we really need to have different error messages for an empty # list and a list of None? if not objs: raise ValueError("No objects to concatenate") - objs = [obj for obj in objs if obj is not None] + if isinstance(objs, dict): + objs = {k: obj for k, obj in objs.items() if obj is not None} + keys = list(objs) + objs = list(objs.values()) + else: + objs = [obj for obj in objs if obj is not None] + keys = None if not objs: raise ValueError("All objects passed were None") @@ -249,7 +265,6 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): # Return for single object if len(objs) == 1: obj = objs[0] - if ignore_index: if axis == 1: result = cudf.DataFrame._from_data( @@ -280,6 +295,11 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): else: if axis == 0: result = obj.copy() + if keys is not None: + raise NotImplementedError( + "Concatenation along axis = 0 " + "when passing a dictionary is not supported yet." + ) else: data = obj._data.copy(deep=True) if isinstance(obj, cudf.Series) and obj.name is None: @@ -288,6 +308,13 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): result = cudf.DataFrame._from_data( data, index=obj.index.copy(deep=True) ) + if keys is not None: + if isinstance(result, cudf.DataFrame): + k = keys[0] + result.columns = cudf.MultiIndex.from_tuples( + [(k, *c) if isinstance(c, tuple) else (k, c) for c in result.columns] + ) + if isinstance(result, cudf.Series) and axis == 0: # sort has no effect for series concatted along axis 0 @@ -351,35 +378,56 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): objs = _align_objs(objs, how=join, sort=sort) df.index = objs[0].index - for o in objs: - for name, col in o._data.items(): - if name in df._data: - raise NotImplementedError( - f"A Column with duplicate name found: {name}, cuDF " - f"doesn't support having multiple columns with " - f"same names yet." - ) - if empty_inner: - # if join is inner and it contains an empty df - # we return an empty df, hence creating an empty - # column with dtype metadata retained. - df[name] = cudf.core.column.column_empty_like( - col, newsize=0 - ) - else: - df[name] = col + if keys is None: + + for o in objs: + for name, col in o._data.items(): + if name in df._data: + raise NotImplementedError( + f"A Column with duplicate name found: {name}, cuDF " + f"doesn't support having multiple columns with " + f"same names yet." + ) + if empty_inner: + # if join is inner and it contains an empty df + # we return an empty df, hence creating an empty + # column with dtype metadata retained. + df[name] = cudf.core.column.column_empty_like( + col, newsize=0 + ) + else: + df[name] = col + + result_columns = ( + objs[0] + ._data.to_pandas_index() + .append([obj._data.to_pandas_index() for obj in objs[1:]]) + .unique() + ) + + # need to create a MultiIndex column + else: + for k, o in zip(keys, objs): + for name, col in o._data.items(): + # the existing column might be multiindex + if not isinstance(name, tuple): + name = (name,) + if empty_inner: + df[(k, *name)] = cudf.core.column.column_empty_like( + col, newsize=0 + ) + else: + df[(k, *name)] = col + + # MultiIndex construction here + result_columns = cudf.MultiIndex.from_tuples(df.columns) - result_columns = ( - objs[0] - ._data.to_pandas_index() - .append([obj._data.to_pandas_index() for obj in objs[1:]]) - ) if ignore_index: # with ignore_index the column names change to numbers - df.columns = pd.RangeIndex(len(result_columns.unique())) + df.columns = pd.RangeIndex(len(result_columns)) else: - df.columns = result_columns.unique() + df.columns = result_columns if empty_inner: # if join is inner and it contains an empty df @@ -388,7 +436,14 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): return df + # If we get here, we are always concatenating along axis 0 (the rows). + if keys is not None: + raise NotImplementedError( + "Concatenation along axis = 0 " + "when passing a dictionary is not supported yet." + ) + typ = list(typs)[0] if len(typs) > 1: if allowed_typs == typs: diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 3f51ecdf7dc..dc64fe694c1 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -5229,4 +5229,4 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False): if equal_nan is True and a_col.null_count and b_col.null_count: result_col[equal_nulls] = True - return Series(result_col, index=index) + return Series(result_col, index=index) \ No newline at end of file diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 6e61675ef92..a5c2a6405c1 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -1921,3 +1921,21 @@ def test_concat_mixed_list_types_error(s1, s2): with pytest.raises(NotImplementedError): cudf.concat([s1, s2], ignore_index=True) + + +def test_horizontal_concat_dictionary(): + + d = { + 'first': cudf.DataFrame({'A': [1, 2], 'B': [3, 4]}), + 'second': cudf.DataFrame({'A': [5, 6], 'B': [7, 8]}), + } + + # horizontal concat + result1 = cudf.concat(d, axis=1) + expected1 = cudf.DataFrame({ + ('first', 'A'): [1, 2], + ('first', 'B'): [3, 4], + ('second', 'A'): [5, 6], + ('second', 'B'): [7, 8] + }) + assert_eq(expected1, result1) \ No newline at end of file