rapidsai · amanlai · Feb 27, 2024 · Feb 27, 2024 · Feb 28, 2024 · Feb 28, 2024
@@ -120,9 +120,10 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
 
     Parameters
     ----------
-    objs : list of DataFrame, Series, or Index
+    objs : list or dictionary of DataFrame, Series, or Index
     axis : {0/'index', 1/'columns'}, default 0
         The axis to concatenate along.
+        `axis=1` must be passed if a dictionary is passed.
     join : {'inner', 'outer'}, default 'outer'
         How to handle indexes on other axis (or axes).
     ignore_index : bool, default False
@@ -229,13 +230,28 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
       letter  number  animal    name
     0      a       1    bird   polly
     1      b       2  monkey  george
+
+    Combine a dictionary of DataFrame objects horizontally:
+
+    >>> d = {'first': df1, 'second': df2}
+    >>> cudf.concat(d, axis=1)
+      first           second
+      letter  number  letter  number
+    0      a       1       c       3
+    1      b       2       d       4
     """
     # TODO: Do we really need to have different error messages for an empty
     # list and a list of None?
     if not objs:
         raise ValueError("No objects to concatenate")
 
-    objs = [obj for obj in objs if obj is not None]
+    if isinstance(objs, dict):
+        objs = {k: obj for k, obj in objs.items() if obj is not None}
+        keys = list(objs)
+        objs = list(objs.values())
+    else:
+        objs = [obj for obj in objs if obj is not None]
+        keys = None
 
     if not objs:
         raise ValueError("All objects passed were None")
@@ -249,7 +265,6 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
     # Return for single object
     if len(objs) == 1:
         obj = objs[0]
-
         if ignore_index:
             if axis == 1:
                 result = cudf.DataFrame._from_data(
@@ -280,6 +295,11 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
         else:
             if axis == 0:
                 result = obj.copy()
+                if keys is not None:
+                    raise NotImplementedError(
+                        "Concatenation along axis = 0 "
+                        "when passing a dictionary is not supported yet."
+                    )
             else:
                 data = obj._data.copy(deep=True)
                 if isinstance(obj, cudf.Series) and obj.name is None:
@@ -288,6 +308,19 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
                 result = cudf.DataFrame._from_data(
                     data, index=obj.index.copy(deep=True)
                 )
+                if keys is not None:
+                    if isinstance(result, cudf.DataFrame):
+                        k = keys[0]
+                        result.columns = cudf.MultiIndex.from_tuples(
+                            [
+                                (k, *c) if isinstance(c, tuple) else (k, c)
+                                for c in result.columns
+                            ]
+                        )
+
+                    result.columns = cudf.MultiIndex.from_product(
+                        [keys, result.columns]
+                    )
 
         if isinstance(result, cudf.Series) and axis == 0:
             # sort has no effect for series concatted along axis 0
@@ -351,35 +384,54 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
             objs = _align_objs(objs, how=join, sort=sort)
             df.index = objs[0].index
 
-        for o in objs:
-            for name, col in o._data.items():
-                if name in df._data:
-                    raise NotImplementedError(
-                        f"A Column with duplicate name found: {name}, cuDF "
-                        f"doesn't support having multiple columns with "
-                        f"same names yet."
-                    )
-                if empty_inner:
-                    # if join is inner and it contains an empty df
-                    # we return an empty df, hence creating an empty
-                    # column with dtype metadata retained.
-                    df[name] = cudf.core.column.column_empty_like(
-                        col, newsize=0
-                    )
-                else:
-                    df[name] = col
+        if keys is None:
+            for o in objs:
+                for name, col in o._data.items():
+                    if name in df._data:
+                        raise NotImplementedError(
+                            f"A Column with duplicate name found: {name}, cuDF "
+                            f"doesn't support having multiple columns with "
+                            f"same names yet."
+                        )
+                    if empty_inner:
+                        # if join is inner and it contains an empty df
+                        # we return an empty df, hence creating an empty
+                        # column with dtype metadata retained.
+                        df[name] = cudf.core.column.column_empty_like(
+                            col, newsize=0
+                        )
+                    else:
+                        df[name] = col
+
+            result_columns = (
+                objs[0]
+                ._data.to_pandas_index()
+                .append([obj._data.to_pandas_index() for obj in objs[1:]])
+                .unique()
+            )
 
-        result_columns = (
-            objs[0]
-            ._data.to_pandas_index()
-            .append([obj._data.to_pandas_index() for obj in objs[1:]])
-        )
+        # need to create a MultiIndex column
+        else:
+            for k, o in zip(keys, objs):
+                for name, col in o._data.items():
+                    # the existing column might be multiindex
+                    if not isinstance(name, tuple):
+                        name = (name,)
+                    if empty_inner:
+                        df[(k, *name)] = cudf.core.column.column_empty_like(
+                            col, newsize=0
+                        )
+                    else:
+                        df[(k, *name)] = col
+
+            # MultiIndex construction here
+            result_columns = cudf.MultiIndex.from_tuples(df._column_names)
 
         if ignore_index:
             # with ignore_index the column names change to numbers
-            df.columns = pd.RangeIndex(len(result_columns.unique()))
+            df.columns = pd.RangeIndex(len(result_columns))
         else:
-            df.columns = result_columns.unique()
+            df.columns = result_columns
 
         if empty_inner:
             # if join is inner and it contains an empty df
@@ -389,6 +441,12 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
         return df
 
     # If we get here, we are always concatenating along axis 0 (the rows).
+    if keys is not None:
+        raise NotImplementedError(
+            "Concatenation along axis = 0 "
+            "when passing a dictionary is not supported yet."
+        )
+
     typ = list(typs)[0]
     if len(typs) > 1:
         if allowed_typs == typs:

@@ -1889,3 +1889,38 @@ def test_concat_mixed_list_types_error(s1, s2):
 
     with pytest.raises(NotImplementedError):
         cudf.concat([s1, s2], ignore_index=True)
+
+
+@pytest.mark.parametrize(
+    "d",
+    [
+        {
+            "first": cudf.DataFrame({"A": [1, 2], "B": [3, 4]}),
+            "second": cudf.DataFrame({"A": [5, 6], "B": [7, 8]}),
+        },
+        {"first": cudf.DataFrame({"A": [1, 2], "B": [3, 4]})},
+        {
+            "first": cudf.DataFrame({"A": [1, 2], "B": [3, 4]}),
+            "second": cudf.DataFrame({"A": [5, 6], "B": [7, 8]}),
+            "third": cudf.DataFrame({"C": [1, 2, 3]}),
+        },
+        {"first": cudf.Series([1, 2, 3]), "second": cudf.Series([4, 5, 6])},
+    ],
+)
+def test_concat_dictionary(d):
+    result1 = cudf.concat(d, axis=1)
+    expected1 = cudf.from_pandas(
+        pd.concat({k: df.to_pandas() for k, df in d.items()}, axis=1)
+    )
+    assert_eq(expected1, result1)
+
+
+def test_concat_dict_incorrect_type():
+    d = {
+        "first": cudf.Index([1, 2, 3]),
+    }
+    with pytest.raises(
+        TypeError,
+        match=f"cannot concatenate object of type {type(d['first'])}",
+    ):
+        cudf.concat(d, axis=1)