Fix DataFrame initialization from list of dicts (#6632)

closes #6628 * support instantiation from a list of dicts Co-authored-by: GALI PREM SAGAR <[email protected]> Co-authored-by: GALI PREM SAGAR <[email protected]>
rapidsai · Nov 13, 2020 · bd564a0 · bd564a0
1 parent 1d5eec6
commit bd564a0
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 9 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -117,6 +117,7 @@
 - PR #6728 Fix cudf python docs and associated build warnings
 - PR #6732 Fix cuDF benchmarks build with static Arrow lib and fix rapids-compose cuDF JNI build
 - PR #6742 Fix concat bug in dask_cudf Series/Index creation
+- PR #6632 Fix DataFrame initialization from list of dicts
 
 
 # cuDF 0.16.0 (21 Oct 2020)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -398,18 +398,24 @@ def _init_from_list_like(self, data, index=None, columns=None):
             index = as_index(index)
 
         self._index = as_index(index)
-        data = list(itertools.zip_longest(*data))
 
-        if columns is not None and len(data) == 0:
-            data = [
-                cudf.core.column.column_empty(row_count=0, dtype=None)
-                for _ in columns
-            ]
+        # list-of-dicts case
+        if len(data) > 0 and isinstance(data[0], dict):
+            data = DataFrame.from_pandas(pd.DataFrame(data))
+            self._data = data._data
+        else:
+            data = list(itertools.zip_longest(*data))
 
-        for col_name, col in enumerate(data):
-            self._data[col_name] = column.as_column(col)
+            if columns is not None and len(data) == 0:
+                data = [
+                    cudf.core.column.column_empty(row_count=0, dtype=None)
+                    for _ in columns
+                ]
 
-        self.columns = columns
+            for col_name, col in enumerate(data):
+                self._data[col_name] = column.as_column(col)
+        if columns:
+            self.columns = columns
 
     def _init_from_dict_like(self, data, index=None, columns=None):
         data = data.copy()

diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
@@ -7892,6 +7892,24 @@ def test_dataframe_to_pandas_nullable_dtypes(df, expected_pdf):
     assert_eq(actual_pdf, expected_pdf)
 
 
+@pytest.mark.parametrize(
+    "data",
+    [
+        [{"a": 1, "b": 2, "c": 3}, {"a": 4, "b": 5, "c": 6}],
+        [{"a": 1, "b": 2, "c": None}, {"a": None, "b": 5, "c": 6}],
+        [{"a": 1, "b": 2}, {"a": 1, "b": 5, "c": 6}],
+        [{"a": 1, "b": 2}, {"b": 5, "c": 6}],
+        [{}, {"a": 1, "b": 5, "c": 6}],
+        [{"a": 1, "b": 2, "c": 3}, {"a": 4.5, "b": 5.5, "c": 6.5}],
+    ],
+)
+def test_dataframe_init_from_list_of_dicts(data):
+    expect = pd.DataFrame(data)
+    got = gd.DataFrame(data)
+
+    assert_eq(expect, got)
+
+
 def test_dataframe_pipe():
     pdf = pd.DataFrame()
     gdf = gd.DataFrame()