Fix list and struct meta generation issue in dask-cudf (#10434)

Fixes: #8913 This PR fixes multiple code-paths that incorrectly handle `list` and `struct` types for metadata generation in `dask-cudf`. ```python >>> ddf = dask_cudf.from_cudf(df, 1) >>> ddf._meta_nonempty a b c 0 0 [[0, 1]] {'a': None, 'b': None} 1 1 [[0, 1]] {'a': None, 'b': None} >>> df a b c 0 1 [[1, 2]] {'a': 1, 'b': [[1, 2]]} 1 2 [[2, 3]] {'a': 2, 'b': [[2, 3]]} 2 3 None {'a': 3, 'b': None} >>> ddf._meta_nonempty.c.dtype StructDtype({'a': dtype('int64'), 'b': ListDtype(ListDtype(int64))}) >>> df.c.dtype StructDtype({'a': dtype('int64'), 'b': ListDtype(ListDtype(int64))}) >>> ddf._meta_nonempty.b.dtype ListDtype(ListDtype(int64)) >>> df.b.dtype ListDtype(ListDtype(int64)) ``` Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Richard (Rick) Zamora (https://github.com/rjzamora) - Bradley Dice (https://github.com/bdice) URL: #10434
rapidsai · Mar 16, 2022 · 1649955 · 1649955
1 parent deb39db
commit 1649955
Show file tree

Hide file tree

Showing 3 changed files with 63 additions and 12 deletions.
diff --git a/codecov.yml b/codecov.yml
@@ -3,6 +3,9 @@ coverage:
   status:
     project: off
     patch: on
+      default:
+        target: auto
+        threshold: 0%
 
 github_checks:
     annotations: true
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
@@ -75,19 +75,41 @@ def _nonempty_index(idx):
     raise TypeError(f"Don't know how to handle index of type {type(idx)}")
 
 
+def _nest_list_data(data, leaf_type):
+    """
+    Helper for _get_non_empty_data which creates
+    nested list data
+    """
+    data = [data]
+    while isinstance(leaf_type, cudf.ListDtype):
+        leaf_type = leaf_type.element_type
+        data = [data]
+    return data
+
+
 @_dask_cudf_nvtx_annotate
 def _get_non_empty_data(s):
-    if isinstance(s._column, cudf.core.column.CategoricalColumn):
+    if isinstance(s, cudf.core.column.CategoricalColumn):
         categories = (
-            s._column.categories
-            if len(s._column.categories)
-            else [UNKNOWN_CATEGORIES]
+            s.categories if len(s.categories) else [UNKNOWN_CATEGORIES]
         )
         codes = cudf.core.column.full(size=2, fill_value=0, dtype="int32")
-        ordered = s._column.ordered
+        ordered = s.ordered
         data = cudf.core.column.build_categorical_column(
             categories=categories, codes=codes, ordered=ordered
         )
+    elif isinstance(s, cudf.core.column.ListColumn):
+        leaf_type = s.dtype.leaf_type
+        if is_string_dtype(leaf_type):
+            data = ["cat", "dog"]
+        else:
+            data = np.array([0, 1], dtype=leaf_type).tolist()
+        data = _nest_list_data(data, s.dtype) * 2
+        data = cudf.core.column.as_column(data, dtype=s.dtype)
+    elif isinstance(s, cudf.core.column.StructColumn):
+        struct_dtype = s.dtype
+        data = [{key: None for key in struct_dtype.fields.keys()}] * 2
+        data = cudf.core.column.as_column(data, dtype=s.dtype)
     elif is_string_dtype(s.dtype):
         data = pa.array(["cat", "dog"])
     else:
@@ -107,7 +129,7 @@ def _get_non_empty_data(s):
 def _nonempty_series(s, idx=None):
     if idx is None:
         idx = _nonempty_index(s.index)
-    data = _get_non_empty_data(s)
+    data = _get_non_empty_data(s._column)
 
     return cudf.Series(data, name=s.name, index=idx)
 
@@ -120,11 +142,18 @@ def meta_nonempty_cudf(x):
     res = cudf.DataFrame(index=idx)
     for col in x._data.names:
         dtype = str(x._data[col].dtype)
-        if dtype not in columns_with_dtype:
-            columns_with_dtype[dtype] = cudf.core.column.as_column(
-                _get_non_empty_data(x[col])
-            )
-        res._data[col] = columns_with_dtype[dtype]
+        if dtype in ("list", "struct"):
+            # Not possible to hash and store list & struct types
+            # as they can contain different levels of nesting or
+            # fields.
+            res._data[col] = _get_non_empty_data(x._data[col])
+        else:
+            if dtype not in columns_with_dtype:
+                columns_with_dtype[dtype] = cudf.core.column.as_column(
+                    _get_non_empty_data(x._data[col])
+                )
+            res._data[col] = columns_with_dtype[dtype]
+
     return res
 
 

diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+
 import glob
 import math
 import os
@@ -513,3 +514,21 @@ def test_cudf_dtypes_from_pandas(tmpdir, data):
     # schema is not is passed through in older Dask versions
     ddf2 = dask_cudf.read_parquet(fn, split_row_groups=True)
     dd.assert_eq(cudf.from_pandas(dfp), ddf2)
+
+
+def test_cudf_list_struct_write(tmpdir):
+    df = cudf.DataFrame(
+        {
+            "a": [1, 2, 3],
+            "b": [[[1, 2]], [[2, 3]], None],
+            "c": [[[["a", "z"]]], [[["b", "d", "e"]]], None],
+        }
+    )
+    df["d"] = df.to_struct()
+
+    ddf = dask_cudf.from_cudf(df, 3)
+    temp_file = str(tmpdir.join("list_struct.parquet"))
+
+    ddf.to_parquet(temp_file)
+    new_ddf = dask_cudf.read_parquet(temp_file)
+    dd.assert_eq(df, new_ddf)