From 1649955e141e140b3f6e914716b31cafbd9778ed Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 15 Mar 2022 21:54:29 -0500
Subject: [PATCH] Fix `list` and `struct` meta generation issue in `dask-cudf`
 (#10434)

Fixes: #8913

This PR fixes multiple code-paths that incorrectly handle `list` and `struct` types for metadata generation in `dask-cudf`.

```python
>>> ddf = dask_cudf.from_cudf(df, 1)
>>> ddf._meta_nonempty
   a         b                       c
0  0  [[0, 1]]  {'a': None, 'b': None}
1  1  [[0, 1]]  {'a': None, 'b': None}
>>> df
   a         b                        c
0  1  [[1, 2]]  {'a': 1, 'b': [[1, 2]]}
1  2  [[2, 3]]  {'a': 2, 'b': [[2, 3]]}
2  3      None      {'a': 3, 'b': None}
>>> ddf._meta_nonempty.c.dtype
StructDtype({'a': dtype('int64'), 'b': ListDtype(ListDtype(int64))})
>>> df.c.dtype
StructDtype({'a': dtype('int64'), 'b': ListDtype(ListDtype(int64))})
>>> ddf._meta_nonempty.b.dtype
ListDtype(ListDtype(int64))
>>> df.b.dtype
ListDtype(ListDtype(int64))
```

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/10434
---
 codecov.yml                                   |  3 ++
 python/dask_cudf/dask_cudf/backends.py        | 51 +++++++++++++++----
 .../dask_cudf/io/tests/test_parquet.py        | 21 +++++++-
 3 files changed, 63 insertions(+), 12 deletions(-)

diff --git a/codecov.yml b/codecov.yml
index 4eaa5b066c4..f9d0f906807 100644
--- a/codecov.yml
+++ b/codecov.yml
@@ -3,6 +3,9 @@ coverage:
   status:
     project: off
     patch: on
+      default:
+        target: auto
+        threshold: 0%
 
 github_checks:
     annotations: true
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index bd9a8fc2769..505430b5cfe 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -75,19 +75,41 @@ def _nonempty_index(idx):
     raise TypeError(f"Don't know how to handle index of type {type(idx)}")
 
 
+def _nest_list_data(data, leaf_type):
+    """
+    Helper for _get_non_empty_data which creates
+    nested list data
+    """
+    data = [data]
+    while isinstance(leaf_type, cudf.ListDtype):
+        leaf_type = leaf_type.element_type
+        data = [data]
+    return data
+
+
 @_dask_cudf_nvtx_annotate
 def _get_non_empty_data(s):
-    if isinstance(s._column, cudf.core.column.CategoricalColumn):
+    if isinstance(s, cudf.core.column.CategoricalColumn):
         categories = (
-            s._column.categories
-            if len(s._column.categories)
-            else [UNKNOWN_CATEGORIES]
+            s.categories if len(s.categories) else [UNKNOWN_CATEGORIES]
         )
         codes = cudf.core.column.full(size=2, fill_value=0, dtype="int32")
-        ordered = s._column.ordered
+        ordered = s.ordered
         data = cudf.core.column.build_categorical_column(
             categories=categories, codes=codes, ordered=ordered
         )
+    elif isinstance(s, cudf.core.column.ListColumn):
+        leaf_type = s.dtype.leaf_type
+        if is_string_dtype(leaf_type):
+            data = ["cat", "dog"]
+        else:
+            data = np.array([0, 1], dtype=leaf_type).tolist()
+        data = _nest_list_data(data, s.dtype) * 2
+        data = cudf.core.column.as_column(data, dtype=s.dtype)
+    elif isinstance(s, cudf.core.column.StructColumn):
+        struct_dtype = s.dtype
+        data = [{key: None for key in struct_dtype.fields.keys()}] * 2
+        data = cudf.core.column.as_column(data, dtype=s.dtype)
     elif is_string_dtype(s.dtype):
         data = pa.array(["cat", "dog"])
     else:
@@ -107,7 +129,7 @@ def _get_non_empty_data(s):
 def _nonempty_series(s, idx=None):
     if idx is None:
         idx = _nonempty_index(s.index)
-    data = _get_non_empty_data(s)
+    data = _get_non_empty_data(s._column)
 
     return cudf.Series(data, name=s.name, index=idx)
 
@@ -120,11 +142,18 @@ def meta_nonempty_cudf(x):
     res = cudf.DataFrame(index=idx)
     for col in x._data.names:
         dtype = str(x._data[col].dtype)
-        if dtype not in columns_with_dtype:
-            columns_with_dtype[dtype] = cudf.core.column.as_column(
-                _get_non_empty_data(x[col])
-            )
-        res._data[col] = columns_with_dtype[dtype]
+        if dtype in ("list", "struct"):
+            # Not possible to hash and store list & struct types
+            # as they can contain different levels of nesting or
+            # fields.
+            res._data[col] = _get_non_empty_data(x._data[col])
+        else:
+            if dtype not in columns_with_dtype:
+                columns_with_dtype[dtype] = cudf.core.column.as_column(
+                    _get_non_empty_data(x._data[col])
+                )
+            res._data[col] = columns_with_dtype[dtype]
+
     return res
 
 
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index f5c1e53258e..c5d3cf293fd 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+
 import glob
 import math
 import os
@@ -513,3 +514,21 @@ def test_cudf_dtypes_from_pandas(tmpdir, data):
     # schema is not is passed through in older Dask versions
     ddf2 = dask_cudf.read_parquet(fn, split_row_groups=True)
     dd.assert_eq(cudf.from_pandas(dfp), ddf2)
+
+
+def test_cudf_list_struct_write(tmpdir):
+    df = cudf.DataFrame(
+        {
+            "a": [1, 2, 3],
+            "b": [[[1, 2]], [[2, 3]], None],
+            "c": [[[["a", "z"]]], [[["b", "d", "e"]]], None],
+        }
+    )
+    df["d"] = df.to_struct()
+
+    ddf = dask_cudf.from_cudf(df, 3)
+    temp_file = str(tmpdir.join("list_struct.parquet"))
+
+    ddf.to_parquet(temp_file)
+    new_ddf = dask_cudf.read_parquet(temp_file)
+    dd.assert_eq(df, new_ddf)