Skip to content

Commit

Permalink
Fix list and struct meta generation issue in dask-cudf (#10434)
Browse files Browse the repository at this point in the history
Fixes: #8913 

This PR fixes multiple code-paths that incorrectly handle `list` and `struct` types for metadata generation in `dask-cudf`.

```python
>>> ddf = dask_cudf.from_cudf(df, 1)
>>> ddf._meta_nonempty
   a         b                       c
0  0  [[0, 1]]  {'a': None, 'b': None}
1  1  [[0, 1]]  {'a': None, 'b': None}
>>> df
   a         b                        c
0  1  [[1, 2]]  {'a': 1, 'b': [[1, 2]]}
1  2  [[2, 3]]  {'a': 2, 'b': [[2, 3]]}
2  3      None      {'a': 3, 'b': None}
>>> ddf._meta_nonempty.c.dtype
StructDtype({'a': dtype('int64'), 'b': ListDtype(ListDtype(int64))})
>>> df.c.dtype
StructDtype({'a': dtype('int64'), 'b': ListDtype(ListDtype(int64))})
>>> ddf._meta_nonempty.b.dtype
ListDtype(ListDtype(int64))
>>> df.b.dtype
ListDtype(ListDtype(int64))
```

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Bradley Dice (https://github.com/bdice)

URL: #10434
  • Loading branch information
galipremsagar authored Mar 16, 2022
1 parent deb39db commit 1649955
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 12 deletions.
3 changes: 3 additions & 0 deletions codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ coverage:
status:
project: off
patch: on
default:
target: auto
threshold: 0%

github_checks:
annotations: true
51 changes: 40 additions & 11 deletions python/dask_cudf/dask_cudf/backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,19 +75,41 @@ def _nonempty_index(idx):
raise TypeError(f"Don't know how to handle index of type {type(idx)}")


def _nest_list_data(data, leaf_type):
"""
Helper for _get_non_empty_data which creates
nested list data
"""
data = [data]
while isinstance(leaf_type, cudf.ListDtype):
leaf_type = leaf_type.element_type
data = [data]
return data


@_dask_cudf_nvtx_annotate
def _get_non_empty_data(s):
if isinstance(s._column, cudf.core.column.CategoricalColumn):
if isinstance(s, cudf.core.column.CategoricalColumn):
categories = (
s._column.categories
if len(s._column.categories)
else [UNKNOWN_CATEGORIES]
s.categories if len(s.categories) else [UNKNOWN_CATEGORIES]
)
codes = cudf.core.column.full(size=2, fill_value=0, dtype="int32")
ordered = s._column.ordered
ordered = s.ordered
data = cudf.core.column.build_categorical_column(
categories=categories, codes=codes, ordered=ordered
)
elif isinstance(s, cudf.core.column.ListColumn):
leaf_type = s.dtype.leaf_type
if is_string_dtype(leaf_type):
data = ["cat", "dog"]
else:
data = np.array([0, 1], dtype=leaf_type).tolist()
data = _nest_list_data(data, s.dtype) * 2
data = cudf.core.column.as_column(data, dtype=s.dtype)
elif isinstance(s, cudf.core.column.StructColumn):
struct_dtype = s.dtype
data = [{key: None for key in struct_dtype.fields.keys()}] * 2
data = cudf.core.column.as_column(data, dtype=s.dtype)
elif is_string_dtype(s.dtype):
data = pa.array(["cat", "dog"])
else:
Expand All @@ -107,7 +129,7 @@ def _get_non_empty_data(s):
def _nonempty_series(s, idx=None):
if idx is None:
idx = _nonempty_index(s.index)
data = _get_non_empty_data(s)
data = _get_non_empty_data(s._column)

return cudf.Series(data, name=s.name, index=idx)

Expand All @@ -120,11 +142,18 @@ def meta_nonempty_cudf(x):
res = cudf.DataFrame(index=idx)
for col in x._data.names:
dtype = str(x._data[col].dtype)
if dtype not in columns_with_dtype:
columns_with_dtype[dtype] = cudf.core.column.as_column(
_get_non_empty_data(x[col])
)
res._data[col] = columns_with_dtype[dtype]
if dtype in ("list", "struct"):
# Not possible to hash and store list & struct types
# as they can contain different levels of nesting or
# fields.
res._data[col] = _get_non_empty_data(x._data[col])
else:
if dtype not in columns_with_dtype:
columns_with_dtype[dtype] = cudf.core.column.as_column(
_get_non_empty_data(x._data[col])
)
res._data[col] = columns_with_dtype[dtype]

return res


Expand Down
21 changes: 20 additions & 1 deletion python/dask_cudf/dask_cudf/io/tests/test_parquet.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright (c) 2019-2020, NVIDIA CORPORATION.
# Copyright (c) 2019-2022, NVIDIA CORPORATION.

import glob
import math
import os
Expand Down Expand Up @@ -513,3 +514,21 @@ def test_cudf_dtypes_from_pandas(tmpdir, data):
# schema is not is passed through in older Dask versions
ddf2 = dask_cudf.read_parquet(fn, split_row_groups=True)
dd.assert_eq(cudf.from_pandas(dfp), ddf2)


def test_cudf_list_struct_write(tmpdir):
df = cudf.DataFrame(
{
"a": [1, 2, 3],
"b": [[[1, 2]], [[2, 3]], None],
"c": [[[["a", "z"]]], [[["b", "d", "e"]]], None],
}
)
df["d"] = df.to_struct()

ddf = dask_cudf.from_cudf(df, 3)
temp_file = str(tmpdir.join("list_struct.parquet"))

ddf.to_parquet(temp_file)
new_ddf = dask_cudf.read_parquet(temp_file)
dd.assert_eq(df, new_ddf)

0 comments on commit 1649955

Please sign in to comment.