From 1649955e141e140b3f6e914716b31cafbd9778ed Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 15 Mar 2022 21:54:29 -0500 Subject: [PATCH] Fix `list` and `struct` meta generation issue in `dask-cudf` (#10434) Fixes: #8913 This PR fixes multiple code-paths that incorrectly handle `list` and `struct` types for metadata generation in `dask-cudf`. ```python >>> ddf = dask_cudf.from_cudf(df, 1) >>> ddf._meta_nonempty a b c 0 0 [[0, 1]] {'a': None, 'b': None} 1 1 [[0, 1]] {'a': None, 'b': None} >>> df a b c 0 1 [[1, 2]] {'a': 1, 'b': [[1, 2]]} 1 2 [[2, 3]] {'a': 2, 'b': [[2, 3]]} 2 3 None {'a': 3, 'b': None} >>> ddf._meta_nonempty.c.dtype StructDtype({'a': dtype('int64'), 'b': ListDtype(ListDtype(int64))}) >>> df.c.dtype StructDtype({'a': dtype('int64'), 'b': ListDtype(ListDtype(int64))}) >>> ddf._meta_nonempty.b.dtype ListDtype(ListDtype(int64)) >>> df.b.dtype ListDtype(ListDtype(int64)) ``` Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Richard (Rick) Zamora (https://github.com/rjzamora) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/10434 --- codecov.yml | 3 ++ python/dask_cudf/dask_cudf/backends.py | 51 +++++++++++++++---- .../dask_cudf/io/tests/test_parquet.py | 21 +++++++- 3 files changed, 63 insertions(+), 12 deletions(-) diff --git a/codecov.yml b/codecov.yml index 4eaa5b066c4..f9d0f906807 100644 --- a/codecov.yml +++ b/codecov.yml @@ -3,6 +3,9 @@ coverage: status: project: off patch: on + default: + target: auto + threshold: 0% github_checks: annotations: true diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index bd9a8fc2769..505430b5cfe 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -75,19 +75,41 @@ def _nonempty_index(idx): raise TypeError(f"Don't know how to handle index of type {type(idx)}") +def _nest_list_data(data, leaf_type): + """ + Helper for _get_non_empty_data which creates + nested list data + """ + data = [data] + while isinstance(leaf_type, cudf.ListDtype): + leaf_type = leaf_type.element_type + data = [data] + return data + + @_dask_cudf_nvtx_annotate def _get_non_empty_data(s): - if isinstance(s._column, cudf.core.column.CategoricalColumn): + if isinstance(s, cudf.core.column.CategoricalColumn): categories = ( - s._column.categories - if len(s._column.categories) - else [UNKNOWN_CATEGORIES] + s.categories if len(s.categories) else [UNKNOWN_CATEGORIES] ) codes = cudf.core.column.full(size=2, fill_value=0, dtype="int32") - ordered = s._column.ordered + ordered = s.ordered data = cudf.core.column.build_categorical_column( categories=categories, codes=codes, ordered=ordered ) + elif isinstance(s, cudf.core.column.ListColumn): + leaf_type = s.dtype.leaf_type + if is_string_dtype(leaf_type): + data = ["cat", "dog"] + else: + data = np.array([0, 1], dtype=leaf_type).tolist() + data = _nest_list_data(data, s.dtype) * 2 + data = cudf.core.column.as_column(data, dtype=s.dtype) + elif isinstance(s, cudf.core.column.StructColumn): + struct_dtype = s.dtype + data = [{key: None for key in struct_dtype.fields.keys()}] * 2 + data = cudf.core.column.as_column(data, dtype=s.dtype) elif is_string_dtype(s.dtype): data = pa.array(["cat", "dog"]) else: @@ -107,7 +129,7 @@ def _get_non_empty_data(s): def _nonempty_series(s, idx=None): if idx is None: idx = _nonempty_index(s.index) - data = _get_non_empty_data(s) + data = _get_non_empty_data(s._column) return cudf.Series(data, name=s.name, index=idx) @@ -120,11 +142,18 @@ def meta_nonempty_cudf(x): res = cudf.DataFrame(index=idx) for col in x._data.names: dtype = str(x._data[col].dtype) - if dtype not in columns_with_dtype: - columns_with_dtype[dtype] = cudf.core.column.as_column( - _get_non_empty_data(x[col]) - ) - res._data[col] = columns_with_dtype[dtype] + if dtype in ("list", "struct"): + # Not possible to hash and store list & struct types + # as they can contain different levels of nesting or + # fields. + res._data[col] = _get_non_empty_data(x._data[col]) + else: + if dtype not in columns_with_dtype: + columns_with_dtype[dtype] = cudf.core.column.as_column( + _get_non_empty_data(x._data[col]) + ) + res._data[col] = columns_with_dtype[dtype] + return res diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index f5c1e53258e..c5d3cf293fd 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -1,4 +1,5 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. + import glob import math import os @@ -513,3 +514,21 @@ def test_cudf_dtypes_from_pandas(tmpdir, data): # schema is not is passed through in older Dask versions ddf2 = dask_cudf.read_parquet(fn, split_row_groups=True) dd.assert_eq(cudf.from_pandas(dfp), ddf2) + + +def test_cudf_list_struct_write(tmpdir): + df = cudf.DataFrame( + { + "a": [1, 2, 3], + "b": [[[1, 2]], [[2, 3]], None], + "c": [[[["a", "z"]]], [[["b", "d", "e"]]], None], + } + ) + df["d"] = df.to_struct() + + ddf = dask_cudf.from_cudf(df, 3) + temp_file = str(tmpdir.join("list_struct.parquet")) + + ddf.to_parquet(temp_file) + new_ddf = dask_cudf.read_parquet(temp_file) + dd.assert_eq(df, new_ddf)