diff --git a/codecov.yml b/codecov.yml index 4eaa5b066c4..f9d0f906807 100644 --- a/codecov.yml +++ b/codecov.yml @@ -3,6 +3,9 @@ coverage: status: project: off patch: on + default: + target: auto + threshold: 0% github_checks: annotations: true diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index bd9a8fc2769..505430b5cfe 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -75,19 +75,41 @@ def _nonempty_index(idx): raise TypeError(f"Don't know how to handle index of type {type(idx)}") +def _nest_list_data(data, leaf_type): + """ + Helper for _get_non_empty_data which creates + nested list data + """ + data = [data] + while isinstance(leaf_type, cudf.ListDtype): + leaf_type = leaf_type.element_type + data = [data] + return data + + @_dask_cudf_nvtx_annotate def _get_non_empty_data(s): - if isinstance(s._column, cudf.core.column.CategoricalColumn): + if isinstance(s, cudf.core.column.CategoricalColumn): categories = ( - s._column.categories - if len(s._column.categories) - else [UNKNOWN_CATEGORIES] + s.categories if len(s.categories) else [UNKNOWN_CATEGORIES] ) codes = cudf.core.column.full(size=2, fill_value=0, dtype="int32") - ordered = s._column.ordered + ordered = s.ordered data = cudf.core.column.build_categorical_column( categories=categories, codes=codes, ordered=ordered ) + elif isinstance(s, cudf.core.column.ListColumn): + leaf_type = s.dtype.leaf_type + if is_string_dtype(leaf_type): + data = ["cat", "dog"] + else: + data = np.array([0, 1], dtype=leaf_type).tolist() + data = _nest_list_data(data, s.dtype) * 2 + data = cudf.core.column.as_column(data, dtype=s.dtype) + elif isinstance(s, cudf.core.column.StructColumn): + struct_dtype = s.dtype + data = [{key: None for key in struct_dtype.fields.keys()}] * 2 + data = cudf.core.column.as_column(data, dtype=s.dtype) elif is_string_dtype(s.dtype): data = pa.array(["cat", "dog"]) else: @@ -107,7 +129,7 @@ def _get_non_empty_data(s): def _nonempty_series(s, idx=None): if idx is None: idx = _nonempty_index(s.index) - data = _get_non_empty_data(s) + data = _get_non_empty_data(s._column) return cudf.Series(data, name=s.name, index=idx) @@ -120,11 +142,18 @@ def meta_nonempty_cudf(x): res = cudf.DataFrame(index=idx) for col in x._data.names: dtype = str(x._data[col].dtype) - if dtype not in columns_with_dtype: - columns_with_dtype[dtype] = cudf.core.column.as_column( - _get_non_empty_data(x[col]) - ) - res._data[col] = columns_with_dtype[dtype] + if dtype in ("list", "struct"): + # Not possible to hash and store list & struct types + # as they can contain different levels of nesting or + # fields. + res._data[col] = _get_non_empty_data(x._data[col]) + else: + if dtype not in columns_with_dtype: + columns_with_dtype[dtype] = cudf.core.column.as_column( + _get_non_empty_data(x._data[col]) + ) + res._data[col] = columns_with_dtype[dtype] + return res diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index f5c1e53258e..c5d3cf293fd 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -1,4 +1,5 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. + import glob import math import os @@ -513,3 +514,21 @@ def test_cudf_dtypes_from_pandas(tmpdir, data): # schema is not is passed through in older Dask versions ddf2 = dask_cudf.read_parquet(fn, split_row_groups=True) dd.assert_eq(cudf.from_pandas(dfp), ddf2) + + +def test_cudf_list_struct_write(tmpdir): + df = cudf.DataFrame( + { + "a": [1, 2, 3], + "b": [[[1, 2]], [[2, 3]], None], + "c": [[[["a", "z"]]], [[["b", "d", "e"]]], None], + } + ) + df["d"] = df.to_struct() + + ddf = dask_cudf.from_cudf(df, 3) + temp_file = str(tmpdir.join("list_struct.parquet")) + + ddf.to_parquet(temp_file) + new_ddf = dask_cudf.read_parquet(temp_file) + dd.assert_eq(df, new_ddf)