Skip to content

Commit

Permalink
Merge branch 'branch-22.04' into fea-byte-pair-encoder
Browse files Browse the repository at this point in the history
  • Loading branch information
davidwendt committed Mar 16, 2022
2 parents 060077b + 1649955 commit cdee746
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 12 deletions.
3 changes: 3 additions & 0 deletions codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ coverage:
status:
project: off
patch: on
default:
target: auto
threshold: 0%

github_checks:
annotations: true
51 changes: 40 additions & 11 deletions python/dask_cudf/dask_cudf/backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,19 +75,41 @@ def _nonempty_index(idx):
raise TypeError(f"Don't know how to handle index of type {type(idx)}")


def _nest_list_data(data, leaf_type):
"""
Helper for _get_non_empty_data which creates
nested list data
"""
data = [data]
while isinstance(leaf_type, cudf.ListDtype):
leaf_type = leaf_type.element_type
data = [data]
return data


@_dask_cudf_nvtx_annotate
def _get_non_empty_data(s):
if isinstance(s._column, cudf.core.column.CategoricalColumn):
if isinstance(s, cudf.core.column.CategoricalColumn):
categories = (
s._column.categories
if len(s._column.categories)
else [UNKNOWN_CATEGORIES]
s.categories if len(s.categories) else [UNKNOWN_CATEGORIES]
)
codes = cudf.core.column.full(size=2, fill_value=0, dtype="int32")
ordered = s._column.ordered
ordered = s.ordered
data = cudf.core.column.build_categorical_column(
categories=categories, codes=codes, ordered=ordered
)
elif isinstance(s, cudf.core.column.ListColumn):
leaf_type = s.dtype.leaf_type
if is_string_dtype(leaf_type):
data = ["cat", "dog"]
else:
data = np.array([0, 1], dtype=leaf_type).tolist()
data = _nest_list_data(data, s.dtype) * 2
data = cudf.core.column.as_column(data, dtype=s.dtype)
elif isinstance(s, cudf.core.column.StructColumn):
struct_dtype = s.dtype
data = [{key: None for key in struct_dtype.fields.keys()}] * 2
data = cudf.core.column.as_column(data, dtype=s.dtype)
elif is_string_dtype(s.dtype):
data = pa.array(["cat", "dog"])
else:
Expand All @@ -107,7 +129,7 @@ def _get_non_empty_data(s):
def _nonempty_series(s, idx=None):
if idx is None:
idx = _nonempty_index(s.index)
data = _get_non_empty_data(s)
data = _get_non_empty_data(s._column)

return cudf.Series(data, name=s.name, index=idx)

Expand All @@ -120,11 +142,18 @@ def meta_nonempty_cudf(x):
res = cudf.DataFrame(index=idx)
for col in x._data.names:
dtype = str(x._data[col].dtype)
if dtype not in columns_with_dtype:
columns_with_dtype[dtype] = cudf.core.column.as_column(
_get_non_empty_data(x[col])
)
res._data[col] = columns_with_dtype[dtype]
if dtype in ("list", "struct"):
# Not possible to hash and store list & struct types
# as they can contain different levels of nesting or
# fields.
res._data[col] = _get_non_empty_data(x._data[col])
else:
if dtype not in columns_with_dtype:
columns_with_dtype[dtype] = cudf.core.column.as_column(
_get_non_empty_data(x._data[col])
)
res._data[col] = columns_with_dtype[dtype]

return res


Expand Down
21 changes: 20 additions & 1 deletion python/dask_cudf/dask_cudf/io/tests/test_parquet.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright (c) 2019-2020, NVIDIA CORPORATION.
# Copyright (c) 2019-2022, NVIDIA CORPORATION.

import glob
import math
import os
Expand Down Expand Up @@ -513,3 +514,21 @@ def test_cudf_dtypes_from_pandas(tmpdir, data):
# schema is not is passed through in older Dask versions
ddf2 = dask_cudf.read_parquet(fn, split_row_groups=True)
dd.assert_eq(cudf.from_pandas(dfp), ddf2)


def test_cudf_list_struct_write(tmpdir):
df = cudf.DataFrame(
{
"a": [1, 2, 3],
"b": [[[1, 2]], [[2, 3]], None],
"c": [[[["a", "z"]]], [[["b", "d", "e"]]], None],
}
)
df["d"] = df.to_struct()

ddf = dask_cudf.from_cudf(df, 3)
temp_file = str(tmpdir.join("list_struct.parquet"))

ddf.to_parquet(temp_file)
new_ddf = dask_cudf.read_parquet(temp_file)
dd.assert_eq(df, new_ddf)

0 comments on commit cdee746

Please sign in to comment.