Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add struct accessor to dask-cudf #8874

Merged
merged 25 commits into from
Aug 24, 2021
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
fd6625d
allocate correct bufer for children
shaneding Jul 7, 2021
f50b129
update child buffer creation
shaneding Jul 7, 2021
9175e5e
added test cases
shaneding Jul 7, 2021
be44b91
remove comment
shaneding Jul 8, 2021
b6c2410
Implemented StructMethods struct accessor
Jul 19, 2021
c3912b6
Fixed example to use dask_cudf in field method comments
Jul 19, 2021
394b8e7
Partial fix for incorrect metadata in struct accessor field method
Jul 20, 2021
42eec5b
Merge remote-tracking branch 'origin/branch-21.08' into struct-accessor
Jul 20, 2021
2e41f40
Added better handling of key types in struct field() accessor method
Jul 21, 2021
10f8a85
Fixed KeyError handling for field() method in Struct Accessor
Jul 22, 2021
5bff6cb
Added testing of struct series creation and struct.field() method
Jul 27, 2021
3bc0214
Modified testing of accessor field method
Jul 27, 2021
9f88244
Modified struct accessor testing further
Jul 27, 2021
b80097c
Update python/dask_cudf/dask_cudf/tests/test_accessor.py
NV-jpt Jul 28, 2021
7fcd16d
Moved return statement out of try/catch block
Jul 28, 2021
dec19f2
Merge branch 'struct-accessor' of https://github.com/NV-jpt/cudf into…
Jul 28, 2021
5c90ba9
Merge remote-tracking branch 'parent/branch-21.10' into struct-accessor
Jul 28, 2021
cde7808
Removed test case with nonexistent field - need to design another tes…
Jul 29, 2021
d30afbd
Implemented some suggestions from Rick Zamora; thank you, Rickgit status
Jul 29, 2021
99a18d5
Added better error handling and tests
Aug 9, 2021
1a8270f
Consolidated try-except blocks into single block, and implemented app…
Aug 13, 2021
c63e5c9
Merge remote-tracking branch 'parent/branch-21.10' into struct-accessor
Aug 18, 2021
bb52b2c
Allow errors from invalid field keys to be thrown from cudf, instead …
Aug 18, 2021
c9af377
Cleaned up tests through removal of repeat definitions of struct data…
Aug 18, 2021
b552833
Removal of commented-out import statement
Aug 18, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1345,7 +1345,13 @@ def column_empty(
dtype = pandas_dtype(dtype)
children = () # type: Tuple[ColumnBase, ...]

if is_categorical_dtype(dtype):
if is_struct_dtype(dtype):
data = None
children = tuple(
column_empty(row_count, dtype.fields[f])
for f in dtype.fields.keys()
)
elif is_categorical_dtype(dtype):
rjzamora marked this conversation as resolved.
Show resolved Hide resolved
data = None
children = (
build_column(
Expand Down
48 changes: 48 additions & 0 deletions python/dask_cudf/dask_cudf/accessors.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,53 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

class StructMethods:
def __init__(self, d_series):
self.d_series = d_series
def field(self, key):
"""
Extract children of the specified struct column
in the Series
Parameters
----------
key: int or str
index/position or field name of the respective
struct column
Returns
-------
Series
Examples
--------
>>> s = cudf.Series([{'a': 1, 'b': 2}, {'a': 3, 'b': 4}])
>>> ds = dask_cudf.from_cudf(s, 2)
>>> ds.struct.field(0).compute()
0 1
1 3
dtype: int64
>>> ds.struct.field('a').compute()
0 1
1 3
dtype: int64
"""
try:
typ = self.d_series._meta.dtype.fields[key]
return self.d_series.map_partitions(
lambda s: s.struct.field(key),
meta=self.d_series._meta._constructor([], dtype=typ),
)
NV-jpt marked this conversation as resolved.
Show resolved Hide resolved
except KeyError as e:
if isinstance(key, int):
key_list = [dict_key for dict_key in self.d_series._meta.dtype.fields.keys()]
typ_key = key_list[key]
typ = self.d_series._meta.dtype.fields[typ_key]
return self.d_series.map_partitions(
lambda s: s.struct.field(key),
meta=self.d_series._meta._constructor([], dtype=typ),
)
else:
print('Field "' + str(key) + '" is not found in the set of existing keys.')
raise e



class ListMethods:
def __init__(self, d_series):
Expand Down
6 changes: 5 additions & 1 deletion python/dask_cudf/dask_cudf/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from cudf import _lib as libcudf

from dask_cudf import sorting
from dask_cudf.accessors import ListMethods
from dask_cudf.accessors import ListMethods, StructMethods

DASK_VERSION = LooseVersion(dask.__version__)

Expand Down Expand Up @@ -414,6 +414,10 @@ def groupby(self, *args, **kwargs):
def list(self):
return ListMethods(self)

@property
def struct(self):
return StructMethods(self)


class Index(Series, dd.core.Index):
_partition_type = cudf.Index
Expand Down
74 changes: 74 additions & 0 deletions python/dask_cudf/dask_cudf/tests/test_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,3 +438,77 @@ def test_sorting(data, ascending, na_position, ignore_index):
.reset_index(drop=True)
)
assert_eq(expect, got)

rjzamora marked this conversation as resolved.
Show resolved Hide resolved
#############################################################################
# Struct Accessor #
#############################################################################
NV-jpt marked this conversation as resolved.
Show resolved Hide resolved



@pytest.mark.parametrize(
"data",
[
[{"a":5, "b":10},
{"a":3, "b":7},
{"a":-3, "b":11}],

[{"a":None, "b":1},
{"a":None, "b":0},
{"a":-3, "b":None}],

[{'a':1, 'b':2}],

[{'b':3, 'c':4}],

],
)
def test_create_struct_series(data):
expect = pd.Series(data)
ds_got = dgd.from_cudf(Series(data), 2)
assert_eq(expect, ds_got.compute())

@pytest.mark.parametrize(
"data",
[

[{"a":5, "b":10},
{"a":3, "b":7},
{"a":-3, "b":11}],

[{"a":None, "b":1},
{"a":None, "b":0},
{"a":-3, "b":None}],

[{'a':1, 'b':2}],

[{'b':3, 'c':4}],

],
)
def test_struct_field_a(data):
expect = Series(data).struct.field('a')
ds_got = dgd.from_cudf(Series(data), 2).struct.field('a')
assert_eq(expect, ds_got.compute())

rjzamora marked this conversation as resolved.
Show resolved Hide resolved
@pytest.mark.parametrize(
"data",
[

[{"a":5, "b":10},
{"a":3, "b":7},
{"a":-3, "b":11}],

[{"a":None, "b":1},
{"a":None, "b":0},
{"a":-3, "b":None}],

[{'a':1, 'b':2}],

[{'b':3, 'c':4}],

],
)
def test_struct_field_zero(data):
expect = Series(data).struct.field(0)
ds_got = dgd.from_cudf(Series(data), 2).struct.field(0)
assert_eq(expect, ds_got.compute())
36 changes: 36 additions & 0 deletions python/dask_cudf/dask_cudf/tests/test_struct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import pytest

import cudf

import dask_cudf


@pytest.mark.parametrize(
"data, column",
[
(
{
"a": [{"a": [1, 2, 3, 4], "b": "Hello world"}, {}, {"a": []}],
"b": [1, 2, 3],
"c": ["rapids", "cudf", "hi"],
},
"a",
),
(
{"a": [{}, {}, {}], "b": [1, 2, 3], "c": ["rapids", "cudf", "hi"]},
"a",
),
(
{
"a": [{}, {}, {}],
"b": [{"a": 1}, {"b": 5}, {"c": "Hello"}],
"c": ["rapids", "cudf", "hi"],
},
"b",
),
],
)
def test_select_struct(data, column):
df = cudf.DataFrame(data)
ddf = dask_cudf.from_cudf(df, 2)
assert df[column].to_arrow() == ddf[column].compute().to_arrow()