Skip to content

Commit

Permalink
Add dseries.struct.explode (#9086)
Browse files Browse the repository at this point in the history
Closes #8660 

Per discussions in thread #8872 , this PR adds a struct-accessor member function to provide a lateral view to a struct type series.

Example: 
```python
>>> import cudf, dask_cudf as dgd
>>> ds = dgd.from_cudf(cudf.Series(
...     [{'a': 42, 'b': 'str1', 'c': [-1]},
...      {'a': 0,  'b': 'str2', 'c': [400, 500]},
...      {'a': 7,  'b': '',     'c': []}]), npartitions=2)
>>> ds.struct.explode().compute()
    a     b           c
0  42  str1        [-1]
1   0  str2  [400, 500]
2   7                []
```

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

URL: #9086
  • Loading branch information
isVoid authored Sep 22, 2021
1 parent 08cbbcd commit 10fd071
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 0 deletions.
26 changes: 26 additions & 0 deletions python/dask_cudf/dask_cudf/accessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,32 @@ def field(self, key):
meta=self.d_series._meta._constructor([], dtype=typ),
)

def explode(self):
"""
Creates a dataframe view of the struct column, one column per field.
Returns
-------
DataFrame
Examples
--------
>>> import cudf, dask_cudf
>>> ds = dask_cudf.from_cudf(cudf.Series(
... [{'a': 42, 'b': 'str1', 'c': [-1]},
... {'a': 0, 'b': 'str2', 'c': [400, 500]},
... {'a': 7, 'b': '', 'c': []}]), npartitions=2)
>>> ds.struct.explode().compute()
a b c
0 42 str1 [-1]
1 0 str2 [400, 500]
2 7 []
"""
return self.d_series.map_partitions(
lambda s: s.struct.explode(),
meta=self.d_series._meta.struct.explode(),
)


class ListMethods:
def __init__(self, d_series):
Expand Down
15 changes: 15 additions & 0 deletions python/dask_cudf/dask_cudf/tests/test_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,3 +499,18 @@ def test_dask_struct_field_Int_Error(data):

with pytest.raises(IndexError):
got.struct.field(1000).compute()


@pytest.mark.parametrize(
"data",
[
[{}, {}, {}],
[{"a": 100, "b": "abc"}, {"a": 42, "b": "def"}, {"a": -87, "b": ""}],
[{"a": [1, 2, 3], "b": {"c": 101}}, {"a": [4, 5], "b": {"c": 102}}],
],
)
def test_struct_explode(data):
expect = Series(data).struct.explode()
got = dgd.from_cudf(Series(data), 2).struct.explode()

assert_eq(expect, got.compute())

0 comments on commit 10fd071

Please sign in to comment.