Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add struct.explode() method #8729

Merged
merged 28 commits into from
Jul 20, 2021
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
ab35f7d
Add initial Buffer.copy()
shwina Apr 29, 2021
72197a2
Add Buffer copy tests
shwina Apr 29, 2021
9b4611b
Docstring
shwina Apr 29, 2021
8d64f84
Initial refactor of ColumnMethods
shwina May 20, 2021
7d9fcc5
More refactoring
shwina May 20, 2021
8911fea
More refactoring
shwina May 20, 2021
2a32405
More refactoring
shwina May 20, 2021
b713e13
parent can never be None
shwina May 20, 2021
50e6fa3
Redundant docstring
shwina May 20, 2021
9d91246
MyPy fix
shwina May 20, 2021
4e7d481
Merge branch 'branch-21.06' of https://github.com/rapidsai/cudf into …
shwina May 25, 2021
a2bd07a
Fix leaves method
shwina May 26, 2021
6142f96
Merge branch 'branch-21.08' of https://github.com/rapidsai/cudf into …
shwina Jul 12, 2021
d9583d0
Merge branch 'column-methods-cleanup' of github.com:shwina/cudf into …
shwina Jul 12, 2021
e55c4e8
Merge branch 'branch-21.08' of https://github.com/rapidsai/cudf into …
shwina Jul 13, 2021
487378c
Move copyright
shwina Jul 13, 2021
e014082
Add a to_struct() method
shwina Jul 13, 2021
a1fc9f2
Merge branch 'column-methods-cleanup' into struct-explode
shwina Jul 13, 2021
cb78803
Add struct.explode() method
shwina Jul 13, 2021
d99e9ea
Rearrange some logic in as_column
shwina Jul 14, 2021
599effa
Merge branch 'branch-21.08' of https://github.com/rapidsai/cudf into …
shwina Jul 15, 2021
07f27e6
Merge branch 'branch-21.08' of https://github.com/rapidsai/cudf into …
shwina Jul 19, 2021
e60c008
Make copies when exploding a struct column
shwina Jul 19, 2021
448c83b
Also copy in to_struct()
shwina Jul 20, 2021
9710c06
Test it
shwina Jul 20, 2021
a9f097d
Add Notes section to docstrings. Explicit deep=True in call to `copy()`.
shwina Jul 20, 2021
7b26d51
Another deep=True
shwina Jul 20, 2021
f699e2a
Merge branch 'branch-21.08' of https://github.com/rapidsai/cudf into …
shwina Jul 20, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 11 additions & 7 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -2037,13 +2037,20 @@ def as_column(
np_type = None
try:
if dtype is not None:
if is_categorical_dtype(dtype) or is_interval_dtype(dtype):
raise TypeError
if is_list_dtype(dtype):
data = pa.array(arbitrary)
if type(data) not in (pa.ListArray, pa.NullArray):
raise ValueError(
"Cannot create list column from given data"
)
return as_column(data, nan_as_null=nan_as_null)
elif isinstance(
dtype, cudf.StructDtype
) and not isinstance(dtype, cudf.IntervalDtype):
data = pa.array(arbitrary, type=dtype.to_arrow())
return as_column(data, nan_as_null=nan_as_null)
if isinstance(dtype, cudf.core.dtypes.Decimal64Dtype):
data = pa.array(
arbitrary,
Expand All @@ -2065,14 +2072,11 @@ def as_column(
data
)
dtype = pd.api.types.pandas_dtype(dtype)
if is_categorical_dtype(dtype) or is_interval_dtype(dtype):
raise TypeError
np_type = np.dtype(dtype).type
if np_type == np.bool_:
pa_type = pa.bool_()
else:
np_type = np.dtype(dtype).type
if np_type == np.bool_:
pa_type = pa.bool_()
else:
pa_type = np_to_pa_dtype(np.dtype(dtype))
pa_type = np_to_pa_dtype(np.dtype(dtype))
data = as_column(
pa.array(
arbitrary,
Expand Down
34 changes: 34 additions & 0 deletions python/cudf/cudf/core/column/struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,8 @@ class StructMethods(ColumnMethods):
Struct methods for Series
"""

_column: StructColumn
vyasr marked this conversation as resolved.
Show resolved Hide resolved

def __init__(self, parent=None):
if not is_struct_dtype(parent.dtype):
raise AttributeError(
Expand Down Expand Up @@ -181,3 +183,35 @@ def field(self, key):
return self._return_or_inplace(self._column.children[pos])
else:
return self._return_or_inplace(self._column.children[key])

def explode(self):
"""
Return a DataFrame whose columns are the fields of this struct Series.
No columns are copied during this operation.
shwina marked this conversation as resolved.
Show resolved Hide resolved

Examples
--------
>>> s
0 {'a': 1, 'b': 'x'}
1 {'a': 2, 'b': 'y'}
2 {'a': 3, 'b': 'z'}
3 {'a': 4, 'b': 'a'}
dtype: struct

>>> s.struct.explode()
a b
0 1 x
1 2 y
2 3 z
3 4 a
"""
return cudf.DataFrame._from_data(
cudf.core.column_accessor.ColumnAccessor(
{
name: col
for name, col in zip(
self._column.dtype.fields, self._column.children
)
}
)
)
23 changes: 23 additions & 0 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -7477,6 +7477,29 @@ def to_dict(self, orient="dict", into=dict):
"`.to_pandas().to_dict()` to construct a Python dictionary."
)

def to_struct(self, name=None, index=None):
"""
Return a struct Series composed of the columns of the DataFrame.
Note that no copies of the data are made.

Parameters
----------
name: optional
Name of the resulting Series
index: optional
Index of the resulting Series. If not provided, the index
of the DataFrame is used.
"""
col = cudf.core.column.build_struct_column(
names=self._data.names, children=self._data.columns, size=len(self)
)
index = index if index is not None else self.index
return cudf.Series._from_data(
cudf.core.column_accessor.ColumnAccessor({name: col}),
index=as_index(index),
name=name,
)

def keys(self):
"""
Get the columns.
Expand Down
17 changes: 17 additions & 0 deletions python/cudf/cudf/tests/test_struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,3 +119,20 @@ def test_struct_scalar_host_construction(data):
def test_struct_scalar_null():
slr = cudf.Scalar(cudf.NA, dtype=StructDtype)
assert slr.device_value.value is cudf.NA


def test_struct_explode():
s = cudf.Series([], dtype=cudf.StructDtype({}))
expect = cudf.DataFrame({})
assert_eq(expect, s.struct.explode())

s = cudf.Series(
[
{"a": 1, "b": "x"},
{"a": 2, "b": "y"},
{"a": 3, "b": "z"},
{"a": 4, "b": "a"},
]
)
expect = cudf.DataFrame({"a": [1, 2, 3, 4], "b": ["x", "y", "z", "a"]})
assert_eq(expect, s.struct.explode())