From cdcc91c697089c7b477cfcbf6333ed91d27425b8 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Tue, 20 Jul 2021 19:23:52 -0400 Subject: [PATCH] Add `struct.explode()` method (#8729) Part of #8660. Note that the issue is asking for this feature in _dask-cudf_, which this PR does not implement. Depends on: #8306 Authors: - Ashwin Srinath (https://github.com/shwina) Approvers: - https://github.com/brandon-b-miller - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/8729 --- python/cudf/cudf/core/column/struct.py | 37 ++++++++++++++++++++++++++ python/cudf/cudf/core/dataframe.py | 9 +++++-- python/cudf/cudf/tests/test_struct.py | 26 ++++++++++++++++++ 3 files changed, 70 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 776a6f5efd6..6988128606e 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -150,6 +150,8 @@ class StructMethods(ColumnMethods): Struct methods for Series """ + _column: StructColumn + def __init__(self, parent=None): if not is_struct_dtype(parent.dtype): raise AttributeError( @@ -190,3 +192,38 @@ def field(self, key): return self._return_or_inplace(self._column.children[pos]) else: return self._return_or_inplace(self._column.children[key]) + + def explode(self): + """ + Return a DataFrame whose columns are the fields of this struct Series. + + Notes + ----- + Note that a copy of the columns is made. + + Examples + -------- + >>> s + 0 {'a': 1, 'b': 'x'} + 1 {'a': 2, 'b': 'y'} + 2 {'a': 3, 'b': 'z'} + 3 {'a': 4, 'b': 'a'} + dtype: struct + + >>> s.struct.explode() + a b + 0 1 x + 1 2 y + 2 3 z + 3 4 a + """ + return cudf.DataFrame._from_data( + cudf.core.column_accessor.ColumnAccessor( + { + name: col.copy(deep=True) + for name, col in zip( + self._column.dtype.fields, self._column.children + ) + } + ) + ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d7733920fb4..e9e3cd71ddb 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7563,18 +7563,23 @@ def to_dict(self, orient="dict", into=dict): def to_struct(self, name=None): """ Return a struct Series composed of the columns of the DataFrame. - Note that no copies of the data are made. Parameters ---------- name: optional Name of the resulting Series + + Notes + ----- + Note that a copy of the columns is made. """ col = cudf.core.column.build_struct_column( names=self._data.names, children=self._data.columns, size=len(self) ) return cudf.Series._from_data( - cudf.core.column_accessor.ColumnAccessor({name: col}), + cudf.core.column_accessor.ColumnAccessor( + {name: col.copy(deep=True)} + ), index=self.index, name=name, ) diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py index 8638be72f71..e9d340185ec 100644 --- a/python/cudf/cudf/tests/test_struct.py +++ b/python/cudf/cudf/tests/test_struct.py @@ -166,6 +166,28 @@ def test_struct_scalar_null(): assert slr.device_value.value is cudf.NA +def test_struct_explode(): + s = cudf.Series([], dtype=cudf.StructDtype({})) + expect = cudf.DataFrame({}) + assert_eq(expect, s.struct.explode()) + + s = cudf.Series( + [ + {"a": 1, "b": "x"}, + {"a": 2, "b": "y"}, + {"a": 3, "b": "z"}, + {"a": 4, "b": "a"}, + ] + ) + expect = cudf.DataFrame({"a": [1, 2, 3, 4], "b": ["x", "y", "z", "a"]}) + got = s.struct.explode() + assert_eq(expect, got) + + # check that a copy was made: + got["a"][0] = 5 + assert_eq(s.struct.explode(), expect) + + def test_dataframe_to_struct(): df = cudf.DataFrame() expect = cudf.Series(dtype=cudf.StructDtype({})) @@ -179,6 +201,10 @@ def test_dataframe_to_struct(): got = df.to_struct() assert_eq(expect, got) + # check that a copy was made: + df["a"][0] = 5 + assert_eq(got, expect) + @pytest.mark.parametrize( "series, start, end",