From bf46f3c231bed9f12c6c1b5fccddfa98d649979e Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 20 Aug 2021 15:51:33 -0700 Subject: [PATCH 1/3] initial --- python/dask_cudf/dask_cudf/accessors.py | 31 +++++++++++++++++++ python/dask_cudf/dask_cudf/core.py | 6 +++- .../dask_cudf/tests/test_accessor.py | 15 +++++++++ 3 files changed, 51 insertions(+), 1 deletion(-) diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/accessors.py index 04d3e20b844..f7af28c2922 100644 --- a/python/dask_cudf/dask_cudf/accessors.py +++ b/python/dask_cudf/dask_cudf/accessors.py @@ -1,6 +1,37 @@ # Copyright (c) 2021, NVIDIA CORPORATION. +class StructMethods: + def __init__(self, d_series): + self.d_series = d_series + + def explode(self): + """ + Creates a dataframe view of the struct column, one column per field. + + Returns + ------- + DataFrame + + Examples + -------- + >>> import cudf, dask_cudf as dgd + >>> ds = dgd.from_cudf(cudf.Series( + ... [{'a': 42, 'b': 'str1', 'c': [-1]}, + ... {'a': 0, 'b': 'str2', 'c': [400, 500]}, + ... {'a': 7, 'b': '', 'c': []}]), npartitions=2) + >>> ds.struct.explode().compute() + a b c + 0 42 str1 [-1] + 1 0 str2 [400, 500] + 2 7 [] + """ + return self.d_series.map_partitions( + lambda s: s.struct.explode(), + meta=self.d_series._meta.struct.explode(), + ) + + class ListMethods: def __init__(self, d_series): self.d_series = d_series diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 1a632907047..f1fb408b0d1 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -27,7 +27,7 @@ from cudf import _lib as libcudf from dask_cudf import sorting -from dask_cudf.accessors import ListMethods +from dask_cudf.accessors import ListMethods, StructMethods DASK_VERSION = LooseVersion(dask.__version__) @@ -414,6 +414,10 @@ def groupby(self, *args, **kwargs): def list(self): return ListMethods(self) + @property + def struct(self): + return StructMethods(self) + class Index(Series, dd.core.Index): _partition_type = cudf.Index diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 342f2b60180..8cbaca55090 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -438,3 +438,18 @@ def test_sorting(data, ascending, na_position, ignore_index): .reset_index(drop=True) ) assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data", + [ + [{}, {}, {}], + [{"a": 100, "b": "abc"}, {"a": 42, "b": "def"}, {"a": -87, "b": ""}], + [{"a": [1, 2, 3], "b": {"c": 101}}, {"a": [4, 5], "b": {"c": 102}}], + ], +) +def test_struct_explode(data): + expect = pd.DataFrame(data) + got = dgd.from_cudf(Series(data), 2).struct.explode().compute() + + assert_eq(expect, got) From bd9be975331cf39df544ac03319a01f31fb9c201 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 24 Aug 2021 14:33:04 -0700 Subject: [PATCH 2/3] style --- python/dask_cudf/dask_cudf/accessors.py | 2 +- python/dask_cudf/dask_cudf/tests/test_accessor.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/accessors.py index 2f69b84ebd4..34626dc42ba 100644 --- a/python/dask_cudf/dask_cudf/accessors.py +++ b/python/dask_cudf/dask_cudf/accessors.py @@ -36,7 +36,7 @@ def field(self, key): lambda s: s.struct.field(key), meta=self.d_series._meta._constructor([], dtype=typ), ) - + def explode(self): """ Creates a dataframe view of the struct column, one column per field. diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index fe9498e0f03..8a98cdebc84 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -501,6 +501,7 @@ def test_dask_struct_field_Int_Error(data): with pytest.raises(IndexError): got.struct.field(1000).compute() + @pytest.mark.parametrize( "data", [ From 61bc39d506c5af8cb55925b2ec29141c05784cd5 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 25 Aug 2021 11:54:38 -0700 Subject: [PATCH 3/3] Review comments --- python/dask_cudf/dask_cudf/accessors.py | 4 ++-- python/dask_cudf/dask_cudf/tests/test_accessor.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/accessors.py index 34626dc42ba..1c21fca51c8 100644 --- a/python/dask_cudf/dask_cudf/accessors.py +++ b/python/dask_cudf/dask_cudf/accessors.py @@ -47,8 +47,8 @@ def explode(self): Examples -------- - >>> import cudf, dask_cudf as dgd - >>> ds = dgd.from_cudf(cudf.Series( + >>> import cudf, dask_cudf + >>> ds = dask_cudf.from_cudf(cudf.Series( ... [{'a': 42, 'b': 'str1', 'c': [-1]}, ... {'a': 0, 'b': 'str2', 'c': [400, 500]}, ... {'a': 7, 'b': '', 'c': []}]), npartitions=2) diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 8a98cdebc84..2c02afd96a9 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -511,7 +511,7 @@ def test_dask_struct_field_Int_Error(data): ], ) def test_struct_explode(data): - expect = pd.DataFrame(data) - got = dgd.from_cudf(Series(data), 2).struct.explode().compute() + expect = Series(data).struct.explode() + got = dgd.from_cudf(Series(data), 2).struct.explode() - assert_eq(expect, got) + assert_eq(expect, got.compute())