From 4ab09dfe9f7a5bb592442468ca5c904bbef84c72 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 20 Apr 2022 14:24:14 -0400 Subject: [PATCH 1/4] Add `.list.astype()` to cast list leaves to specified dtype --- python/cudf/cudf/core/column/lists.py | 53 ++++++++++++++++++++++----- python/cudf/cudf/tests/test_list.py | 10 +++++ 2 files changed, 54 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index b383f7bc321..fd399a62277 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -299,16 +299,29 @@ def as_string_column( """ Create a strings column from a list column """ - # Convert the leaf child column to strings column + lc = self._transform_leaves(lambda col: col.as_string_column(dtype)) + + # Separator strings to match the Python format + separators = as_column([", ", "[", "]"]) + + # Call libcudf to format the list column + return format_list_column(lc, separators) + + def _transform_leaves(self, func, *args, **kwargs): + # return a new list column with the same nested structure + # as ``self``, but with the leaf column transformed + # by applying ``func`` to it + cc: List[ListColumn] = [] c: ColumnBase = self + while isinstance(c, ListColumn): cc.insert(0, c) c = c.children[1] - s = c.as_string_column(dtype) + + lc = func(c, *args, **kwargs) # Rebuild the list column replacing just the leaf child - lc = s for c in cc: o = c.children[0] lc = cudf.core.column.ListColumn( # type: ignore @@ -319,12 +332,7 @@ def as_string_column( null_count=c.null_count, children=(o, lc), ) - - # Separator strings to match the Python format - separators = as_column([", ", "[", "]"]) - - # Call libcudf to format the list column - return format_list_column(lc, separators) + return lc class ListMethods(ColumnMethods): @@ -715,3 +723,30 @@ def concat(self, dropna=True) -> ParentType: "of nesting" ) return self._return_or_inplace(result) + + def astype(self, dtype): + """ + Return a new list Series with the leaf values casted + to the specified data type. + + Parameters + ---------- + dtype: data type to cast leaves values to + + Returns + ------- + A new Series of lists + + Examples + -------- + >>> import cudf + >>> s = cudf.Series([[1, 2], [3, 4]]) + >>> s.dtype + ListDtype(int64) + >>> s2 = s.list.astype("float64") + >>> s2.dtype + ListDtype(float64) + """ + return self._return_or_inplace( + self._column._transform_leaves(lambda col: col.astype(dtype)) + ) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index cf53a3525ef..4b7a1e03511 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -759,3 +759,13 @@ def test_listcol_setitem_retain_dtype(): # prior to this fix: https://github.com/rapidsai/cudf/pull/10151/ df2 = df1.copy() assert df2["a"].dtype == df["a"].dtype + + +def test_list_astype(): + s = cudf.Series([[1, 2], [3, 4]]) + s2 = s.list.astype("float64") + assert s2.dtype == cudf.ListDtype("float64") + + s = cudf.Series([[[1, 2], [3]], [[5, 6], None]]) + s2 = s.list.astype("string") + assert s2.dtype == cudf.ListDtype(cudf.ListDtype("string")) From 0f22f2fb6696a1435f273acbd79f9779c4ba708d Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 20 Apr 2022 14:28:42 -0400 Subject: [PATCH 2/4] Lose the `import` --- python/cudf/cudf/core/column/lists.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index fd399a62277..158cedbffd9 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -739,7 +739,6 @@ def astype(self, dtype): Examples -------- - >>> import cudf >>> s = cudf.Series([[1, 2], [3, 4]]) >>> s.dtype ListDtype(int64) From 0cf1b71e9ec00db2702f6a8e61c000aee73db3a5 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 20 Apr 2022 14:43:05 -0400 Subject: [PATCH 3/4] Use *args, not closure --- python/cudf/cudf/core/column/lists.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 158cedbffd9..df6aaa91a2b 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -299,7 +299,9 @@ def as_string_column( """ Create a strings column from a list column """ - lc = self._transform_leaves(lambda col: col.as_string_column(dtype)) + lc = self._transform_leaves( + lambda col, dtype: col.as_string_column(dtype), dtype + ) # Separator strings to match the Python format separators = as_column([", ", "[", "]"]) @@ -747,5 +749,7 @@ def astype(self, dtype): ListDtype(float64) """ return self._return_or_inplace( - self._column._transform_leaves(lambda col: col.astype(dtype)) + self._column._transform_leaves( + lambda col, dtype: col.astype(dtype), dtype + ) ) From 53ca86203ba0629d430eaa0ff4852b3181f1c855 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 21 Apr 2022 14:43:27 -0400 Subject: [PATCH 4/4] Also test leaf values --- python/cudf/cudf/tests/test_list.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 4b7a1e03511..c21e1a0f61f 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -765,7 +765,9 @@ def test_list_astype(): s = cudf.Series([[1, 2], [3, 4]]) s2 = s.list.astype("float64") assert s2.dtype == cudf.ListDtype("float64") + assert_eq(s.list.leaves.astype("float64"), s2.list.leaves) s = cudf.Series([[[1, 2], [3]], [[5, 6], None]]) s2 = s.list.astype("string") assert s2.dtype == cudf.ListDtype(cudf.ListDtype("string")) + assert_eq(s.list.leaves.astype("string"), s2.list.leaves)