Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Fix cudf.Series constructor to handle list of sequences #8735

Merged
merged 13 commits into from
Jul 20, 2021
Merged
12 changes: 11 additions & 1 deletion python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -2026,7 +2026,6 @@ def as_column(
mask = bools_to_mask(as_column(mask).unary_operator("not"))

data = data.set_mask(mask)

else:
try:
data = as_column(
Expand Down Expand Up @@ -2098,6 +2097,17 @@ def as_column(
elif is_interval_dtype(dtype):
sr = pd.Series(arbitrary, dtype="interval")
data = as_column(sr, nan_as_null=nan_as_null, dtype=dtype)
elif (
isinstance(arbitrary, Sequence)
and len(arbitrary) > 0
and any(
cudf.utils.dtypes.is_column_like(arb)
for arb in arbitrary
)
):
return cudf.core.column.ListColumn.from_sequences(
arbitrary
)
else:
data = as_column(
_construct_array(arbitrary, dtype),
Expand Down
37 changes: 37 additions & 0 deletions python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.

import pickle
from typing import Sequence

import numpy as np
import pyarrow as pa
Expand Down Expand Up @@ -278,6 +279,42 @@ def leaves(self):
else:
return self.elements

@classmethod
def from_sequences(
cls, arbitrary: Sequence[ColumnLike]
) -> "cudf.core.column.ListColumn":
"""
Create a list column for list of column-like sequences
"""
data_col = column.column_empty(0)
mask_col = []
offset_col = [0]
offset = 0

# Build Data, Mask & Offsets
for data in arbitrary:
if cudf._lib.scalar._is_null_host_scalar(data):
mask_col.append(False)
offset_col.append(offset)
else:
mask_col.append(True)
data_col = data_col.append(as_column(data))
offset += len(data)
offset_col.append(offset)

offset_col = column.as_column(offset_col, dtype="int32")

# Build ListColumn
res = cls(
size=len(arbitrary),
dtype=cudf.ListDtype(data_col.dtype),
mask=cudf._lib.transform.bools_to_mask(as_column(mask_col)),
offset=0,
null_count=0,
children=(offset_col, data_col),
)
return res


class ListMethods(ColumnMethods):
"""
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ def __init__(
if isinstance(data, dict):
index = data.keys()
data = column.as_column(
data.values(), nan_as_null=nan_as_null, dtype=dtype
list(data.values()), nan_as_null=nan_as_null, dtype=dtype
)

if data is None:
Expand Down
27 changes: 27 additions & 0 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import re
from string import ascii_letters, digits

import cupy as cp
import numpy as np
import pandas as pd
import pytest
Expand Down Expand Up @@ -1203,3 +1204,29 @@ def test_explode(data, ignore_index, p_index):
assert_eq(expect, got, check_dtype=False)
else:
assert_eq(expect, got, check_dtype=False)


@pytest.mark.parametrize(
"data, expected",
[
(
[cudf.Series([1, 2, 3]), cudf.Series([10, 20])],
cudf.Series([[1, 2, 3], [10, 20]]),
),
(
[cudf.Series([1, 2, 3]), None, cudf.Series([10, 20, np.nan])],
cudf.Series([[1, 2, 3], None, [10, 20, np.nan]]),
),
(
[cp.array([5, 6]), cudf.NA, cp.array([1])],
cudf.Series([[5, 6], None, [1]]),
),
(
[None, None, None, None, None, cudf.Series([10, 20])],
cudf.Series([None, None, None, None, None, [10, 20]]),
),
],
)
def test_nested_series_from_sequence_data(data, expected):
actual = cudf.Series(data)
assert_eq(actual, expected)
14 changes: 14 additions & 0 deletions python/dask_cudf/dask_cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,3 +580,17 @@ def test_groupby_agg_redirect(aggregations):
)
def test_is_supported(arg):
assert _is_supported(arg, {"supported"}) is False


def test_groupby_unique_lists():
df = pd.DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": [10, 10, 10, 7, 8, 9]})
ddf = dd.from_pandas(df, 2)
gdf = cudf.from_pandas(df)
gddf = dask_cudf.from_cudf(gdf, 2)
dd.assert_eq(
ddf.groupby("a").b.unique().compute(),
gddf.groupby("a").b.unique().compute(),
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
)
dd.assert_eq(
gdf.groupby("a").b.unique(), gddf.groupby("a").b.unique().compute(),
)