Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Fix cudf.Series constructor to handle list of sequences #8735

Merged
merged 13 commits into from
Jul 20, 2021
Merged
12 changes: 11 additions & 1 deletion python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -2026,7 +2026,6 @@ def as_column(
mask = bools_to_mask(as_column(mask).unary_operator("not"))

data = data.set_mask(mask)

else:
try:
data = as_column(
Expand Down Expand Up @@ -2098,6 +2097,17 @@ def as_column(
elif is_interval_dtype(dtype):
sr = pd.Series(arbitrary, dtype="interval")
data = as_column(sr, nan_as_null=nan_as_null, dtype=dtype)
elif (
isinstance(arbitrary, Sequence)
and len(arbitrary) > 0
and any(
cudf.utils.dtypes.is_column_like(arb)
for arb in arbitrary
)
):
return cudf.core.column.ListColumn.from_sequences(
arbitrary
)
else:
data = as_column(
_construct_array(arbitrary, dtype),
Expand Down
41 changes: 41 additions & 0 deletions python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.

import pickle
from typing import Sequence, cast

import numpy as np
import pyarrow as pa
Expand Down Expand Up @@ -278,6 +279,46 @@ def leaves(self):
else:
return self.elements

@classmethod
def from_sequences(cls, arbitrary: Sequence[ColumnLike]):
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
"""
Create a list column for list of column-like sequences
"""
data_col = column.column_empty(0)
mask_col = []
lengths_col = []

# Build Data & Mask
for data in arbitrary:
if cudf._lib.scalar._is_null_host_scalar(data):
mask_col.append(False)
lengths_col.append(0)
else:
mask_col.append(True)
data_col = data_col.append(as_column(data))
lengths_col.append(len(data))

# Build offsets
offset_col = column.column_empty(
row_count=len(arbitrary) + 1, dtype="int32"
)
offset_col[0] = 0
offset_col[1:] = lengths_col
offset_col = cast(
cudf.core.column.NumericalColumn, offset_col
)._apply_scan_op("sum")
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved

# Build ListColumn
res = cls(
size=len(arbitrary),
dtype=cudf.ListDtype(data_col.dtype),
mask=cudf._lib.transform.bools_to_mask(as_column(mask_col)),
offset=0,
null_count=0,
children=(offset_col, data_col),
)
return res


class ListMethods(ColumnMethods):
"""
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ def __init__(
if isinstance(data, dict):
index = data.keys()
data = column.as_column(
data.values(), nan_as_null=nan_as_null, dtype=dtype
list(data.values()), nan_as_null=nan_as_null, dtype=dtype
)

if data is None:
Expand Down
27 changes: 27 additions & 0 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import re
from string import ascii_letters, digits

import cupy as cp
import numpy as np
import pandas as pd
import pytest
Expand Down Expand Up @@ -1203,3 +1204,29 @@ def test_explode(data, ignore_index, p_index):
assert_eq(expect, got, check_dtype=False)
else:
assert_eq(expect, got, check_dtype=False)


@pytest.mark.parametrize(
"data, expected",
[
(
[cudf.Series([1, 2, 3]), cudf.Series([10, 20])],
cudf.Series([[1, 2, 3], [10, 20]]),
),
(
[cudf.Series([1, 2, 3]), None, cudf.Series([10, 20, np.nan])],
cudf.Series([[1, 2, 3], None, [10, 20, np.nan]]),
),
(
[cp.array([5, 6]), cudf.NA, cp.array([1])],
cudf.Series([[5, 6], None, [1]]),
),
(
[None, None, None, None, None, cudf.Series([10, 20])],
cudf.Series([None, None, None, None, None, [10, 20]]),
),
],
)
def test_nested_series_from_sequence_data(data, expected):
actual = cudf.Series(data)
assert_eq(actual, expected)
14 changes: 14 additions & 0 deletions python/dask_cudf/dask_cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,3 +580,17 @@ def test_groupby_agg_redirect(aggregations):
)
def test_is_supported(arg):
assert _is_supported(arg, {"supported"}) is False


def test_groupby_unique_lists():
df = pd.DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": [10, 10, 10, 7, 8, 9]})
ddf = dd.from_pandas(df, 2)
gdf = cudf.from_pandas(df)
gddf = dask_cudf.from_cudf(gdf, 2)
dd.assert_eq(
ddf.groupby("a").b.unique().compute(),
gddf.groupby("a").b.unique().compute(),
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
)
dd.assert_eq(
gdf.groupby("a").b.unique(), gddf.groupby("a").b.unique().compute(),
)