Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Fix cudf.Series constructor to handle list of sequences #8735

Merged
merged 13 commits into from
Jul 20, 2021
Merged
53 changes: 52 additions & 1 deletion python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -2026,7 +2026,6 @@ def as_column(
mask = bools_to_mask(as_column(mask).unary_operator("not"))

data = data.set_mask(mask)

else:
try:
data = as_column(
Expand Down Expand Up @@ -2098,6 +2097,15 @@ def as_column(
elif is_interval_dtype(dtype):
sr = pd.Series(arbitrary, dtype="interval")
data = as_column(sr, nan_as_null=nan_as_null, dtype=dtype)
elif (
isinstance(arbitrary, list)
and len(arbitrary) > 0
and any(
cudf.utils.dtypes.is_column_like(arb)
for arb in arbitrary
)
):
return _create_list_column_from_sequences_list(arbitrary)
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
else:
data = as_column(
_construct_array(arbitrary, dtype),
Expand Down Expand Up @@ -2371,3 +2379,46 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
) from e
raise
return col


def _create_list_column_from_sequences_list(arbitrary: List[ColumnLike]):
"""
Create a list column for list of column-like sequences
"""
if cudf.utils.dtypes.is_column_like(arbitrary[0]):
data_col = as_column(arbitrary[0])
mask_col = [True]
else:
data_col = column_empty(row_count=0)
mask_col = [False]

lengths_col = [len(data_col)]

# Build Data & Mask
for data in arbitrary[1:]:
if cudf._lib.scalar._is_null_host_scalar(data):
mask_col.append(False)
lengths_col.append(0)
else:
mask_col.append(True)
data_col = data_col.append(as_column(data))
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
lengths_col.append(len(data))

# Build offsets
offset_col = column_empty(row_count=len(arbitrary) + 1, dtype="int32")
offset_col[0] = 0
offset_col[1:] = lengths_col
offset_col = cast(
cudf.core.column.NumericalColumn, offset_col
)._apply_scan_op("sum")

# Build ListColumn
res = cudf.core.column.ListColumn(
size=len(arbitrary),
dtype=cudf.ListDtype(data_col.dtype),
mask=cudf._lib.transform.bools_to_mask(as_column(mask_col)),
offset=0,
null_count=0,
children=(offset_col, data_col),
)
return res
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ def __init__(
if isinstance(data, dict):
index = data.keys()
data = column.as_column(
data.values(), nan_as_null=nan_as_null, dtype=dtype
list(data.values()), nan_as_null=nan_as_null, dtype=dtype
)

if data is None:
Expand Down
27 changes: 27 additions & 0 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import re
from string import ascii_letters, digits

import cupy as cp
import numpy as np
import pandas as pd
import pytest
Expand Down Expand Up @@ -1203,3 +1204,29 @@ def test_explode(data, ignore_index, p_index):
assert_eq(expect, got, check_dtype=False)
else:
assert_eq(expect, got, check_dtype=False)


@pytest.mark.parametrize(
"data, expected",
[
(
[cudf.Series([1, 2, 3]), cudf.Series([10, 20])],
cudf.Series([[1, 2, 3], [10, 20]]),
),
(
[cudf.Series([1, 2, 3]), None, cudf.Series([10, 20, np.nan])],
cudf.Series([[1, 2, 3], None, [10, 20, np.nan]]),
),
(
[cp.array([5, 6]), cudf.NA, cp.array([1])],
cudf.Series([[5, 6], None, [1]]),
),
(
[None, None, None, None, None, cudf.Series([10, 20])],
cudf.Series([None, None, None, None, None, [10, 20]]),
),
],
)
def test_nested_series_from_sequence_data(data, expected):
actual = cudf.Series(data)
assert_eq(actual, expected)
14 changes: 14 additions & 0 deletions python/dask_cudf/dask_cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,3 +580,17 @@ def test_groupby_agg_redirect(aggregations):
)
def test_is_supported(arg):
assert _is_supported(arg, {"supported"}) is False


def test_groupby_unique_lists():
df = pd.DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": [10, 10, 10, 7, 8, 9]})
ddf = dd.from_pandas(df, 2)
gdf = cudf.from_pandas(df)
gddf = dask_cudf.from_cudf(gdf, 2)
dd.assert_eq(
ddf.groupby("a").b.unique().compute(),
gddf.groupby("a").b.unique().compute(),
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
)
dd.assert_eq(
gdf.groupby("a").b.unique(), gddf.groupby("a").b.unique().compute(),
)