Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Fix cudf.Series constructor to handle list of sequences #8735

Merged
merged 13 commits into from
Jul 20, 2021
Merged
47 changes: 46 additions & 1 deletion python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -2026,7 +2026,12 @@ def as_column(
mask = bools_to_mask(as_column(mask).unary_operator("not"))

data = data.set_mask(mask)

elif (
isinstance(arbitrary, list)
and len(arbitrary) > 0
and cudf.utils.dtypes.is_column_like(arbitrary[0])
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
):
return _create_list_column_from_sequences_list(arbitrary)
else:
try:
data = as_column(
Expand Down Expand Up @@ -2371,3 +2376,43 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
) from e
raise
return col


def _create_list_column_from_sequences_list(arbitrary: List[ColumnLike]):
"""
Create a list column for list of column-like sequences
"""
data_col = as_column(arbitrary[0])
lengths_col = [len(data_col)]
mask_col = [True]

# Build Data & Mask
for data in arbitrary[1:]:
if cudf._lib.scalar._is_null_host_scalar(data):
mask_col.append(False)
lengths_col.append(0)
else:
mask_col.append(True)
data_col = data_col.append(as_column(data))
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
lengths_col.append(len(data))

# Build offsets
offset_col = cudf.core.column.column_empty(
row_count=len(arbitrary) + 1, dtype="int32"
)
offset_col[0] = 0
offset_col[1:] = lengths_col
offset_col = cast(
cudf.core.column.NumericalColumn, offset_col
)._apply_scan_op("sum")

# Build ListColumn
res = cudf.core.column.ListColumn(
size=len(arbitrary),
dtype=cudf.ListDtype(data_col.dtype),
mask=cudf._lib.transform.bools_to_mask(as_column(mask_col)),
offset=0,
null_count=0,
children=(offset_col, data_col),
)
return res
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ def __init__(
if isinstance(data, dict):
index = data.keys()
data = column.as_column(
data.values(), nan_as_null=nan_as_null, dtype=dtype
list(data.values()), nan_as_null=nan_as_null, dtype=dtype
)

if data is None:
Expand Down
23 changes: 23 additions & 0 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import re
from string import ascii_letters, digits

import cupy as cp
import numpy as np
import pandas as pd
import pytest
Expand Down Expand Up @@ -1203,3 +1204,25 @@ def test_explode(data, ignore_index, p_index):
assert_eq(expect, got, check_dtype=False)
else:
assert_eq(expect, got, check_dtype=False)


@pytest.mark.parametrize(
"data, expected",
[
(
[cudf.Series([1, 2, 3]), cudf.Series([10, 20])],
cudf.Series([[1, 2, 3], [10, 20]]),
),
(
[cudf.Series([1, 2, 3]), None, cudf.Series([10, 20, np.nan])],
cudf.Series([[1, 2, 3], None, [10, 20, np.nan]]),
),
(
[cp.array([5, 6]), cudf.NA, cp.array([1])],
cudf.Series([[5, 6], None, [1]]),
),
],
)
def test_nested_series_from_sequence_data(data, expected):
actual = cudf.Series(data)
assert_eq(actual, expected)
17 changes: 14 additions & 3 deletions python/dask_cudf/dask_cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@
import dask
from dask import dataframe as dd

import cudf
from cudf.core._compat import PANDAS_GE_120

import dask_cudf
from dask_cudf.groupby import _is_supported

import cudf
from cudf.core._compat import PANDAS_GE_120


@pytest.mark.parametrize("aggregation", ["sum", "mean", "count", "min", "max"])
def test_groupby_basic_aggs(aggregation):
Expand Down Expand Up @@ -580,3 +580,14 @@ def test_groupby_agg_redirect(aggregations):
)
def test_is_supported(arg):
assert _is_supported(arg, {"supported"}) is False


def test_groupby_unique_lists():
df = pd.DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": [10, 10, 10, 7, 8, 9]})
ddf = dd.from_pandas(df, 2)
gdf = cudf.from_pandas(df)
gddf = dask_cudf.from_cudf(gdf, 2)
dd.assert_eq(
ddf.groupby("a").b.unique().compute(),
gddf.groupby("a").b.unique().compute(),
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
)