Skip to content

Commit

Permalink
Refactor cudf.Series.__init__ (#14450)
Browse files Browse the repository at this point in the history
I found a few issues related to reindexing  and name priority when `index=` and `name=` are given, so added unit tests for those.

Additionally added some typing in `from_pandas` methods 

The main approach here is to collect the potential column from the `if/elif` branchs first, call `super().__init__({name: columns}, index=index)`, and then apply the additional keywords to the result

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Michael Wang (https://github.com/isVoid)

URL: #14450
  • Loading branch information
mroeschke authored Jan 8, 2024
1 parent 79d5070 commit c4e5e8c
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 83 deletions.
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021-2023, NVIDIA CORPORATION.
# Copyright (c) 2021-2024, NVIDIA CORPORATION.

from __future__ import annotations

Expand Down Expand Up @@ -1836,7 +1836,7 @@ def __array_function__(self, func, types, args, kwargs):
return NotImplemented

@classmethod
def from_pandas(cls, index, nan_as_null=no_default):
def from_pandas(cls, index: pd.Index, nan_as_null=no_default):
"""
Convert from a Pandas Index.
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2023, NVIDIA CORPORATION.
# Copyright (c) 2019-2024, NVIDIA CORPORATION.

from __future__ import annotations

Expand Down Expand Up @@ -1601,7 +1601,7 @@ def to_pandas(self, *, nullable: bool = False) -> pd.MultiIndex:

@classmethod
@_cudf_nvtx_annotate
def from_pandas(cls, multiindex, nan_as_null=no_default):
def from_pandas(cls, multiindex: pd.MultiIndex, nan_as_null=no_default):
"""
Convert from a Pandas MultiIndex
Expand Down
131 changes: 53 additions & 78 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2023, NVIDIA CORPORATION.
# Copyright (c) 2018-2024, NVIDIA CORPORATION.

from __future__ import annotations

Expand Down Expand Up @@ -57,7 +57,6 @@
TimeDeltaColumn,
arange,
as_column,
column,
full,
)
from cudf.core.column.categorical import (
Expand Down Expand Up @@ -202,7 +201,6 @@ def __getitem__(self, arg):

@_cudf_nvtx_annotate
def __setitem__(self, key, value):
from cudf.core.column import column

if isinstance(key, tuple):
key = list(key)
Expand Down Expand Up @@ -264,7 +262,7 @@ def __setitem__(self, key, value):
self._frame._column.dtype, (cudf.ListDtype, cudf.StructDtype)
)
):
value = column.as_column(value)
value = as_column(value)

if (
(
Expand Down Expand Up @@ -568,7 +566,7 @@ def from_masked_array(cls, data, mask, null_count=None):
4 14
dtype: int64
"""
col = column.as_column(data).set_mask(mask)
col = as_column(data).set_mask(mask)
return cls(data=col)

@_cudf_nvtx_annotate
Expand All @@ -593,73 +591,33 @@ def __init__(
"to silence this warning.",
FutureWarning,
)
if isinstance(data, pd.Series):
if name is None:
name = data.name
if isinstance(data.index, pd.MultiIndex):
index = cudf.from_pandas(data.index)
else:
index = as_index(data.index)
elif isinstance(data, pd.Index):
if name is None:
name = data.name
data = as_column(data, nan_as_null=nan_as_null, dtype=dtype)
elif isinstance(data, BaseIndex):
if name is None:
name = data.name
data = data._values
if dtype is not None:
data = data.astype(dtype)
index_from_data = None
name_from_data = None
if data is None:
data = {}

if isinstance(data, (pd.Series, pd.Index, BaseIndex, Series)):
if copy:
data = data.copy(deep=True)
name_from_data = data.name
column = as_column(data, nan_as_null=nan_as_null, dtype=dtype)
if isinstance(data, (pd.Series, Series)):
index_from_data = as_index(data.index)
elif isinstance(data, ColumnAccessor):
raise TypeError(
"Use cudf.Series._from_data for constructing a Series from "
"ColumnAccessor"
)

if isinstance(data, Series):
if index is not None:
data = data.reindex(index)
else:
index = data._index
if name is None:
name = data.name
data = data._column
if copy:
data = data.copy(deep=True)
if dtype is not None:
data = data.astype(dtype)

if isinstance(data, dict):
elif isinstance(data, dict):
if not data:
current_index = RangeIndex(0)
column = as_column(data, nan_as_null=nan_as_null, dtype=dtype)
index_from_data = RangeIndex(0)
else:
current_index = data.keys()
if index is not None:
series = Series(
list(data.values()),
nan_as_null=nan_as_null,
dtype=dtype,
index=current_index,
)
new_index = as_index(index)
if not series.index.equals(new_index):
series = series.reindex(new_index)
data = series._column
index = series._index
else:
data = column.as_column(
column = as_column(
list(data.values()), nan_as_null=nan_as_null, dtype=dtype
)
index = current_index
if data is None:
if index is not None:
data = column.column_empty(
row_count=len(index), dtype=None, masked=True
)
else:
data = {}

if not isinstance(data, ColumnBase):
index_from_data = as_index(list(data.keys()))
else:
# Using `getattr_static` to check if
# `data` is on device memory and perform
# a deep copy later. This is different
Expand All @@ -677,25 +635,42 @@ def __init__(
)
is property
)
data = column.as_column(
column = as_column(
data,
nan_as_null=nan_as_null,
dtype=dtype,
length=len(index) if index is not None else None,
)
if copy and has_cai:
data = data.copy(deep=True)
else:
if dtype is not None:
data = data.astype(dtype)
column = column.copy(deep=True)

if index is not None and not isinstance(index, BaseIndex):
index = as_index(index)
assert isinstance(column, ColumnBase)

if dtype is not None:
column = column.astype(dtype)

assert isinstance(data, ColumnBase)
if name_from_data is not None and name is None:
name = name_from_data

super().__init__({name: data})
self._index = RangeIndex(len(data)) if index is None else index
if index is not None:
index = as_index(index)

if index_from_data is not None:
first_index = index_from_data
second_index = index
elif index is None:
first_index = RangeIndex(len(column))
second_index = None
else:
first_index = index
second_index = None

super().__init__({name: column}, index=first_index)
if second_index is not None:
# TODO: This there a better way to do this?
reindexed = self.reindex(index=second_index, copy=False)
self._data = reindexed._data
self._index = second_index
self._check_data_index_length_match()

@classmethod
Expand All @@ -717,7 +692,7 @@ def __contains__(self, item):

@classmethod
@_cudf_nvtx_annotate
def from_pandas(cls, s, nan_as_null=no_default):
def from_pandas(cls, s: pd.Series, nan_as_null=no_default):
"""
Convert from a Pandas Series.
Expand Down Expand Up @@ -760,7 +735,7 @@ def from_pandas(cls, s, nan_as_null=no_default):
False if cudf.get_option("mode.pandas_compatible") else None
)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
warnings.simplefilter("ignore", FutureWarning)
result = cls(s, nan_as_null=nan_as_null)
return result

Expand Down Expand Up @@ -5250,16 +5225,16 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
b = b.reindex(a.index)
index = as_index(a.index)

a_col = column.as_column(a)
a_col = as_column(a)
a_array = cupy.asarray(a_col.data_array_view(mode="read"))

b_col = column.as_column(b)
b_col = as_column(b)
b_array = cupy.asarray(b_col.data_array_view(mode="read"))

result = cupy.isclose(
a=a_array, b=b_array, rtol=rtol, atol=atol, equal_nan=equal_nan
)
result_col = column.as_column(result)
result_col = as_column(result)

if a_col.null_count and b_col.null_count:
a_nulls = a_col.isnull()
Expand Down
41 changes: 40 additions & 1 deletion python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

import decimal
import hashlib
Expand Down Expand Up @@ -2683,3 +2683,42 @@ def test_series_duplicate_index_reindex():
lfunc_args_and_kwargs=([10, 11, 12, 13], {}),
rfunc_args_and_kwargs=([10, 11, 12, 13], {}),
)


@pytest.mark.parametrize(
"klass", [cudf.Series, cudf.Index, pd.Series, pd.Index]
)
def test_series_from_named_object_name_priority(klass):
result = cudf.Series(klass([1], name="a"), name="b")
assert result.name == "b"


@pytest.mark.parametrize(
"data",
[
{"a": 1, "b": 2, "c": 3},
cudf.Series([1, 2, 3], index=list("abc")),
pd.Series([1, 2, 3], index=list("abc")),
],
)
def test_series_from_object_with_index_index_arg_reindex(data):
result = cudf.Series(data, index=list("bca"))
expected = cudf.Series([2, 3, 1], index=list("bca"))
assert_eq(result, expected)


@pytest.mark.parametrize(
"data",
[
{0: 1, 1: 2, 2: 3},
cudf.Series([1, 2, 3]),
cudf.Index([1, 2, 3]),
pd.Series([1, 2, 3]),
pd.Index([1, 2, 3]),
[1, 2, 3],
],
)
def test_series_dtype_astypes(data):
result = cudf.Series(data, dtype="float64")
expected = cudf.Series([1.0, 2.0, 3.0])
assert_eq(result, expected)

0 comments on commit c4e5e8c

Please sign in to comment.