Refactor cudf.Series.__init__ (#14450)

I found a few issues related to reindexing and name priority when `index=` and `name=` are given, so added unit tests for those. Additionally added some typing in `from_pandas` methods The main approach here is to collect the potential column from the `if/elif` branchs first, call `super().__init__({name: columns}, index=index)`, and then apply the additional keywords to the result Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Michael Wang (https://github.com/isVoid) URL: #14450
rapidsai · Jan 8, 2024 · c4e5e8c · c4e5e8c
1 parent 79d5070
commit c4e5e8c
Show file tree

Hide file tree

Showing 4 changed files with 97 additions and 83 deletions.
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -1836,7 +1836,7 @@ def __array_function__(self, func, types, args, kwargs):
             return NotImplemented
 
     @classmethod
-    def from_pandas(cls, index, nan_as_null=no_default):
+    def from_pandas(cls, index: pd.Index, nan_as_null=no_default):
         """
         Convert from a Pandas Index.
 

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -1601,7 +1601,7 @@ def to_pandas(self, *, nullable: bool = False) -> pd.MultiIndex:
 
     @classmethod
     @_cudf_nvtx_annotate
-    def from_pandas(cls, multiindex, nan_as_null=no_default):
+    def from_pandas(cls, multiindex: pd.MultiIndex, nan_as_null=no_default):
         """
         Convert from a Pandas MultiIndex
 

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -57,7 +57,6 @@
     TimeDeltaColumn,
     arange,
     as_column,
-    column,
     full,
 )
 from cudf.core.column.categorical import (
@@ -202,7 +201,6 @@ def __getitem__(self, arg):
 
     @_cudf_nvtx_annotate
     def __setitem__(self, key, value):
-        from cudf.core.column import column
 
         if isinstance(key, tuple):
             key = list(key)
@@ -264,7 +262,7 @@ def __setitem__(self, key, value):
                 self._frame._column.dtype, (cudf.ListDtype, cudf.StructDtype)
             )
         ):
-            value = column.as_column(value)
+            value = as_column(value)
 
         if (
             (
@@ -568,7 +566,7 @@ def from_masked_array(cls, data, mask, null_count=None):
         4      14
         dtype: int64
         """
-        col = column.as_column(data).set_mask(mask)
+        col = as_column(data).set_mask(mask)
         return cls(data=col)
 
     @_cudf_nvtx_annotate
@@ -593,73 +591,33 @@ def __init__(
                 "to silence this warning.",
                 FutureWarning,
             )
-        if isinstance(data, pd.Series):
-            if name is None:
-                name = data.name
-            if isinstance(data.index, pd.MultiIndex):
-                index = cudf.from_pandas(data.index)
-            else:
-                index = as_index(data.index)
-        elif isinstance(data, pd.Index):
-            if name is None:
-                name = data.name
-            data = as_column(data, nan_as_null=nan_as_null, dtype=dtype)
-        elif isinstance(data, BaseIndex):
-            if name is None:
-                name = data.name
-            data = data._values
-            if dtype is not None:
-                data = data.astype(dtype)
+        index_from_data = None
+        name_from_data = None
+        if data is None:
+            data = {}
+
+        if isinstance(data, (pd.Series, pd.Index, BaseIndex, Series)):
+            if copy:
+                data = data.copy(deep=True)
+            name_from_data = data.name
+            column = as_column(data, nan_as_null=nan_as_null, dtype=dtype)
+            if isinstance(data, (pd.Series, Series)):
+                index_from_data = as_index(data.index)
         elif isinstance(data, ColumnAccessor):
             raise TypeError(
                 "Use cudf.Series._from_data for constructing a Series from "
                 "ColumnAccessor"
             )
-
-        if isinstance(data, Series):
-            if index is not None:
-                data = data.reindex(index)
-            else:
-                index = data._index
-            if name is None:
-                name = data.name
-            data = data._column
-            if copy:
-                data = data.copy(deep=True)
-            if dtype is not None:
-                data = data.astype(dtype)
-
-        if isinstance(data, dict):
+        elif isinstance(data, dict):
             if not data:
-                current_index = RangeIndex(0)
+                column = as_column(data, nan_as_null=nan_as_null, dtype=dtype)
+                index_from_data = RangeIndex(0)
             else:
-                current_index = data.keys()
-            if index is not None:
-                series = Series(
-                    list(data.values()),
-                    nan_as_null=nan_as_null,
-                    dtype=dtype,
-                    index=current_index,
-                )
-                new_index = as_index(index)
-                if not series.index.equals(new_index):
-                    series = series.reindex(new_index)
-                data = series._column
-                index = series._index
-            else:
-                data = column.as_column(
+                column = as_column(
                     list(data.values()), nan_as_null=nan_as_null, dtype=dtype
                 )
-                index = current_index
-        if data is None:
-            if index is not None:
-                data = column.column_empty(
-                    row_count=len(index), dtype=None, masked=True
-                )
-            else:
-                data = {}
-
-        if not isinstance(data, ColumnBase):
+                index_from_data = as_index(list(data.keys()))
+        else:
             # Using `getattr_static` to check if
             # `data` is on device memory and perform
             # a deep copy later. This is different
@@ -677,25 +635,42 @@ def __init__(
                 )
                 is property
             )
-            data = column.as_column(
+            column = as_column(
                 data,
                 nan_as_null=nan_as_null,
                 dtype=dtype,
                 length=len(index) if index is not None else None,
             )
             if copy and has_cai:
-                data = data.copy(deep=True)
-        else:
-            if dtype is not None:
-                data = data.astype(dtype)
+                column = column.copy(deep=True)
 
-        if index is not None and not isinstance(index, BaseIndex):
-            index = as_index(index)
+        assert isinstance(column, ColumnBase)
+
+        if dtype is not None:
+            column = column.astype(dtype)
 
-        assert isinstance(data, ColumnBase)
+        if name_from_data is not None and name is None:
+            name = name_from_data
 
-        super().__init__({name: data})
-        self._index = RangeIndex(len(data)) if index is None else index
+        if index is not None:
+            index = as_index(index)
+
+        if index_from_data is not None:
+            first_index = index_from_data
+            second_index = index
+        elif index is None:
+            first_index = RangeIndex(len(column))
+            second_index = None
+        else:
+            first_index = index
+            second_index = None
+
+        super().__init__({name: column}, index=first_index)
+        if second_index is not None:
+            # TODO: This there a better way to do this?
+            reindexed = self.reindex(index=second_index, copy=False)
+            self._data = reindexed._data
+            self._index = second_index
         self._check_data_index_length_match()
 
     @classmethod
@@ -717,7 +692,7 @@ def __contains__(self, item):
 
     @classmethod
     @_cudf_nvtx_annotate
-    def from_pandas(cls, s, nan_as_null=no_default):
+    def from_pandas(cls, s: pd.Series, nan_as_null=no_default):
         """
         Convert from a Pandas Series.
 
@@ -760,7 +735,7 @@ def from_pandas(cls, s, nan_as_null=no_default):
                 False if cudf.get_option("mode.pandas_compatible") else None
             )
         with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
+            warnings.simplefilter("ignore", FutureWarning)
             result = cls(s, nan_as_null=nan_as_null)
         return result
 
@@ -5250,16 +5225,16 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
         b = b.reindex(a.index)
         index = as_index(a.index)
 
-    a_col = column.as_column(a)
+    a_col = as_column(a)
     a_array = cupy.asarray(a_col.data_array_view(mode="read"))
 
-    b_col = column.as_column(b)
+    b_col = as_column(b)
     b_array = cupy.asarray(b_col.data_array_view(mode="read"))
 
     result = cupy.isclose(
         a=a_array, b=b_array, rtol=rtol, atol=atol, equal_nan=equal_nan
     )
-    result_col = column.as_column(result)
+    result_col = as_column(result)
 
     if a_col.null_count and b_col.null_count:
         a_nulls = a_col.isnull()

diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import decimal
 import hashlib
@@ -2683,3 +2683,42 @@ def test_series_duplicate_index_reindex():
         lfunc_args_and_kwargs=([10, 11, 12, 13], {}),
         rfunc_args_and_kwargs=([10, 11, 12, 13], {}),
     )
+
+
+@pytest.mark.parametrize(
+    "klass", [cudf.Series, cudf.Index, pd.Series, pd.Index]
+)
+def test_series_from_named_object_name_priority(klass):
+    result = cudf.Series(klass([1], name="a"), name="b")
+    assert result.name == "b"
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {"a": 1, "b": 2, "c": 3},
+        cudf.Series([1, 2, 3], index=list("abc")),
+        pd.Series([1, 2, 3], index=list("abc")),
+    ],
+)
+def test_series_from_object_with_index_index_arg_reindex(data):
+    result = cudf.Series(data, index=list("bca"))
+    expected = cudf.Series([2, 3, 1], index=list("bca"))
+    assert_eq(result, expected)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {0: 1, 1: 2, 2: 3},
+        cudf.Series([1, 2, 3]),
+        cudf.Index([1, 2, 3]),
+        pd.Series([1, 2, 3]),
+        pd.Index([1, 2, 3]),
+        [1, 2, 3],
+    ],
+)
+def test_series_dtype_astypes(data):
+    result = cudf.Series(data, dtype="float64")
+    expected = cudf.Series([1.0, 2.0, 3.0])
+    assert_eq(result, expected)