From c4e5e8cb65272c00bdedf6ee5d33744fc2725c53 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 8 Jan 2024 12:21:54 -1000 Subject: [PATCH] Refactor cudf.Series.__init__ (#14450) I found a few issues related to reindexing and name priority when `index=` and `name=` are given, so added unit tests for those. Additionally added some typing in `from_pandas` methods The main approach here is to collect the potential column from the `if/elif` branchs first, call `super().__init__({name: columns}, index=index)`, and then apply the additional keywords to the result Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Michael Wang (https://github.com/isVoid) URL: https://github.com/rapidsai/cudf/pull/14450 --- python/cudf/cudf/core/_base_index.py | 4 +- python/cudf/cudf/core/multiindex.py | 4 +- python/cudf/cudf/core/series.py | 131 +++++++++++--------------- python/cudf/cudf/tests/test_series.py | 41 +++++++- 4 files changed, 97 insertions(+), 83 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index fcfe8a21f05..8d2506403d4 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -1836,7 +1836,7 @@ def __array_function__(self, func, types, args, kwargs): return NotImplemented @classmethod - def from_pandas(cls, index, nan_as_null=no_default): + def from_pandas(cls, index: pd.Index, nan_as_null=no_default): """ Convert from a Pandas Index. diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 4f98a878792..489f0e74dd6 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -1601,7 +1601,7 @@ def to_pandas(self, *, nullable: bool = False) -> pd.MultiIndex: @classmethod @_cudf_nvtx_annotate - def from_pandas(cls, multiindex, nan_as_null=no_default): + def from_pandas(cls, multiindex: pd.MultiIndex, nan_as_null=no_default): """ Convert from a Pandas MultiIndex diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 3562c83e797..fcb4e77f6a5 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -57,7 +57,6 @@ TimeDeltaColumn, arange, as_column, - column, full, ) from cudf.core.column.categorical import ( @@ -202,7 +201,6 @@ def __getitem__(self, arg): @_cudf_nvtx_annotate def __setitem__(self, key, value): - from cudf.core.column import column if isinstance(key, tuple): key = list(key) @@ -264,7 +262,7 @@ def __setitem__(self, key, value): self._frame._column.dtype, (cudf.ListDtype, cudf.StructDtype) ) ): - value = column.as_column(value) + value = as_column(value) if ( ( @@ -568,7 +566,7 @@ def from_masked_array(cls, data, mask, null_count=None): 4 14 dtype: int64 """ - col = column.as_column(data).set_mask(mask) + col = as_column(data).set_mask(mask) return cls(data=col) @_cudf_nvtx_annotate @@ -593,73 +591,33 @@ def __init__( "to silence this warning.", FutureWarning, ) - if isinstance(data, pd.Series): - if name is None: - name = data.name - if isinstance(data.index, pd.MultiIndex): - index = cudf.from_pandas(data.index) - else: - index = as_index(data.index) - elif isinstance(data, pd.Index): - if name is None: - name = data.name - data = as_column(data, nan_as_null=nan_as_null, dtype=dtype) - elif isinstance(data, BaseIndex): - if name is None: - name = data.name - data = data._values - if dtype is not None: - data = data.astype(dtype) + index_from_data = None + name_from_data = None + if data is None: + data = {} + + if isinstance(data, (pd.Series, pd.Index, BaseIndex, Series)): + if copy: + data = data.copy(deep=True) + name_from_data = data.name + column = as_column(data, nan_as_null=nan_as_null, dtype=dtype) + if isinstance(data, (pd.Series, Series)): + index_from_data = as_index(data.index) elif isinstance(data, ColumnAccessor): raise TypeError( "Use cudf.Series._from_data for constructing a Series from " "ColumnAccessor" ) - - if isinstance(data, Series): - if index is not None: - data = data.reindex(index) - else: - index = data._index - if name is None: - name = data.name - data = data._column - if copy: - data = data.copy(deep=True) - if dtype is not None: - data = data.astype(dtype) - - if isinstance(data, dict): + elif isinstance(data, dict): if not data: - current_index = RangeIndex(0) + column = as_column(data, nan_as_null=nan_as_null, dtype=dtype) + index_from_data = RangeIndex(0) else: - current_index = data.keys() - if index is not None: - series = Series( - list(data.values()), - nan_as_null=nan_as_null, - dtype=dtype, - index=current_index, - ) - new_index = as_index(index) - if not series.index.equals(new_index): - series = series.reindex(new_index) - data = series._column - index = series._index - else: - data = column.as_column( + column = as_column( list(data.values()), nan_as_null=nan_as_null, dtype=dtype ) - index = current_index - if data is None: - if index is not None: - data = column.column_empty( - row_count=len(index), dtype=None, masked=True - ) - else: - data = {} - - if not isinstance(data, ColumnBase): + index_from_data = as_index(list(data.keys())) + else: # Using `getattr_static` to check if # `data` is on device memory and perform # a deep copy later. This is different @@ -677,25 +635,42 @@ def __init__( ) is property ) - data = column.as_column( + column = as_column( data, nan_as_null=nan_as_null, dtype=dtype, length=len(index) if index is not None else None, ) if copy and has_cai: - data = data.copy(deep=True) - else: - if dtype is not None: - data = data.astype(dtype) + column = column.copy(deep=True) - if index is not None and not isinstance(index, BaseIndex): - index = as_index(index) + assert isinstance(column, ColumnBase) + + if dtype is not None: + column = column.astype(dtype) - assert isinstance(data, ColumnBase) + if name_from_data is not None and name is None: + name = name_from_data - super().__init__({name: data}) - self._index = RangeIndex(len(data)) if index is None else index + if index is not None: + index = as_index(index) + + if index_from_data is not None: + first_index = index_from_data + second_index = index + elif index is None: + first_index = RangeIndex(len(column)) + second_index = None + else: + first_index = index + second_index = None + + super().__init__({name: column}, index=first_index) + if second_index is not None: + # TODO: This there a better way to do this? + reindexed = self.reindex(index=second_index, copy=False) + self._data = reindexed._data + self._index = second_index self._check_data_index_length_match() @classmethod @@ -717,7 +692,7 @@ def __contains__(self, item): @classmethod @_cudf_nvtx_annotate - def from_pandas(cls, s, nan_as_null=no_default): + def from_pandas(cls, s: pd.Series, nan_as_null=no_default): """ Convert from a Pandas Series. @@ -760,7 +735,7 @@ def from_pandas(cls, s, nan_as_null=no_default): False if cudf.get_option("mode.pandas_compatible") else None ) with warnings.catch_warnings(): - warnings.simplefilter("ignore") + warnings.simplefilter("ignore", FutureWarning) result = cls(s, nan_as_null=nan_as_null) return result @@ -5250,16 +5225,16 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False): b = b.reindex(a.index) index = as_index(a.index) - a_col = column.as_column(a) + a_col = as_column(a) a_array = cupy.asarray(a_col.data_array_view(mode="read")) - b_col = column.as_column(b) + b_col = as_column(b) b_array = cupy.asarray(b_col.data_array_view(mode="read")) result = cupy.isclose( a=a_array, b=b_array, rtol=rtol, atol=atol, equal_nan=equal_nan ) - result_col = column.as_column(result) + result_col = as_column(result) if a_col.null_count and b_col.null_count: a_nulls = a_col.isnull() diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 39da34fa89c..248ac201e12 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import decimal import hashlib @@ -2683,3 +2683,42 @@ def test_series_duplicate_index_reindex(): lfunc_args_and_kwargs=([10, 11, 12, 13], {}), rfunc_args_and_kwargs=([10, 11, 12, 13], {}), ) + + +@pytest.mark.parametrize( + "klass", [cudf.Series, cudf.Index, pd.Series, pd.Index] +) +def test_series_from_named_object_name_priority(klass): + result = cudf.Series(klass([1], name="a"), name="b") + assert result.name == "b" + + +@pytest.mark.parametrize( + "data", + [ + {"a": 1, "b": 2, "c": 3}, + cudf.Series([1, 2, 3], index=list("abc")), + pd.Series([1, 2, 3], index=list("abc")), + ], +) +def test_series_from_object_with_index_index_arg_reindex(data): + result = cudf.Series(data, index=list("bca")) + expected = cudf.Series([2, 3, 1], index=list("bca")) + assert_eq(result, expected) + + +@pytest.mark.parametrize( + "data", + [ + {0: 1, 1: 2, 2: 3}, + cudf.Series([1, 2, 3]), + cudf.Index([1, 2, 3]), + pd.Series([1, 2, 3]), + pd.Index([1, 2, 3]), + [1, 2, 3], + ], +) +def test_series_dtype_astypes(data): + result = cudf.Series(data, dtype="float64") + expected = cudf.Series([1.0, 2.0, 3.0]) + assert_eq(result, expected)