diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 9d14d4bde7f..c6aba0d360a 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -705,7 +705,6 @@ def __init__( self._data = new_df._data self._index = new_df._index - self._check_data_index_length_match() elif len(data) > 0 and isinstance(data[0], Series): self._init_from_series_list( data=data, columns=columns, index=index @@ -714,7 +713,7 @@ def __init__( self._init_from_list_like( data, index=index, columns=columns ) - + self._check_data_index_length_match() else: if not is_dict_like(data): raise TypeError("data must be list or dict-like") @@ -722,19 +721,11 @@ def __init__( self._init_from_dict_like( data, index=index, columns=columns, nan_as_null=nan_as_null ) + self._check_data_index_length_match() if dtype: self._data = self.astype(dtype)._data - def _check_data_index_length_match(df: DataFrame) -> None: - # Validate that the number of rows in the data matches the index if the - # data is not empty. This is a helper for the constructor. - if df._data.nrows > 0 and df._data.nrows != len(df._index): - raise ValueError( - f"Shape of passed values is {df.shape}, indices imply " - f"({len(df._index)}, {df._num_columns})" - ) - @_cudf_nvtx_annotate def _init_from_series_list(self, data, columns, index): if index is None: diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 2992cb005e5..9ce7c1ac1d9 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -441,6 +441,15 @@ def _scan(self, op, axis=None, skipna=True): results[name] = getattr(result_col, op)() return self._from_data(results, self._index) + def _check_data_index_length_match(self: T) -> None: + # Validate that the number of rows in the data matches the index if the + # data is not empty. This is a helper for the constructor. + if self._data.nrows > 0 and self._data.nrows != len(self._index): + raise ValueError( + f"Length of values ({self._data.nrows}) does not " + f"match length of index ({len(self._index)})" + ) + def copy(self: T, deep: bool = True) -> T: """Make a copy of this object's indices and data. diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index cf6a4b62726..9e07d135926 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -346,7 +346,7 @@ class Series(SingleColumnFrame, IndexedFrame, Serializable): as null/NaN). Operations between Series (`+`, `-`, `/`, `*`, `**`) align - values based on their associated index values-– they need + values based on their associated index values, they need not be the same length. The result index will be the sorted union of the two indexes. @@ -360,7 +360,7 @@ class Series(SingleColumnFrame, IndexedFrame, Serializable): index : array-like or Index (1d) Values must be hashable and have the same length as data. Non-unique index values are allowed. Will - default to RangeIndex (0, 1, 2, …, n) if not provided. + default to RangeIndex (0, 1, 2, ..., n) if not provided. If both a dict and index sequence are used, the index will override the keys found in the dict. @@ -536,11 +536,24 @@ def __init__( data = data.astype(dtype) if isinstance(data, dict): - index = data.keys() - data = column.as_column( - list(data.values()), nan_as_null=nan_as_null, dtype=dtype - ) - + current_index = data.keys() + if index is not None: + series = Series( + list(data.values()), + nan_as_null=nan_as_null, + dtype=dtype, + index=current_index, + ) + new_index = as_index(index) + if not series.index.equals(new_index): + series = series.reindex(new_index) + data = series._column + index = series._index + else: + data = column.as_column( + list(data.values()), nan_as_null=nan_as_null, dtype=dtype + ) + index = current_index if data is None: if index is not None: data = column.column_empty( @@ -571,6 +584,7 @@ def __init__( data, nan_as_null=nan_as_null, dtype=dtype, + length=len(index) if index is not None else None, ) if copy and has_cai: data = data.copy(deep=True) @@ -585,6 +599,7 @@ def __init__( super().__init__({name: data}) self._index = RangeIndex(len(data)) if index is None else index + self._check_data_index_length_match() @classmethod @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index 24a0b69a48b..0489329d801 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -19,6 +19,7 @@ import cudf from cudf._lib.null_mask import bitmask_allocation_size_bytes +from cudf.api.types import is_scalar from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion from cudf.core.udf.strings_lowering import cast_string_view_to_udf_string from cudf.core.udf.strings_typing import StringView, string_view, udf_string @@ -371,7 +372,9 @@ def assert_column_memory_ne( def _create_pandas_series(data=None, index=None, dtype=None, *args, **kwargs): # Wrapper around pd.Series using a float64 default dtype for empty data. - if dtype is None and (data is None or len(data) == 0): + if dtype is None and ( + data is None or (not is_scalar(data) and len(data) == 0) + ): dtype = "float64" return pd.Series(data=data, index=index, dtype=dtype, *args, **kwargs) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 609f5eb488b..70b7f74017c 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10074,16 +10074,37 @@ def test_dataframe_init_from_scalar_and_lists(data): assert_eq(expected, actual) -def test_dataframe_init_length_error(): +@pytest.mark.parametrize( + "data,index", + [ + ({"a": [1, 2, 3], "b": ["x", "y", "z", "z"], "c": 4}, None), + ( + { + "a": [1, 2, 3], + "b": ["x", "y", "z"], + }, + [10, 11], + ), + ( + { + "a": [1, 2, 3], + "b": ["x", "y", "z"], + }, + [10, 11], + ), + ([[10, 11], [12, 13]], ["a", "b", "c"]), + ], +) +def test_dataframe_init_length_error(data, index): assert_exceptions_equal( lfunc=pd.DataFrame, rfunc=cudf.DataFrame, lfunc_args_and_kwargs=( [], - {"data": {"a": [1, 2, 3], "b": ["x", "y", "z", "z"], "c": 4}}, + {"data": data, "index": index}, ), rfunc_args_and_kwargs=( [], - {"data": {"a": [1, 2, 3], "b": ["x", "y", "z", "z"], "c": 4}}, + {"data": data, "index": index}, ), ) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 7123069d5b8..3bb669dede6 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2144,3 +2144,44 @@ def test_series_copy(data, copy): assert_eq(psr, gsr) assert_eq(new_psr, new_gsr) + + +@pytest.mark.parametrize( + "data", + [ + {"a": 1, "b": 2, "c": 24, "d": 1010}, + {"a": 1}, + ], +) +@pytest.mark.parametrize( + "index", [None, ["b", "c"], ["d", "a", "c", "b"], ["a"]] +) +def test_series_init_dict_with_index(data, index): + pandas_series = pd.Series(data, index=index) + cudf_series = cudf.Series(data, index=index) + + assert_eq(pandas_series, cudf_series) + + +@pytest.mark.parametrize("data", ["abc", None, 1, 3.7]) +@pytest.mark.parametrize( + "index", [None, ["b", "c"], ["d", "a", "c", "b"], ["a"]] +) +def test_series_init_scalar_with_index(data, index): + pandas_series = _create_pandas_series(data, index=index) + cudf_series = cudf.Series(data, index=index) + + assert_eq( + pandas_series, + cudf_series, + check_index_type=False if data is None and index is None else True, + ) + + +def test_series_init_error(): + assert_exceptions_equal( + lfunc=pd.Series, + rfunc=cudf.Series, + lfunc_args_and_kwargs=([], {"data": [11], "index": [10, 11]}), + rfunc_args_and_kwargs=([], {"data": [11], "index": [10, 11]}), + )