Fix Series and DataFrame constructors to validate index lengths (#…

…13122) Fixes: #12999, #13056 This PR fixes the `Series` and `DataFrame` constructors to validate the `data` & `index` lengths. This also contains fixes where `index` was being ignored in certain cases. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: #13122
rapidsai · Apr 14, 2023 · 2d70331 · 2d70331
1 parent 4f0c46e
commit 2d70331
Show file tree

Hide file tree

Showing 6 changed files with 102 additions and 22 deletions.
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -705,7 +705,6 @@ def __init__(
 
                     self._data = new_df._data
                     self._index = new_df._index
-                    self._check_data_index_length_match()
                 elif len(data) > 0 and isinstance(data[0], Series):
                     self._init_from_series_list(
                         data=data, columns=columns, index=index
@@ -714,27 +713,19 @@ def __init__(
                     self._init_from_list_like(
                         data, index=index, columns=columns
                     )
-
+                self._check_data_index_length_match()
             else:
                 if not is_dict_like(data):
                     raise TypeError("data must be list or dict-like")
 
                 self._init_from_dict_like(
                     data, index=index, columns=columns, nan_as_null=nan_as_null
                 )
+                self._check_data_index_length_match()
 
         if dtype:
             self._data = self.astype(dtype)._data
 
-    def _check_data_index_length_match(df: DataFrame) -> None:
-        # Validate that the number of rows in the data matches the index if the
-        # data is not empty. This is a helper for the constructor.
-        if df._data.nrows > 0 and df._data.nrows != len(df._index):
-            raise ValueError(
-                f"Shape of passed values is {df.shape}, indices imply "
-                f"({len(df._index)}, {df._num_columns})"
-            )
-
     @_cudf_nvtx_annotate
     def _init_from_series_list(self, data, columns, index):
         if index is None:

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
@@ -441,6 +441,15 @@ def _scan(self, op, axis=None, skipna=True):
             results[name] = getattr(result_col, op)()
         return self._from_data(results, self._index)
 
+    def _check_data_index_length_match(self: T) -> None:
+        # Validate that the number of rows in the data matches the index if the
+        # data is not empty. This is a helper for the constructor.
+        if self._data.nrows > 0 and self._data.nrows != len(self._index):
+            raise ValueError(
+                f"Length of values ({self._data.nrows}) does not "
+                f"match length of index ({len(self._index)})"
+            )
+
     def copy(self: T, deep: bool = True) -> T:
         """Make a copy of this object's indices and data.
 

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -346,7 +346,7 @@ class Series(SingleColumnFrame, IndexedFrame, Serializable):
     as null/NaN).
 
     Operations between Series (`+`, `-`, `/`, `*`, `**`) align
-    values based on their associated index values-– they need
+    values based on their associated index values, they need
     not be the same length. The result index will be the
     sorted union of the two indexes.
 
@@ -360,7 +360,7 @@ class Series(SingleColumnFrame, IndexedFrame, Serializable):
     index : array-like or Index (1d)
         Values must be hashable and have the same length
         as data. Non-unique index values are allowed. Will
-        default to RangeIndex (0, 1, 2, …, n) if not provided.
+        default to RangeIndex (0, 1, 2, ..., n) if not provided.
         If both a dict and index sequence are used, the index will
         override the keys found in the dict.
 
@@ -536,11 +536,24 @@ def __init__(
                 data = data.astype(dtype)
 
         if isinstance(data, dict):
-            index = data.keys()
-            data = column.as_column(
-                list(data.values()), nan_as_null=nan_as_null, dtype=dtype
-            )
-
+            current_index = data.keys()
+            if index is not None:
+                series = Series(
+                    list(data.values()),
+                    nan_as_null=nan_as_null,
+                    dtype=dtype,
+                    index=current_index,
+                )
+                new_index = as_index(index)
+                if not series.index.equals(new_index):
+                    series = series.reindex(new_index)
+                data = series._column
+                index = series._index
+            else:
+                data = column.as_column(
+                    list(data.values()), nan_as_null=nan_as_null, dtype=dtype
+                )
+                index = current_index
         if data is None:
             if index is not None:
                 data = column.column_empty(
@@ -571,6 +584,7 @@ def __init__(
                 data,
                 nan_as_null=nan_as_null,
                 dtype=dtype,
+                length=len(index) if index is not None else None,
             )
             if copy and has_cai:
                 data = data.copy(deep=True)
@@ -585,6 +599,7 @@ def __init__(
 
         super().__init__({name: data})
         self._index = RangeIndex(len(data)) if index is None else index
+        self._check_data_index_length_match()
 
     @classmethod
     @_cudf_nvtx_annotate

diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
@@ -19,6 +19,7 @@
 
 import cudf
 from cudf._lib.null_mask import bitmask_allocation_size_bytes
+from cudf.api.types import is_scalar
 from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
 from cudf.core.udf.strings_lowering import cast_string_view_to_udf_string
 from cudf.core.udf.strings_typing import StringView, string_view, udf_string
@@ -371,7 +372,9 @@ def assert_column_memory_ne(
 
 def _create_pandas_series(data=None, index=None, dtype=None, *args, **kwargs):
     # Wrapper around pd.Series using a float64 default dtype for empty data.
-    if dtype is None and (data is None or len(data) == 0):
+    if dtype is None and (
+        data is None or (not is_scalar(data) and len(data) == 0)
+    ):
         dtype = "float64"
     return pd.Series(data=data, index=index, dtype=dtype, *args, **kwargs)
 

diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
@@ -10074,16 +10074,37 @@ def test_dataframe_init_from_scalar_and_lists(data):
     assert_eq(expected, actual)
 
 
-def test_dataframe_init_length_error():
+@pytest.mark.parametrize(
+    "data,index",
+    [
+        ({"a": [1, 2, 3], "b": ["x", "y", "z", "z"], "c": 4}, None),
+        (
+            {
+                "a": [1, 2, 3],
+                "b": ["x", "y", "z"],
+            },
+            [10, 11],
+        ),
+        (
+            {
+                "a": [1, 2, 3],
+                "b": ["x", "y", "z"],
+            },
+            [10, 11],
+        ),
+        ([[10, 11], [12, 13]], ["a", "b", "c"]),
+    ],
+)
+def test_dataframe_init_length_error(data, index):
     assert_exceptions_equal(
         lfunc=pd.DataFrame,
         rfunc=cudf.DataFrame,
         lfunc_args_and_kwargs=(
             [],
-            {"data": {"a": [1, 2, 3], "b": ["x", "y", "z", "z"], "c": 4}},
+            {"data": data, "index": index},
         ),
         rfunc_args_and_kwargs=(
             [],
-            {"data": {"a": [1, 2, 3], "b": ["x", "y", "z", "z"], "c": 4}},
+            {"data": data, "index": index},
         ),
     )
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
@@ -2144,3 +2144,44 @@ def test_series_copy(data, copy):
 
     assert_eq(psr, gsr)
     assert_eq(new_psr, new_gsr)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {"a": 1, "b": 2, "c": 24, "d": 1010},
+        {"a": 1},
+    ],
+)
+@pytest.mark.parametrize(
+    "index", [None, ["b", "c"], ["d", "a", "c", "b"], ["a"]]
+)
+def test_series_init_dict_with_index(data, index):
+    pandas_series = pd.Series(data, index=index)
+    cudf_series = cudf.Series(data, index=index)
+
+    assert_eq(pandas_series, cudf_series)
+
+
+@pytest.mark.parametrize("data", ["abc", None, 1, 3.7])
+@pytest.mark.parametrize(
+    "index", [None, ["b", "c"], ["d", "a", "c", "b"], ["a"]]
+)
+def test_series_init_scalar_with_index(data, index):
+    pandas_series = _create_pandas_series(data, index=index)
+    cudf_series = cudf.Series(data, index=index)
+
+    assert_eq(
+        pandas_series,
+        cudf_series,
+        check_index_type=False if data is None and index is None else True,
+    )
+
+
+def test_series_init_error():
+    assert_exceptions_equal(
+        lfunc=pd.Series,
+        rfunc=cudf.Series,
+        lfunc_args_and_kwargs=([], {"data": [11], "index": [10, 11]}),
+        rfunc_args_and_kwargs=([], {"data": [11], "index": [10, 11]}),
+    )