Skip to content

Commit

Permalink
Fix Series and DataFrame constructors to validate index lengths (#…
Browse files Browse the repository at this point in the history
…13122)

Fixes: #12999, #13056

This PR fixes the `Series` and `DataFrame` constructors to validate the `data` & `index` lengths. This also contains fixes where `index` was being ignored in certain cases.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #13122
  • Loading branch information
galipremsagar authored Apr 14, 2023
1 parent 4f0c46e commit 2d70331
Show file tree
Hide file tree
Showing 6 changed files with 102 additions and 22 deletions.
13 changes: 2 additions & 11 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -705,7 +705,6 @@ def __init__(

self._data = new_df._data
self._index = new_df._index
self._check_data_index_length_match()
elif len(data) > 0 and isinstance(data[0], Series):
self._init_from_series_list(
data=data, columns=columns, index=index
Expand All @@ -714,27 +713,19 @@ def __init__(
self._init_from_list_like(
data, index=index, columns=columns
)

self._check_data_index_length_match()
else:
if not is_dict_like(data):
raise TypeError("data must be list or dict-like")

self._init_from_dict_like(
data, index=index, columns=columns, nan_as_null=nan_as_null
)
self._check_data_index_length_match()

if dtype:
self._data = self.astype(dtype)._data

def _check_data_index_length_match(df: DataFrame) -> None:
# Validate that the number of rows in the data matches the index if the
# data is not empty. This is a helper for the constructor.
if df._data.nrows > 0 and df._data.nrows != len(df._index):
raise ValueError(
f"Shape of passed values is {df.shape}, indices imply "
f"({len(df._index)}, {df._num_columns})"
)

@_cudf_nvtx_annotate
def _init_from_series_list(self, data, columns, index):
if index is None:
Expand Down
9 changes: 9 additions & 0 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,15 @@ def _scan(self, op, axis=None, skipna=True):
results[name] = getattr(result_col, op)()
return self._from_data(results, self._index)

def _check_data_index_length_match(self: T) -> None:
# Validate that the number of rows in the data matches the index if the
# data is not empty. This is a helper for the constructor.
if self._data.nrows > 0 and self._data.nrows != len(self._index):
raise ValueError(
f"Length of values ({self._data.nrows}) does not "
f"match length of index ({len(self._index)})"
)

def copy(self: T, deep: bool = True) -> T:
"""Make a copy of this object's indices and data.
Expand Down
29 changes: 22 additions & 7 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ class Series(SingleColumnFrame, IndexedFrame, Serializable):
as null/NaN).
Operations between Series (`+`, `-`, `/`, `*`, `**`) align
values based on their associated index values-– they need
values based on their associated index values, they need
not be the same length. The result index will be the
sorted union of the two indexes.
Expand All @@ -360,7 +360,7 @@ class Series(SingleColumnFrame, IndexedFrame, Serializable):
index : array-like or Index (1d)
Values must be hashable and have the same length
as data. Non-unique index values are allowed. Will
default to RangeIndex (0, 1, 2, , n) if not provided.
default to RangeIndex (0, 1, 2, ..., n) if not provided.
If both a dict and index sequence are used, the index will
override the keys found in the dict.
Expand Down Expand Up @@ -536,11 +536,24 @@ def __init__(
data = data.astype(dtype)

if isinstance(data, dict):
index = data.keys()
data = column.as_column(
list(data.values()), nan_as_null=nan_as_null, dtype=dtype
)

current_index = data.keys()
if index is not None:
series = Series(
list(data.values()),
nan_as_null=nan_as_null,
dtype=dtype,
index=current_index,
)
new_index = as_index(index)
if not series.index.equals(new_index):
series = series.reindex(new_index)
data = series._column
index = series._index
else:
data = column.as_column(
list(data.values()), nan_as_null=nan_as_null, dtype=dtype
)
index = current_index
if data is None:
if index is not None:
data = column.column_empty(
Expand Down Expand Up @@ -571,6 +584,7 @@ def __init__(
data,
nan_as_null=nan_as_null,
dtype=dtype,
length=len(index) if index is not None else None,
)
if copy and has_cai:
data = data.copy(deep=True)
Expand All @@ -585,6 +599,7 @@ def __init__(

super().__init__({name: data})
self._index = RangeIndex(len(data)) if index is None else index
self._check_data_index_length_match()

@classmethod
@_cudf_nvtx_annotate
Expand Down
5 changes: 4 additions & 1 deletion python/cudf/cudf/testing/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import cudf
from cudf._lib.null_mask import bitmask_allocation_size_bytes
from cudf.api.types import is_scalar
from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
from cudf.core.udf.strings_lowering import cast_string_view_to_udf_string
from cudf.core.udf.strings_typing import StringView, string_view, udf_string
Expand Down Expand Up @@ -371,7 +372,9 @@ def assert_column_memory_ne(

def _create_pandas_series(data=None, index=None, dtype=None, *args, **kwargs):
# Wrapper around pd.Series using a float64 default dtype for empty data.
if dtype is None and (data is None or len(data) == 0):
if dtype is None and (
data is None or (not is_scalar(data) and len(data) == 0)
):
dtype = "float64"
return pd.Series(data=data, index=index, dtype=dtype, *args, **kwargs)

Expand Down
27 changes: 24 additions & 3 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10074,16 +10074,37 @@ def test_dataframe_init_from_scalar_and_lists(data):
assert_eq(expected, actual)


def test_dataframe_init_length_error():
@pytest.mark.parametrize(
"data,index",
[
({"a": [1, 2, 3], "b": ["x", "y", "z", "z"], "c": 4}, None),
(
{
"a": [1, 2, 3],
"b": ["x", "y", "z"],
},
[10, 11],
),
(
{
"a": [1, 2, 3],
"b": ["x", "y", "z"],
},
[10, 11],
),
([[10, 11], [12, 13]], ["a", "b", "c"]),
],
)
def test_dataframe_init_length_error(data, index):
assert_exceptions_equal(
lfunc=pd.DataFrame,
rfunc=cudf.DataFrame,
lfunc_args_and_kwargs=(
[],
{"data": {"a": [1, 2, 3], "b": ["x", "y", "z", "z"], "c": 4}},
{"data": data, "index": index},
),
rfunc_args_and_kwargs=(
[],
{"data": {"a": [1, 2, 3], "b": ["x", "y", "z", "z"], "c": 4}},
{"data": data, "index": index},
),
)
41 changes: 41 additions & 0 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2144,3 +2144,44 @@ def test_series_copy(data, copy):

assert_eq(psr, gsr)
assert_eq(new_psr, new_gsr)


@pytest.mark.parametrize(
"data",
[
{"a": 1, "b": 2, "c": 24, "d": 1010},
{"a": 1},
],
)
@pytest.mark.parametrize(
"index", [None, ["b", "c"], ["d", "a", "c", "b"], ["a"]]
)
def test_series_init_dict_with_index(data, index):
pandas_series = pd.Series(data, index=index)
cudf_series = cudf.Series(data, index=index)

assert_eq(pandas_series, cudf_series)


@pytest.mark.parametrize("data", ["abc", None, 1, 3.7])
@pytest.mark.parametrize(
"index", [None, ["b", "c"], ["d", "a", "c", "b"], ["a"]]
)
def test_series_init_scalar_with_index(data, index):
pandas_series = _create_pandas_series(data, index=index)
cudf_series = cudf.Series(data, index=index)

assert_eq(
pandas_series,
cudf_series,
check_index_type=False if data is None and index is None else True,
)


def test_series_init_error():
assert_exceptions_equal(
lfunc=pd.Series,
rfunc=cudf.Series,
lfunc_args_and_kwargs=([], {"data": [11], "index": [10, 11]}),
rfunc_args_and_kwargs=([], {"data": [11], "index": [10, 11]}),
)

0 comments on commit 2d70331

Please sign in to comment.