Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Fix Series and DataFrame constructors to validate index lengths #13122

Merged
merged 6 commits into from
Apr 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 2 additions & 11 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -705,7 +705,6 @@ def __init__(

self._data = new_df._data
self._index = new_df._index
self._check_data_index_length_match()
elif len(data) > 0 and isinstance(data[0], Series):
self._init_from_series_list(
data=data, columns=columns, index=index
Expand All @@ -714,27 +713,19 @@ def __init__(
self._init_from_list_like(
data, index=index, columns=columns
)

self._check_data_index_length_match()
else:
if not is_dict_like(data):
raise TypeError("data must be list or dict-like")

self._init_from_dict_like(
data, index=index, columns=columns, nan_as_null=nan_as_null
)
self._check_data_index_length_match()

if dtype:
self._data = self.astype(dtype)._data

def _check_data_index_length_match(df: DataFrame) -> None:
# Validate that the number of rows in the data matches the index if the
# data is not empty. This is a helper for the constructor.
if df._data.nrows > 0 and df._data.nrows != len(df._index):
raise ValueError(
f"Shape of passed values is {df.shape}, indices imply "
f"({len(df._index)}, {df._num_columns})"
)

@_cudf_nvtx_annotate
def _init_from_series_list(self, data, columns, index):
if index is None:
Expand Down
9 changes: 9 additions & 0 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,15 @@ def _scan(self, op, axis=None, skipna=True):
results[name] = getattr(result_col, op)()
return self._from_data(results, self._index)

def _check_data_index_length_match(self: T) -> None:
# Validate that the number of rows in the data matches the index if the
# data is not empty. This is a helper for the constructor.
if self._data.nrows > 0 and self._data.nrows != len(self._index):
raise ValueError(
f"Length of values ({self._data.nrows}) does not "
f"match length of index ({len(self._index)})"
)

def copy(self: T, deep: bool = True) -> T:
"""Make a copy of this object's indices and data.

Expand Down
29 changes: 22 additions & 7 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ class Series(SingleColumnFrame, IndexedFrame, Serializable):
as null/NaN).

Operations between Series (`+`, `-`, `/`, `*`, `**`) align
values based on their associated index values-– they need
values based on their associated index values, they need
not be the same length. The result index will be the
sorted union of the two indexes.

Expand All @@ -360,7 +360,7 @@ class Series(SingleColumnFrame, IndexedFrame, Serializable):
index : array-like or Index (1d)
Values must be hashable and have the same length
as data. Non-unique index values are allowed. Will
default to RangeIndex (0, 1, 2, , n) if not provided.
default to RangeIndex (0, 1, 2, ..., n) if not provided.
If both a dict and index sequence are used, the index will
override the keys found in the dict.

Expand Down Expand Up @@ -536,11 +536,24 @@ def __init__(
data = data.astype(dtype)

if isinstance(data, dict):
index = data.keys()
data = column.as_column(
list(data.values()), nan_as_null=nan_as_null, dtype=dtype
)

current_index = data.keys()
if index is not None:
series = Series(
list(data.values()),
nan_as_null=nan_as_null,
dtype=dtype,
index=current_index,
)
new_index = as_index(index)
if not series.index.equals(new_index):
series = series.reindex(new_index)
data = series._column
index = series._index
vyasr marked this conversation as resolved.
Show resolved Hide resolved
else:
data = column.as_column(
list(data.values()), nan_as_null=nan_as_null, dtype=dtype
)
index = current_index
if data is None:
if index is not None:
data = column.column_empty(
Expand Down Expand Up @@ -571,6 +584,7 @@ def __init__(
data,
nan_as_null=nan_as_null,
dtype=dtype,
length=len(index) if index is not None else None,
)
if copy and has_cai:
data = data.copy(deep=True)
Expand All @@ -585,6 +599,7 @@ def __init__(

super().__init__({name: data})
self._index = RangeIndex(len(data)) if index is None else index
self._check_data_index_length_match()

@classmethod
@_cudf_nvtx_annotate
Expand Down
5 changes: 4 additions & 1 deletion python/cudf/cudf/testing/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import cudf
from cudf._lib.null_mask import bitmask_allocation_size_bytes
from cudf.api.types import is_scalar
from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
from cudf.core.udf.strings_lowering import cast_string_view_to_udf_string
from cudf.core.udf.strings_typing import StringView, string_view, udf_string
Expand Down Expand Up @@ -371,7 +372,9 @@ def assert_column_memory_ne(

def _create_pandas_series(data=None, index=None, dtype=None, *args, **kwargs):
# Wrapper around pd.Series using a float64 default dtype for empty data.
if dtype is None and (data is None or len(data) == 0):
if dtype is None and (
data is None or (not is_scalar(data) and len(data) == 0)
):
dtype = "float64"
return pd.Series(data=data, index=index, dtype=dtype, *args, **kwargs)

Expand Down
27 changes: 24 additions & 3 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10074,16 +10074,37 @@ def test_dataframe_init_from_scalar_and_lists(data):
assert_eq(expected, actual)


def test_dataframe_init_length_error():
@pytest.mark.parametrize(
"data,index",
[
({"a": [1, 2, 3], "b": ["x", "y", "z", "z"], "c": 4}, None),
(
{
"a": [1, 2, 3],
"b": ["x", "y", "z"],
},
[10, 11],
),
(
{
"a": [1, 2, 3],
"b": ["x", "y", "z"],
},
[10, 11],
),
([[10, 11], [12, 13]], ["a", "b", "c"]),
],
)
def test_dataframe_init_length_error(data, index):
assert_exceptions_equal(
lfunc=pd.DataFrame,
rfunc=cudf.DataFrame,
lfunc_args_and_kwargs=(
[],
{"data": {"a": [1, 2, 3], "b": ["x", "y", "z", "z"], "c": 4}},
{"data": data, "index": index},
),
rfunc_args_and_kwargs=(
[],
{"data": {"a": [1, 2, 3], "b": ["x", "y", "z", "z"], "c": 4}},
{"data": data, "index": index},
),
)
41 changes: 41 additions & 0 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2144,3 +2144,44 @@ def test_series_copy(data, copy):

assert_eq(psr, gsr)
assert_eq(new_psr, new_gsr)


@pytest.mark.parametrize(
"data",
[
{"a": 1, "b": 2, "c": 24, "d": 1010},
{"a": 1},
],
)
@pytest.mark.parametrize(
"index", [None, ["b", "c"], ["d", "a", "c", "b"], ["a"]]
)
def test_series_init_dict_with_index(data, index):
pandas_series = pd.Series(data, index=index)
cudf_series = cudf.Series(data, index=index)

assert_eq(pandas_series, cudf_series)


@pytest.mark.parametrize("data", ["abc", None, 1, 3.7])
@pytest.mark.parametrize(
"index", [None, ["b", "c"], ["d", "a", "c", "b"], ["a"]]
)
def test_series_init_scalar_with_index(data, index):
pandas_series = _create_pandas_series(data, index=index)
cudf_series = cudf.Series(data, index=index)

assert_eq(
pandas_series,
cudf_series,
check_index_type=False if data is None and index is None else True,
)


def test_series_init_error():
assert_exceptions_equal(
lfunc=pd.Series,
rfunc=cudf.Series,
lfunc_args_and_kwargs=([], {"data": [11], "index": [10, 11]}),
rfunc_args_and_kwargs=([], {"data": [11], "index": [10, 11]}),
)