Skip to content

Commit

Permalink
Fix type of empty Index and raise warning in Series constructor (#…
Browse files Browse the repository at this point in the history
…14116)

Fixes: #14091 
This PR fixes empty inputs dtype in `Index` to default to `str` instead of `float64`. Another change is there is a deprecation warning for `Series` constructor to match pandas.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: #14116
  • Loading branch information
galipremsagar authored Sep 20, 2023
1 parent 7b0693f commit f7ca051
Show file tree
Hide file tree
Showing 12 changed files with 148 additions and 63 deletions.
21 changes: 14 additions & 7 deletions python/cudf/cudf/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
import cupy as cp
import numpy as np

from cudf.core.column import as_column
from cudf.core.copy_types import BooleanMask
from cudf.core.index import Index, RangeIndex
from cudf.core.index import RangeIndex, as_index
from cudf.core.indexed_frame import IndexedFrame
from cudf.core.scalar import Scalar
from cudf.core.series import Series
from cudf.options import get_option
from cudf.utils.dtypes import can_convert_to_column


def factorize(
Expand Down Expand Up @@ -95,7 +96,13 @@ def factorize(

return_cupy_array = isinstance(values, cp.ndarray)

values = Series(values)
if not can_convert_to_column(values):
raise TypeError(
"'values' can only be a Series, Index, or CuPy array, "
f"got {type(values)}"
)

values = as_column(values)

if na_sentinel is None:
na_sentinel = (
Expand Down Expand Up @@ -128,22 +135,22 @@ def factorize(
warnings.warn("size_hint is not applicable for cudf.factorize")

if use_na_sentinel is None or use_na_sentinel:
cats = values._column.dropna()
cats = values.dropna()
else:
cats = values._column
cats = values

cats = cats.unique().astype(values.dtype)

if sort:
cats = cats.sort_values()

labels = values._column._label_encoding(
labels = values._label_encoding(
cats=cats,
na_sentinel=Scalar(na_sentinel),
dtype="int64" if get_option("mode.pandas_compatible") else None,
).values

return labels, cats.values if return_cupy_array else Index(cats)
return labels, cats.values if return_cupy_array else as_index(cats)


def _linear_interpolation(column, index=None):
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5607,7 +5607,7 @@ def quantile(
result.name = q
return result

result.index = list(map(float, qs))
result.index = cudf.Index(list(map(float, qs)), dtype="float64")
return result

@_cudf_nvtx_annotate
Expand Down
12 changes: 11 additions & 1 deletion python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
List,
MutableMapping,
Optional,
Sequence,
Tuple,
Type,
Union,
Expand Down Expand Up @@ -3467,14 +3468,23 @@ def __new__(
"tupleize_cols != True is not yet supported"
)

return as_index(
res = as_index(
data,
copy=copy,
dtype=dtype,
name=name,
nan_as_null=nan_as_null,
**kwargs,
)
if (
isinstance(data, Sequence)
and not isinstance(data, range)
and len(data) == 0
and dtype is None
and getattr(data, "dtype", None) is None
):
return res.astype("str")
return res

@classmethod
@_cudf_nvtx_annotate
Expand Down
32 changes: 29 additions & 3 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,16 @@
import warnings
from collections import abc
from shutil import get_terminal_size
from typing import Any, Dict, MutableMapping, Optional, Set, Tuple, Union
from typing import (
Any,
Dict,
MutableMapping,
Optional,
Sequence,
Set,
Tuple,
Union,
)

import cupy
import numpy as np
Expand Down Expand Up @@ -500,6 +509,18 @@ def __init__(
copy=False,
nan_as_null=True,
):
if (
isinstance(data, Sequence)
and len(data) == 0
and dtype is None
and getattr(data, "dtype", None) is None
):
warnings.warn(
"The default dtype for empty Series will be 'object' instead "
"of 'float64' in a future version. Specify a dtype explicitly "
"to silence this warning.",
FutureWarning,
)
if isinstance(data, pd.Series):
if name is None:
name = data.name
Expand Down Expand Up @@ -656,7 +677,10 @@ def from_pandas(cls, s, nan_as_null=None):
3 NaN
dtype: float64
"""
return cls(s, nan_as_null=nan_as_null)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
result = cls(s, nan_as_null=nan_as_null)
return result

@property # type: ignore
@_cudf_nvtx_annotate
Expand Down Expand Up @@ -2642,7 +2666,9 @@ def mode(self, dropna=True):
if len(val_counts) > 0:
val_counts = val_counts[val_counts == val_counts.iloc[0]]

return Series(val_counts.index.sort_values(), name=self.name)
return Series._from_data(
{self.name: val_counts.index.sort_values()}, name=self.name
)

@_cudf_nvtx_annotate
def round(self, decimals=0, how="half_even"):
Expand Down
21 changes: 19 additions & 2 deletions python/cudf/cudf/testing/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,15 +397,32 @@ def assert_column_memory_ne(
raise AssertionError("lhs and rhs holds the same memory.")


def _create_pandas_series(data=None, index=None, dtype=None, *args, **kwargs):
# Wrapper around pd.Series using a float64 default dtype for empty data.
def _create_pandas_series_float64_default(
data=None, index=None, dtype=None, *args, **kwargs
):
# Wrapper around pd.Series using a float64
# default dtype for empty data to silence warnings.
# TODO: Remove this in pandas-2.0 upgrade
if dtype is None and (
data is None or (not is_scalar(data) and len(data) == 0)
):
dtype = "float64"
return pd.Series(data=data, index=index, dtype=dtype, *args, **kwargs)


def _create_cudf_series_float64_default(
data=None, index=None, dtype=None, *args, **kwargs
):
# Wrapper around cudf.Series using a float64
# default dtype for empty data to silence warnings.
# TODO: Remove this in pandas-2.0 upgrade
if dtype is None and (
data is None or (not is_scalar(data) and len(data) == 0)
):
dtype = "float64"
return cudf.Series(data=data, index=index, dtype=dtype, *args, **kwargs)


parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize(
"left_dtype,right_dtype",
list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)),
Expand Down
19 changes: 10 additions & 9 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
ALL_TYPES,
DATETIME_TYPES,
NUMERIC_TYPES,
_create_cudf_series_float64_default,
assert_eq,
assert_exceptions_equal,
assert_neq,
Expand Down Expand Up @@ -2000,8 +2001,8 @@ def test_series_shape():


def test_series_shape_empty():
ps = pd.Series(dtype="float64")
cs = cudf.Series([])
ps = pd.Series([], dtype="float64")
cs = cudf.Series([], dtype="float64")

assert ps.shape == cs.shape

Expand Down Expand Up @@ -2840,7 +2841,7 @@ def test_series_all_null(num_elements, null_type):
@pytest.mark.parametrize("num_elements", [0, 2, 10, 100])
def test_series_all_valid_nan(num_elements):
data = [np.nan] * num_elements
sr = cudf.Series(data, nan_as_null=False)
sr = _create_cudf_series_float64_default(data, nan_as_null=False)
np.testing.assert_equal(sr.null_count, 0)


Expand Down Expand Up @@ -4073,28 +4074,28 @@ def test_empty_dataframe_describe():


def test_as_column_types():
col = column.as_column(cudf.Series([]))
col = column.as_column(cudf.Series([], dtype="float64"))
assert_eq(col.dtype, np.dtype("float64"))
gds = cudf.Series(col)
pds = pd.Series(pd.Series([], dtype="float64"))

assert_eq(pds, gds)

col = column.as_column(cudf.Series([]), dtype="float32")
col = column.as_column(cudf.Series([], dtype="float64"), dtype="float32")
assert_eq(col.dtype, np.dtype("float32"))
gds = cudf.Series(col)
pds = pd.Series(pd.Series([], dtype="float32"))

assert_eq(pds, gds)

col = column.as_column(cudf.Series([]), dtype="str")
col = column.as_column(cudf.Series([], dtype="float64"), dtype="str")
assert_eq(col.dtype, np.dtype("object"))
gds = cudf.Series(col)
pds = pd.Series(pd.Series([], dtype="str"))

assert_eq(pds, gds)

col = column.as_column(cudf.Series([]), dtype="object")
col = column.as_column(cudf.Series([], dtype="float64"), dtype="object")
assert_eq(col.dtype, np.dtype("object"))
gds = cudf.Series(col)
pds = pd.Series(pd.Series([], dtype="object"))
Expand Down Expand Up @@ -4469,7 +4470,7 @@ def test_create_dataframe_column():
)
def test_series_values_host_property(data):
pds = pd.Series(data=data, dtype=None if data else float)
gds = cudf.Series(data)
gds = _create_cudf_series_float64_default(data)

np.testing.assert_array_equal(pds.values, gds.values_host)

Expand All @@ -4492,7 +4493,7 @@ def test_series_values_host_property(data):
)
def test_series_values_property(data):
pds = pd.Series(data=data, dtype=None if data else float)
gds = cudf.Series(data)
gds = _create_cudf_series_float64_default(data)
gds_vals = gds.values
assert isinstance(gds_vals, cupy.ndarray)
np.testing.assert_array_equal(gds_vals.get(), pds.values)
Expand Down
9 changes: 6 additions & 3 deletions python/cudf/cudf/tests/test_dropna.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2023, NVIDIA CORPORATION.

import numpy as np
import pandas as pd
import pytest

import cudf
from cudf.testing._utils import _create_pandas_series, assert_eq
from cudf.testing._utils import (
_create_pandas_series_float64_default,
assert_eq,
)


@pytest.mark.parametrize(
Expand All @@ -22,7 +25,7 @@
@pytest.mark.parametrize("inplace", [True, False])
def test_dropna_series(data, nulls, inplace):

psr = _create_pandas_series(data)
psr = _create_pandas_series_float64_default(data)

if len(data) > 0:
if nulls == "one":
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/tests/test_duplicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import cudf
from cudf import concat
from cudf.testing._utils import (
_create_pandas_series,
_create_pandas_series_float64_default,
assert_eq,
assert_exceptions_equal,
)
Expand Down Expand Up @@ -62,7 +62,7 @@ def test_duplicated_with_misspelled_column_name(subset):
],
)
def test_drop_duplicates_series(data, keep):
pds = _create_pandas_series(data)
pds = _create_pandas_series_float64_default(data)
gds = cudf.from_pandas(pds)

assert_df(pds.drop_duplicates(keep=keep), gds.drop_duplicates(keep=keep))
Expand Down
16 changes: 12 additions & 4 deletions python/cudf/cudf/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@
SIGNED_INTEGER_TYPES,
SIGNED_TYPES,
UNSIGNED_TYPES,
_create_pandas_series,
_create_cudf_series_float64_default,
_create_pandas_series_float64_default,
assert_column_memory_eq,
assert_column_memory_ne,
assert_eq,
Expand Down Expand Up @@ -1006,8 +1007,8 @@ def test_index_equal_misc(data, other):
actual = gd_data.equals(np.array(gd_other))
assert_eq(expected, actual)

expected = pd_data.equals(_create_pandas_series(pd_other))
actual = gd_data.equals(cudf.Series(gd_other))
expected = pd_data.equals(_create_pandas_series_float64_default(pd_other))
actual = gd_data.equals(_create_cudf_series_float64_default(gd_other))
assert_eq(expected, actual)

expected = pd_data.astype("category").equals(pd_other)
Expand Down Expand Up @@ -2275,7 +2276,7 @@ def test_index_nan_as_null(data, nan_idx, NA_idx, nan_as_null):
],
)
def test_isin_index(data, values):
psr = _create_pandas_series(data)
psr = _create_pandas_series_float64_default(data)
gsr = cudf.Series.from_pandas(psr)

got = gsr.index.isin(values)
Expand Down Expand Up @@ -2780,6 +2781,13 @@ def test_index_empty_from_pandas(request, dtype):
assert_eq(pidx, gidx)


def test_empty_index_init():
pidx = pd.Index([])
gidx = cudf.Index([])

assert_eq(pidx, gidx)


@pytest.mark.parametrize(
"data", [[1, 2, 3], ["ab", "cd", "e", None], range(0, 10)]
)
Expand Down
9 changes: 6 additions & 3 deletions python/cudf/cudf/tests/test_rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@

import cudf
from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140
from cudf.testing._utils import _create_pandas_series, assert_eq
from cudf.testing._utils import (
_create_pandas_series_float64_default,
assert_eq,
)
from cudf.testing.dataset_generator import rand_dataframe


Expand Down Expand Up @@ -55,7 +58,7 @@ def test_rolling_series_basic(data, index, agg, nulls, center):
elif nulls == "all":
data = [np.nan] * len(data)

psr = _create_pandas_series(data, index=index)
psr = _create_pandas_series_float64_default(data, index=index)
gsr = cudf.Series(psr)
for window_size in range(1, len(data) + 1):
for min_periods in range(1, window_size + 1):
Expand Down Expand Up @@ -313,7 +316,7 @@ def test_rolling_getitem_window():
@pytest.mark.parametrize("center", [True, False])
def test_rollling_series_numba_udf_basic(data, index, center):

psr = _create_pandas_series(data, index=index)
psr = _create_pandas_series_float64_default(data, index=index)
gsr = cudf.from_pandas(psr)

def some_func(A):
Expand Down
Loading

0 comments on commit f7ca051

Please sign in to comment.