Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consolidate 1D pandas object handling in as_column #14394

Merged
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 24 additions & 7 deletions python/cudf/cudf/api/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from pandas.api import types as pd_types

import cudf
from cudf.core._compat import PANDAS_GE_150
from cudf.core.dtypes import ( # noqa: F401
_BaseDtype,
dtype,
Expand Down Expand Up @@ -444,15 +445,31 @@ def is_any_real_numeric_dtype(arr_or_dtype) -> bool:
)


def _is_pandas_nullable_extension_dtype(dtype_to_check):
def _is_pandas_nullable_extension_dtype(dtype_to_check) -> bool:
if isinstance(
dtype_to_check, pd.api.extensions.ExtensionDtype
) and not isinstance(dtype_to_check, pd.core.dtypes.dtypes.PandasDtype):
if isinstance(dtype_to_check, pd.CategoricalDtype):
return _is_pandas_nullable_extension_dtype(
dtype_to_check.categories.dtype
)
dtype_to_check,
(
pd.UInt8Dtype,
pd.UInt16Dtype,
pd.UInt32Dtype,
pd.UInt64Dtype,
pd.Int8Dtype,
pd.Int16Dtype,
pd.Int32Dtype,
pd.Int64Dtype,
pd.Float32Dtype,
pd.Float64Dtype,
pd.BooleanDtype,
pd.StringDtype,
),
) or (PANDAS_GE_150 and isinstance(dtype_to_check, pd.ArrowDtype)):
return True
elif isinstance(dtype_to_check, pd.CategoricalDtype):
return _is_pandas_nullable_extension_dtype(
dtype_to_check.categories.dtype
)
elif isinstance(dtype_to_check, pd.IntervalDtype):
return _is_pandas_nullable_extension_dtype(dtype_to_check.subtype)
return False


Expand Down
247 changes: 105 additions & 142 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -2033,69 +2033,121 @@ def as_column(

return col

elif isinstance(arbitrary, (pd.Series, pd.Categorical)):
if isinstance(arbitrary, pd.Series):
if isinstance(
arbitrary.array, pd.core.arrays.masked.BaseMaskedArray
):
return as_column(arbitrary.array)
elif PANDAS_GE_150 and isinstance(arbitrary.dtype, pd.ArrowDtype):
if cudf.get_option("mode.pandas_compatible"):
raise NotImplementedError("not supported")
return as_column(pa.array(arbitrary.array, from_pandas=True))
elif isinstance(arbitrary.dtype, pd.SparseDtype):
raise NotImplementedError(
f"{arbitrary.dtype} is not supported. Convert first to "
f"{arbitrary.dtype.subtype}."
)
if is_categorical_dtype(arbitrary.dtype):
if isinstance(
arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype
):
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
)
data = as_column(pa.array(arbitrary, from_pandas=True))
elif is_interval_dtype(arbitrary.dtype):
if isinstance(arbitrary.dtype.subtype, pd.DatetimeTZDtype):
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
elif isinstance(
arbitrary, (pd.Series, pd.Index, pd.api.extensions.ExtensionArray)
):
if isinstance(arbitrary.dtype, (pd.SparseDtype, pd.PeriodDtype)):
raise NotImplementedError(
f"cuDF does not yet support {type(arbitrary.dtype).__name__}"
)
elif (
cudf.get_option("mode.pandas_compatible")
and isinstance(arbitrary, (pd.DatetimeIndex, pd.TimedeltaIndex))
and arbitrary.freq is not None
):
raise NotImplementedError("freq is not implemented yet")
elif (
isinstance(arbitrary.dtype, pd.DatetimeTZDtype)
or (
isinstance(arbitrary.dtype, pd.IntervalDtype)
and isinstance(arbitrary.dtype.subtype, pd.DatetimeTZDtype)
)
or (
isinstance(arbitrary.dtype, pd.CategoricalDtype)
and isinstance(
arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype
)
data = as_column(pa.array(arbitrary, from_pandas=True))
elif arbitrary.dtype == np.bool_:
data = as_column(cupy.asarray(arbitrary), dtype=arbitrary.dtype)
elif arbitrary.dtype.kind in ("f"):
arb_dtype = np.dtype(arbitrary.dtype)
)
):
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
)
elif _is_pandas_nullable_extension_dtype(arbitrary.dtype):
if cudf.get_option("mode.pandas_compatible"):
raise NotImplementedError("not supported")
Comment on lines +2075 to +2076
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: What happens to this type of data if we're not in pandas-compat mode? And why is it not supported if we are?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens to this type of data if we're not in pandas-compat mode?

We currently convert this correctly to a corresponding cudf type

In [1]: import cudf; import pandas as pd

In [2]: cudf.from_pandas(pd.Series([1], dtype=pd.Int64Dtype()))
Out[2]: 
0    1
dtype: int64

And why is it not supported if we are?

We disallowed it for cudf.pandas since currently we cannot roundtrip back to pandas correctly since cudf doesn't keep track whether a e.g. numpy.int64 or pandas.Int64Dtype was passed in

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, makes sense, thanks.

if isinstance(arbitrary, (pd.Series, pd.Index)):
arbitrary = arbitrary.array
wence- marked this conversation as resolved.
Show resolved Hide resolved
data = as_column(
cupy.asarray(arbitrary, dtype=arb_dtype),
pa.array(arbitrary, from_pandas=True),
nan_as_null=nan_as_null,
dtype=dtype,
length=length,
)
elif arbitrary.dtype.kind in ("u", "i"):
elif isinstance(
arbitrary.dtype, (pd.CategoricalDtype, pd.IntervalDtype)
):
# TODO: Validate subtype?
data = as_column(
cupy.asarray(arbitrary), nan_as_null=nan_as_null, dtype=dtype
pa.array(arbitrary, from_pandas=True),
nan_as_null=nan_as_null,
dtype=dtype,
length=length,
)
elif isinstance(arbitrary.dtype, pd.PeriodDtype):
elif isinstance(
arbitrary.dtype, pd.api.extensions.ExtensionDtype
) and not isinstance(arbitrary, pd.arrays.PandasArray):
raise NotImplementedError(
"cuDF does not yet support `PeriodDtype`"
"Custom pandas ExtensionDtypes are not supported"
)
else:
if cudf.get_option(
"mode.pandas_compatible"
) and _is_pandas_nullable_extension_dtype(arbitrary.dtype):
raise NotImplementedError("not supported")
pyarrow_array = pa.array(arbitrary, from_pandas=nan_as_null)
if arbitrary.dtype == cudf.dtype("object") and cudf.dtype(
pyarrow_array.type.to_pandas_dtype()
) != cudf.dtype(arbitrary.dtype):
elif arbitrary.dtype.kind in "fiubmM":
if isinstance(arbitrary, pd.arrays.PandasArray):
arbitrary = np.array(arbitrary)
arb_dtype = np.dtype(arbitrary.dtype)
if arb_dtype.kind == "f" and arb_dtype.itemsize == 2:
# float16
arbitrary = arbitrary.astype(np.dtype(np.float32))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: Because cudf doesn't support float16 natively? Does this potentially cause problems in cudf-pandas mode since the dtype will not be preserved.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I believe so I was migrating this code below here

               arb_dtype = (
                    cudf.dtype("float32")
                    if arbitrary.dtype == "float16"
                    else cudf.dtype(arbitrary.dtype)
                )

pandas also does not support float16 (which was made more intentional by raising in pandas 2.0, before it would sometimes coerce to float32 also), so I don't think the dtype preservation here isn't too much of an issue

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion:

Given that the pandas-2 behaviour raises, let us take this opportunity to add a deprecation warning here so that we can also raise once we're supporting pandas-2. (See https://docs.rapids.ai/api/cudf/nightly/developer_guide/contributing_guide/#deprecating-and-removing-code)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah so my generalization about float16 in pandas 2.0 isn't entirely correct.

Only pandas Index objects will disallow float16 (IIRC there's no hashtable implementation for this type), but Series and DataFrame objects will continue to allow float16.

Copy link
Contributor

@wence- wence- Nov 28, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, but you can't do many things with float16 series (e.g. merge doesn't work).

I think I would rather raise here (as we do at the moment for non-pandas data):

import cudf
cudf.Series([1, 2, 3], dtype="float16")
# TypeError: Unsupported type float16

Rather than silently upcasting. WDYT? I realise this would be a breaking change (since we currently do upcast in from_pandas).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah agreed raising here is better than silent upcasting, I'll make this raise and mark as a breaking change

elif arb_dtype.kind in "mM":
# not supported by cupy
arbitrary = np.asarray(arbitrary)
else:
arbitrary = cupy.asarray(arbitrary)
data = as_column(
arbitrary, nan_as_null=nan_as_null, dtype=dtype, length=length
)
elif arbitrary.dtype.kind == "O":
if isinstance(arbitrary, pd.arrays.PandasArray):
# infer_dtype does not handle PandasArray
arbitrary = np.array(arbitrary, dtype=object)
inferred_dtype = infer_dtype(arbitrary)
if inferred_dtype in ("mixed-integer", "mixed-integer-float"):
raise MixedTypeError("Cannot create column with mixed types")
elif inferred_dtype in (
"bytes",
"floating",
"integer",
"boolean",
"datetime",
"timedelta",
"datetime64",
"timedelta64",
):
# Types that would be coerced but should be kept as "object"
# e.g. pandas.Series([1], dtype=object)
raise TypeError(
f"Cannot convert a object type of {inferred_dtype}"
wence- marked this conversation as resolved.
Show resolved Hide resolved
)
# TODO: nan_as_na interaction here with from_pandas
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we open an issue documenting the various TODOs in data ingest so that we have an overview of what is still in-progress? It's unclear to me how much of this needs new implementation in cudf and how much needs thinking about the boundaries.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes definitely. Once I get all the existing tests passing I'll make an issue of the edge cases to handle

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks!

pyarrow_array = pa.array(
arbitrary,
from_pandas=True,
)
if isinstance(pyarrow_array.type, pa.Decimal128Type):
pyarrow_type = cudf.Decimal128Dtype.from_arrow(
pyarrow_array.type
)
else:
pyarrow_type = arbitrary.dtype
data = as_column(pyarrow_array, dtype=pyarrow_type)
data = as_column(
pyarrow_array,
dtype=pyarrow_type,
nan_as_null=nan_as_null,
length=length,
)
else:
raise NotImplementedError(
f"{type(arbitrary).__name__} with "
f"{type(arbitrary.dtype).__name__} is not supported."
)
if dtype is not None:
data = data.astype(dtype)

Expand Down Expand Up @@ -2171,11 +2223,13 @@ def as_column(
arbitrary = arbitrary.astype(cudf.dtype("datetime64[s]"))

buffer = as_buffer(arbitrary.view("|u1"))
mask = None
if nan_as_null is None or nan_as_null is True:
data = build_column(buffer, dtype=arbitrary.dtype)
data = _make_copy_replacing_NaT_with_null(data)
mask = data.mask
else:
bool_mask = as_column(~np.isnat(arbitrary))
mask = as_buffer(bools_to_mask(bool_mask))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems like it turns NaT into nulls, is that the intention?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I think we still want to consider NaT values when creating the mask (as null values), but not necessarily cast the value to NA as tested in test_series_np_array_nat_nan_as_null_false. cc @galipremsagar

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, at one point of time we supported both NA and NaT for datetime & timedelta columns. Now we just treat NA as NaT and vice-versa for datetime & timedelta column. With recent pandas accelerator mode work, we decided to just repr out NA as NAT. So yes we need to mark the mask when we have NAT's anywhere.


data = build_column(data=buffer, mask=mask, dtype=arbitrary.dtype)
elif arb_dtype.kind == "m":
Expand All @@ -2186,11 +2240,13 @@ def as_column(
arbitrary = arbitrary.astype(cudf.dtype("timedelta64[s]"))

buffer = as_buffer(arbitrary.view("|u1"))
mask = None
if nan_as_null is None or nan_as_null is True:
data = build_column(buffer, dtype=arbitrary.dtype)
data = _make_copy_replacing_NaT_with_null(data)
mask = data.mask
else:
bool_mask = as_column(np.isnat(arbitrary))
mask = as_buffer(bools_to_mask(bool_mask))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same question here.


data = cudf.core.column.timedelta.TimeDeltaColumn(
data=buffer,
Expand Down Expand Up @@ -2232,105 +2288,12 @@ def as_column(
if delayed_cast:
data = data.astype(cudf.dtype(dtype))

elif isinstance(arbitrary, pd.arrays.PandasArray):
if cudf.get_option(
"mode.pandas_compatible"
) and _is_pandas_nullable_extension_dtype(arbitrary.dtype):
raise NotImplementedError("not supported")
if is_categorical_dtype(arbitrary.dtype):
arb_dtype = arbitrary.dtype
else:
if arbitrary.dtype == pd.StringDtype():
arb_dtype = cudf.dtype("O")
else:
arb_dtype = (
cudf.dtype("float32")
if arbitrary.dtype == "float16"
else cudf.dtype(arbitrary.dtype)
)
if arb_dtype != arbitrary.dtype.numpy_dtype:
arbitrary = arbitrary.astype(arb_dtype)
if (
arbitrary.size != 0
and isinstance(arbitrary[0], pd._libs.interval.Interval)
and arb_dtype.kind in ("O")
):
# changing from pd array to series,possible arrow bug
interval_series = pd.Series(arbitrary)
data = as_column(
pa.Array.from_pandas(interval_series), dtype=arb_dtype
)
elif arb_dtype.kind in ("O", "U"):
pyarrow_array = pa.Array.from_pandas(arbitrary)
if not isinstance(
pyarrow_array,
(
pa.ListArray,
pa.StructArray,
pa.NullArray,
pa.Decimal128Array,
pa.StringArray,
pa.BooleanArray,
),
):
raise MixedTypeError("Cannot create column with mixed types")
data = as_column(pyarrow_array, dtype=arb_dtype)
else:
data = as_column(
pa.array(
arbitrary,
from_pandas=True if nan_as_null is None else nan_as_null,
),
nan_as_null=nan_as_null,
)
if dtype is not None:
data = data.astype(dtype)
elif isinstance(arbitrary, pd.arrays.SparseArray):
raise NotImplementedError(
f"{arbitrary.dtype} is not supported. Convert first to "
f"{arbitrary.dtype.subtype}."
)
elif isinstance(arbitrary, memoryview):
data = as_column(
np.asarray(arbitrary), dtype=dtype, nan_as_null=nan_as_null
)
elif isinstance(arbitrary, cudf.Scalar):
data = ColumnBase.from_scalar(arbitrary, length if length else 1)
elif isinstance(arbitrary, pd.core.arrays.masked.BaseMaskedArray):
if cudf.get_option("mode.pandas_compatible"):
raise NotImplementedError("not supported")
data = as_column(pa.Array.from_pandas(arbitrary), dtype=dtype)
elif (
(
isinstance(arbitrary, pd.DatetimeIndex)
and isinstance(arbitrary.dtype, pd.DatetimeTZDtype)
)
or (
isinstance(arbitrary, pd.IntervalIndex)
and is_datetime64tz_dtype(arbitrary.dtype.subtype)
)
or (
isinstance(arbitrary, pd.CategoricalIndex)
and isinstance(
arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype
)
)
):
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
)
elif isinstance(
arbitrary, (pd.core.arrays.period.PeriodArray, pd.PeriodIndex)
):
raise NotImplementedError(
f"cuDF does not yet support {type(arbitrary).__name__}"
)
elif (
cudf.get_option("mode.pandas_compatible")
and isinstance(arbitrary, (pd.DatetimeIndex, pd.TimedeltaIndex))
and arbitrary.freq is not None
):
raise NotImplementedError("freq is not implemented yet")
else:
try:
data = as_column(
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6317,7 +6317,7 @@ def test_df_string_cat_types_mask_where(data, condition, other, has_cat):
(
pd.Series([random.random() for _ in range(10)], dtype="float128"),
None,
TypeError,
ValueError,
),
],
)
Expand Down
20 changes: 20 additions & 0 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2638,3 +2638,23 @@ def test_series_setitem_mixed_bool_dtype():
s = cudf.Series([True, False, True])
with pytest.raises(TypeError):
s[0] = 10


@pytest.mark.parametrize(
"nat, value",
[
[np.datetime64("nat"), np.datetime64("2020-01-01")],
[np.timedelta64("nat"), np.timedelta64(1)],
],
)
def test_series_np_array_nat_nan_as_null_false(nat, value, request):
expected = np.array([nat, value])
if expected.dtype.kind == "m":
request.applymarker(
pytest.mark.xfail(
raises=TypeError, reason="timedelta64 not supported by cupy"
)
)
ser = cudf.Series(expected, nan_as_null=False)
mroeschke marked this conversation as resolved.
Show resolved Hide resolved
assert ser[0] is pd.NaT
assert ser[1] == value
Loading