-
Notifications
You must be signed in to change notification settings - Fork 915
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Consolidate 1D pandas object handling in as_column #14394
Changes from 4 commits
e9c62bf
615c343
a8b5b1d
216f70a
8cb9004
c79b494
eb4df96
8517829
927b677
18be376
98b5a5c
ada7d6b
58822df
accee6a
69d59ec
c69f80e
4a45752
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2033,69 +2033,121 @@ def as_column( | |
|
||
return col | ||
|
||
elif isinstance(arbitrary, (pd.Series, pd.Categorical)): | ||
if isinstance(arbitrary, pd.Series): | ||
if isinstance( | ||
arbitrary.array, pd.core.arrays.masked.BaseMaskedArray | ||
): | ||
return as_column(arbitrary.array) | ||
elif PANDAS_GE_150 and isinstance(arbitrary.dtype, pd.ArrowDtype): | ||
if cudf.get_option("mode.pandas_compatible"): | ||
raise NotImplementedError("not supported") | ||
return as_column(pa.array(arbitrary.array, from_pandas=True)) | ||
elif isinstance(arbitrary.dtype, pd.SparseDtype): | ||
raise NotImplementedError( | ||
f"{arbitrary.dtype} is not supported. Convert first to " | ||
f"{arbitrary.dtype.subtype}." | ||
) | ||
if is_categorical_dtype(arbitrary.dtype): | ||
if isinstance( | ||
arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype | ||
): | ||
raise NotImplementedError( | ||
"cuDF does not yet support timezone-aware datetimes" | ||
) | ||
data = as_column(pa.array(arbitrary, from_pandas=True)) | ||
elif is_interval_dtype(arbitrary.dtype): | ||
if isinstance(arbitrary.dtype.subtype, pd.DatetimeTZDtype): | ||
raise NotImplementedError( | ||
"cuDF does not yet support timezone-aware datetimes" | ||
elif isinstance( | ||
arbitrary, (pd.Series, pd.Index, pd.api.extensions.ExtensionArray) | ||
): | ||
if isinstance(arbitrary.dtype, (pd.SparseDtype, pd.PeriodDtype)): | ||
raise NotImplementedError( | ||
f"cuDF does not yet support {type(arbitrary.dtype).__name__}" | ||
) | ||
elif ( | ||
cudf.get_option("mode.pandas_compatible") | ||
and isinstance(arbitrary, (pd.DatetimeIndex, pd.TimedeltaIndex)) | ||
and arbitrary.freq is not None | ||
): | ||
raise NotImplementedError("freq is not implemented yet") | ||
elif ( | ||
isinstance(arbitrary.dtype, pd.DatetimeTZDtype) | ||
or ( | ||
isinstance(arbitrary.dtype, pd.IntervalDtype) | ||
and isinstance(arbitrary.dtype.subtype, pd.DatetimeTZDtype) | ||
) | ||
or ( | ||
isinstance(arbitrary.dtype, pd.CategoricalDtype) | ||
and isinstance( | ||
arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype | ||
) | ||
data = as_column(pa.array(arbitrary, from_pandas=True)) | ||
elif arbitrary.dtype == np.bool_: | ||
data = as_column(cupy.asarray(arbitrary), dtype=arbitrary.dtype) | ||
elif arbitrary.dtype.kind in ("f"): | ||
arb_dtype = np.dtype(arbitrary.dtype) | ||
) | ||
): | ||
raise NotImplementedError( | ||
"cuDF does not yet support timezone-aware datetimes" | ||
) | ||
elif _is_pandas_nullable_extension_dtype(arbitrary.dtype): | ||
if cudf.get_option("mode.pandas_compatible"): | ||
raise NotImplementedError("not supported") | ||
if isinstance(arbitrary, (pd.Series, pd.Index)): | ||
arbitrary = arbitrary.array | ||
wence- marked this conversation as resolved.
Show resolved
Hide resolved
|
||
data = as_column( | ||
cupy.asarray(arbitrary, dtype=arb_dtype), | ||
pa.array(arbitrary, from_pandas=True), | ||
nan_as_null=nan_as_null, | ||
dtype=dtype, | ||
length=length, | ||
) | ||
elif arbitrary.dtype.kind in ("u", "i"): | ||
elif isinstance( | ||
arbitrary.dtype, (pd.CategoricalDtype, pd.IntervalDtype) | ||
): | ||
# TODO: Validate subtype? | ||
data = as_column( | ||
cupy.asarray(arbitrary), nan_as_null=nan_as_null, dtype=dtype | ||
pa.array(arbitrary, from_pandas=True), | ||
nan_as_null=nan_as_null, | ||
dtype=dtype, | ||
length=length, | ||
) | ||
elif isinstance(arbitrary.dtype, pd.PeriodDtype): | ||
elif isinstance( | ||
arbitrary.dtype, pd.api.extensions.ExtensionDtype | ||
) and not isinstance(arbitrary, pd.arrays.PandasArray): | ||
raise NotImplementedError( | ||
"cuDF does not yet support `PeriodDtype`" | ||
"Custom pandas ExtensionDtypes are not supported" | ||
) | ||
else: | ||
if cudf.get_option( | ||
"mode.pandas_compatible" | ||
) and _is_pandas_nullable_extension_dtype(arbitrary.dtype): | ||
raise NotImplementedError("not supported") | ||
pyarrow_array = pa.array(arbitrary, from_pandas=nan_as_null) | ||
if arbitrary.dtype == cudf.dtype("object") and cudf.dtype( | ||
pyarrow_array.type.to_pandas_dtype() | ||
) != cudf.dtype(arbitrary.dtype): | ||
elif arbitrary.dtype.kind in "fiubmM": | ||
if isinstance(arbitrary, pd.arrays.PandasArray): | ||
arbitrary = np.array(arbitrary) | ||
arb_dtype = np.dtype(arbitrary.dtype) | ||
if arb_dtype.kind == "f" and arb_dtype.itemsize == 2: | ||
# float16 | ||
arbitrary = arbitrary.astype(np.dtype(np.float32)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. question: Because cudf doesn't support float16 natively? Does this potentially cause problems in cudf-pandas mode since the dtype will not be preserved. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah I believe so I was migrating this code below here arb_dtype = (
cudf.dtype("float32")
if arbitrary.dtype == "float16"
else cudf.dtype(arbitrary.dtype)
) pandas also does not support There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion: Given that the pandas-2 behaviour raises, let us take this opportunity to add a deprecation warning here so that we can also raise once we're supporting pandas-2. (See https://docs.rapids.ai/api/cudf/nightly/developer_guide/contributing_guide/#deprecating-and-removing-code) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah so my generalization about float16 in pandas 2.0 isn't entirely correct. Only pandas Index objects will disallow float16 (IIRC there's no hashtable implementation for this type), but Series and DataFrame objects will continue to allow float16. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, but you can't do many things with float16 series (e.g. merge doesn't work). I think I would rather
Rather than silently upcasting. WDYT? I realise this would be a breaking change (since we currently do upcast in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah agreed raising here is better than silent upcasting, I'll make this raise and mark as a breaking change |
||
elif arb_dtype.kind in "mM": | ||
# not supported by cupy | ||
arbitrary = np.asarray(arbitrary) | ||
else: | ||
arbitrary = cupy.asarray(arbitrary) | ||
data = as_column( | ||
arbitrary, nan_as_null=nan_as_null, dtype=dtype, length=length | ||
) | ||
elif arbitrary.dtype.kind == "O": | ||
if isinstance(arbitrary, pd.arrays.PandasArray): | ||
# infer_dtype does not handle PandasArray | ||
arbitrary = np.array(arbitrary, dtype=object) | ||
inferred_dtype = infer_dtype(arbitrary) | ||
if inferred_dtype in ("mixed-integer", "mixed-integer-float"): | ||
raise MixedTypeError("Cannot create column with mixed types") | ||
elif inferred_dtype in ( | ||
"bytes", | ||
"floating", | ||
"integer", | ||
"boolean", | ||
"datetime", | ||
"timedelta", | ||
"datetime64", | ||
"timedelta64", | ||
): | ||
# Types that would be coerced but should be kept as "object" | ||
# e.g. pandas.Series([1], dtype=object) | ||
raise TypeError( | ||
f"Cannot convert a object type of {inferred_dtype}" | ||
wence- marked this conversation as resolved.
Show resolved
Hide resolved
|
||
) | ||
# TODO: nan_as_na interaction here with from_pandas | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we open an issue documenting the various TODOs in data ingest so that we have an overview of what is still in-progress? It's unclear to me how much of this needs new implementation in cudf and how much needs thinking about the boundaries. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes definitely. Once I get all the existing tests passing I'll make an issue of the edge cases to handle There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks! |
||
pyarrow_array = pa.array( | ||
arbitrary, | ||
from_pandas=True, | ||
) | ||
if isinstance(pyarrow_array.type, pa.Decimal128Type): | ||
pyarrow_type = cudf.Decimal128Dtype.from_arrow( | ||
pyarrow_array.type | ||
) | ||
else: | ||
pyarrow_type = arbitrary.dtype | ||
data = as_column(pyarrow_array, dtype=pyarrow_type) | ||
data = as_column( | ||
pyarrow_array, | ||
dtype=pyarrow_type, | ||
nan_as_null=nan_as_null, | ||
length=length, | ||
) | ||
else: | ||
raise NotImplementedError( | ||
f"{type(arbitrary).__name__} with " | ||
f"{type(arbitrary.dtype).__name__} is not supported." | ||
) | ||
if dtype is not None: | ||
data = data.astype(dtype) | ||
|
||
|
@@ -2171,11 +2223,13 @@ def as_column( | |
arbitrary = arbitrary.astype(cudf.dtype("datetime64[s]")) | ||
|
||
buffer = as_buffer(arbitrary.view("|u1")) | ||
mask = None | ||
if nan_as_null is None or nan_as_null is True: | ||
data = build_column(buffer, dtype=arbitrary.dtype) | ||
data = _make_copy_replacing_NaT_with_null(data) | ||
mask = data.mask | ||
else: | ||
bool_mask = as_column(~np.isnat(arbitrary)) | ||
mask = as_buffer(bools_to_mask(bool_mask)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems like it turns There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah I think we still want to consider There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, at one point of time we supported both |
||
|
||
data = build_column(data=buffer, mask=mask, dtype=arbitrary.dtype) | ||
elif arb_dtype.kind == "m": | ||
|
@@ -2186,11 +2240,13 @@ def as_column( | |
arbitrary = arbitrary.astype(cudf.dtype("timedelta64[s]")) | ||
|
||
buffer = as_buffer(arbitrary.view("|u1")) | ||
mask = None | ||
if nan_as_null is None or nan_as_null is True: | ||
data = build_column(buffer, dtype=arbitrary.dtype) | ||
data = _make_copy_replacing_NaT_with_null(data) | ||
mask = data.mask | ||
else: | ||
bool_mask = as_column(np.isnat(arbitrary)) | ||
mask = as_buffer(bools_to_mask(bool_mask)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same question here. |
||
|
||
data = cudf.core.column.timedelta.TimeDeltaColumn( | ||
data=buffer, | ||
|
@@ -2232,105 +2288,12 @@ def as_column( | |
if delayed_cast: | ||
data = data.astype(cudf.dtype(dtype)) | ||
|
||
elif isinstance(arbitrary, pd.arrays.PandasArray): | ||
if cudf.get_option( | ||
"mode.pandas_compatible" | ||
) and _is_pandas_nullable_extension_dtype(arbitrary.dtype): | ||
raise NotImplementedError("not supported") | ||
if is_categorical_dtype(arbitrary.dtype): | ||
arb_dtype = arbitrary.dtype | ||
else: | ||
if arbitrary.dtype == pd.StringDtype(): | ||
arb_dtype = cudf.dtype("O") | ||
else: | ||
arb_dtype = ( | ||
cudf.dtype("float32") | ||
if arbitrary.dtype == "float16" | ||
else cudf.dtype(arbitrary.dtype) | ||
) | ||
if arb_dtype != arbitrary.dtype.numpy_dtype: | ||
arbitrary = arbitrary.astype(arb_dtype) | ||
if ( | ||
arbitrary.size != 0 | ||
and isinstance(arbitrary[0], pd._libs.interval.Interval) | ||
and arb_dtype.kind in ("O") | ||
): | ||
# changing from pd array to series,possible arrow bug | ||
interval_series = pd.Series(arbitrary) | ||
data = as_column( | ||
pa.Array.from_pandas(interval_series), dtype=arb_dtype | ||
) | ||
elif arb_dtype.kind in ("O", "U"): | ||
pyarrow_array = pa.Array.from_pandas(arbitrary) | ||
if not isinstance( | ||
pyarrow_array, | ||
( | ||
pa.ListArray, | ||
pa.StructArray, | ||
pa.NullArray, | ||
pa.Decimal128Array, | ||
pa.StringArray, | ||
pa.BooleanArray, | ||
), | ||
): | ||
raise MixedTypeError("Cannot create column with mixed types") | ||
data = as_column(pyarrow_array, dtype=arb_dtype) | ||
else: | ||
data = as_column( | ||
pa.array( | ||
arbitrary, | ||
from_pandas=True if nan_as_null is None else nan_as_null, | ||
), | ||
nan_as_null=nan_as_null, | ||
) | ||
if dtype is not None: | ||
data = data.astype(dtype) | ||
elif isinstance(arbitrary, pd.arrays.SparseArray): | ||
raise NotImplementedError( | ||
f"{arbitrary.dtype} is not supported. Convert first to " | ||
f"{arbitrary.dtype.subtype}." | ||
) | ||
elif isinstance(arbitrary, memoryview): | ||
data = as_column( | ||
np.asarray(arbitrary), dtype=dtype, nan_as_null=nan_as_null | ||
) | ||
elif isinstance(arbitrary, cudf.Scalar): | ||
data = ColumnBase.from_scalar(arbitrary, length if length else 1) | ||
elif isinstance(arbitrary, pd.core.arrays.masked.BaseMaskedArray): | ||
if cudf.get_option("mode.pandas_compatible"): | ||
raise NotImplementedError("not supported") | ||
data = as_column(pa.Array.from_pandas(arbitrary), dtype=dtype) | ||
elif ( | ||
( | ||
isinstance(arbitrary, pd.DatetimeIndex) | ||
and isinstance(arbitrary.dtype, pd.DatetimeTZDtype) | ||
) | ||
or ( | ||
isinstance(arbitrary, pd.IntervalIndex) | ||
and is_datetime64tz_dtype(arbitrary.dtype.subtype) | ||
) | ||
or ( | ||
isinstance(arbitrary, pd.CategoricalIndex) | ||
and isinstance( | ||
arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype | ||
) | ||
) | ||
): | ||
raise NotImplementedError( | ||
"cuDF does not yet support timezone-aware datetimes" | ||
) | ||
elif isinstance( | ||
arbitrary, (pd.core.arrays.period.PeriodArray, pd.PeriodIndex) | ||
): | ||
raise NotImplementedError( | ||
f"cuDF does not yet support {type(arbitrary).__name__}" | ||
) | ||
elif ( | ||
cudf.get_option("mode.pandas_compatible") | ||
and isinstance(arbitrary, (pd.DatetimeIndex, pd.TimedeltaIndex)) | ||
and arbitrary.freq is not None | ||
): | ||
raise NotImplementedError("freq is not implemented yet") | ||
else: | ||
try: | ||
data = as_column( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
question: What happens to this type of data if we're not in pandas-compat mode? And why is it not supported if we are?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We currently convert this correctly to a corresponding cudf type
We disallowed it for
cudf.pandas
since currently we cannot roundtrip back to pandas correctly since cudf doesn't keep track whether a e.g. numpy.int64 or pandas.Int64Dtype was passed inThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, makes sense, thanks.