Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clean up special casing in as_column for non-typed input #14636

Closed
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
b583163
Ensure cudf dtype is gotten first, fix bug in categorical interval logic
mroeschke Dec 13, 2023
43d5942
move more logic
mroeschke Dec 13, 2023
b8a2f5c
Merge remote-tracking branch 'upstream/branch-24.02' into ref/as_colu…
mroeschke Dec 13, 2023
71377b7
Start running tests
mroeschke Dec 14, 2023
d5852b6
Merge remote-tracking branch 'upstream/branch-24.02' into ref/as_colu…
mroeschke Dec 14, 2023
dfd0ac0
Merge remote-tracking branch 'upstream/branch-24.02' into ref/as_colu…
mroeschke Dec 14, 2023
109cbe1
Pass series tests
mroeschke Dec 14, 2023
897320a
Clean up special casing in for non-typed input
mroeschke Dec 14, 2023
6c7a5e4
Remove not needed
mroeschke Dec 14, 2023
43abd0b
Add fallback from arrow parsing
mroeschke Dec 15, 2023
076cdf0
Merge remote-tracking branch 'upstream/branch-24.02' into ref/as_colu…
mroeschke Dec 15, 2023
01b6aee
temp refactor of inference case
mroeschke Dec 16, 2023
be536a0
Merge remote-tracking branch 'upstream/branch-24.02' into ref/as_colu…
mroeschke Dec 19, 2023
261acda
Undo merge
mroeschke Dec 19, 2023
20e8bc0
Code suggestion
mroeschke Dec 19, 2023
a9dcd67
Ensure arrow astypes
mroeschke Dec 29, 2023
4eeb6bb
Merge remote-tracking branch 'upstream/branch-24.02' into ref/as_colu…
mroeschke Dec 29, 2023
5232653
Add carveouts at beginning
mroeschke Dec 29, 2023
7f96476
Merge remote-tracking branch 'upstream/branch-24.02' into ref/as_colu…
mroeschke Jan 5, 2024
d592060
isinstance cupy.ndarray
mroeschke Jan 5, 2024
69d9f48
Merge remote-tracking branch 'upstream/branch-24.04' into ref/as_colu…
mroeschke Jan 31, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
341 changes: 70 additions & 271 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import builtins
import pickle
from collections import abc
from functools import cached_property
from itertools import chain
from types import SimpleNamespace
Expand Down Expand Up @@ -53,13 +52,13 @@
infer_dtype,
is_bool_dtype,
is_categorical_dtype,
is_datetime64_dtype,
is_datetime64tz_dtype,
is_decimal32_dtype,
is_decimal64_dtype,
is_decimal128_dtype,
is_decimal_dtype,
is_dtype_equal,
is_float_dtype,
is_integer_dtype,
is_interval_dtype,
is_list_dtype,
Expand Down Expand Up @@ -88,6 +87,7 @@
_maybe_convert_to_default_type,
cudf_dtype_from_pa_type,
get_time_unit,
is_column_like,
is_mixed_with_object_dtype,
min_scalar_type,
min_unsigned_type,
Expand Down Expand Up @@ -1911,6 +1911,14 @@ def _make_copy_replacing_NaT_with_null(column):
return out_col


def can_memoryview(arbitrary: Any) -> bool:
try:
memoryview(arbitrary)
return True
except TypeError:
return False
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion

Suggested change
def can_memoryview(arbitrary: Any) -> bool:
try:
memoryview(arbitrary)
return True
except TypeError:
return False
def as_memoryview(arbitrary: Any) -> Optional[memoryview]:
try:
return memoryview(arbitrary)
except TypeError:
return None



def as_column(
arbitrary: Any,
nan_as_null: Optional[bool] = None,
Expand Down Expand Up @@ -1951,6 +1959,13 @@ def as_column(
* pyarrow array
* pandas.Categorical objects
"""
if dtype is not None:
dtype = cudf.dtype(dtype)
if isinstance(dtype, pd.DatetimeTZDtype):
raise NotImplementedError(
"Use `tz_localize()` to construct timezone aware data."
)

if isinstance(arbitrary, ColumnBase):
if dtype is not None:
return arbitrary.astype(dtype)
Expand Down Expand Up @@ -2318,285 +2333,69 @@ def as_column(
)
elif isinstance(arbitrary, cudf.Scalar):
data = ColumnBase.from_scalar(arbitrary, length if length else 1)
elif can_memoryview(arbitrary):
data = as_column(
memoryview(arbitrary), dtype=dtype, nan_as_null=nan_as_null
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
elif can_memoryview(arbitrary):
data = as_column(
memoryview(arbitrary), dtype=dtype, nan_as_null=nan_as_null
)
elif (view := as_memoryview(arbitrary)) is not None:
data = as_column(view, dtype=dtype, nan_as_null=nan_as_null)

Then one could merge this handler into the isinstance(arbitrary, memoryview) case as well with:

if isinstance((view := as_memoryview(arbitrary)), memoryview):
    data = as_column(np.asarray(view), dtype=dtype, nan_as_null=nan_as_null)

Which brings me to a further suggestion for this function. One of the issues is that it's very difficult to read the control flow. AIUI, we basically have lots of different branches to check if we can produce a Column, assign that to data and then eventually return data. But it's unclear (for example here) when we do data = as_column(memoryview(arb), ...) whether the is any further processing to be done, or if we are now ready to return the final answer.

I think I would prefer if all of these branches return ... rather than assigning to a variable and going to fallthrough.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I would prefer if all of these branches return ... rather than assigning to a variable and going to fallthrough.

Good point. I'll try to have every branch do return ... for clarity. One aspect I'm not sure if when dtype= is specified that an "astype" is done if the input is typed for all branches

else:
try:
data = as_column(
memoryview(arbitrary), dtype=dtype, nan_as_null=nan_as_null
)
except TypeError:
if dtype is not None:
# Arrow throws a type error if the input is of
# mixed-precision and cannot fit into the provided
# decimal type properly, see:
# https://github.com/apache/arrow/pull/9948
# Hence we should let the exception propagate to
# the user.
if isinstance(dtype, cudf.core.dtypes.Decimal128Dtype):
data = pa.array(
arbitrary,
type=pa.decimal128(
precision=dtype.precision, scale=dtype.scale
),
)
return cudf.core.column.Decimal128Column.from_arrow(data)
elif isinstance(dtype, cudf.core.dtypes.Decimal64Dtype):
data = pa.array(
arbitrary,
type=pa.decimal128(
precision=dtype.precision, scale=dtype.scale
),
)
return cudf.core.column.Decimal64Column.from_arrow(data)
elif isinstance(dtype, cudf.core.dtypes.Decimal32Dtype):
data = pa.array(
arbitrary,
type=pa.decimal128(
precision=dtype.precision, scale=dtype.scale
),
)
return cudf.core.column.Decimal32Column.from_arrow(data)

pa_type = None
np_type = None
try:
if dtype is not None:
if is_categorical_dtype(dtype) or is_interval_dtype(dtype):
raise TypeError
if is_datetime64tz_dtype(dtype):
raise NotImplementedError(
"Use `tz_localize()` to construct "
"timezone aware data."
)
elif is_datetime64_dtype(dtype):
# Error checking only, actual construction happens
# below.
pa_array = pa.array(arbitrary)
if (
isinstance(pa_array.type, pa.TimestampType)
and pa_array.type.tz is not None
):
raise NotImplementedError(
"cuDF does not yet support timezone-aware "
"datetimes"
)
if is_list_dtype(dtype):
data = pa.array(arbitrary)
if type(data) not in (pa.ListArray, pa.NullArray):
raise ValueError(
"Cannot create list column from given data"
)
return as_column(data, nan_as_null=nan_as_null)
elif isinstance(
dtype, cudf.StructDtype
) and not isinstance(dtype, cudf.IntervalDtype):
data = pa.array(arbitrary, type=dtype.to_arrow())
return as_column(data, nan_as_null=nan_as_null)
elif isinstance(dtype, cudf.core.dtypes.Decimal128Dtype):
data = pa.array(
arbitrary,
type=pa.decimal128(
precision=dtype.precision, scale=dtype.scale
),
)
return cudf.core.column.Decimal128Column.from_arrow(
data
)
elif isinstance(dtype, cudf.core.dtypes.Decimal64Dtype):
data = pa.array(
arbitrary,
type=pa.decimal128(
precision=dtype.precision, scale=dtype.scale
),
)
return cudf.core.column.Decimal64Column.from_arrow(
data
)
elif isinstance(dtype, cudf.core.dtypes.Decimal32Dtype):
data = pa.array(
arbitrary,
type=pa.decimal128(
precision=dtype.precision, scale=dtype.scale
),
)
return cudf.core.column.Decimal32Column.from_arrow(
data
)
if is_bool_dtype(dtype):
# Need this special case handling for bool dtypes,
# since 'boolean' & 'pd.BooleanDtype' are not
# understood by np.dtype below.
dtype = "bool"
np_dtype = np.dtype(dtype)
if np_dtype.kind in {"m", "M"}:
unit = np.datetime_data(np_dtype)[0]
if unit not in {"ns", "us", "ms", "s", "D"}:
raise NotImplementedError(
f"{dtype=} is not supported."
)
np_type = np_dtype.type
pa_type = np_to_pa_dtype(np_dtype)
from_pandas = True if nan_as_null is None else nan_as_null
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
from_pandas = True if nan_as_null is None else nan_as_null
from_pandas = nan_as_null is None or nan_as_null

if dtype is not None:
if (
isinstance(dtype, (cudf.CategoricalDtype, cudf.IntervalDtype))
or dtype == object
):
if dtype == object:
pd_dtype = "str"
else:
# By default cudf constructs a 64-bit column. Setting
# the `default_*_bitwidth` to 32 will result in a 32-bit
# column being created.
if (
cudf.get_option("default_integer_bitwidth")
and infer_dtype(arbitrary) == "integer"
):
pa_type = np_to_pa_dtype(
_maybe_convert_to_default_type("int")
)
if cudf.get_option(
"default_float_bitwidth"
) and infer_dtype(arbitrary) in (
"floating",
"mixed-integer-float",
):
pa_type = np_to_pa_dtype(
_maybe_convert_to_default_type("float")
)

if (
cudf.get_option("mode.pandas_compatible")
and isinstance(
arbitrary, (pd.Index, pd.api.extensions.ExtensionArray)
)
and _is_pandas_nullable_extension_dtype(arbitrary.dtype)
):
raise NotImplementedError("not supported")

pyarrow_array = pa.array(
arbitrary,
type=pa_type,
from_pandas=True if nan_as_null is None else nan_as_null,
)

if (
isinstance(pyarrow_array, pa.NullArray)
and pa_type is None
and dtype is None
and getattr(arbitrary, "dtype", None)
== cudf.dtype("object")
):
# pa.array constructor returns a NullArray
# for empty arrays, instead of a StringArray.
# This issue is only specific to this dtype,
# all other dtypes, result in their corresponding
# arrow array creation.
dtype = cudf.dtype("str")
pyarrow_array = pyarrow_array.cast(np_to_pa_dtype(dtype))

if (
isinstance(arbitrary, pd.Index)
and arbitrary.dtype == cudf.dtype("object")
and (
cudf.dtype(pyarrow_array.type.to_pandas_dtype())
!= cudf.dtype(arbitrary.dtype)
)
):
raise MixedTypeError(
"Cannot create column with mixed types"
)

if (
cudf.get_option("mode.pandas_compatible")
and pa.types.is_integer(pyarrow_array.type)
and pyarrow_array.null_count
):
pyarrow_array = pyarrow_array.cast("float64").fill_null(
np.nan
pd_dtype = dtype.to_pandas()
arbitrary = pd.Series(arbitrary, dtype=pd_dtype)
else:
if isinstance(dtype, np.dtype):
typ = np_to_pa_dtype(dtype)
else:
typ = dtype.to_arrow()
try:
arbitrary = pa.array(
arbitrary, type=typ, from_pandas=from_pandas
)

data = as_column(
pyarrow_array,
dtype=dtype,
nan_as_null=nan_as_null,
)
except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError) as e:
if isinstance(e, MixedTypeError):
raise TypeError(str(e))
if is_categorical_dtype(dtype):
sr = pd.Series(arbitrary, dtype="category")
data = as_column(sr, nan_as_null=nan_as_null, dtype=dtype)
elif np_type == np.str_:
sr = pd.Series(arbitrary, dtype="str")
data = as_column(sr, nan_as_null=nan_as_null)
elif is_interval_dtype(dtype):
sr = pd.Series(arbitrary, dtype="interval")
data = as_column(sr, nan_as_null=nan_as_null, dtype=dtype)
elif (
isinstance(arbitrary, Sequence)
and len(arbitrary) > 0
and any(
cudf.utils.dtypes.is_column_like(arb)
for arb in arbitrary
except (pa.ArrowInvalid, pa.ArrowTypeError):
if not isinstance(dtype, np.dtype):
dtype = dtype.to_pandas()
arbitrary = pd.Series(arbitrary, dtype=dtype)
data = as_column(arbitrary, nan_as_null=nan_as_null)
else:
try:
arbitrary = pa.array(arbitrary, from_pandas=from_pandas)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to go via arrow first and only then try and fall back to pandas? Or should we just go via pandas? I note this PR does not fix #14627 because I think we still go down the same (different) arrow vs pandas codepath.

if cudf.get_option(
"default_integer_bitwidth"
) and pa.types.is_integer(arbitrary.type):
typ = np_to_pa_dtype(_maybe_convert_to_default_type("int"))
arbitrary = arbitrary.cast(typ)
elif cudf.get_option(
"default_float_bitwidth"
) and pa.types.is_floating(arbitrary.type):
typ = np_to_pa_dtype(
_maybe_convert_to_default_type("float")
)
):
arbitrary = arbitrary.cast(typ)
except (pa.ArrowInvalid, pa.ArrowTypeError):
if any(is_column_like(val) for val in arbitrary):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: are we guaranteed that arbitrary is iterable at this point?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe so since an initial pa.array call proxy-checks that, but this would fail here for generators for example so I'll make sure this branch casts the input to list or similar

return cudf.core.column.ListColumn.from_sequences(
arbitrary
)
elif isinstance(arbitrary, abc.Iterable) or isinstance(
arbitrary, abc.Sequence
):
data = as_column(
_construct_array(arbitrary, dtype),
dtype=dtype,
nan_as_null=nan_as_null,
)
else:
raise e
arbitrary = pd.Series(arbitrary)
if cudf.get_option(
"default_integer_bitwidth"
) and is_integer_dtype(arbitrary.dtype):
dtype = _maybe_convert_to_default_type("int")
elif cudf.get_option(
"default_float_bitwidth"
) and is_float_dtype(arbitrary.dtype):
dtype = _maybe_convert_to_default_type("float")
data = as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype)
return data


def _construct_array(
arbitrary: Any, dtype: Optional[Dtype]
) -> Union[np.ndarray, cupy.ndarray, pd.api.extensions.ExtensionArray]:
"""
Construct a CuPy/NumPy/Pandas array from `arbitrary`
"""
try:
dtype = dtype if dtype is None else cudf.dtype(dtype)
arbitrary = cupy.asarray(arbitrary, dtype=dtype)
except (TypeError, ValueError):
native_dtype = dtype
inferred_dtype = infer_dtype(arbitrary, skipna=False)
if (
dtype is None
and not cudf._lib.scalar._is_null_host_scalar(arbitrary)
and inferred_dtype
in (
"mixed",
"mixed-integer",
)
):
native_dtype = "object"
if inferred_dtype == "interval":
# Only way to construct an Interval column.
return pd.array(arbitrary)
elif (
inferred_dtype == "string" and getattr(dtype, "kind", None) == "M"
):
# We may have date-like strings with timezones
try:
pd_arbitrary = pd.to_datetime(arbitrary)
if isinstance(pd_arbitrary.dtype, pd.DatetimeTZDtype):
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
)
return pd_arbitrary.to_numpy()
except pd.errors.OutOfBoundsDatetime:
# https://github.com/pandas-dev/pandas/issues/55096
pass

arbitrary = np.asarray(
arbitrary,
dtype=native_dtype
if native_dtype is None
else np.dtype(native_dtype),
)
return arbitrary


def _mask_from_cuda_array_interface_desc(obj) -> Union[Buffer, None]:
desc = obj.__cuda_array_interface__
mask = desc.get("mask", None)
Expand Down
Loading
Loading