rapidsai · mroeschke · Dec 13, 2023 · Dec 13, 2023 · Dec 13, 2023 · Dec 14, 2023
@@ -4,7 +4,6 @@
 
 import builtins
 import pickle
-from collections import abc
 from functools import cached_property
 from itertools import chain
 from types import SimpleNamespace
@@ -53,13 +52,13 @@
     infer_dtype,
     is_bool_dtype,
     is_categorical_dtype,
-    is_datetime64_dtype,
     is_datetime64tz_dtype,
     is_decimal32_dtype,
     is_decimal64_dtype,
     is_decimal128_dtype,
     is_decimal_dtype,
     is_dtype_equal,
+    is_float_dtype,
     is_integer_dtype,
     is_interval_dtype,
     is_list_dtype,
@@ -88,6 +87,7 @@
     _maybe_convert_to_default_type,
     cudf_dtype_from_pa_type,
     get_time_unit,
+    is_column_like,
     is_mixed_with_object_dtype,
     min_scalar_type,
     min_unsigned_type,
@@ -1911,6 +1911,14 @@ def _make_copy_replacing_NaT_with_null(column):
     return out_col
 
 
+def can_memoryview(arbitrary: Any) -> bool:
+    try:
+        memoryview(arbitrary)
+        return True
+    except TypeError:
+        return False
-def can_memoryview(arbitrary: Any) -> bool:
-    try:
-        memoryview(arbitrary)
-        return True
-    except TypeError:
-        return False
+def as_memoryview(arbitrary: Any) -> Optional[memoryview]:
+    try:
+        return memoryview(arbitrary)
+    except TypeError:
+        return None
-def can_memoryview(arbitrary: Any) -> bool:
-    try:
-        memoryview(arbitrary)
-        return True
-    except TypeError:
-        return False
+def as_memoryview(arbitrary: Any) -> Optional[memoryview]:
+    try:
+        return memoryview(arbitrary)
+    except TypeError:
+        return None
+
+
 def as_column(
     arbitrary: Any,
     nan_as_null: Optional[bool] = None,
@@ -1951,6 +1959,13 @@ def as_column(
     * pyarrow array
     * pandas.Categorical objects
     """
+    if dtype is not None:
+        dtype = cudf.dtype(dtype)
+        if isinstance(dtype, pd.DatetimeTZDtype):
+            raise NotImplementedError(
+                "Use `tz_localize()` to construct timezone aware data."
+            )
+
     if isinstance(arbitrary, ColumnBase):
         if dtype is not None:
             return arbitrary.astype(dtype)
@@ -2318,285 +2333,69 @@ def as_column(
         )
     elif isinstance(arbitrary, cudf.Scalar):
         data = ColumnBase.from_scalar(arbitrary, length if length else 1)
+    elif can_memoryview(arbitrary):
+        data = as_column(
+            memoryview(arbitrary), dtype=dtype, nan_as_null=nan_as_null
+        )
-    elif can_memoryview(arbitrary):
-        data = as_column(
-            memoryview(arbitrary), dtype=dtype, nan_as_null=nan_as_null
-        )
+    elif (view := as_memoryview(arbitrary)) is not None:
+        data = as_column(view, dtype=dtype, nan_as_null=nan_as_null)
-    elif can_memoryview(arbitrary):
-        data = as_column(
-            memoryview(arbitrary), dtype=dtype, nan_as_null=nan_as_null
-        )
+    elif (view := as_memoryview(arbitrary)) is not None:
+        data = as_column(view, dtype=dtype, nan_as_null=nan_as_null)
     else:
-        try:
-            data = as_column(
-                memoryview(arbitrary), dtype=dtype, nan_as_null=nan_as_null
-            )
-        except TypeError:
-            if dtype is not None:
-                # Arrow throws a type error if the input is of
-                # mixed-precision and cannot fit into the provided
-                # decimal type properly, see:
-                # https://github.com/apache/arrow/pull/9948
-                # Hence we should let the exception propagate to
-                # the user.
-                if isinstance(dtype, cudf.core.dtypes.Decimal128Dtype):
-                    data = pa.array(
-                        arbitrary,
-                        type=pa.decimal128(
-                            precision=dtype.precision, scale=dtype.scale
-                        ),
-                    )
-                    return cudf.core.column.Decimal128Column.from_arrow(data)
-                elif isinstance(dtype, cudf.core.dtypes.Decimal64Dtype):
-                    data = pa.array(
-                        arbitrary,
-                        type=pa.decimal128(
-                            precision=dtype.precision, scale=dtype.scale
-                        ),
-                    )
-                    return cudf.core.column.Decimal64Column.from_arrow(data)
-                elif isinstance(dtype, cudf.core.dtypes.Decimal32Dtype):
-                    data = pa.array(
-                        arbitrary,
-                        type=pa.decimal128(
-                            precision=dtype.precision, scale=dtype.scale
-                        ),
-                    )
-                    return cudf.core.column.Decimal32Column.from_arrow(data)
-
-            pa_type = None
-            np_type = None
-            try:
-                if dtype is not None:
-                    if is_categorical_dtype(dtype) or is_interval_dtype(dtype):
-                        raise TypeError
-                    if is_datetime64tz_dtype(dtype):
-                        raise NotImplementedError(
-                            "Use `tz_localize()` to construct "
-                            "timezone aware data."
-                        )
-                    elif is_datetime64_dtype(dtype):
-                        # Error checking only, actual construction happens
-                        # below.
-                        pa_array = pa.array(arbitrary)
-                        if (
-                            isinstance(pa_array.type, pa.TimestampType)
-                            and pa_array.type.tz is not None
-                        ):
-                            raise NotImplementedError(
-                                "cuDF does not yet support timezone-aware "
-                                "datetimes"
-                            )
-                    if is_list_dtype(dtype):
-                        data = pa.array(arbitrary)
-                        if type(data) not in (pa.ListArray, pa.NullArray):
-                            raise ValueError(
-                                "Cannot create list column from given data"
-                            )
-                        return as_column(data, nan_as_null=nan_as_null)
-                    elif isinstance(
-                        dtype, cudf.StructDtype
-                    ) and not isinstance(dtype, cudf.IntervalDtype):
-                        data = pa.array(arbitrary, type=dtype.to_arrow())
-                        return as_column(data, nan_as_null=nan_as_null)
-                    elif isinstance(dtype, cudf.core.dtypes.Decimal128Dtype):
-                        data = pa.array(
-                            arbitrary,
-                            type=pa.decimal128(
-                                precision=dtype.precision, scale=dtype.scale
-                            ),
-                        )
-                        return cudf.core.column.Decimal128Column.from_arrow(
-                            data
-                        )
-                    elif isinstance(dtype, cudf.core.dtypes.Decimal64Dtype):
-                        data = pa.array(
-                            arbitrary,
-                            type=pa.decimal128(
-                                precision=dtype.precision, scale=dtype.scale
-                            ),
-                        )
-                        return cudf.core.column.Decimal64Column.from_arrow(
-                            data
-                        )
-                    elif isinstance(dtype, cudf.core.dtypes.Decimal32Dtype):
-                        data = pa.array(
-                            arbitrary,
-                            type=pa.decimal128(
-                                precision=dtype.precision, scale=dtype.scale
-                            ),
-                        )
-                        return cudf.core.column.Decimal32Column.from_arrow(
-                            data
-                        )
-                    if is_bool_dtype(dtype):
-                        # Need this special case handling for bool dtypes,
-                        # since 'boolean' & 'pd.BooleanDtype' are not
-                        # understood by np.dtype below.
-                        dtype = "bool"
-                    np_dtype = np.dtype(dtype)
-                    if np_dtype.kind in {"m", "M"}:
-                        unit = np.datetime_data(np_dtype)[0]
-                        if unit not in {"ns", "us", "ms", "s", "D"}:
-                            raise NotImplementedError(
-                                f"{dtype=} is not supported."
-                            )
-                    np_type = np_dtype.type
-                    pa_type = np_to_pa_dtype(np_dtype)
+        from_pandas = True if nan_as_null is None else nan_as_null
-        from_pandas = True if nan_as_null is None else nan_as_null
+        from_pandas = nan_as_null is None or nan_as_null
-        from_pandas = True if nan_as_null is None else nan_as_null
+        from_pandas = nan_as_null is None or nan_as_null
+        if dtype is not None:
+            if (
+                isinstance(dtype, (cudf.CategoricalDtype, cudf.IntervalDtype))
+                or dtype == object
+            ):
+                if dtype == object:
+                    pd_dtype = "str"
                 else:
-                    # By default cudf constructs a 64-bit column. Setting
-                    # the `default_*_bitwidth` to 32 will result in a 32-bit
-                    # column being created.
-                    if (
-                        cudf.get_option("default_integer_bitwidth")
-                        and infer_dtype(arbitrary) == "integer"
-                    ):
-                        pa_type = np_to_pa_dtype(
-                            _maybe_convert_to_default_type("int")
-                        )
-                    if cudf.get_option(
-                        "default_float_bitwidth"
-                    ) and infer_dtype(arbitrary) in (
-                        "floating",
-                        "mixed-integer-float",
-                    ):
-                        pa_type = np_to_pa_dtype(
-                            _maybe_convert_to_default_type("float")
-                        )
-
-                if (
-                    cudf.get_option("mode.pandas_compatible")
-                    and isinstance(
-                        arbitrary, (pd.Index, pd.api.extensions.ExtensionArray)
-                    )
-                    and _is_pandas_nullable_extension_dtype(arbitrary.dtype)
-                ):
-                    raise NotImplementedError("not supported")
-
-                pyarrow_array = pa.array(
-                    arbitrary,
-                    type=pa_type,
-                    from_pandas=True if nan_as_null is None else nan_as_null,
-                )
-
-                if (
-                    isinstance(pyarrow_array, pa.NullArray)
-                    and pa_type is None
-                    and dtype is None
-                    and getattr(arbitrary, "dtype", None)
-                    == cudf.dtype("object")
-                ):
-                    # pa.array constructor returns a NullArray
-                    # for empty arrays, instead of a StringArray.
-                    # This issue is only specific to this dtype,
-                    # all other dtypes, result in their corresponding
-                    # arrow array creation.
-                    dtype = cudf.dtype("str")
-                    pyarrow_array = pyarrow_array.cast(np_to_pa_dtype(dtype))
-
-                if (
-                    isinstance(arbitrary, pd.Index)
-                    and arbitrary.dtype == cudf.dtype("object")
-                    and (
-                        cudf.dtype(pyarrow_array.type.to_pandas_dtype())
-                        != cudf.dtype(arbitrary.dtype)
-                    )
-                ):
-                    raise MixedTypeError(
-                        "Cannot create column with mixed types"
-                    )
-
-                if (
-                    cudf.get_option("mode.pandas_compatible")
-                    and pa.types.is_integer(pyarrow_array.type)
-                    and pyarrow_array.null_count
-                ):
-                    pyarrow_array = pyarrow_array.cast("float64").fill_null(
-                        np.nan
+                    pd_dtype = dtype.to_pandas()
+                arbitrary = pd.Series(arbitrary, dtype=pd_dtype)
+            else:
+                if isinstance(dtype, np.dtype):
+                    typ = np_to_pa_dtype(dtype)
+                else:
+                    typ = dtype.to_arrow()
+                try:
+                    arbitrary = pa.array(
+                        arbitrary, type=typ, from_pandas=from_pandas
                     )
-
-                data = as_column(
-                    pyarrow_array,
-                    dtype=dtype,
-                    nan_as_null=nan_as_null,
-                )
-            except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError) as e:
-                if isinstance(e, MixedTypeError):
-                    raise TypeError(str(e))
-                if is_categorical_dtype(dtype):
-                    sr = pd.Series(arbitrary, dtype="category")
-                    data = as_column(sr, nan_as_null=nan_as_null, dtype=dtype)
-                elif np_type == np.str_:
-                    sr = pd.Series(arbitrary, dtype="str")
-                    data = as_column(sr, nan_as_null=nan_as_null)
-                elif is_interval_dtype(dtype):
-                    sr = pd.Series(arbitrary, dtype="interval")
-                    data = as_column(sr, nan_as_null=nan_as_null, dtype=dtype)
-                elif (
-                    isinstance(arbitrary, Sequence)
-                    and len(arbitrary) > 0
-                    and any(
-                        cudf.utils.dtypes.is_column_like(arb)
-                        for arb in arbitrary
+                except (pa.ArrowInvalid, pa.ArrowTypeError):
+                    if not isinstance(dtype, np.dtype):
+                        dtype = dtype.to_pandas()
+                    arbitrary = pd.Series(arbitrary, dtype=dtype)
+            data = as_column(arbitrary, nan_as_null=nan_as_null)
+        else:
+            try:
+                arbitrary = pa.array(arbitrary, from_pandas=from_pandas)
+                if cudf.get_option(
+                    "default_integer_bitwidth"
+                ) and pa.types.is_integer(arbitrary.type):
+                    typ = np_to_pa_dtype(_maybe_convert_to_default_type("int"))
+                    arbitrary = arbitrary.cast(typ)
+                elif cudf.get_option(
+                    "default_float_bitwidth"
+                ) and pa.types.is_floating(arbitrary.type):
+                    typ = np_to_pa_dtype(
+                        _maybe_convert_to_default_type("float")
                     )
-                ):
+                    arbitrary = arbitrary.cast(typ)
+            except (pa.ArrowInvalid, pa.ArrowTypeError):
+                if any(is_column_like(val) for val in arbitrary):
                     return cudf.core.column.ListColumn.from_sequences(
                         arbitrary
                     )
-                elif isinstance(arbitrary, abc.Iterable) or isinstance(
-                    arbitrary, abc.Sequence
-                ):
-                    data = as_column(
-                        _construct_array(arbitrary, dtype),
-                        dtype=dtype,
-                        nan_as_null=nan_as_null,
-                    )
-                else:
-                    raise e
+                arbitrary = pd.Series(arbitrary)
+                if cudf.get_option(
+                    "default_integer_bitwidth"
+                ) and is_integer_dtype(arbitrary.dtype):
+                    dtype = _maybe_convert_to_default_type("int")
+                elif cudf.get_option(
+                    "default_float_bitwidth"
+                ) and is_float_dtype(arbitrary.dtype):
+                    dtype = _maybe_convert_to_default_type("float")
+            data = as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype)
     return data
 
 
-def _construct_array(
-    arbitrary: Any, dtype: Optional[Dtype]
-) -> Union[np.ndarray, cupy.ndarray, pd.api.extensions.ExtensionArray]:
-    """
-    Construct a CuPy/NumPy/Pandas array from `arbitrary`
-    """
-    try:
-        dtype = dtype if dtype is None else cudf.dtype(dtype)
-        arbitrary = cupy.asarray(arbitrary, dtype=dtype)
-    except (TypeError, ValueError):
-        native_dtype = dtype
-        inferred_dtype = infer_dtype(arbitrary, skipna=False)
-        if (
-            dtype is None
-            and not cudf._lib.scalar._is_null_host_scalar(arbitrary)
-            and inferred_dtype
-            in (
-                "mixed",
-                "mixed-integer",
-            )
-        ):
-            native_dtype = "object"
-        if inferred_dtype == "interval":
-            # Only way to construct an Interval column.
-            return pd.array(arbitrary)
-        elif (
-            inferred_dtype == "string" and getattr(dtype, "kind", None) == "M"
-        ):
-            # We may have date-like strings with timezones
-            try:
-                pd_arbitrary = pd.to_datetime(arbitrary)
-                if isinstance(pd_arbitrary.dtype, pd.DatetimeTZDtype):
-                    raise NotImplementedError(
-                        "cuDF does not yet support timezone-aware datetimes"
-                    )
-                return pd_arbitrary.to_numpy()
-            except pd.errors.OutOfBoundsDatetime:
-                # https://github.com/pandas-dev/pandas/issues/55096
-                pass
-
-        arbitrary = np.asarray(
-            arbitrary,
-            dtype=native_dtype
-            if native_dtype is None
-            else np.dtype(native_dtype),
-        )
-    return arbitrary
-
-
 def _mask_from_cuda_array_interface_desc(obj) -> Union[Buffer, None]:
     desc = obj.__cuda_array_interface__
     mask = desc.get("mask", None)