diff --git a/CHANGELOG.md b/CHANGELOG.md index 35e21b91538..2c7cafc2d35 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -174,6 +174,7 @@ - PR #4493 Skip legacy testing in CI - PR #4524 Updating `__setitem__` for DataFrame to use scalar scatter - PR #4534 Disable deprecation warnings as errors. +- PR #4506 Check for multi-dimensional data in column/Series creation ## Bug Fixes diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index b3d822510e9..710545c2a05 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1160,37 +1160,9 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): col = col.set_mask(mask) elif np.issubdtype(col.dtype, np.datetime64): if nan_as_null or (mask is None and nan_as_null is None): - null = column_empty_like(col, masked=True, newsize=1) - col = libcudfxx.replace.replace( - col, - as_column( - Buffer( - np.array([np.datetime64("NaT")], dtype=col.dtype) - ), - dtype=col.dtype, - ), - null, - ) + col = utils.time_col_replace_nulls(col) return col - elif isinstance(arbitrary, np.ndarray): - # CUDF assumes values are always contiguous - if not arbitrary.flags["C_CONTIGUOUS"]: - arbitrary = np.ascontiguousarray(arbitrary) - - if dtype is not None: - arbitrary = arbitrary.astype(dtype) - - if arbitrary.dtype.kind == "M": - data = datetime.DatetimeColumn.from_numpy(arbitrary) - - elif arbitrary.dtype.kind in ("O", "U"): - data = as_column( - pa.Array.from_pandas(arbitrary), dtype=arbitrary.dtype - ) - else: - data = as_column(cupy.asarray(arbitrary), nan_as_null=nan_as_null) - elif isinstance(arbitrary, pa.Array): if isinstance(arbitrary, pa.StringArray): pa_size, pa_offset, nbuf, obuf, sbuf = buffers_from_pyarrow( @@ -1357,6 +1329,49 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): elif np.issubdtype(data.dtype, np.datetime64): data = data.fillna(np.datetime64("NaT")) + elif hasattr(arbitrary, "__array_interface__"): + # CUDF assumes values are always contiguous + desc = arbitrary.__array_interface__ + shape = desc["shape"] + arb_dtype = np.dtype(desc["typestr"]) + # CUDF assumes values are always contiguous + if len(shape) > 1: + raise ValueError("Data must be 1-dimensional") + + arbitrary = np.asarray(arbitrary) + if not arbitrary.flags["C_CONTIGUOUS"]: + arbitrary = np.ascontiguousarray(arbitrary) + + if dtype is not None: + arbitrary = arbitrary.astype(dtype) + + if arb_dtype.kind == "M": + + time_unit, _ = np.datetime_data(arbitrary.dtype) + cast_dtype = time_unit in ("D", "W", "M", "Y") + + if cast_dtype: + arbitrary = arbitrary.astype(np.dtype("datetime64[s]")) + + buffer = Buffer(arbitrary) + mask = None + if nan_as_null: + data = as_column( + buffer, dtype=arbitrary.dtype, nan_as_null=nan_as_null + ) + data = utils.time_col_replace_nulls(data) + mask = data.mask + + data = datetime.DatetimeColumn( + data=buffer, mask=mask, dtype=arbitrary.dtype + ) + elif arb_dtype.kind in ("O", "U"): + data = as_column( + pa.Array.from_pandas(arbitrary), dtype=arbitrary.dtype + ) + else: + data = as_column(cupy.asarray(arbitrary), nan_as_null=nan_as_null) + elif isinstance(arbitrary, memoryview): data = as_column( np.asarray(arbitrary), dtype=dtype, nan_as_null=nan_as_null diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 5639e1dabe4..b1278c0d766 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -4,11 +4,10 @@ import pandas as pd import pyarrow as pa -import cudf import cudf._lib as libcudf import cudf._libxx as libcudfxx from cudf.core.buffer import Buffer -from cudf.core.column import as_column, column +from cudf.core.column import column from cudf.utils import utils from cudf.utils.dtypes import is_scalar, np_to_pa_dtype @@ -54,47 +53,6 @@ def __contains__(self, item): return False return item.astype("int_") in self.as_numerical - @classmethod - def from_numpy(cls, array): - cast_dtype = array.dtype.type == np.int64 - if array.dtype.kind == "M": - time_unit, _ = np.datetime_data(array.dtype) - cast_dtype = time_unit in ("D", "W", "M", "Y") or ( - len(array) > 0 - and ( - isinstance(array[0], str) - or isinstance(array[0], dt.datetime) - ) - ) - elif not cast_dtype: - raise ValueError( - ("Cannot infer datetime dtype " + "from np.array dtype `%s`") - % (array.dtype) - ) - - if cast_dtype: - array = array.astype(np.dtype("datetime64[s]")) - assert array.dtype.itemsize == 8 - - mask = None - if np.any(np.isnat(array)): - null = cudf.core.column.column_empty_like( - array, masked=True, newsize=1 - ) - col = libcudfxx.replace.replace( - as_column(Buffer(array), dtype=array.dtype), - as_column( - Buffer( - np.array([np.datetime64("NaT")], dtype=array.dtype) - ), - dtype=array.dtype, - ), - null, - ) - mask = col.mask - - return cls(data=Buffer(array), mask=mask, dtype=array.dtype) - @property def time_unit(self): return self._time_unit diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 0a9f2e84483..ca2b951478f 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -852,13 +852,11 @@ def __init__(self, values, **kwargs): # and then just dispatch upstream kwargs = _setdefault_name(values, kwargs) if isinstance(values, np.ndarray) and values.dtype.kind == "M": - values = DatetimeColumn.from_numpy(values) + values = column.as_column(values) elif isinstance(values, pd.DatetimeIndex): - values = DatetimeColumn.from_numpy(values.values) + values = column.as_column(values.values) elif isinstance(values, (list, tuple)): - values = DatetimeColumn.from_numpy( - np.array(values, dtype="