diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index ad56cabb48e..68e078a0c5f 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -4,8 +4,6 @@ import builtins import pickle -import warnings -from collections import abc from functools import cached_property from itertools import chain from types import SimpleNamespace @@ -41,7 +39,6 @@ bitmask_allocation_size_bytes, create_null_mask, ) -from cudf._lib.scalar import as_device_scalar from cudf._lib.stream_compaction import ( apply_boolean_mask, distinct_count as cpp_distinct_count, @@ -53,17 +50,14 @@ from cudf._typing import ColumnLike, Dtype, ScalarLike from cudf.api.types import ( _is_categorical_dtype, - _is_datetime64tz_dtype, - _is_interval_dtype, _is_non_decimal_numeric_dtype, _is_pandas_nullable_extension_dtype, infer_dtype, is_bool_dtype, - is_datetime64_dtype, is_decimal_dtype, is_dtype_equal, + is_float_dtype, is_integer_dtype, - is_list_dtype, is_scalar, is_string_dtype, ) @@ -89,6 +83,7 @@ cudf_dtype_from_pa_type, find_common_type, get_time_unit, + is_column_like, is_mixed_with_object_dtype, min_scalar_type, min_unsigned_type, @@ -1848,25 +1843,14 @@ def as_column( * pandas.Categorical objects * range objects """ - if isinstance(arbitrary, (range, pd.RangeIndex, cudf.RangeIndex)): - column = libcudf.filling.sequence( - len(arbitrary), - as_device_scalar(arbitrary.start, dtype=cudf.dtype("int64")), - as_device_scalar(arbitrary.step, dtype=cudf.dtype("int64")), - ) - if cudf.get_option("default_integer_bitwidth") and dtype is None: - dtype = cudf.dtype( - f'i{cudf.get_option("default_integer_bitwidth")//8}' + if dtype is not None: + dtype = cudf.dtype(dtype) + if isinstance(dtype, pd.DatetimeTZDtype): + raise NotImplementedError( + "Use `tz_localize()` to construct timezone aware data." ) - if dtype is not None: - return column.astype(dtype) - return column - elif isinstance(arbitrary, (ColumnBase, cudf.Series, cudf.BaseIndex)): - # Ignoring nan_as_null per the docstring - if isinstance(arbitrary, cudf.Series): - arbitrary = arbitrary._column - elif isinstance(arbitrary, cudf.BaseIndex): - arbitrary = arbitrary._values + + if isinstance(arbitrary, ColumnBase): if dtype is not None: return arbitrary.astype(dtype) return arbitrary @@ -1963,6 +1947,9 @@ def as_column( elif dtype is not None: col = col.astype(dtype) + elif dtype is not None: + col = col.astype(dtype) + return col elif isinstance( @@ -2227,267 +2214,76 @@ def as_column( elif isinstance(arbitrary, cudf.Scalar): data = ColumnBase.from_scalar(arbitrary, length if length else 1) else: - if dtype is not None: - # Arrow throws a type error if the input is of - # mixed-precision and cannot fit into the provided - # decimal type properly, see: - # https://github.com/apache/arrow/pull/9948 - # Hence we should let the exception propagate to - # the user. - if isinstance(dtype, cudf.core.dtypes.Decimal128Dtype): - data = pa.array( - arbitrary, - type=pa.decimal128( - precision=dtype.precision, scale=dtype.scale - ), - ) - return cudf.core.column.Decimal128Column.from_arrow(data) - elif isinstance(dtype, cudf.core.dtypes.Decimal64Dtype): - data = pa.array( - arbitrary, - type=pa.decimal128( - precision=dtype.precision, scale=dtype.scale - ), - ) - return cudf.core.column.Decimal64Column.from_arrow(data) - elif isinstance(dtype, cudf.core.dtypes.Decimal32Dtype): - data = pa.array( - arbitrary, - type=pa.decimal128( - precision=dtype.precision, scale=dtype.scale - ), - ) - return cudf.core.column.Decimal32Column.from_arrow(data) - - pa_type = None - np_type = None - try: - if dtype is not None: - if _is_categorical_dtype(dtype) or _is_interval_dtype(dtype): - raise TypeError - if _is_datetime64tz_dtype(dtype): - raise NotImplementedError( - "Use `tz_localize()` to construct " - "timezone aware data." - ) - elif is_datetime64_dtype(dtype): - # Error checking only, actual construction happens - # below. - pa_array = pa.array(arbitrary) - if ( - isinstance(pa_array.type, pa.TimestampType) - and pa_array.type.tz is not None - ): - raise NotImplementedError( - "cuDF does not yet support timezone-aware " - "datetimes" - ) - if is_list_dtype(dtype): - data = pa.array(arbitrary) - if type(data) not in (pa.ListArray, pa.NullArray): - raise ValueError( - "Cannot create list column from given data" - ) - return as_column(data, nan_as_null=nan_as_null) - elif isinstance(dtype, cudf.StructDtype) and not isinstance( - dtype, cudf.IntervalDtype - ): - data = pa.array(arbitrary, type=dtype.to_arrow()) - return as_column(data, nan_as_null=nan_as_null) - elif isinstance(dtype, cudf.core.dtypes.Decimal128Dtype): - data = pa.array( - arbitrary, - type=pa.decimal128( - precision=dtype.precision, scale=dtype.scale - ), - ) - return cudf.core.column.Decimal128Column.from_arrow(data) - elif isinstance(dtype, cudf.core.dtypes.Decimal64Dtype): - data = pa.array( - arbitrary, - type=pa.decimal128( - precision=dtype.precision, scale=dtype.scale - ), - ) - return cudf.core.column.Decimal64Column.from_arrow(data) - elif isinstance(dtype, cudf.core.dtypes.Decimal32Dtype): - data = pa.array( - arbitrary, - type=pa.decimal128( - precision=dtype.precision, scale=dtype.scale - ), - ) - return cudf.core.column.Decimal32Column.from_arrow(data) - if is_bool_dtype(dtype): - # Need this special case handling for bool dtypes, - # since 'boolean' & 'pd.BooleanDtype' are not - # understood by np.dtype below. - dtype = "bool" - np_dtype = np.dtype(dtype) - if np_dtype.kind in {"m", "M"}: - unit = np.datetime_data(np_dtype)[0] - if unit not in {"ns", "us", "ms", "s", "D"}: - raise NotImplementedError( - f"{dtype=} is not supported." - ) - np_type = np_dtype.type - pa_type = np_to_pa_dtype(np_dtype) - else: - # By default cudf constructs a 64-bit column. Setting - # the `default_*_bitwidth` to 32 will result in a 32-bit - # column being created. - if ( - cudf.get_option("default_integer_bitwidth") - and infer_dtype(arbitrary) == "integer" - ): - pa_type = np_to_pa_dtype( - _maybe_convert_to_default_type("int") - ) - if cudf.get_option("default_float_bitwidth") and infer_dtype( - arbitrary - ) in ( - "floating", - "mixed-integer-float", - ): - pa_type = np_to_pa_dtype( - _maybe_convert_to_default_type("float") - ) - - if ( - cudf.get_option("mode.pandas_compatible") - and isinstance( - arbitrary, (pd.Index, pd.api.extensions.ExtensionArray) - ) - and _is_pandas_nullable_extension_dtype(arbitrary.dtype) - ): - raise NotImplementedError("not supported") - - pyarrow_array = pa.array( - arbitrary, - type=pa_type, - from_pandas=True if nan_as_null is None else nan_as_null, - ) - - if ( - isinstance(pyarrow_array, pa.NullArray) - and pa_type is None - and dtype is None - and getattr(arbitrary, "dtype", None) == cudf.dtype("object") - ): - # pa.array constructor returns a NullArray - # for empty arrays, instead of a StringArray. - # This issue is only specific to this dtype, - # all other dtypes, result in their corresponding - # arrow array creation. - dtype = cudf.dtype("str") - pyarrow_array = pyarrow_array.cast(np_to_pa_dtype(dtype)) - - if ( - isinstance(arbitrary, pd.Index) - and arbitrary.dtype == cudf.dtype("object") - and ( - cudf.dtype(pyarrow_array.type.to_pandas_dtype()) - != cudf.dtype(arbitrary.dtype) - ) - ): - raise MixedTypeError("Cannot create column with mixed types") - - if ( - cudf.get_option("mode.pandas_compatible") - and pa.types.is_integer(pyarrow_array.type) - and pyarrow_array.null_count - ): - pyarrow_array = pyarrow_array.cast("float64").fill_null(np.nan) - - data = as_column( - pyarrow_array, - dtype=dtype, - nan_as_null=nan_as_null, - ) - except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError) as e: - if isinstance(e, MixedTypeError): - raise TypeError(str(e)) - if _is_categorical_dtype(dtype): - sr = pd.Series(arbitrary, dtype="category") - data = as_column(sr, nan_as_null=nan_as_null, dtype=dtype) - elif np_type == np.str_: - sr = pd.Series(arbitrary, dtype="str") - data = as_column(sr, nan_as_null=nan_as_null) - elif _is_interval_dtype(dtype): - sr = pd.Series(arbitrary, dtype="interval") - data = as_column(sr, nan_as_null=nan_as_null, dtype=dtype) - elif ( - isinstance(arbitrary, Sequence) - and len(arbitrary) > 0 - and any( - cudf.utils.dtypes.is_column_like(arb) for arb in arbitrary - ) - ): - return cudf.core.column.ListColumn.from_sequences(arbitrary) - elif isinstance(arbitrary, abc.Iterable) or isinstance( - arbitrary, abc.Sequence - ): - data = as_column( - _construct_array(arbitrary, dtype), + from_pandas = nan_as_null is None or nan_as_null + arbitrary = list(arbitrary) + for element in arbitrary: + # Carve-outs that cannot be parsed by pyarrow/pandas + if isinstance(element, cupy.ndarray): + # 0-D cupy.arrays ("scalar"): test_series_from_cupy_scalars + return as_column( + cupy.array(arbitrary), dtype=dtype, nan_as_null=nan_as_null, + length=length, ) + elif is_column_like(element): + # e.g. cudf.Series: test_nested_series_from_sequence_data + return cudf.core.column.ListColumn.from_sequences(arbitrary) + elif not any(element is na for na in (None, pd.NA, np.nan)): + # Might have NA + element like above, but short-circuit if + # an element pyarrow/pandas might be able to parse + break + if dtype is not None: + if ( + isinstance(dtype, (cudf.CategoricalDtype, cudf.IntervalDtype)) + or dtype == object + ): + if dtype == object: + pd_dtype = "str" + else: + pd_dtype = dtype.to_pandas() + arbitrary = pd.Series(arbitrary, dtype=pd_dtype) else: - raise e - return data - - -def _construct_array( - arbitrary: Any, dtype: Optional[Dtype] -) -> Union[np.ndarray, cupy.ndarray, pd.api.extensions.ExtensionArray]: - """ - Construct a CuPy/NumPy/Pandas array from `arbitrary` - """ - try: - dtype = dtype if dtype is None else cudf.dtype(dtype) - arbitrary = cupy.asarray(arbitrary, dtype=dtype) - except (TypeError, ValueError): - native_dtype = dtype - inferred_dtype = infer_dtype(arbitrary, skipna=False) - if ( - dtype is None - and not cudf._lib.scalar._is_null_host_scalar(arbitrary) - and inferred_dtype - in ( - "mixed", - "mixed-integer", - ) - ): - native_dtype = "object" - if inferred_dtype == "interval": - # Only way to construct an Interval column. - return pd.array(arbitrary) - elif ( - inferred_dtype == "string" and getattr(dtype, "kind", None) == "M" - ): - # We may have date-like strings with timezones + if isinstance(dtype, np.dtype): + typ = np_to_pa_dtype(dtype) + else: + typ = dtype.to_arrow() + try: + arbitrary = pa.array( + arbitrary, type=typ, from_pandas=from_pandas + ) + except (pa.ArrowInvalid, pa.ArrowTypeError): + if not isinstance(dtype, np.dtype): + dtype = dtype.to_pandas() + arbitrary = pd.Series(arbitrary, dtype=dtype) + data = as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype) + else: try: - with warnings.catch_warnings(): - # Need to ignore userwarnings when - # datetime format cannot be inferred. - warnings.simplefilter("ignore", UserWarning) - pd_arbitrary = pd.to_datetime(arbitrary) - if isinstance(pd_arbitrary.dtype, pd.DatetimeTZDtype): - raise NotImplementedError( - "cuDF does not yet support timezone-aware datetimes" + arbitrary = pa.array(arbitrary, from_pandas=from_pandas) + if cudf.get_option( + "default_integer_bitwidth" + ) and pa.types.is_integer(arbitrary.type): + typ = np_to_pa_dtype(_maybe_convert_to_default_type("int")) + arbitrary = arbitrary.cast(typ) + elif cudf.get_option( + "default_float_bitwidth" + ) and pa.types.is_floating(arbitrary.type): + typ = np_to_pa_dtype( + _maybe_convert_to_default_type("float") ) - return pd_arbitrary.to_numpy() - except pd.errors.OutOfBoundsDatetime: - # https://github.com/pandas-dev/pandas/issues/55096 - pass - - arbitrary = np.asarray( - arbitrary, - dtype=native_dtype - if native_dtype is None - else np.dtype(native_dtype), - ) - return arbitrary + arbitrary = arbitrary.cast(typ) + except (pa.ArrowInvalid, pa.ArrowTypeError): + arbitrary = pd.Series(arbitrary) + if cudf.get_option( + "default_integer_bitwidth" + ) and is_integer_dtype(arbitrary.dtype): + dtype = _maybe_convert_to_default_type("int") + elif cudf.get_option( + "default_float_bitwidth" + ) and is_float_dtype(arbitrary.dtype): + dtype = _maybe_convert_to_default_type("float") + data = as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype) + return data def _mask_from_cuda_array_interface_desc(obj) -> Union[Buffer, None]: diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 14006f90b45..fe446eac5b6 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2210,7 +2210,7 @@ def __getitem__(self, key): def test_series_constructor_error_mixed_type(): - with pytest.raises(pa.ArrowTypeError): + with pytest.raises(MixedTypeError): cudf.Series(["abc", np.nan, "123"], nan_as_null=False) @@ -2529,7 +2529,7 @@ def test_nan_as_null_from_arrow_objects(klass, data): @pytest.mark.parametrize("reso", ["M", "ps"]) @pytest.mark.parametrize("typ", ["M", "m"]) def test_series_invalid_reso_dtype(reso, typ): - with pytest.raises(NotImplementedError): + with pytest.raises(TypeError): cudf.Series([], dtype=f"{typ}8[{reso}]") @@ -2659,6 +2659,22 @@ def test_series_duplicate_index_reindex(): ) +def test_list_category_like_maintains_dtype(): + dtype = cudf.CategoricalDtype(categories=[1, 2, 3, 4], ordered=True) + data = [1, 2, 3] + result = cudf.Series(cudf.core.column.as_column(data, dtype=dtype)) + expected = pd.Series(data, dtype=dtype.to_pandas()) + assert_eq(result, expected) + + +def test_list_interval_like_maintains_dtype(): + dtype = cudf.IntervalDtype(subtype=np.int8) + data = [pd.Interval(1, 2)] + result = cudf.Series(cudf.core.column.as_column(data, dtype=dtype)) + expected = pd.Series(data, dtype=dtype.to_pandas()) + assert_eq(result, expected) + + @pytest.mark.parametrize( "klass", [cudf.Series, cudf.Index, pd.Series, pd.Index] )