From 30169059560e6e849695cbcb049b713ff976dca5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 12 Dec 2023 15:50:41 -0800 Subject: [PATCH 1/9] Refactor scalar handling in as_column --- python/cudf/cudf/core/column/column.py | 28 ++++++++------------------ python/cudf/cudf/tests/test_column.py | 7 +++++++ 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 430b71989d6..5db7a59a238 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2171,33 +2171,23 @@ def as_column( if dtype is not None: data = data.astype(dtype) - elif isinstance(arbitrary, (pd.Timestamp, pd.Timedelta)): - # This will always treat NaTs as nulls since it's not technically a - # discrete value like NaN - length = length or 1 - data = as_column( - pa.array(pd.Series([arbitrary] * length), from_pandas=True) - ) - if dtype is not None: - data = data.astype(dtype) - - elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview): + elif is_scalar(arbitrary) and not isinstance(arbitrary, memoryview): length = length or 1 if ( - (nan_as_null is True) + nan_as_null is True and isinstance(arbitrary, (np.floating, float)) and np.isnan(arbitrary) ): arbitrary = None if dtype is None: dtype = cudf.dtype("float64") + elif arbitrary is None and dtype is None: + dtype = cudf.dtype("object") + arbitrary = cudf.Scalar(arbitrary, dtype=dtype) + data = ColumnBase.from_scalar(arbitrary, length) - data = as_column(full(length, arbitrary, dtype=dtype)) - if not nan_as_null and not is_decimal_dtype(data.dtype): - if np.issubdtype(data.dtype, np.floating): - data = data.fillna(np.nan) - elif np.issubdtype(data.dtype, np.datetime64): - data = data.fillna(np.datetime64("NaT")) + if dtype is not None: + data = data.astype(dtype) elif hasattr(arbitrary, "__array_interface__"): # CUDF assumes values are always contiguous @@ -2315,8 +2305,6 @@ def as_column( data = as_column( np.asarray(arbitrary), dtype=dtype, nan_as_null=nan_as_null ) - elif isinstance(arbitrary, cudf.Scalar): - data = ColumnBase.from_scalar(arbitrary, length if length else 1) else: try: data = as_column( diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 0546638f388..2b09054ea16 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -552,3 +552,10 @@ def test_astype_with_aliases(alias, expect_dtype, data): gd_data = cudf.Series.from_pandas(pd_data) assert_eq(pd_data.astype(expect_dtype), gd_data.astype(alias)) + + +def test_as_column_None_as_object_type(): + result = as_column(None, length=1) + assert isinstance(result, cudf.core.column.string.StringColumn) + assert len(result) == 1 + assert result.null_count == 1 From 6d2a3108b218f7c69e248b49c5d38a30b6c5835a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 13 Dec 2023 13:49:48 -0800 Subject: [PATCH 2/9] Add exception for Interval --- python/cudf/cudf/core/column/column.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 5db7a59a238..b4e0b70a47e 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2173,6 +2173,14 @@ def as_column( elif is_scalar(arbitrary) and not isinstance(arbitrary, memoryview): length = length or 1 + if isinstance(arbitrary, pd.Interval): + # No cudf.Scalar support yet + return as_column( + pd.Series([arbitrary] * length), + nan_as_null=nan_as_null, + dtype=dtype, + length=length, + ) if ( nan_as_null is True and isinstance(arbitrary, (np.floating, float)) From 9b701bfc8cbd0ce5c96db56e401319954af4a757 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 18 Dec 2023 16:33:03 -0800 Subject: [PATCH 3/9] lint --- python/cudf/cudf/core/column/column.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index b6797f1e6f2..de9d08ff92d 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -55,7 +55,6 @@ is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - is_decimal_dtype, is_dtype_equal, is_integer_dtype, is_interval_dtype, From 5e7cb98a0930763787f49bb5b50fc441443d23b0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 18 Dec 2023 16:57:51 -0800 Subject: [PATCH 4/9] Add return --- python/cudf/cudf/core/column/column.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index de9d08ff92d..f998ac82d46 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2184,6 +2184,7 @@ def as_column( if dtype is not None: data = data.astype(dtype) + return data elif hasattr(arbitrary, "__array_interface__"): # CUDF assumes values are always contiguous From 3c87fa88e23fd0ef9af9ed83d399ba0e8451a201 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 2 Jan 2024 17:59:29 -0800 Subject: [PATCH 5/9] Use as_column instead of full --- python/cudf/cudf/core/column/__init__.py | 3 +- python/cudf/cudf/core/column/categorical.py | 14 +++--- python/cudf/cudf/core/column/column.py | 56 ++++----------------- python/cudf/cudf/core/column/decimal.py | 6 +-- python/cudf/cudf/core/column/numerical.py | 5 +- python/cudf/cudf/core/column/string.py | 14 ++++-- python/cudf/cudf/core/dataframe.py | 24 +++++---- python/cudf/cudf/core/indexed_frame.py | 16 +++--- python/cudf/cudf/core/multiindex.py | 4 +- python/cudf/cudf/core/series.py | 7 ++- python/cudf/cudf/core/tools/datetimes.py | 6 +-- python/cudf/cudf/core/window/rolling.py | 6 +-- python/cudf/cudf/io/parquet.py | 16 +++--- python/cudf/cudf/tests/test_column.py | 2 +- python/cudf/cudf/tests/test_testing.py | 8 +-- python/cudf/cudf/utils/utils.py | 6 +-- python/dask_cudf/dask_cudf/backends.py | 8 +-- 17 files changed, 84 insertions(+), 117 deletions(-) diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index aba4ded4f9d..c0bca2c3c95 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. """ isort: skip_file @@ -19,7 +19,6 @@ column_empty_like_same_mask, concat_columns, deserialize_columns, - full, serialize_columns, ) from cudf.core.column.datetime import DatetimeColumn # noqa: F401 diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 7036a9ee870..569a6118abf 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -937,8 +937,8 @@ def normalize_binop_value(self, other: ScalarLike) -> CategoricalColumn: ) return other - ary = column.full( - len(self), self._encode(other), dtype=self.codes.dtype + ary = column.as_column( + self._encode(other), length=len(self), dtype=self.codes.dtype ) return column.build_categorical_column( categories=self.dtype.categories._values, @@ -1618,11 +1618,9 @@ def _create_empty_categorical_column( return column.build_categorical_column( categories=column.as_column(dtype.categories), codes=column.as_column( - column.full( - categorical_column.size, - _DEFAULT_CATEGORICAL_VALUE, - categorical_column.codes.dtype, - ) + _DEFAULT_CATEGORICAL_VALUE, + length=categorical_column.size, + dtype=categorical_column.codes.dtype, ), offset=categorical_column.offset, size=categorical_column.size, diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index f998ac82d46..b06ec30dd79 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -877,7 +877,7 @@ def isin(self, values: Sequence) -> ColumnBase: except ValueError: # pandas functionally returns all False when cleansing via # typecasting fails - return full(len(self), False, dtype="bool") + return as_column(False, length=len(self), dtype="bool") return lhs._obtain_isin_result(rhs) @@ -904,9 +904,9 @@ def _isin_earlystop(self, rhs: ColumnBase) -> Union[ColumnBase, None]: if self.null_count and rhs.null_count: return self.isnull() else: - return cudf.core.column.full(len(self), False, dtype="bool") + return as_column(False, length=len(self), dtype="bool") elif self.null_count == 0 and (rhs.null_count == len(rhs)): - return cudf.core.column.full(len(self), False, dtype="bool") + return as_column(False, length=len(self), dtype="bool") else: return None @@ -1388,9 +1388,7 @@ def _label_encoding( na_sentinel = cudf.Scalar(-1) def _return_sentinel_column(): - return cudf.core.column.full( - size=len(self), fill_value=na_sentinel, dtype=dtype - ) + return as_column(na_sentinel, dtype=dtype, length=len(self)) if dtype is None: dtype = min_scalar_type(max(len(cats), na_sentinel), 8) @@ -1485,7 +1483,9 @@ def column_empty( elif isinstance(dtype, ListDtype): data = None children = ( - full(row_count + 1, 0, dtype=libcudf.types.size_type_dtype), + as_column( + 0, length=row_count + 1, dtype=libcudf.types.size_type_dtype + ), column_empty(row_count, dtype=dtype.element_type), ) elif isinstance(dtype, CategoricalDtype): @@ -1504,7 +1504,9 @@ def column_empty( elif dtype.kind in "OU" and not isinstance(dtype, DecimalDtype): data = None children = ( - full(row_count + 1, 0, dtype=libcudf.types.size_type_dtype), + as_column( + 0, length=row_count + 1, dtype=libcudf.types.size_type_dtype + ), build_column( data=as_buffer( rmm.DeviceBuffer( @@ -2694,42 +2696,6 @@ def arange( ) -def full( - size: int, fill_value: ScalarLike, dtype: Optional[Dtype] = None -) -> ColumnBase: - """ - Returns a column of given size and dtype, filled with a given value. - - Parameters - ---------- - size : int - size of the expected column. - fill_value : scalar - A scalar value to fill a new array. - dtype : default None - Data type specifier. It is inferred from other arguments by default. - - Returns - ------- - Column - - Examples - -------- - >>> import cudf - >>> col = cudf.core.column.full(size=5, fill_value=7, dtype='int8') - >>> col - - >>> cudf.Series(col) - 0 7 - 1 7 - 2 7 - 3 7 - 4 7 - dtype: int8 - """ - return ColumnBase.from_scalar(cudf.Scalar(fill_value, dtype), size) - - def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: """Concatenate a sequence of columns.""" if len(objs) == 0: diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 02e03f92745..4c01c8a8d15 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. import warnings from decimal import Decimal @@ -65,8 +65,8 @@ def as_string_column( def __pow__(self, other): if isinstance(other, int): if other == 0: - res = cudf.core.column.full( - size=len(self), fill_value=1, dtype=self.dtype + res = cudf.core.column.as_column( + 1, dtype=self.dtype, length=len(self) ) if self.nullable: res = res.set_mask(self.mask) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index f40886bf153..9493368f625 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -41,7 +41,6 @@ as_column, build_column, column, - full, string, ) from cudf.core.dtypes import CategoricalDtype @@ -506,7 +505,7 @@ def find_and_replace( ) if len(replacement_col) == 1 and len(to_replace_col) > 1: replacement_col = column.as_column( - full(len(to_replace_col), replacement[0], self.dtype) + replacement[0], length=len(to_replace_col), dtype=self.dtype ) elif len(replacement_col) == 1 and len(to_replace_col) == 0: return self.copy() diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 7bf81f3e2d3..c7e07a621e4 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -5488,7 +5488,9 @@ def __init__( if len(children) == 0 and size != 0: # all nulls-column: - offsets = column.full(size + 1, 0, dtype=size_type_dtype) + offsets = column.as_column( + 0, length=size + 1, dtype=size_type_dtype + ) chars = cudf.core.column.column_empty(0, dtype="int8") children = (offsets, chars) @@ -5885,8 +5887,8 @@ def _binaryop( "__eq__", "__ne__", }: - return column.full( - len(self), op == "__ne__", dtype="bool" + return column.as_column( + op == "__ne__", length=len(self), dtype="bool" ).set_mask(self.mask) else: return NotImplemented @@ -5895,7 +5897,9 @@ def _binaryop( if isinstance(other, cudf.Scalar): other = cast( StringColumn, - column.full(len(self), other, dtype="object"), + column.as_column( + other, length=len(self), dtype="object" + ), ) # Explicit types are necessary because mypy infers ColumnBase diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 01935fec8c3..e5251341680 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -1388,7 +1388,7 @@ def __setitem__(self, arg, value): allow_non_unique=True, ) if is_scalar(value): - self._data[arg] = column.full(len(self), value) + self._data[arg] = as_column(value, length=len(self)) else: value = as_column(value) self._data[arg] = value @@ -1436,8 +1436,8 @@ def __setitem__(self, arg, value): else: for col in arg: if is_scalar(value): - self._data[col] = column.full( - size=len(self), fill_value=value + self._data[col] = as_column( + value, length=len(self) ) else: self._data[col] = column.as_column(value) @@ -3165,10 +3165,12 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): ) if _is_scalar_or_zero_d_array(value): - value = column.full( - len(self), + value = as_column( value, - "str" if libcudf.scalar._is_null_host_scalar(value) else None, + length=len(self), + dtype="str" + if libcudf.scalar._is_null_host_scalar(value) + else None, ) if len(self) == 0: @@ -5830,7 +5832,7 @@ def isin(self, values): fill_value = cudf.Scalar(False) def make_false_column_like_self(): - return column.full(len(self), fill_value, "bool") + return column.as_column(fill_value, length=len(self), dtype="bool") # Preprocess different input types into a mapping from column names to # a list of values to check. @@ -5952,7 +5954,7 @@ def _prepare_for_rowwise_op(self, method, skipna): { name: filtered._data[name]._get_mask_as_column() if filtered._data[name].nullable - else column.full(len(filtered._data[name]), True) + else as_column(True, length=len(filtered._data[name])) for name in filtered._data.names } ) @@ -7772,8 +7774,8 @@ def func(left, right, output): return output for name in uncommon_columns: - output._data[name] = column.full( - size=len(output), fill_value=value, dtype="bool" + output._data[name] = as_column( + value, length=len(output), dtype="bool" ) return output diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index ab089ceb103..21c3d961a9e 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. """Base class for Frame types that have an index.""" from __future__ import annotations @@ -48,7 +48,7 @@ ) from cudf.core._base_index import BaseIndex from cudf.core.buffer import acquire_spill_lock -from cudf.core.column import ColumnBase, as_column, full +from cudf.core.column import ColumnBase, as_column from cudf.core.column_accessor import ColumnAccessor from cudf.core.copy_types import BooleanMask, GatherMap from cudf.core.dtypes import ListDtype @@ -2096,7 +2096,7 @@ def duplicated(self, subset=None, keep="first"): (result,) = libcudf.copying.scatter( [cudf.Scalar(False, dtype=bool)], distinct, - [full(len(self), True, dtype=bool)], + [as_column(True, length=len(self), dtype=bool)], bounds_check=False, ) return cudf.Series(result, index=self.index) @@ -2357,9 +2357,7 @@ def _apply(self, func, kernel_getter, *args, **kwargs): # Mask and data column preallocated ans_col = _return_arr_from_dtype(retty, len(self)) - ans_mask = cudf.core.column.full( - size=len(self), fill_value=True, dtype="bool" - ) + ans_mask = as_column(size=len(self), fill_value=True, dtype="bool") output_args = [(ans_col, ans_mask), len(self)] input_args = _get_input_args_from_frame(self) launch_args = output_args + input_args + list(args) @@ -5252,10 +5250,10 @@ def _get_replacement_values_for_columns( values_columns = { col: [value] if _is_non_decimal_numeric_dtype(columns_dtype_map[col]) - else full( - len(to_replace), + else as_column( value, - cudf.dtype(type(value)), + length=len(to_replace), + dtype=cudf.dtype(type(value)), ) for col in columns_dtype_map } diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 4f98a878792..ecc1dbcbb06 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -748,7 +748,7 @@ def isin(self, values, level=None): self_df = self.to_frame(index=False).reset_index() values_df = values_idx.to_frame(index=False) idx = self_df.merge(values_df, how="leftsemi")._data["index"] - res = cudf.core.column.full(size=len(self), fill_value=False) + res = column.as_column(False, length=len(self)) res[idx] = True result = res.values else: diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 3562c83e797..aa41be9209b 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -58,7 +58,6 @@ arange, as_column, column, - full, ) from cudf.core.column.categorical import ( CategoricalAccessor as CategoricalAccessor, @@ -1396,7 +1395,7 @@ def map(self, arg, na_action=None) -> "Series": { "x": arg.keys(), "s": arg.values(), - "bool": full(len(arg), True, dtype=self.dtype), + "bool": as_column(True, length=len(arg), dtype=self.dtype), } ) res = lhs.merge(rhs, on="x", how="left").sort_values( @@ -1416,7 +1415,7 @@ def map(self, arg, na_action=None) -> "Series": { "x": arg.keys(), "s": arg, - "bool": full(len(arg), True, dtype=self.dtype), + "bool": as_column(True, length=len(arg), dtype=self.dtype), } ) res = lhs.merge(rhs, on="x", how="left").sort_values( diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 6ec9dcb5f44..c23e3de8115 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import math import re @@ -730,7 +730,7 @@ def _isin_datetimelike( rhs = cudf.core.column.as_column(values) if rhs.dtype.kind in {"f", "i", "u"}: - return cudf.core.column.full(len(lhs), False, dtype="bool") + return column.as_column(False, length=len(lhs), dtype="bool") rhs = rhs.astype(lhs.dtype) res = lhs._isin_earlystop(rhs) if res is not None: @@ -738,7 +738,7 @@ def _isin_datetimelike( except ValueError: # pandas functionally returns all False when cleansing via # typecasting fails - return cudf.core.column.full(len(lhs), False, dtype="bool") + return column.as_column(False, length=len(lhs), dtype="bool") res = lhs._obtain_isin_result(rhs) return res diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 8a92ea86d57..711a799aedf 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION +# Copyright (c) 2020-2024, NVIDIA CORPORATION import itertools @@ -245,8 +245,8 @@ def _apply_agg_column(self, source_column, agg_name): window = None else: preceding_window = as_column(self.window) - following_window = column.full( - self.window.size, 0, dtype=self.window.dtype + following_window = as_column( + 0, length=self.window.size, dtype=self.window.dtype ) window = None diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index bcc24a85cf9..4ad1eb97270 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. from __future__ import annotations import itertools @@ -20,7 +20,7 @@ import cudf from cudf._lib import parquet as libparquet from cudf.api.types import is_list_like -from cudf.core.column import build_categorical_column, column_empty, full +from cudf.core.column import as_column, build_categorical_column, column_empty from cudf.utils import ioutils from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate @@ -762,9 +762,9 @@ def _parquet_to_frame( _len = len(dfs[-1]) if partition_categories and name in partition_categories: # Build the categorical column from `codes` - codes = full( - size=_len, - fill_value=partition_categories[name].index(value), + codes = as_column( + partition_categories[name].index(value), + length=_len, ) dfs[-1][name] = build_categorical_column( categories=partition_categories[name], @@ -788,10 +788,10 @@ def _parquet_to_frame( masked=True, ) else: - dfs[-1][name] = full( - size=_len, - fill_value=value, + dfs[-1][name] = as_column( + value, dtype=_dtype, + length=_len, ) # Concatenate dfs and return. diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 3c3e989fec6..cdf1b60aec6 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import cupy as cp import numpy as np diff --git a/python/cudf/cudf/tests/test_testing.py b/python/cudf/cudf/tests/test_testing.py index e6658040663..bbb8ba687c4 100644 --- a/python/cudf/cudf/tests/test_testing.py +++ b/python/cudf/cudf/tests/test_testing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import numpy as np import pandas as pd @@ -6,7 +6,7 @@ import pytest import cudf -from cudf.core.column.column import as_column, full +from cudf.core.column.column import as_column from cudf.testing import ( assert_frame_equal, assert_index_equal, @@ -173,8 +173,8 @@ def test_assert_column_equal_dtype_edge_cases(other): assert_column_equal(base.slice(0, 0), other.slice(0, 0), check_dtype=False) assert_column_equal(other.slice(0, 0), base.slice(0, 0), check_dtype=False) - base = full(len(base), fill_value=cudf.NA, dtype=base.dtype) - other = full(len(other), fill_value=cudf.NA, dtype=other.dtype) + base = as_column(cudf.NA, length=len(base), dtype=base.dtype) + other = as_column(cudf.NA, length=len(other), dtype=other.dtype) assert_column_equal(base, other, check_dtype=False) assert_column_equal(other, base, check_dtype=False) diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index ec5693e14d2..95621cf9519 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import decimal import functools @@ -396,8 +396,8 @@ def _all_bools_with_nulls(lhs, rhs, bool_fill_value): else: result_mask = None - result_col = column.full( - size=len(lhs), fill_value=bool_fill_value, dtype=cudf.dtype(np.bool_) + result_col = column.as_column( + bool_fill_value, dtype=cudf.dtype(np.bool_), length=len(lhs) ) if result_mask is not None: result_col = result_col.set_mask(result_mask.as_mask()) diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 387643587d1..9bcc9fb7be7 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import warnings from collections.abc import Iterator @@ -106,8 +106,10 @@ def _get_non_empty_data(s): categories = ( s.categories if len(s.categories) else [UNKNOWN_CATEGORIES] ) - codes = cudf.core.column.full( - size=2, fill_value=0, dtype=cudf._lib.types.size_type_dtype + codes = cudf.core.column.as_column( + 0, + dtype=cudf._lib.types.size_type_dtype, + length=2, ) ordered = s.ordered data = cudf.core.column.build_categorical_column( From 0811adc4d5e2379279c5c297fe5b9f74ac809b47 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 3 Jan 2024 17:02:33 -0800 Subject: [PATCH 6/9] Fix typo --- python/cudf/cudf/core/indexed_frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 21c3d961a9e..9e4458908c3 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -2357,7 +2357,7 @@ def _apply(self, func, kernel_getter, *args, **kwargs): # Mask and data column preallocated ans_col = _return_arr_from_dtype(retty, len(self)) - ans_mask = as_column(size=len(self), fill_value=True, dtype="bool") + ans_mask = as_column(True, length=len(self), dtype="bool") output_args = [(ans_col, ans_mask), len(self)] input_args = _get_input_args_from_frame(self) launch_args = output_args + input_args + list(args) From 30e77a2c042431b3af7f4a3f4bc4a1d98d468c0e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 4 Jan 2024 18:08:04 -0800 Subject: [PATCH 7/9] Unpack 0D arrays --- python/cudf/cudf/core/dataframe.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index e5251341680..37bb1093989 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3165,12 +3165,16 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): ) if _is_scalar_or_zero_d_array(value): + dtype = None + if hasattr(value, "ndim"): + dtype = value.dtype + value = value.item() + if libcudf.scalar._is_null_host_scalar(value): + dtype = "str" value = as_column( value, length=len(self), - dtype="str" - if libcudf.scalar._is_null_host_scalar(value) - else None, + dtype=dtype, ) if len(self) == 0: From 5ebd6f588fa1bafc6f482b1b84d5761f8d114193 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 22 Jan 2024 15:03:23 -0800 Subject: [PATCH 8/9] For len 0 return empty of same dtype --- python/cudf/cudf/core/column/column.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index df09ffb6f57..e68756cbef8 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2134,7 +2134,10 @@ def as_column( data = data.astype(dtype) elif is_scalar(arbitrary) and not isinstance(arbitrary, memoryview): - length = length or 1 + if length is None: + length = 1 + elif length < 0: + raise ValueError(f"{length=} must be >=0.") if isinstance(arbitrary, pd.Interval): # No cudf.Scalar support yet return as_column( @@ -2148,17 +2151,14 @@ def as_column( and isinstance(arbitrary, (np.floating, float)) and np.isnan(arbitrary) ): - arbitrary = None if dtype is None: - dtype = cudf.dtype("float64") - elif arbitrary is None and dtype is None: - dtype = cudf.dtype("object") + dtype = getattr(arbitrary, "dtype", cudf.dtype("float64")) + arbitrary = None arbitrary = cudf.Scalar(arbitrary, dtype=dtype) - data = ColumnBase.from_scalar(arbitrary, length) - - if dtype is not None: - data = data.astype(dtype) - return data + if length == 0: + return column_empty(length, dtype=arbitrary.dtype) + else: + return ColumnBase.from_scalar(arbitrary, length) elif hasattr(arbitrary, "__array_interface__"): # CUDF assumes values are always contiguous From 35e8aa5f3b5354f27d9c5e1007b60c4c1550ec92 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 2 Feb 2024 15:53:53 -0800 Subject: [PATCH 9/9] Fix usages --- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/index.py | 6 +++--- python/cudf/cudf/core/multiindex.py | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 3dd45cc808f..712b1bee821 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3212,7 +3212,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): if _is_scalar_or_zero_d_array(value): dtype = None - if hasattr(value, "ndim"): + if isinstance(value, (np.ndarray, cupy.ndarray)): dtype = value.dtype value = value.item() if libcudf.scalar._is_null_host_scalar(value): diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index c8eedae200b..541199127b6 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1227,9 +1227,9 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): ) needle = as_column(target) - result = cudf.core.column.full( - len(needle), - fill_value=-1, + result = as_column( + -1, + length=len(needle), dtype=libcudf.types.size_type_dtype, ) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 239307f824e..9a1dbac7e54 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1837,9 +1837,9 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): f"{method=} is not supported yet for MultiIndex." ) - result = cudf.core.column.full( - len(target), - fill_value=-1, + result = column.as_column( + -1, + length=len(target), dtype=libcudf.types.size_type_dtype, ) if not len(self):