diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index b1335c7c076..6569184e90b 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -569,17 +569,6 @@ def to_dlpack(self): return cudf.io.dlpack.to_dlpack(self) - @property - def gpu_values(self): - """ - View the data as a numba device array object - """ - warnings.warn( - "The gpu_values property is deprecated and will be removed.", - FutureWarning, - ) - return self._values.data_array_view - def append(self, other): """ Append a collection of Index options together. @@ -1254,10 +1243,6 @@ def astype(self, dtype, copy=False): self.copy(deep=copy)._values.astype(dtype), name=self.name ) - # TODO: This method is deprecated and can be removed. - def to_array(self, fillna=None): - return self._values.to_array(fillna=fillna) - def to_series(self, index=None, name=None): """ Create a Series with both index and values equal to the index keys. @@ -1536,14 +1521,6 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None): "`allow_fill` and `fill_value` are unsupported." ) - indices = cudf.core.column.as_column(indices) - if is_bool_dtype(indices): - warnings.warn( - "Calling take with a boolean array is deprecated and will be " - "removed in the future.", - FutureWarning, - ) - return self._apply_boolean_mask(indices) return self._gather(indices) def _apply_boolean_mask(self, boolean_mask): diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index de06e62cbb1..24f9dc83ca9 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -46,6 +46,9 @@ ) +_DEFAULT_CATEGORICAL_VALUE = -1 + + class CategoricalAccessor(ColumnMethods): """ Accessor object for categorical properties of the Series values. @@ -946,7 +949,11 @@ def to_pandas(self, index: pd.Index = None, **kwargs) -> pd.Series: col = self signed_dtype = min_signed_type(len(col.categories)) - codes = col.codes.astype(signed_dtype).fillna(-1).to_array() + codes = ( + col.codes.astype(signed_dtype) + .fillna(_DEFAULT_CATEGORICAL_VALUE) + .values_host + ) if is_interval_dtype(col.categories.dtype): # leaving out dropna because it temporarily changes an interval # index into a struct and throws off results. @@ -1015,13 +1022,10 @@ def _encode(self, value) -> ScalarLike: return self.categories.find_first_value(value) def _decode(self, value: int) -> ScalarLike: - if value == self._default_na_value(): + if value == _DEFAULT_CATEGORICAL_VALUE: return None return self.categories.element_indexing(value) - def _default_na_value(self) -> ScalarLike: - return -1 - def find_and_replace( self, to_replace: ColumnLike, @@ -1178,7 +1182,7 @@ def fillna( fill_is_scalar = np.isscalar(fill_value) if fill_is_scalar: - if fill_value == self._default_na_value(): + if fill_value == _DEFAULT_CATEGORICAL_VALUE: fill_value = self.codes.dtype.type(fill_value) else: try: @@ -1578,7 +1582,7 @@ def _create_empty_categorical_column( categories=column.as_column(dtype.categories), codes=column.as_column( cudf.utils.utils.scalar_broadcast_to( - categorical_column._default_na_value(), + _DEFAULT_CATEGORICAL_VALUE, categorical_column.size, categorical_column.codes.dtype, ) @@ -1601,7 +1605,7 @@ def pandas_categorical_as_column( codes = categorical.codes if codes is None else codes codes = column.as_column(codes) - valid_codes = codes != codes.dtype.type(-1) + valid_codes = codes != codes.dtype.type(_DEFAULT_CATEGORICAL_VALUE) mask = None if not valid_codes.all(): diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 7999fa9039b..5d694dac255 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -314,51 +314,6 @@ def memory_usage(self) -> int: n += bitmask_allocation_size_bytes(self.size) return n - def _default_na_value(self) -> Any: - raise NotImplementedError() - - # TODO: This method is deprecated and can be removed when the associated - # Frame methods are removed. - def to_gpu_array(self, fillna=None) -> "cuda.devicearray.DeviceNDArray": - """Get a dense numba device array for the data. - - Parameters - ---------- - fillna : scalar, 'pandas', or None - See *fillna* in ``.to_array``. - - Notes - ----- - - if ``fillna`` is ``None``, null values are skipped. Therefore, the - output size could be smaller. - """ - if fillna: - return self.fillna(self._default_na_value()).data_array_view - else: - return self.dropna(drop_nan=False).data_array_view - - # TODO: This method is deprecated and can be removed when the associated - # Frame methods are removed. - def to_array(self, fillna=None) -> np.ndarray: - """Get a dense numpy array for the data. - - Parameters - ---------- - fillna : scalar, 'pandas', or None - Defaults to None, which will skip null values. - If it equals "pandas", null values are filled with NaNs. - Non integral dtype is promoted to np.float64. - - Notes - ----- - - if ``fillna`` is ``None``, null values are skipped. Therefore, the - output size could be smaller. - """ - - return self.to_gpu_array(fillna=fillna).copy_to_host() - def _fill( self, fill_value: ScalarLike, diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index b763790986a..c72fb66addc 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -199,7 +199,7 @@ def to_pandas( # Pandas supports only `datetime64[ns]`, hence the cast. return pd.Series( - self.astype("datetime64[ns]").to_array("NAT"), + self.astype("datetime64[ns]").fillna("NaT").values_host, copy=False, index=index, ) @@ -346,10 +346,6 @@ def as_string_column( column.column_empty(0, dtype="object", masked=False), ) - def _default_na_value(self) -> DatetimeLikeScalar: - """Returns the default NA value for this column""" - return np.datetime64("nat", self.time_unit) - def mean(self, skipna=None, dtype=np.float64) -> ScalarLike: return pd.Timestamp( self.as_numerical.mean(skipna=skipna, dtype=dtype), @@ -488,15 +484,6 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool: return False -def binop_offset(lhs, rhs, op): - if rhs._is_no_op: - return lhs - else: - rhs = rhs._generate_column(len(lhs), op) - out = libcudf.datetime.add_months(lhs, rhs) - return out - - def infer_format(element: str, **kwargs) -> str: """ Infers datetime format from a string, also takes cares for `ms` and `ns` diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index a7481ce62a3..9b54c4d9acd 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -355,20 +355,6 @@ def _process_for_reduction( skipna=skipna, min_count=min_count ) - def _default_na_value(self) -> ScalarLike: - """Returns the default NA value for this column""" - dkind = self.dtype.kind - if dkind == "f": - return self.dtype.type(np.nan) - elif dkind == "i": - return np.iinfo(self.dtype).min - elif dkind == "u": - return np.iinfo(self.dtype).max - elif dkind == "b": - return self.dtype.type(False) - else: - raise TypeError(f"numeric column of {self.dtype} has no NaN value") - def find_and_replace( self, to_replace: ColumnLike, diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 9b44b4e6831..6467fd39ddd 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5218,26 +5218,6 @@ def values(self) -> cupy.ndarray: """ raise TypeError("String Arrays is not yet implemented in cudf") - # TODO: This method is deprecated and should be removed when the associated - # Frame methods are removed. - def to_array(self, fillna: bool = None) -> np.ndarray: - """Get a dense numpy array for the data. - - Notes - ----- - - if ``fillna`` is ``None``, null values are skipped. Therefore, the - output size could be smaller. - - Raises - ------ - ``NotImplementedError`` if there are nulls - """ - if fillna is not None: - warnings.warn("fillna parameter not supported for string arrays") - - return self.to_arrow().to_pandas().values - def to_pandas( self, index: pd.Index = None, nullable: bool = False, **kwargs ) -> "pd.Series": @@ -5402,9 +5382,6 @@ def normalize_binop_value(self, other) -> "column.ColumnBase": else: raise TypeError(f"cannot broadcast {type(other)}") - def _default_na_value(self) -> ScalarLike: - return None - def binary_operator( self, op: builtins.str, rhs, reflect: bool = False ) -> "column.ColumnBase": diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 4b7a3bcc197..6c8c904e13c 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -12,13 +12,7 @@ import cudf from cudf import _lib as libcudf -from cudf._typing import ( - BinaryOperand, - DatetimeLikeScalar, - Dtype, - DtypeObj, - ScalarLike, -) +from cudf._typing import BinaryOperand, DatetimeLikeScalar, Dtype, DtypeObj from cudf.api.types import is_scalar from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase, column, string @@ -123,7 +117,8 @@ def to_pandas( # Pandas supports only `timedelta64[ns]`, hence the cast. pd_series = pd.Series( - self.astype("timedelta64[ns]").to_array("NAT"), copy=False + self.astype("timedelta64[ns]").fillna("NaT").values_host, + copy=False, ) if index is not None: @@ -304,10 +299,6 @@ def as_numerical(self) -> "cudf.core.column.NumericalColumn": ), ) - def _default_na_value(self) -> ScalarLike: - """Returns the default NA value for this column""" - return np.timedelta64("nat", self.time_unit) - @property def time_unit(self) -> str: return self._time_unit diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index c246eb3b266..323a5ad088a 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -17,7 +17,6 @@ import numpy as np import pandas as pd import pyarrow as pa -from numba import cuda from nvtx import annotate from pandas._config import get_option from pandas.io.formats import console @@ -269,7 +268,9 @@ def _getitem_tuple_arg(self, arg): else: row_selection = as_column(arg[0]) if is_bool_dtype(row_selection.dtype): - df.index = self._frame.index.take(row_selection) + df.index = self._frame.index._apply_boolean_mask( + row_selection + ) else: df.index = as_index(row_selection) # Step 4: Downcast @@ -3022,124 +3023,6 @@ def add_suffix(self, suffix): ] return out - def as_gpu_matrix(self, columns=None, order="F"): - warnings.warn( - "The as_gpu_matrix method will be removed in a future cuDF " - "release. Consider using `to_cupy` instead.", - FutureWarning, - ) - if columns is None: - columns = self._data.names - - cols = [self._data[k] for k in columns] - ncol = len(cols) - nrow = len(self) - if ncol < 1: - # This is the case for empty dataframe - construct empty cupy array - matrix = cupy.empty( - shape=(0, 0), dtype=cudf.dtype("float64"), order=order - ) - return cuda.as_cuda_array(matrix) - - if any( - (is_categorical_dtype(c) or np.issubdtype(c, cudf.dtype("object"))) - for c in cols - ): - raise TypeError("non-numeric data not yet supported") - - dtype = find_common_type([col.dtype for col in cols]) - for k, c in self._data.items(): - if c.has_nulls(): - raise ValueError( - f"column '{k}' has null values. " - f"hint: use .fillna() to replace null values" - ) - cupy_dtype = dtype - if np.issubdtype(cupy_dtype, np.datetime64): - cupy_dtype = cudf.dtype("int64") - - if order not in ("F", "C"): - raise ValueError( - "order parameter should be 'C' for row major or 'F' for" - "column major GPU matrix" - ) - - matrix = cupy.empty(shape=(nrow, ncol), dtype=cupy_dtype, order=order) - for colidx, inpcol in enumerate(cols): - dense = inpcol.astype(cupy_dtype) - matrix[:, colidx] = cupy.asarray(dense) - return cuda.as_cuda_array(matrix).view(dtype) - - def as_matrix(self, columns=None): - warnings.warn( - "The as_matrix method will be removed in a future cuDF " - "release. Consider using `to_numpy` instead.", - FutureWarning, - ) - return self.as_gpu_matrix(columns=columns).copy_to_host() - - def label_encoding( - self, column, prefix, cats, prefix_sep="_", dtype=None, na_sentinel=-1 - ): - """Encode labels in a column with label encoding. - - Parameters - ---------- - column : str - the source column with binary encoding for the data. - prefix : str - the new column name prefix. - cats : sequence of ints - the sequence of categories as integers. - prefix_sep : str - the separator between the prefix and the category. - dtype : - the dtype for the outputs; see Series.label_encoding - na_sentinel : number - Value to indicate missing category. - - Returns - ------- - A new DataFrame with a new column appended for the coded values. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a':[1, 2, 3], 'b':[10, 10, 20]}) - >>> df - a b - 0 1 10 - 1 2 10 - 2 3 20 - >>> df.label_encoding(column="b", prefix="b_col", cats=[10, 20]) - a b b_col_labels - 0 1 10 0 - 1 2 10 0 - 2 3 20 1 - """ - - warnings.warn( - "DataFrame.label_encoding is deprecated and will be removed in " - "the future. Consider using cuML's LabelEncoder instead.", - FutureWarning, - ) - - return self._label_encoding( - column, prefix, cats, prefix_sep, dtype, na_sentinel - ) - - def _label_encoding( - self, column, prefix, cats, prefix_sep="_", dtype=None, na_sentinel=-1 - ): - # Private implementation of deprecated public label_encoding method - newname = prefix_sep.join([prefix, "labels"]) - newcol = self[column]._label_encoding( - cats=cats, dtype=dtype, na_sentinel=na_sentinel - ) - outdf = self.copy() - outdf.insert(len(outdf._data), newname, newcol) - return outdf - def agg(self, aggs, axis=None): """ Aggregate using one or more operations over the specified axis. @@ -5535,7 +5418,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): ) .fillna(np.nan) ) - arr = cupy.asarray(prepared.as_gpu_matrix()) + arr = prepared.to_cupy() if skipna is not False and method in _cupy_nan_methods_map: method = _cupy_nan_methods_map[method] diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 91c7a740699..fc59d15e264 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -682,6 +682,7 @@ def _intersection(self, other, sort=False): return new_index def _gather(self, gather_map, nullify=False, check_bounds=True): + gather_map = cudf.core.column.as_column(gather_map) return Int64Index._from_columns( [self._values.take(gather_map, nullify, check_bounds)], [self.name] ) @@ -771,23 +772,6 @@ def __init__(self, data, **kwargs): name = kwargs.get("name") super().__init__({name: data}) - @classmethod - def deserialize(cls, header, frames): - if "index_column" in header: - warnings.warn( - "Index objects serialized in cudf version " - "21.10 or older will no longer be deserializable " - "after version 21.12. Please load and resave any " - "pickles before upgrading to version 22.02.", - FutureWarning, - ) - header["columns"] = [header.pop("index_column")] - header["column_names"] = pickle.dumps( - [pickle.loads(header["name"])] - ) - - return super().deserialize(header, frames) - def _binaryop( self, other: T, @@ -2508,7 +2492,7 @@ def to_pandas(self): def __repr__(self): return ( - f"{self.__class__.__name__}({self._values.to_array()}," + f"{self.__class__.__name__}({self._values.values_host}," f" dtype='object'" + ( f", name={pd.io.formats.printing.default_pprint(self.name)}" diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 59040e3ecbb..8ecab2c7c65 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1459,18 +1459,9 @@ def take(self, indices, axis=0): 0 1.0 a 2 3.0 c """ - axis = self._get_axis_from_axis_arg(axis) - if axis != 0: + if self._get_axis_from_axis_arg(axis) != 0: raise NotImplementedError("Only axis=0 is supported.") - indices = cudf.core.column.as_column(indices) - if is_bool_dtype(indices): - warnings.warn( - "Calling take with a boolean array is deprecated and will be " - "removed in the future.", - FutureWarning, - ) - return self._apply_boolean_mask(indices) return self._gather(indices) def _reset_index(self, level, drop, col_level=0, col_fill=""): diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index fa84889adea..adce3c24a83 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -859,28 +859,8 @@ def serialize(self): @classmethod def deserialize(cls, header, frames): - if "names" in header: - warnings.warn( - "MultiIndex objects serialized in cudf version " - "21.10 or older will no longer be deserializable " - "after version 21.12. Please load and resave any " - "pickles before upgrading to version 22.02.", - FutureWarning, - ) - header["column_names"] = header["names"] - column_names = pickle.loads(header["column_names"]) - if "source_data" in header: - warnings.warn( - "MultiIndex objects serialized in cudf version " - "21.08 or older will no longer be deserializable " - "after version 21.10. Please load and resave any " - "pickles before upgrading to version 21.12.", - FutureWarning, - ) - df = cudf.DataFrame.deserialize(header["source_data"], frames) - return cls.from_frame(df)._set_names(column_names) - # Spoof the column names to construct the frame, then set manually. + column_names = pickle.loads(header["column_names"]) header["column_names"] = pickle.dumps(range(0, len(column_names))) obj = super().deserialize(header, frames) return obj._set_names(column_names) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 61975d47af2..66194f0f877 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -559,19 +559,6 @@ def serialize(self): @classmethod def deserialize(cls, header, frames): - if "column" in header: - warnings.warn( - "Series objects serialized in cudf version " - "21.10 or older will no longer be deserializable " - "after version 21.12. Please load and resave any " - "pickles before upgrading to version 22.02.", - FutureWarning, - ) - header["columns"] = [header.pop("column")] - header["column_names"] = pickle.dumps( - [pickle.loads(header["name"])] - ) - index_nframes = header["index_frame_count"] obj = super().deserialize( header, frames[header["index_frame_count"] :] @@ -965,15 +952,6 @@ def to_frame(self, name=None): return cudf.DataFrame({col: self._column}, index=self.index) - def set_mask(self, mask, null_count=None): - warnings.warn( - "Series.set_mask is deprecated and will be removed in the future.", - FutureWarning, - ) - return self._from_data( - {self.name: self._column.set_mask(mask)}, self._index - ) - def memory_usage(self, index=True, deep=False): """ Return the memory usage of the Series. @@ -1623,25 +1601,6 @@ def drop_duplicates(self, keep="first", inplace=False, ignore_index=False): return self._mimic_inplace(result, inplace=inplace) - def fill(self, fill_value, begin=0, end=-1, inplace=False): - warnings.warn( - "The fill method will be removed in a future cuDF release.", - FutureWarning, - ) - fill_values = [fill_value] - col_and_fill = zip(self._columns, fill_values) - - if not inplace: - data_columns = (c._fill(v, begin, end) for (c, v) in col_and_fill) - return self.__class__._from_data( - zip(self._column_names, data_columns), self._index - ) - - for (c, v) in col_and_fill: - c.fill(v, begin, end, inplace=True) - - return self - def fillna( self, value=None, method=None, axis=None, inplace=False, limit=None ): @@ -1665,15 +1624,6 @@ def fillna( value=value, method=method, axis=axis, inplace=inplace, limit=limit ) - # TODO: When this method is removed we can also remove ColumnBase.to_array. - def to_array(self, fillna=None): - warnings.warn( - "The to_array method will be removed in a future cuDF " - "release. Consider using `to_numpy` instead.", - FutureWarning, - ) - return self._column.to_array(fillna=fillna) - def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): if bool_only not in (None, True): raise NotImplementedError( @@ -1782,27 +1732,6 @@ def nullmask(self): """The gpu buffer for the null-mask""" return cudf.Series(self._column.nullmask) - def as_mask(self): - """Convert booleans to bitmask - - Returns - ------- - device array - - Examples - -------- - >>> import cudf - >>> s = cudf.Series([True, False, True]) - >>> s.as_mask() - - """ - if not is_bool_dtype(self.dtype): - raise TypeError( - f"Series must of boolean dtype, found: {self.dtype}" - ) - - return self._column.as_mask() - def astype(self, dtype, copy=False, errors="raise"): """ Cast the Series to the given dtype @@ -2243,76 +2172,6 @@ def update(self, other): self.mask(mask, other, inplace=True) - def reverse(self): - warnings.warn( - "Series.reverse is deprecated and will be removed in the future.", - FutureWarning, - ) - rinds = column.arange((self._column.size - 1), -1, -1, dtype=np.int32) - return self._from_data( - {self.name: self._column[rinds]}, self.index._values[rinds] - ) - - def label_encoding(self, cats, dtype=None, na_sentinel=-1): - """Perform label encoding. - - Parameters - ---------- - values : sequence of input values - dtype : numpy.dtype; optional - Specifies the output dtype. If `None` is given, the - smallest possible integer dtype (starting with np.int8) - is used. - na_sentinel : number, default -1 - Value to indicate missing category. - - Returns - ------- - A sequence of encoded labels with value between 0 and n-1 classes(cats) - - Examples - -------- - >>> import cudf - >>> s = cudf.Series([1, 2, 3, 4, 10]) - >>> s.label_encoding([2, 3]) - 0 -1 - 1 0 - 2 1 - 3 -1 - 4 -1 - dtype: int8 - - `na_sentinel` parameter can be used to - control the value when there is no encoding. - - >>> s.label_encoding([2, 3], na_sentinel=10) - 0 10 - 1 0 - 2 1 - 3 10 - 4 10 - dtype: int8 - - When none of `cats` values exist in s, entire - Series will be `na_sentinel`. - - >>> s.label_encoding(['a', 'b', 'c']) - 0 -1 - 1 -1 - 2 -1 - 3 -1 - 4 -1 - dtype: int8 - """ - - warnings.warn( - "Series.label_encoding is deprecated and will be removed in the " - "future. Consider using cuML's LabelEncoder instead.", - FutureWarning, - ) - - return self._label_encoding(cats, dtype, na_sentinel) - def _label_encoding(self, cats, dtype=None, na_sentinel=-1): # Private implementation of deprecated public label_encoding method def _return_sentinel_series(): diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 7793a2fdf29..2623569afac 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -3,7 +3,6 @@ from __future__ import annotations -import warnings from typing import Any, Dict, MutableMapping, Optional, Tuple, TypeVar, Union import cupy @@ -143,16 +142,6 @@ def tolist(self): # noqa: D102 to_list = tolist - # TODO: When this method is removed we can also remove - # ColumnBase.to_gpu_array. - def to_gpu_array(self, fillna=None): # noqa: D102 - warnings.warn( - "The to_gpu_array method will be removed in a future cuDF " - "release. Consider using `to_cupy` instead.", - FutureWarning, - ) - return self._column.to_gpu_array(fillna=fillna) - @classmethod def from_arrow(cls, array): """Create from PyArrow Array/ChunkedArray. diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index cc5aec36853..41dac26edf8 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -40,6 +40,17 @@ ALL_TYPES = sorted(list(dtypeutils.ALL_TYPES)) +def set_random_null_mask_inplace(series, null_probability=0.5, seed=None): + """Randomly nullify elements in series with the provided probability.""" + probs = [null_probability, 1 - null_probability] + rng = np.random.default_rng(seed=seed) + mask = rng.choice([False, True], size=len(series), p=probs) + series[mask] = None + + +# TODO: This function should be removed. Anywhere that it is being used should +# instead be generating a random boolean array (bytemask) and use the public +# APIs to set those elements to None. def random_bitmask(size): """ Parameters diff --git a/python/cudf/cudf/tests/test_applymap.py b/python/cudf/cudf/tests/test_applymap.py index 925c9ef720c..ff6e79e7804 100644 --- a/python/cudf/cudf/tests/test_applymap.py +++ b/python/cudf/cudf/tests/test_applymap.py @@ -24,14 +24,10 @@ def test_applymap_round(nelem, masked): boolmask = np.asarray( utils.expand_bits_to_bytes(bitmask), dtype=np.bool_ )[:nelem] - data[~boolmask] = np.nan + data[~boolmask] = None sr = Series(data) - if masked: - # Mask the Series - sr = sr.set_mask(bitmask) - # Call applymap out = sr.applymap( lambda x: (floor(x) + 1 if x - floor(x) >= 0.5 else floor(x)) diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index e01b952be94..748cf958ac3 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -437,7 +437,7 @@ def test_build_df_from_nullable_pandas_dtype(pd_dtype, expect_dtype): expect_mask = [True if x is not pd.NA else False for x in pd_data["a"]] got_mask = mask_to_bools( gd_data["a"]._column.base_mask, 0, len(gd_data) - ).to_array() + ).values_host np.testing.assert_array_equal(expect_mask, got_mask) @@ -475,7 +475,7 @@ def test_build_series_from_nullable_pandas_dtype(pd_dtype, expect_dtype): expect_mask = [True if x is not pd.NA else False for x in pd_data] got_mask = mask_to_bools( gd_data._column.base_mask, 0, len(gd_data) - ).to_array() + ).values_host np.testing.assert_array_equal(expect_mask, got_mask) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 121cedb79da..889662c8a1c 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -968,7 +968,7 @@ def test_dataframe_dir_and_getattr(): df.not_a_column -def test_empty_dataframe_to_array(): +def test_empty_dataframe_to_cupy(): df = cudf.DataFrame() # Check fully empty dataframe. @@ -1023,7 +1023,7 @@ def test_dataframe_to_cupy_null_values(): for k in "abcd": df[k] = data = np.random.random(nelem) bitmask = utils.random_bitmask(nelem) - df[k] = df[k].set_mask(bitmask) + df[k] = df[k]._column.set_mask(bitmask) boolmask = np.asarray( utils.expand_bits_to_bytes(bitmask)[:nelem], dtype=np.bool_ ) @@ -1194,7 +1194,7 @@ def test_dataframe_hash_partition_masked_value(nrows): gdf["val"] = np.arange(nrows) + 100 bitmask = utils.random_bitmask(nrows) bytemask = utils.expand_bits_to_bytes(bitmask) - gdf["val"] = gdf["val"].set_mask(bitmask) + gdf["val"] = gdf["val"]._column.set_mask(bitmask) parted = gdf.partition_by_hash(["key"], nparts=3) # Verify that the valid mask is correct for p in parted: @@ -1215,7 +1215,7 @@ def test_dataframe_hash_partition_masked_keys(nrows): gdf["val"] = np.arange(nrows) + 100 bitmask = utils.random_bitmask(nrows) bytemask = utils.expand_bits_to_bytes(bitmask) - gdf["key"] = gdf["key"].set_mask(bitmask) + gdf["key"] = gdf["key"]._column.set_mask(bitmask) parted = gdf.partition_by_hash(["key"], nparts=3, keep_index=False) # Verify that the valid mask is correct for p in parted: diff --git a/python/cudf/cudf/tests/test_fill.py b/python/cudf/cudf/tests/test_fill.py deleted file mode 100644 index 224db2b39d1..00000000000 --- a/python/cudf/cudf/tests/test_fill.py +++ /dev/null @@ -1,64 +0,0 @@ -import pandas as pd -import pytest - -import cudf -from cudf.testing._utils import assert_eq - - -@pytest.mark.parametrize( - "fill_value,data", - [ - (7, [6, 3, 4]), - ("x", ["a", "b", "c", "d", "e", "f"]), - (7, [6, 3, 4, 2, 1, 7, 8, 5]), - (0.8, [0.6, 0.3, 0.4, 0.2, 0.1, 0.7, 0.8, 0.5]), - ("b", pd.Categorical(["a", "b", "c"])), - (None, [0.0, 1.0, 2.0, 3.0]), - ], -) -@pytest.mark.parametrize( - "begin,end", - [ - (0, -1), - (0, 4), - (1, -1), - (1, 4), - (-2, 1), - (-2, -1), - (10, 12), - (8, 10), - (10, 8), - (-10, -8), - (-2, 6), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_fill(data, fill_value, begin, end, inplace): - gs = cudf.Series(data) - ps = gs.to_pandas() - - if inplace: - actual = gs - gs[begin:end] = fill_value - else: - # private impl doesn't take care of rounding or bounds check - if begin < 0: - begin += len(gs) - - if end < 0: - end += len(gs) - - begin = max(0, min(len(gs), begin)) - end = max(0, min(len(gs), end)) - actual = gs.fill(fill_value, begin, end, False) - assert actual is not gs - - ps[begin:end] = fill_value - - assert_eq(ps, actual) - - -@pytest.mark.xfail(raises=ValueError) -def test_fill_new_category(): - gs = cudf.Series(pd.Categorical(["a", "b", "c"])) - gs[0:1] = "d" diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index e452dc5d7f7..102e5b57e8e 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -783,8 +783,8 @@ def test_dataframe_masked_slicing(nelem, slice_start, slice_end): gdf = cudf.DataFrame() gdf["a"] = list(range(nelem)) gdf["b"] = list(range(nelem, 2 * nelem)) - gdf["a"] = gdf["a"].set_mask(utils.random_bitmask(nelem)) - gdf["b"] = gdf["b"].set_mask(utils.random_bitmask(nelem)) + gdf["a"] = gdf["a"]._column.set_mask(utils.random_bitmask(nelem)) + gdf["b"] = gdf["b"]._column.set_mask(utils.random_bitmask(nelem)) def do_slice(x): return x[slice_start:slice_end] diff --git a/python/cudf/cudf/tests/test_label_encode.py b/python/cudf/cudf/tests/test_label_encode.py deleted file mode 100644 index e5c6bacf1d1..00000000000 --- a/python/cudf/cudf/tests/test_label_encode.py +++ /dev/null @@ -1,132 +0,0 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. - -import random -from itertools import product - -import numpy as np -import pytest - -import cudf -from cudf import DataFrame, Series - - -def _random_float(nelem, dtype): - return np.random.random(nelem).astype(dtype) - - -def _random_int(nelem, dtype): - return np.random.randint(low=0, high=nelem, size=nelem, dtype=dtype) - - -def _random(nelem, dtype): - dtype = cudf.dtype(dtype) - if dtype.kind in {"i", "u"}: - return _random_int(nelem, dtype) - elif dtype.kind == "f": - return _random_float(nelem, dtype) - - -_param_sizes = [1, 7, 10, 100, 1000] -_param_dtypes = [np.int32, np.float32] - - -@pytest.mark.filterwarnings("ignore:DataFrame.label_encoding is deprecated") -@pytest.mark.filterwarnings("ignore:Series.label_encoding is deprecated") -@pytest.mark.parametrize( - "nelem,dtype", list(product(_param_sizes, _param_dtypes)) -) -def test_label_encode(nelem, dtype): - df = DataFrame() - np.random.seed(0) - - # initialize data frame - df["cats"] = _random(nelem, dtype) - vals = df["cats"].unique() - lab = dict({vals[i]: i for i in range(len(vals))}) - - # label encode series - ncol = df["cats"].label_encoding(cats=vals) - arr = ncol.to_numpy() - - # verify labels of new column - for i in range(arr.size): - np.testing.assert_equal(arr[i], lab.get(df.cats[i], None)) - - # label encode data frame - df2 = df.label_encoding(column="cats", prefix="cats", cats=vals) - - assert df2.columns[0] == "cats" - assert df2.columns[1] == "cats_labels" - - -@pytest.mark.filterwarnings("ignore:DataFrame.label_encoding is deprecated") -@pytest.mark.filterwarnings("ignore:Series.label_encoding is deprecated") -def test_label_encode_drop_one(): - random.seed(0) - np.random.seed(0) - - df = DataFrame() - - # initialize data frame - df["cats"] = np.random.randint(7, size=10, dtype=np.int32) - vals = df["cats"].unique() - # drop 1 randomly - vals = vals[vals.index != random.randrange(len(vals))].reset_index( - drop=True - ) - - lab = dict({vals[i]: i for i in range(len(vals))}) - - # label encode series - ncol = df["cats"].label_encoding(cats=vals, dtype="float32") - arr = ncol.to_numpy() - - # verify labels of new column - - for i in range(arr.size): - # assuming -1 is used for missing value - np.testing.assert_equal(arr[i], lab.get(df.cats[i], -1)) - - # label encode data frame - df2 = df.label_encoding( - column="cats", prefix="cats", cats=vals, dtype="float32" - ) - - assert df2.columns[0] == "cats" - assert df2.columns[1] == "cats_labels" - - -@pytest.mark.filterwarnings("ignore:DataFrame.label_encoding is deprecated") -def test_label_encode_float_output(): - random.seed(0) - np.random.seed(0) - - df = DataFrame() - - # initialize data frame - df["cats"] = arr = np.random.randint(7, size=10, dtype=np.int32) - cats = [1, 2, 3, 4] - encoder = {c: i for i, c in enumerate(cats)} - df2 = df.label_encoding( - column="cats", - prefix="cats", - cats=cats, - dtype=np.float32, - na_sentinel=np.nan, - ) - - got = df2["cats_labels"].to_numpy(na_value=np.nan) - - handcoded = np.array([encoder.get(v, np.nan) for v in arr]) - np.testing.assert_equal(got, handcoded) - - -@pytest.mark.filterwarnings("ignore:Series.label_encoding is deprecated") -@pytest.mark.parametrize( - "ncats,cat_dtype", [(10, np.int8), (127, np.int8), (128, np.int16)] -) -def test_label_encode_dtype(ncats, cat_dtype): - s = Series([str(i % ncats) for i in range(ncats + 1)]) - cats = s.unique().astype(s.dtype) - encoded_col = s.label_encoding(cats=cats) - np.testing.assert_equal(encoded_col.dtype, cat_dtype) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 21556aad1eb..80ab0671a0d 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -28,7 +28,7 @@ TIMEDELTA_TYPES, assert_eq, assert_exceptions_equal, - random_bitmask, + set_random_null_mask_inplace, ) @@ -2124,7 +2124,7 @@ def test_parquet_writer_statistics(tmpdir, pdf, add_nulls): gdf = cudf.from_pandas(pdf) if add_nulls: for col in gdf: - gdf[col] = gdf[col].set_mask(random_bitmask(len(gdf))) + set_random_null_mask_inplace(gdf[col]) gdf.to_parquet(file_path, index=False) # Read back from pyarrow diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index 82020f30f7c..ca02ee55df0 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -20,10 +20,8 @@ @pytest.mark.parametrize("nrows", [0, 5, 10]) def test_null_series(nrows, dtype): size = 5 - mask = utils.random_bitmask(size) - data = cudf.Series(np.random.randint(1, 9, size)) - column = data.set_mask(mask) - sr = cudf.Series(column).astype(dtype) + sr = cudf.Series(np.random.randint(1, 9, size)).astype(dtype) + sr[np.random.choice([False, True], size=size)] = None if dtype != "category" and cudf.dtype(dtype).kind in {"u", "i"}: ps = pd.Series( sr._column.data_array_view.copy_to_host(), @@ -62,10 +60,8 @@ def test_null_dataframe(ncols): size = 20 gdf = cudf.DataFrame() for idx, dtype in enumerate(dtype_categories): - mask = utils.random_bitmask(size) - data = cudf.Series(np.random.randint(0, 128, size)) - column = data.set_mask(mask) - sr = cudf.Series(column).astype(dtype) + sr = cudf.Series(np.random.randint(0, 128, size)).astype(dtype) + sr[np.random.choice([False, True], size=size)] = None gdf[dtype] = sr pdf = gdf.to_pandas() pd.options.display.max_columns = int(ncols) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index ffdd53c58ac..1c80fe80f2d 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -561,7 +561,9 @@ def test_series_value_counts(dropna, normalize): for size in [10 ** x for x in range(5)]: arr = np.random.randint(low=-1, high=10, size=size) mask = arr != -1 - sr = cudf.Series.from_masked_array(arr, cudf.Series(mask).as_mask()) + sr = cudf.Series.from_masked_array( + arr, cudf.Series(mask)._column.as_mask() + ) sr.name = "col" expect = ( @@ -1517,3 +1519,48 @@ def test_series_transpose(data): assert_eq(pd_transposed, cudf_transposed) assert_eq(pd_property, cudf_property) assert_eq(cudf_transposed, csr) + + +@pytest.mark.parametrize( + "fill_value,data", + [ + (7, [6, 3, 4]), + ("x", ["a", "b", "c", "d", "e", "f"]), + (7, [6, 3, 4, 2, 1, 7, 8, 5]), + (0.8, [0.6, 0.3, 0.4, 0.2, 0.1, 0.7, 0.8, 0.5]), + ("b", pd.Categorical(["a", "b", "c"])), + (None, [0.0, 1.0, 2.0, 3.0]), + ], +) +@pytest.mark.parametrize( + "begin,end", + [ + (0, -1), + (0, 4), + (1, -1), + (1, 4), + (-2, 1), + (-2, -1), + (10, 12), + (8, 10), + (10, 8), + (-10, -8), + (-2, 6), + ], +) +@pytest.mark.parametrize("inplace", [True, False]) +def test_fill(data, fill_value, begin, end, inplace): + gs = cudf.Series(data) + ps = gs.to_pandas() + + actual = gs + gs[begin:end] = fill_value + ps[begin:end] = fill_value + + assert_eq(ps, actual) + + +@pytest.mark.xfail(raises=ValueError) +def test_fill_new_category(): + gs = cudf.Series(pd.Categorical(["a", "b", "c"])) + gs[0:1] = "d" diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 142ca6c6831..cb3a369d067 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -32,7 +32,8 @@ def test_series_reductions(method, dtype, skipna): arr = arr.astype(dtype) if dtype in (np.float32, np.float64): arr[[2, 5, 14, 19, 50, 70]] = np.nan - sr = cudf.Series.from_masked_array(arr, cudf.Series(mask).as_mask()) + sr = cudf.Series(arr) + sr[~mask] = None psr = sr.to_pandas() psr[~mask] = np.nan @@ -83,7 +84,8 @@ def test_series_unique(): for size in [10 ** x for x in range(5)]: arr = np.random.randint(low=-1, high=10, size=size) mask = arr != -1 - sr = cudf.Series.from_masked_array(arr, cudf.Series(mask).as_mask()) + sr = cudf.Series(arr) + sr[~mask] = None assert set(arr[mask]) == set(sr.unique().dropna().to_numpy()) assert len(set(arr[mask])) == sr.nunique() @@ -298,7 +300,8 @@ def test_series_median(dtype, num_na): mask = np.arange(100) >= num_na arr = arr.astype(dtype) - sr = cudf.Series.from_masked_array(arr, cudf.Series(mask).as_mask()) + sr = cudf.Series(arr) + sr[~mask] = None arr2 = arr[mask] ps = pd.Series(arr2, dtype=dtype) diff --git a/python/cudf/cudf/tests/test_udf_binops.py b/python/cudf/cudf/tests/test_udf_binops.py index 935c3868a68..c5cd8f8b717 100644 --- a/python/cudf/cudf/tests/test_udf_binops.py +++ b/python/cudf/cudf/tests/test_udf_binops.py @@ -49,4 +49,4 @@ def generic_function(a, b): result = lhs_arr ** 3 + rhs_arr - np.testing.assert_almost_equal(result, out_col.to_array()) + np.testing.assert_almost_equal(result, out_col.values_host) diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py index fa5cde76524..3cbbc1e1ce7 100644 --- a/python/cudf/cudf/utils/applyutils.py +++ b/python/cudf/cudf/utils/applyutils.py @@ -173,7 +173,9 @@ def run(self, df, **launch_params): outputs[k], index=outdf.index, nan_as_null=False ) if out_mask is not None: - outdf[k] = outdf[k].set_mask(out_mask.data_array_view) + outdf._data[k] = outdf[k]._column.set_mask( + out_mask.data_array_view + ) return outdf diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index d23094ef3f9..add4ecd8f01 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -242,7 +242,6 @@ def _fillna_natwise(col): return column.build_column( data=result.base_data, dtype=result.dtype, - mask=col.base_mask, size=result.size, offset=result.offset, children=result.base_children, diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 1521ce41806..c7342818610 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -41,7 +41,7 @@ def test_series(data): sr = Series(pdsr) dsr = dgd.from_cudf(sr, npartitions=5) - np.testing.assert_equal(np.array(pdsr), dsr.compute().to_array()) + np.testing.assert_equal(np.array(pdsr), dsr.compute().values_host) @pytest.mark.parametrize("data", [data_dt_1()]) @@ -114,7 +114,7 @@ def test_categorical_basic(data): sr = Series(cat) dsr = dgd.from_cudf(sr, npartitions=2) result = dsr.compute() - np.testing.assert_array_equal(cat.codes, result.to_array()) + np.testing.assert_array_equal(cat.codes, result.cat.codes.values_host) assert dsr.dtype.to_pandas() == pdsr.dtype # Test attributes @@ -122,7 +122,9 @@ def test_categorical_basic(data): assert_eq(pdsr.cat.categories, dsr.cat.categories) - np.testing.assert_array_equal(pdsr.cat.codes.values, result.to_array()) + np.testing.assert_array_equal( + pdsr.cat.codes.values, result.cat.codes.values_host + ) string = str(result) expect_str = """ @@ -207,12 +209,12 @@ def test_categorical_compare_ordered(data): # Test equality out = dsr1 == dsr1 assert out.dtype == np.bool_ - assert np.all(out.compute().to_array()) + assert np.all(out.compute().values_host) assert np.all(pdsr1 == pdsr1) # Test inequality out = dsr1 != dsr1 - assert not np.any(out.compute().to_array()) + assert not np.any(out.compute().values_host) assert not np.any(pdsr1 != pdsr1) assert dsr1.cat.ordered @@ -220,10 +222,10 @@ def test_categorical_compare_ordered(data): # Test ordered operators np.testing.assert_array_equal( - pdsr1 < pdsr2, (dsr1 < dsr2).compute().to_array() + pdsr1 < pdsr2, (dsr1 < dsr2).compute().values_host ) np.testing.assert_array_equal( - pdsr1 > pdsr2, (dsr1 > dsr2).compute().to_array() + pdsr1 > pdsr2, (dsr1 > dsr2).compute().values_host ) diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index ace9701b677..67fed62c582 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -284,7 +284,7 @@ def test_assign(): got = dgf.assign(z=newcol) dd.assert_eq(got.loc[:, ["x", "y"]], df) - np.testing.assert_array_equal(got["z"].compute().to_array(), pdcol) + np.testing.assert_array_equal(got["z"].compute().values_host, pdcol) @pytest.mark.parametrize("data_type", ["int8", "int16", "int32", "int64"])