From 71253103d9cd655de3d2081f5021166387965846 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 5 Jan 2024 17:08:01 -0800 Subject: [PATCH 1/2] Replace as_numerical with as_numerical_column in datetimelike --- python/cudf/cudf/core/column/datetime.py | 41 ++++++++++------------- python/cudf/cudf/core/column/timedelta.py | 41 ++++++++++------------- 2 files changed, 36 insertions(+), 46 deletions(-) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 466ea3220c8..794d0b869ce 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -267,7 +267,9 @@ def __contains__(self, item: ScalarLike) -> bool: # np.datetime64 raises ValueError, hence `item` # cannot exist in `self`. return False - return item_as_dt64.astype("int64") in self.as_numerical + return item_as_dt64.astype("int64") in self.as_numerical_column( + "int64" + ) @property def time_unit(self) -> str: @@ -396,19 +398,6 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike: return NotImplemented - @property - def as_numerical(self) -> "cudf.core.column.NumericalColumn": - return cast( - "cudf.core.column.NumericalColumn", - column.build_column( - data=self.base_data, - dtype=np.int64, - mask=self.base_mask, - offset=self.offset, - size=self.size, - ), - ) - @property def __cuda_array_interface__(self) -> Mapping[str, Any]: output = { @@ -448,9 +437,14 @@ def as_timedelta_column( def as_numerical_column( self, dtype: Dtype, **kwargs ) -> "cudf.core.column.NumericalColumn": - return cast( - "cudf.core.column.NumericalColumn", self.as_numerical.astype(dtype) + col = column.build_column( + data=self.base_data, + dtype=np.int64, + mask=self.base_mask, + offset=self.offset, + size=self.size, ) + return cast("cudf.core.column.NumericalColumn", col.astype(dtype)) def as_string_column( self, dtype: Dtype, format=None, **kwargs @@ -483,7 +477,7 @@ def mean( self, skipna=None, min_count: int = 0, dtype=np.float64 ) -> ScalarLike: return pd.Timestamp( - self.as_numerical.mean( + self.as_numerical_column("int64").mean( skipna=skipna, min_count=min_count, dtype=dtype ), unit=self.time_unit, @@ -497,7 +491,7 @@ def std( ddof: int = 1, ) -> pd.Timedelta: return pd.Timedelta( - self.as_numerical.std( + self.as_numerical_column("int64").std( skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof ) * _unit_to_nanoseconds_conversion[self.time_unit], @@ -505,7 +499,8 @@ def std( def median(self, skipna: Optional[bool] = None) -> pd.Timestamp: return pd.Timestamp( - self.as_numerical.median(skipna=skipna), unit=self.time_unit + self.as_numerical_column("int64").median(skipna=skipna), + unit=self.time_unit, ) def quantile( @@ -515,7 +510,7 @@ def quantile( exact: bool, return_scalar: bool, ) -> ColumnBase: - result = self.as_numerical.quantile( + result = self.as_numerical_column("int64").quantile( q=q, interpolation=interpolation, exact=exact, @@ -616,12 +611,12 @@ def indices_of( ) -> cudf.core.column.NumericalColumn: value = column.as_column( pd.to_datetime(value), dtype=self.dtype - ).as_numerical - return self.as_numerical.indices_of(value) + ).as_numerical_column("int64") + return self.as_numerical_column("int64").indices_of(value) @property def is_unique(self) -> bool: - return self.as_numerical.is_unique + return self.as_numerical_column("int64").is_unique def isin(self, values: Sequence) -> ColumnBase: return cudf.core.tools.datetimes._isin_datetimelike(self, values) diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 572b3b894dc..be366689ed0 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -114,7 +114,7 @@ def __contains__(self, item: DatetimeLikeScalar) -> bool: # np.timedelta64 raises ValueError, hence `item` # cannot exist in `self`. return False - return item.view("int64") in self.as_numerical + return item.view("int64") in self.as_numerical_column("int64") @property def values(self): @@ -133,7 +133,9 @@ def to_arrow(self) -> pa.Array: self.mask_array_view(mode="read").copy_to_host() ) data = pa.py_buffer( - self.as_numerical.data_array_view(mode="read").copy_to_host() + self.as_numerical_column("int64") + .data_array_view(mode="read") + .copy_to_host() ) pa_dtype = np_to_pa_dtype(self.dtype) return pa.Array.from_buffers( @@ -260,19 +262,6 @@ def normalize_binop_value(self, other) -> ColumnBinaryOperand: return cudf.Scalar(other) return NotImplemented - @property - def as_numerical(self) -> "cudf.core.column.NumericalColumn": - return cast( - "cudf.core.column.NumericalColumn", - column.build_column( - data=self.base_data, - dtype=np.int64, - mask=self.base_mask, - offset=self.offset, - size=self.size, - ), - ) - @property def time_unit(self) -> str: return self._time_unit @@ -303,9 +292,14 @@ def fillna( def as_numerical_column( self, dtype: Dtype, **kwargs ) -> "cudf.core.column.NumericalColumn": - return cast( - "cudf.core.column.NumericalColumn", self.as_numerical.astype(dtype) + col = column.build_column( + data=self.base_data, + dtype=np.int64, + mask=self.base_mask, + offset=self.offset, + size=self.size, ) + return cast("cudf.core.column.NumericalColumn", col.astype(dtype)) def as_datetime_column( self, dtype: Dtype, **kwargs @@ -339,13 +333,14 @@ def as_timedelta_column(self, dtype: Dtype, **kwargs) -> TimeDeltaColumn: def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta: return pd.Timedelta( - self.as_numerical.mean(skipna=skipna, dtype=dtype), + self.as_numerical_column("int64").mean(skipna=skipna, dtype=dtype), unit=self.time_unit, ) def median(self, skipna: Optional[bool] = None) -> pd.Timedelta: return pd.Timedelta( - self.as_numerical.median(skipna=skipna), unit=self.time_unit + self.as_numerical_column("int64").median(skipna=skipna), + unit=self.time_unit, ) def isin(self, values: Sequence) -> ColumnBase: @@ -358,7 +353,7 @@ def quantile( exact: bool, return_scalar: bool, ) -> ColumnBase: - result = self.as_numerical.quantile( + result = self.as_numerical_column("int64").quantile( q=q, interpolation=interpolation, exact=exact, @@ -378,7 +373,7 @@ def sum( # Since sum isn't overridden in Numerical[Base]Column, mypy only # sees the signature from Reducible (which doesn't have the extra # parameters from ColumnBase._reduce) so we have to ignore this. - self.as_numerical.sum( # type: ignore + self.as_numerical_column("int64").sum( # type: ignore skipna=skipna, min_count=min_count, dtype=dtype ), unit=self.time_unit, @@ -392,7 +387,7 @@ def std( ddof: int = 1, ) -> pd.Timedelta: return pd.Timedelta( - self.as_numerical.std( + self.as_numerical_column("int64").std( skipna=skipna, min_count=min_count, ddof=ddof, dtype=dtype ), unit=self.time_unit, From a9e283a3d0498c19e6d8ec678952f00ea1a23853 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 8 Jan 2024 10:29:05 -0800 Subject: [PATCH 2/2] Remove as_numerical for categorical --- python/cudf/cudf/core/column/categorical.py | 27 +++++++-------------- python/cudf/cudf/core/dataframe.py | 8 +++--- 2 files changed, 12 insertions(+), 23 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 7036a9ee870..68bbfa6c375 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -754,7 +754,7 @@ def __contains__(self, item: ScalarLike) -> bool: self._encode(item) except ValueError: return False - return self._encode(item) in self.as_numerical + return self._encode(item) in self.codes def set_base_data(self, value): if value is not None: @@ -799,15 +799,6 @@ def children(self) -> Tuple[NumericalColumn]: self._children = (codes_column,) return self._children - @property - def as_numerical(self) -> NumericalColumn: - return cast( - cudf.core.column.NumericalColumn, - column.build_column( - data=self.codes.data, dtype=self.codes.dtype, mask=self.mask - ), - ) - @property def categories(self) -> ColumnBase: return self.dtype.categories._values @@ -925,7 +916,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: "The only binary operations supported by unordered " "categorical columns are equality and inequality." ) - return self.as_numerical._binaryop(other.as_numerical, op) + return self.codes._binaryop(other.codes, op) def normalize_binop_value(self, other: ScalarLike) -> CategoricalColumn: if isinstance(other, column.ColumnBase): @@ -950,7 +941,7 @@ def normalize_binop_value(self, other: ScalarLike) -> CategoricalColumn: def sort_values( self, ascending: bool = True, na_position="last" ) -> CategoricalColumn: - codes = self.as_numerical.sort_values(ascending, na_position) + codes = self.codes.sort_values(ascending, na_position) col = column.build_categorical_column( categories=self.dtype.categories._values, codes=column.build_column(codes.base_data, dtype=codes.dtype), @@ -961,7 +952,7 @@ def sort_values( return col def element_indexing(self, index: int) -> ScalarLike: - val = self.as_numerical.element_indexing(index) + val = self.codes.element_indexing(index) return self._decode(int(val)) if val is not None else val @property @@ -1053,7 +1044,7 @@ def data_array_view( return self.codes.data_array_view(mode=mode) def unique(self) -> CategoricalColumn: - codes = self.as_numerical.unique() + codes = self.codes.unique() return column.build_categorical_column( categories=self.categories, codes=column.build_column(codes.base_data, dtype=codes.dtype), @@ -1281,15 +1272,15 @@ def fillna( def indices_of( self, value: ScalarLike ) -> cudf.core.column.NumericalColumn: - return self.as_numerical.indices_of(self._encode(value)) + return self.codes.indices_of(self._encode(value)) @property def is_monotonic_increasing(self) -> bool: - return bool(self.ordered) and self.as_numerical.is_monotonic_increasing + return bool(self.ordered) and self.codes.is_monotonic_increasing @property def is_monotonic_decreasing(self) -> bool: - return bool(self.ordered) and self.as_numerical.is_monotonic_decreasing + return bool(self.ordered) and self.codes.is_monotonic_decreasing def as_categorical_column(self, dtype: Dtype) -> CategoricalColumn: if isinstance(dtype, str) and dtype == "category": diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 01935fec8c3..8dd81c92994 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -2379,15 +2379,13 @@ def scatter_by_map( # Convert string or categorical to integer if isinstance(map_index, cudf.core.column.StringColumn): - map_index = map_index.as_categorical_column( - "category" - ).as_numerical + map_index = map_index.as_categorical_column("category").codes warnings.warn( "Using StringColumn for map_index in scatter_by_map. " "Use an integer array/column for better performance." ) elif isinstance(map_index, cudf.core.column.CategoricalColumn): - map_index = map_index.as_numerical + map_index = map_index.codes warnings.warn( "Using CategoricalColumn for map_index in scatter_by_map. " "Use an integer array/column for better performance."