From c0a3cd14eabd18ba8cedd3b7dd87cba8b6706719 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 11 Jan 2024 16:13:59 -1000 Subject: [PATCH] Clean up base column methods (#14725) * Removed the need for a `drop_nan` argument in `Column.dropna` * Removed the need for `Column.as_frame` * Removed the need for `Column.force_deep_copy` Authors: - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Michael Wang (https://github.com/isVoid) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14725 --- .../cudf/benchmarks/internal/bench_column.py | 7 ++--- python/cudf/cudf/core/column/categorical.py | 11 ++++---- python/cudf/cudf/core/column/column.py | 28 ++++--------------- python/cudf/cudf/core/column/interval.py | 5 +--- python/cudf/cudf/core/column/numerical.py | 5 ---- python/cudf/cudf/io/dlpack.py | 4 +-- 6 files changed, 18 insertions(+), 42 deletions(-) diff --git a/python/cudf/benchmarks/internal/bench_column.py b/python/cudf/benchmarks/internal/bench_column.py index d4969b39f7f..8da769b7858 100644 --- a/python/cudf/benchmarks/internal/bench_column.py +++ b/python/cudf/benchmarks/internal/bench_column.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. """Benchmarks of Column methods.""" @@ -18,9 +18,8 @@ def bench_apply_boolean_mask(benchmark, column): @benchmark_with_object(cls="column", dtype="float") -@pytest.mark.parametrize("dropnan", [True, False]) -def bench_dropna(benchmark, column, dropnan): - benchmark(column.dropna, drop_nan=dropnan) +def bench_dropna(benchmark, column): + benchmark(column.dropna) @benchmark_with_object(cls="column", dtype="float") diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 71143fa7a95..eb4220c5895 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -987,15 +987,16 @@ def to_pandas( .fillna(_DEFAULT_CATEGORICAL_VALUE) .values_host ) - if isinstance(col.categories.dtype, IntervalDtype): + cats = col.categories + if cats.dtype.kind in "biuf": + cats = cats.nans_to_nulls().dropna() # type: ignore[attr-defined] + elif not isinstance(cats.dtype, IntervalDtype): # leaving out dropna because it temporarily changes an interval # index into a struct and throws off results. # TODO: work on interval index dropna - categories = col.categories.to_pandas() - else: - categories = col.categories.dropna(drop_nan=True).to_pandas() + cats = cats.dropna() data = pd.Categorical.from_codes( - codes, categories=categories, ordered=col.ordered + codes, categories=cats.to_pandas(), ordered=col.ordered ) return pd.Series(data, index=index) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 81579b53bb7..3cf686da7b0 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -109,16 +109,8 @@ class ColumnBase(Column, Serializable, BinaryOperand, Reducible): "min", } - def as_frame(self) -> "cudf.core.frame.Frame": - """ - Converts a Column to Frame - """ - return cudf.core.single_column_frame.SingleColumnFrame( - {None: self.copy(deep=False)} - ) - def data_array_view( - self, *, mode="write" + self, *, mode: Literal["write", "read"] = "write" ) -> "cuda.devicearray.DeviceNDArray": """ View the data as a device array object @@ -155,7 +147,7 @@ def data_array_view( return cuda.as_cuda_array(obj).view(self.dtype) def mask_array_view( - self, *, mode="write" + self, *, mode: Literal["write", "read"] = "write" ) -> "cuda.devicearray.DeviceNDArray": """ View the mask as a device array @@ -291,8 +283,7 @@ def any(self, skipna: bool = True) -> bool: return libcudf.reduce.reduce("any", self, dtype=np.bool_) - def dropna(self, drop_nan: bool = False) -> ColumnBase: - # The drop_nan argument is only used for numerical columns. + def dropna(self) -> ColumnBase: return drop_nulls([self])[0]._with_type_metadata(self.dtype) def to_arrow(self) -> pa.Array: @@ -437,14 +428,6 @@ def nullmask(self) -> Buffer: raise ValueError("Column has no null mask") return self.mask_array_view(mode="read") - def force_deep_copy(self) -> Self: - """ - A method to create deep copy irrespective of whether - `copy-on-write` is enabled. - """ - result = libcudf.copying.copy_column(self) - return result._with_type_metadata(self.dtype) - def copy(self, deep: bool = True) -> Self: """ Makes a copy of the Column. @@ -464,7 +447,8 @@ def copy(self, deep: bool = True) -> Self: them. """ if deep: - return self.force_deep_copy() + result = libcudf.copying.copy_column(self) + return result._with_type_metadata(self.dtype) else: return cast( Self, @@ -1069,7 +1053,7 @@ def as_categorical_column(self, dtype) -> ColumnBase: ) # columns include null index in factorization; remove: if self.has_nulls(): - cats = cats.dropna(drop_nan=False) + cats = cats.dropna() min_type = min_unsigned_type(len(cats), 8) if cudf.dtype(min_type).itemsize < labels.dtype.itemsize: labels = labels.astype(min_type) diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index 81059717b20..6a7e7729123 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -142,7 +142,4 @@ def element_indexing(self, index: int): result = super().element_indexing(index) if cudf.get_option("mode.pandas_compatible"): return pd.Interval(**result, closed=self._closed) - return { - field: value - for field, value in zip(self.dtype.fields, result.values()) - } + return result diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 5461d1b13b5..0577e0f37ed 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -20,7 +20,6 @@ import cudf from cudf import _lib as libcudf -from cudf._lib.stream_compaction import drop_nulls from cudf._lib.types import size_type_dtype from cudf._typing import ( ColumnBinaryOperand, @@ -421,10 +420,6 @@ def nan_count(self) -> int: self._nan_count = nan_col.sum() return self._nan_count - def dropna(self, drop_nan: bool = False) -> NumericalColumn: - col = self.nans_to_nulls() if drop_nan else self - return drop_nulls([col])[0] - def _process_values_for_isin( self, values: Sequence ) -> Tuple[ColumnBase, ColumnBase]: diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py index e1950c9f250..bed376e4a79 100644 --- a/python/cudf/cudf/io/dlpack.py +++ b/python/cudf/cudf/io/dlpack.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import cudf @@ -71,7 +71,7 @@ def to_dlpack(cudf_obj): if isinstance(cudf_obj, (cudf.DataFrame, cudf.Series, cudf.BaseIndex)): gdf = cudf_obj elif isinstance(cudf_obj, ColumnBase): - gdf = cudf_obj.as_frame() + gdf = cudf.Series._from_data({None: cudf_obj}) else: raise TypeError( f"Input of type {type(cudf_obj)} cannot be converted "