From f11c8f165fabe5f06460d0bd6cdfa6a59e1ff738 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 8 Jan 2024 18:03:31 -0800 Subject: [PATCH 1/5] Clean up base column methods --- .../cudf/benchmarks/internal/bench_column.py | 5 ++- python/cudf/cudf/core/column/categorical.py | 10 +++--- python/cudf/cudf/core/column/column.py | 31 +++++-------------- python/cudf/cudf/core/column/interval.py | 7 ++--- python/cudf/cudf/core/column/numerical.py | 7 +---- python/cudf/cudf/io/dlpack.py | 4 +-- 6 files changed, 21 insertions(+), 43 deletions(-) diff --git a/python/cudf/benchmarks/internal/bench_column.py b/python/cudf/benchmarks/internal/bench_column.py index d4969b39f7f..3ed42ed59dc 100644 --- a/python/cudf/benchmarks/internal/bench_column.py +++ b/python/cudf/benchmarks/internal/bench_column.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. """Benchmarks of Column methods.""" @@ -18,9 +18,8 @@ def bench_apply_boolean_mask(benchmark, column): @benchmark_with_object(cls="column", dtype="float") -@pytest.mark.parametrize("dropnan", [True, False]) def bench_dropna(benchmark, column, dropnan): - benchmark(column.dropna, drop_nan=dropnan) + benchmark(column.dropna) @benchmark_with_object(cls="column", dtype="float") diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 7036a9ee870..60895391ff4 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -1000,11 +1000,13 @@ def to_pandas( # leaving out dropna because it temporarily changes an interval # index into a struct and throws off results. # TODO: work on interval index dropna - categories = col.categories.to_pandas() + categories = col.categories + elif isinstance(col.categories, NumericalColumn): + categories = col.categories.nans_to_nulls().dropna() else: - categories = col.categories.dropna(drop_nan=True).to_pandas() + categories = col.categories.dropna() data = pd.Categorical.from_codes( - codes, categories=categories, ordered=col.ordered + codes, categories=categories.to_pandas(), ordered=col.ordered ) return pd.Series(data, index=index) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 296fd6a41b0..7a1718ba593 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -12,6 +12,7 @@ Any, Dict, List, + Literal, MutableSequence, Optional, Sequence, @@ -107,16 +108,8 @@ class ColumnBase(Column, Serializable, BinaryOperand, Reducible): "min", } - def as_frame(self) -> "cudf.core.frame.Frame": - """ - Converts a Column to Frame - """ - return cudf.core.single_column_frame.SingleColumnFrame( - {None: self.copy(deep=False)} - ) - def data_array_view( - self, *, mode="write" + self, *, mode: Literal["write", "read"] = "write" ) -> "cuda.devicearray.DeviceNDArray": """ View the data as a device array object @@ -153,7 +146,7 @@ def data_array_view( return cuda.as_cuda_array(obj).view(self.dtype) def mask_array_view( - self, *, mode="write" + self, *, mode: Literal["write", "read"] = "write" ) -> "cuda.devicearray.DeviceNDArray": """ View the mask as a device array @@ -289,8 +282,7 @@ def any(self, skipna: bool = True) -> bool: return libcudf.reduce.reduce("any", self, dtype=np.bool_) - def dropna(self, drop_nan: bool = False) -> ColumnBase: - # The drop_nan argument is only used for numerical columns. + def dropna(self) -> ColumnBase: return drop_nulls([self])[0]._with_type_metadata(self.dtype) def to_arrow(self) -> pa.Array: @@ -440,14 +432,6 @@ def nullmask(self) -> Buffer: raise ValueError("Column has no null mask") return self.mask_array_view(mode="read") - def force_deep_copy(self) -> Self: - """ - A method to create deep copy irrespective of whether - `copy-on-write` is enabled. - """ - result = libcudf.copying.copy_column(self) - return result._with_type_metadata(self.dtype) - def copy(self, deep: bool = True) -> Self: """ Makes a copy of the Column. @@ -467,7 +451,8 @@ def copy(self, deep: bool = True) -> Self: them. """ if deep: - return self.force_deep_copy() + result = libcudf.copying.copy_column(self) + return result._with_type_metadata(self.dtype) else: return cast( Self, @@ -1067,7 +1052,7 @@ def as_categorical_column(self, dtype) -> ColumnBase: ) # columns include null index in factorization; remove: if self.has_nulls(): - cats = cats.dropna(drop_nan=False) + cats = cats.dropna() min_type = min_unsigned_type(len(cats), 8) if cudf.dtype(min_type).itemsize < labels.dtype.itemsize: labels = labels.astype(min_type) diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index eed7bba3628..0d9222ae8d9 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from typing import Optional import pandas as pd @@ -142,7 +142,4 @@ def element_indexing(self, index: int): result = super().element_indexing(index) if cudf.get_option("mode.pandas_compatible"): return pd.Interval(**result, closed=self._closed) - return { - field: value - for field, value in zip(self.dtype.fields, result.values()) - } + return result diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index f40886bf153..be65dd8766b 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -19,7 +19,6 @@ import cudf from cudf import _lib as libcudf -from cudf._lib.stream_compaction import drop_nulls from cudf._lib.types import size_type_dtype from cudf._typing import ( ColumnBinaryOperand, @@ -420,10 +419,6 @@ def nan_count(self) -> int: self._nan_count = nan_col.sum() return self._nan_count - def dropna(self, drop_nan: bool = False) -> NumericalColumn: - col = self.nans_to_nulls() if drop_nan else self - return drop_nulls([col])[0] - @property def contains_na_entries(self) -> bool: return (self.nan_count != 0) or (self.null_count != 0) diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py index e1950c9f250..bed376e4a79 100644 --- a/python/cudf/cudf/io/dlpack.py +++ b/python/cudf/cudf/io/dlpack.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import cudf @@ -71,7 +71,7 @@ def to_dlpack(cudf_obj): if isinstance(cudf_obj, (cudf.DataFrame, cudf.Series, cudf.BaseIndex)): gdf = cudf_obj elif isinstance(cudf_obj, ColumnBase): - gdf = cudf_obj.as_frame() + gdf = cudf.Series._from_data({None: cudf_obj}) else: raise TypeError( f"Input of type {type(cudf_obj)} cannot be converted " From 122337d1604e455f9cd2bd5192a5382c3762db91 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 9 Jan 2024 10:15:56 -0800 Subject: [PATCH 2/5] Fix import --- python/cudf/cudf/core/column/categorical.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 60895391ff4..2ec124f8c85 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -18,7 +18,7 @@ from cudf._lib.transform import bools_to_mask from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike from cudf.core.buffer import Buffer -from cudf.core.column import column +from cudf.core.column import NumericalColumn, column from cudf.core.column.methods import ColumnMethods from cudf.core.dtypes import CategoricalDtype, IntervalDtype from cudf.utils.dtypes import ( @@ -32,7 +32,6 @@ from cudf.core.column import ( ColumnBase, DatetimeColumn, - NumericalColumn, StringColumn, TimeDeltaColumn, ) From 9f764aad928c2b46b24ee07fed5c9dbecbab283f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 9 Jan 2024 14:37:41 -0800 Subject: [PATCH 3/5] Check dtype --- python/cudf/cudf/core/column/categorical.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 2ec124f8c85..ea2e8b15bdc 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -18,7 +18,7 @@ from cudf._lib.transform import bools_to_mask from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike from cudf.core.buffer import Buffer -from cudf.core.column import NumericalColumn, column +from cudf.core.column import column from cudf.core.column.methods import ColumnMethods from cudf.core.dtypes import CategoricalDtype, IntervalDtype from cudf.utils.dtypes import ( @@ -32,6 +32,7 @@ from cudf.core.column import ( ColumnBase, DatetimeColumn, + NumericalColumn, StringColumn, TimeDeltaColumn, ) @@ -995,17 +996,18 @@ def to_pandas( .fillna(_DEFAULT_CATEGORICAL_VALUE) .values_host ) - if isinstance(col.categories.dtype, IntervalDtype): + cats = col.categories + if isinstance(cats.dtype, IntervalDtype): # leaving out dropna because it temporarily changes an interval # index into a struct and throws off results. # TODO: work on interval index dropna - categories = col.categories - elif isinstance(col.categories, NumericalColumn): - categories = col.categories.nans_to_nulls().dropna() + pass + elif cats.dtype.kind in "biuf": + cats = cats.nans_to_nulls().dropna() # type: ignore[attr-defined] else: - categories = col.categories.dropna() + cats = cats.dropna() data = pd.Categorical.from_codes( - codes, categories=categories.to_pandas(), ordered=col.ordered + codes, categories=cats.to_pandas(), ordered=col.ordered ) return pd.Series(data, index=index) From d5fe27222b52903da16aea441176ec5aeaf33534 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 9 Jan 2024 17:30:47 -0800 Subject: [PATCH 4/5] Fix bench_dropna --- python/cudf/benchmarks/internal/bench_column.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/benchmarks/internal/bench_column.py b/python/cudf/benchmarks/internal/bench_column.py index 3ed42ed59dc..8da769b7858 100644 --- a/python/cudf/benchmarks/internal/bench_column.py +++ b/python/cudf/benchmarks/internal/bench_column.py @@ -18,7 +18,7 @@ def bench_apply_boolean_mask(benchmark, column): @benchmark_with_object(cls="column", dtype="float") -def bench_dropna(benchmark, column, dropnan): +def bench_dropna(benchmark, column): benchmark(column.dropna) From 3c73efe81ce0790a24fe0d2b9d486888b0ea1304 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 10 Jan 2024 12:50:19 -0800 Subject: [PATCH 5/5] Remove extra branch --- python/cudf/cudf/core/column/categorical.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 78c0f353f5c..213f57aa198 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -988,14 +988,12 @@ def to_pandas( .values_host ) cats = col.categories - if isinstance(cats.dtype, IntervalDtype): + if cats.dtype.kind in "biuf": + cats = cats.nans_to_nulls().dropna() # type: ignore[attr-defined] + elif not isinstance(cats.dtype, IntervalDtype): # leaving out dropna because it temporarily changes an interval # index into a struct and throws off results. # TODO: work on interval index dropna - pass - elif cats.dtype.kind in "biuf": - cats = cats.nans_to_nulls().dropna() # type: ignore[attr-defined] - else: cats = cats.dropna() data = pd.Categorical.from_codes( codes, categories=cats.to_pandas(), ordered=col.ordered