Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clean up base column methods #14725

Merged
merged 11 commits into from
Jan 12, 2024
7 changes: 3 additions & 4 deletions python/cudf/benchmarks/internal/bench_column.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.

"""Benchmarks of Column methods."""

Expand All @@ -18,9 +18,8 @@ def bench_apply_boolean_mask(benchmark, column):


@benchmark_with_object(cls="column", dtype="float")
@pytest.mark.parametrize("dropnan", [True, False])
def bench_dropna(benchmark, column, dropnan):
benchmark(column.dropna, drop_nan=dropnan)
def bench_dropna(benchmark, column):
benchmark(column.dropna)


@benchmark_with_object(cls="column", dtype="float")
Expand Down
11 changes: 7 additions & 4 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -996,15 +996,18 @@ def to_pandas(
.fillna(_DEFAULT_CATEGORICAL_VALUE)
.values_host
)
if isinstance(col.categories.dtype, IntervalDtype):
cats = col.categories
if isinstance(cats.dtype, IntervalDtype):
# leaving out dropna because it temporarily changes an interval
# index into a struct and throws off results.
# TODO: work on interval index dropna
categories = col.categories.to_pandas()
pass
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This branch can be removed.

elif cats.dtype.kind in "biuf":
cats = cats.nans_to_nulls().dropna() # type: ignore[attr-defined]
else:
categories = col.categories.dropna(drop_nan=True).to_pandas()
cats = cats.dropna()
data = pd.Categorical.from_codes(
codes, categories=categories, ordered=col.ordered
codes, categories=cats.to_pandas(), ordered=col.ordered
)
return pd.Series(data, index=index)

Expand Down
29 changes: 7 additions & 22 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
Any,
Dict,
List,
Literal,
MutableSequence,
Optional,
Sequence,
Expand Down Expand Up @@ -107,16 +108,8 @@ class ColumnBase(Column, Serializable, BinaryOperand, Reducible):
"min",
}

def as_frame(self) -> "cudf.core.frame.Frame":
"""
Converts a Column to Frame
"""
return cudf.core.single_column_frame.SingleColumnFrame(
{None: self.copy(deep=False)}
)

def data_array_view(
self, *, mode="write"
self, *, mode: Literal["write", "read"] = "write"
) -> "cuda.devicearray.DeviceNDArray":
"""
View the data as a device array object
Expand Down Expand Up @@ -153,7 +146,7 @@ def data_array_view(
return cuda.as_cuda_array(obj).view(self.dtype)

def mask_array_view(
self, *, mode="write"
self, *, mode: Literal["write", "read"] = "write"
) -> "cuda.devicearray.DeviceNDArray":
"""
View the mask as a device array
Expand Down Expand Up @@ -289,8 +282,7 @@ def any(self, skipna: bool = True) -> bool:

return libcudf.reduce.reduce("any", self, dtype=np.bool_)

def dropna(self, drop_nan: bool = False) -> ColumnBase:
# The drop_nan argument is only used for numerical columns.
def dropna(self) -> ColumnBase:
return drop_nulls([self])[0]._with_type_metadata(self.dtype)

def to_arrow(self) -> pa.Array:
Expand Down Expand Up @@ -440,14 +432,6 @@ def nullmask(self) -> Buffer:
raise ValueError("Column has no null mask")
return self.mask_array_view(mode="read")

def force_deep_copy(self) -> Self:
"""
A method to create deep copy irrespective of whether
`copy-on-write` is enabled.
"""
result = libcudf.copying.copy_column(self)
return result._with_type_metadata(self.dtype)

def copy(self, deep: bool = True) -> Self:
"""
Makes a copy of the Column.
Expand All @@ -467,7 +451,8 @@ def copy(self, deep: bool = True) -> Self:
them.
"""
if deep:
return self.force_deep_copy()
result = libcudf.copying.copy_column(self)
return result._with_type_metadata(self.dtype)
else:
return cast(
Self,
Expand Down Expand Up @@ -1066,7 +1051,7 @@ def as_categorical_column(self, dtype) -> ColumnBase:
)
# columns include null index in factorization; remove:
if self.has_nulls():
cats = cats.dropna(drop_nan=False)
cats = cats.dropna()
min_type = min_unsigned_type(len(cats), 8)
if cudf.dtype(min_type).itemsize < labels.dtype.itemsize:
labels = labels.astype(min_type)
Expand Down
7 changes: 2 additions & 5 deletions python/cudf/cudf/core/column/interval.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2023, NVIDIA CORPORATION.
# Copyright (c) 2018-2024, NVIDIA CORPORATION.
from typing import Optional

import pandas as pd
Expand Down Expand Up @@ -142,7 +142,4 @@ def element_indexing(self, index: int):
result = super().element_indexing(index)
if cudf.get_option("mode.pandas_compatible"):
return pd.Interval(**result, closed=self._closed)
return {
field: value
for field, value in zip(self.dtype.fields, result.values())
}
return result
5 changes: 0 additions & 5 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

import cudf
from cudf import _lib as libcudf
from cudf._lib.stream_compaction import drop_nulls
from cudf._lib.types import size_type_dtype
from cudf._typing import (
ColumnBinaryOperand,
Expand Down Expand Up @@ -421,10 +420,6 @@ def nan_count(self) -> int:
self._nan_count = nan_col.sum()
return self._nan_count

def dropna(self, drop_nan: bool = False) -> NumericalColumn:
col = self.nans_to_nulls() if drop_nan else self
return drop_nulls([col])[0]

@property
def contains_na_entries(self) -> bool:
return (self.nan_count != 0) or (self.null_count != 0)
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/io/dlpack.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2022, NVIDIA CORPORATION.
# Copyright (c) 2019-2024, NVIDIA CORPORATION.


import cudf
Expand Down Expand Up @@ -71,7 +71,7 @@ def to_dlpack(cudf_obj):
if isinstance(cudf_obj, (cudf.DataFrame, cudf.Series, cudf.BaseIndex)):
gdf = cudf_obj
elif isinstance(cudf_obj, ColumnBase):
gdf = cudf_obj.as_frame()
gdf = cudf.Series._from_data({None: cudf_obj})
else:
raise TypeError(
f"Input of type {type(cudf_obj)} cannot be converted "
Expand Down