From f11c8f165fabe5f06460d0bd6cdfa6a59e1ff738 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 8 Jan 2024 18:03:31 -0800
Subject: [PATCH 1/5] Clean up base column methods

---
 .../cudf/benchmarks/internal/bench_column.py  |  5 ++-
 python/cudf/cudf/core/column/categorical.py   | 10 +++---
 python/cudf/cudf/core/column/column.py        | 31 +++++--------------
 python/cudf/cudf/core/column/interval.py      |  7 ++---
 python/cudf/cudf/core/column/numerical.py     |  7 +----
 python/cudf/cudf/io/dlpack.py                 |  4 +--
 6 files changed, 21 insertions(+), 43 deletions(-)

diff --git a/python/cudf/benchmarks/internal/bench_column.py b/python/cudf/benchmarks/internal/bench_column.py
index d4969b39f7f..3ed42ed59dc 100644
--- a/python/cudf/benchmarks/internal/bench_column.py
+++ b/python/cudf/benchmarks/internal/bench_column.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 """Benchmarks of Column methods."""
 
@@ -18,9 +18,8 @@ def bench_apply_boolean_mask(benchmark, column):
 
 
 @benchmark_with_object(cls="column", dtype="float")
-@pytest.mark.parametrize("dropnan", [True, False])
 def bench_dropna(benchmark, column, dropnan):
-    benchmark(column.dropna, drop_nan=dropnan)
+    benchmark(column.dropna)
 
 
 @benchmark_with_object(cls="column", dtype="float")
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 7036a9ee870..60895391ff4 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -1000,11 +1000,13 @@ def to_pandas(
             # leaving out dropna because it temporarily changes an interval
             # index into a struct and throws off results.
             # TODO: work on interval index dropna
-            categories = col.categories.to_pandas()
+            categories = col.categories
+        elif isinstance(col.categories, NumericalColumn):
+            categories = col.categories.nans_to_nulls().dropna()
         else:
-            categories = col.categories.dropna(drop_nan=True).to_pandas()
+            categories = col.categories.dropna()
         data = pd.Categorical.from_codes(
-            codes, categories=categories, ordered=col.ordered
+            codes, categories=categories.to_pandas(), ordered=col.ordered
         )
         return pd.Series(data, index=index)
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 296fd6a41b0..7a1718ba593 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -12,6 +12,7 @@
     Any,
     Dict,
     List,
+    Literal,
     MutableSequence,
     Optional,
     Sequence,
@@ -107,16 +108,8 @@ class ColumnBase(Column, Serializable, BinaryOperand, Reducible):
         "min",
     }
 
-    def as_frame(self) -> "cudf.core.frame.Frame":
-        """
-        Converts a Column to Frame
-        """
-        return cudf.core.single_column_frame.SingleColumnFrame(
-            {None: self.copy(deep=False)}
-        )
-
     def data_array_view(
-        self, *, mode="write"
+        self, *, mode: Literal["write", "read"] = "write"
     ) -> "cuda.devicearray.DeviceNDArray":
         """
         View the data as a device array object
@@ -153,7 +146,7 @@ def data_array_view(
         return cuda.as_cuda_array(obj).view(self.dtype)
 
     def mask_array_view(
-        self, *, mode="write"
+        self, *, mode: Literal["write", "read"] = "write"
     ) -> "cuda.devicearray.DeviceNDArray":
         """
         View the mask as a device array
@@ -289,8 +282,7 @@ def any(self, skipna: bool = True) -> bool:
 
         return libcudf.reduce.reduce("any", self, dtype=np.bool_)
 
-    def dropna(self, drop_nan: bool = False) -> ColumnBase:
-        # The drop_nan argument is only used for numerical columns.
+    def dropna(self) -> ColumnBase:
         return drop_nulls([self])[0]._with_type_metadata(self.dtype)
 
     def to_arrow(self) -> pa.Array:
@@ -440,14 +432,6 @@ def nullmask(self) -> Buffer:
             raise ValueError("Column has no null mask")
         return self.mask_array_view(mode="read")
 
-    def force_deep_copy(self) -> Self:
-        """
-        A method to create deep copy irrespective of whether
-        `copy-on-write` is enabled.
-        """
-        result = libcudf.copying.copy_column(self)
-        return result._with_type_metadata(self.dtype)
-
     def copy(self, deep: bool = True) -> Self:
         """
         Makes a copy of the Column.
@@ -467,7 +451,8 @@ def copy(self, deep: bool = True) -> Self:
             them.
         """
         if deep:
-            return self.force_deep_copy()
+            result = libcudf.copying.copy_column(self)
+            return result._with_type_metadata(self.dtype)
         else:
             return cast(
                 Self,
@@ -1067,7 +1052,7 @@ def as_categorical_column(self, dtype) -> ColumnBase:
         )
         # columns include null index in factorization; remove:
         if self.has_nulls():
-            cats = cats.dropna(drop_nan=False)
+            cats = cats.dropna()
             min_type = min_unsigned_type(len(cats), 8)
             if cudf.dtype(min_type).itemsize < labels.dtype.itemsize:
                 labels = labels.astype(min_type)
diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py
index eed7bba3628..0d9222ae8d9 100644
--- a/python/cudf/cudf/core/column/interval.py
+++ b/python/cudf/cudf/core/column/interval.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 from typing import Optional
 
 import pandas as pd
@@ -142,7 +142,4 @@ def element_indexing(self, index: int):
         result = super().element_indexing(index)
         if cudf.get_option("mode.pandas_compatible"):
             return pd.Interval(**result, closed=self._closed)
-        return {
-            field: value
-            for field, value in zip(self.dtype.fields, result.values())
-        }
+        return result
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index f40886bf153..be65dd8766b 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -19,7 +19,6 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.stream_compaction import drop_nulls
 from cudf._lib.types import size_type_dtype
 from cudf._typing import (
     ColumnBinaryOperand,
@@ -420,10 +419,6 @@ def nan_count(self) -> int:
             self._nan_count = nan_col.sum()
         return self._nan_count
 
-    def dropna(self, drop_nan: bool = False) -> NumericalColumn:
-        col = self.nans_to_nulls() if drop_nan else self
-        return drop_nulls([col])[0]
-
     @property
     def contains_na_entries(self) -> bool:
         return (self.nan_count != 0) or (self.null_count != 0)
diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py
index e1950c9f250..bed376e4a79 100644
--- a/python/cudf/cudf/io/dlpack.py
+++ b/python/cudf/cudf/io/dlpack.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 
 import cudf
@@ -71,7 +71,7 @@ def to_dlpack(cudf_obj):
     if isinstance(cudf_obj, (cudf.DataFrame, cudf.Series, cudf.BaseIndex)):
         gdf = cudf_obj
     elif isinstance(cudf_obj, ColumnBase):
-        gdf = cudf_obj.as_frame()
+        gdf = cudf.Series._from_data({None: cudf_obj})
     else:
         raise TypeError(
             f"Input of type {type(cudf_obj)} cannot be converted "

From 122337d1604e455f9cd2bd5192a5382c3762db91 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 9 Jan 2024 10:15:56 -0800
Subject: [PATCH 2/5] Fix import

---
 python/cudf/cudf/core/column/categorical.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 60895391ff4..2ec124f8c85 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -18,7 +18,7 @@
 from cudf._lib.transform import bools_to_mask
 from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
 from cudf.core.buffer import Buffer
-from cudf.core.column import column
+from cudf.core.column import NumericalColumn, column
 from cudf.core.column.methods import ColumnMethods
 from cudf.core.dtypes import CategoricalDtype, IntervalDtype
 from cudf.utils.dtypes import (
@@ -32,7 +32,6 @@
     from cudf.core.column import (
         ColumnBase,
         DatetimeColumn,
-        NumericalColumn,
         StringColumn,
         TimeDeltaColumn,
     )

From 9f764aad928c2b46b24ee07fed5c9dbecbab283f Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 9 Jan 2024 14:37:41 -0800
Subject: [PATCH 3/5] Check dtype

---
 python/cudf/cudf/core/column/categorical.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 2ec124f8c85..ea2e8b15bdc 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -18,7 +18,7 @@
 from cudf._lib.transform import bools_to_mask
 from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
 from cudf.core.buffer import Buffer
-from cudf.core.column import NumericalColumn, column
+from cudf.core.column import column
 from cudf.core.column.methods import ColumnMethods
 from cudf.core.dtypes import CategoricalDtype, IntervalDtype
 from cudf.utils.dtypes import (
@@ -32,6 +32,7 @@
     from cudf.core.column import (
         ColumnBase,
         DatetimeColumn,
+        NumericalColumn,
         StringColumn,
         TimeDeltaColumn,
     )
@@ -995,17 +996,18 @@ def to_pandas(
             .fillna(_DEFAULT_CATEGORICAL_VALUE)
             .values_host
         )
-        if isinstance(col.categories.dtype, IntervalDtype):
+        cats = col.categories
+        if isinstance(cats.dtype, IntervalDtype):
             # leaving out dropna because it temporarily changes an interval
             # index into a struct and throws off results.
             # TODO: work on interval index dropna
-            categories = col.categories
-        elif isinstance(col.categories, NumericalColumn):
-            categories = col.categories.nans_to_nulls().dropna()
+            pass
+        elif cats.dtype.kind in "biuf":
+            cats = cats.nans_to_nulls().dropna()  # type: ignore[attr-defined]
         else:
-            categories = col.categories.dropna()
+            cats = cats.dropna()
         data = pd.Categorical.from_codes(
-            codes, categories=categories.to_pandas(), ordered=col.ordered
+            codes, categories=cats.to_pandas(), ordered=col.ordered
         )
         return pd.Series(data, index=index)
 

From d5fe27222b52903da16aea441176ec5aeaf33534 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 9 Jan 2024 17:30:47 -0800
Subject: [PATCH 4/5] Fix bench_dropna

---
 python/cudf/benchmarks/internal/bench_column.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/benchmarks/internal/bench_column.py b/python/cudf/benchmarks/internal/bench_column.py
index 3ed42ed59dc..8da769b7858 100644
--- a/python/cudf/benchmarks/internal/bench_column.py
+++ b/python/cudf/benchmarks/internal/bench_column.py
@@ -18,7 +18,7 @@ def bench_apply_boolean_mask(benchmark, column):
 
 
 @benchmark_with_object(cls="column", dtype="float")
-def bench_dropna(benchmark, column, dropnan):
+def bench_dropna(benchmark, column):
     benchmark(column.dropna)
 
 

From 3c73efe81ce0790a24fe0d2b9d486888b0ea1304 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 10 Jan 2024 12:50:19 -0800
Subject: [PATCH 5/5] Remove extra branch

---
 python/cudf/cudf/core/column/categorical.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 78c0f353f5c..213f57aa198 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -988,14 +988,12 @@ def to_pandas(
             .values_host
         )
         cats = col.categories
-        if isinstance(cats.dtype, IntervalDtype):
+        if cats.dtype.kind in "biuf":
+            cats = cats.nans_to_nulls().dropna()  # type: ignore[attr-defined]
+        elif not isinstance(cats.dtype, IntervalDtype):
             # leaving out dropna because it temporarily changes an interval
             # index into a struct and throws off results.
             # TODO: work on interval index dropna
-            pass
-        elif cats.dtype.kind in "biuf":
-            cats = cats.nans_to_nulls().dropna()  # type: ignore[attr-defined]
-        else:
             cats = cats.dropna()
         data = pd.Categorical.from_codes(
             codes, categories=cats.to_pandas(), ordered=col.ordered