From a5923d7a0e01961dc65e09c038a94d0d26938f51 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 9 Dec 2024 14:17:43 -0800
Subject: [PATCH 1/2] Remove cudf._lib.interop in favor of inlining pylibcudf

---
 python/cudf/cudf/_lib/CMakeLists.txt     |  22 +----
 python/cudf/cudf/_lib/__init__.py        |   1 -
 python/cudf/cudf/_lib/interop.pyx        | 111 -----------------------
 python/cudf/cudf/core/column/column.py   |  47 ++++------
 python/cudf/cudf/core/column/datetime.py |   2 +-
 python/cudf/cudf/core/column/decimal.py  |  10 +-
 python/cudf/cudf/core/column/lists.py    |   4 +-
 python/cudf/cudf/core/frame.py           |  15 +--
 python/cudf/cudf/io/dlpack.py            |  27 +++---
 9 files changed, 53 insertions(+), 186 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/interop.pyx

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index e98cf283bbb..44275bdc56e 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -13,21 +13,8 @@
 # =============================================================================
 
 set(cython_sources
-    column.pyx
-    copying.pyx
-    csv.pyx
-    groupby.pyx
-    interop.pyx
-    parquet.pyx
-    reduce.pyx
-    scalar.pyx
-    sort.pyx
-    stream_compaction.pyx
-    string_casting.pyx
-    strings_udf.pyx
-    transform.pyx
-    types.pyx
-    utils.pyx
+    column.pyx copying.pyx csv.pyx groupby.pyx parquet.pyx reduce.pyx scalar.pyx sort.pyx
+    stream_compaction.pyx string_casting.pyx strings_udf.pyx transform.pyx types.pyx utils.pyx
 )
 set(linked_libraries cudf::cudf)
 
@@ -38,11 +25,6 @@ rapids_cython_create_modules(
 )
 
 target_link_libraries(strings_udf PUBLIC cudf_strings_udf)
-target_include_directories(interop PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>")
-
-include(${rapids-cmake-dir}/export/find_package_root.cmake)
-include(../../../../cpp/cmake/thirdparty/get_nanoarrow.cmake)
-target_link_libraries(interop PUBLIC nanoarrow)
 
 add_subdirectory(io)
 add_subdirectory(nvtext)
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index 4758a933898..7f36636a674 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -5,7 +5,6 @@
     copying,
     csv,
     groupby,
-    interop,
     nvtext,
     parquet,
     reduce,
diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
deleted file mode 100644
index 1c9d3a01b80..00000000000
--- a/python/cudf/cudf/_lib/interop.pyx
+++ /dev/null
@@ -1,111 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-import pylibcudf
-
-from cudf._lib.utils cimport columns_from_pylibcudf_table
-
-from cudf.core.buffer import acquire_spill_lock
-from cudf.core.dtypes import ListDtype, StructDtype
-
-
-def from_dlpack(object dlpack_capsule):
-    """
-    Converts a DLPack Tensor PyCapsule into a list of columns.
-
-    DLPack Tensor PyCapsule is expected to have the name "dltensor".
-    """
-    return columns_from_pylibcudf_table(
-        pylibcudf.interop.from_dlpack(dlpack_capsule)
-    )
-
-
-def to_dlpack(list source_columns):
-    """
-    Converts a list of columns into a DLPack Tensor PyCapsule.
-
-    DLPack Tensor PyCapsule will have the name "dltensor".
-    """
-    return pylibcudf.interop.to_dlpack(
-        pylibcudf.Table(
-            [col.to_pylibcudf(mode="read") for col in source_columns]
-        )
-    )
-
-
-def gather_metadata(object cols_dtypes):
-    """
-    Generates a ColumnMetadata vector for each column.
-
-    Parameters
-    ----------
-    cols_dtypes : iterable
-        An iterable of ``(column_name, dtype)`` pairs.
-    """
-    cpp_metadata = []
-    if cols_dtypes is not None:
-        for idx, (col_name, col_dtype) in enumerate(cols_dtypes):
-            cpp_metadata.append(pylibcudf.interop.ColumnMetadata(col_name))
-            if isinstance(col_dtype, (ListDtype, StructDtype)):
-                _set_col_children_metadata(col_dtype, cpp_metadata[idx])
-    else:
-        raise TypeError(
-            "An iterable of (column_name, dtype) pairs is required to "
-            "construct column_metadata"
-        )
-    return cpp_metadata
-
-
-def _set_col_children_metadata(dtype, col_meta):
-    if isinstance(dtype, StructDtype):
-        for name, value in dtype.fields.items():
-            element_metadata = pylibcudf.interop.ColumnMetadata(name)
-            _set_col_children_metadata(value, element_metadata)
-            col_meta.children_meta.append(element_metadata)
-    elif isinstance(dtype, ListDtype):
-        # Offsets - child 0
-        col_meta.children_meta.append(pylibcudf.interop.ColumnMetadata())
-
-        # Element column - child 1
-        element_metadata = pylibcudf.interop.ColumnMetadata()
-        _set_col_children_metadata(dtype.element_type, element_metadata)
-        col_meta.children_meta.append(element_metadata)
-    else:
-        col_meta.children_meta.append(pylibcudf.interop.ColumnMetadata())
-
-
-@acquire_spill_lock()
-def to_arrow(list source_columns, object column_dtypes):
-    """Convert a list of columns from
-    cudf Frame to a PyArrow Table.
-
-    Parameters
-    ----------
-    source_columns : a list of columns to convert
-    column_dtypes : Iterable of ``(column_name, column_dtype)`` pairs
-
-    Returns
-    -------
-    pyarrow table
-    """
-    cpp_metadata = gather_metadata(column_dtypes)
-    return pylibcudf.interop.to_arrow(
-        pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source_columns]),
-        cpp_metadata,
-    )
-
-
-@acquire_spill_lock()
-def from_arrow(object input_table):
-    """Convert from PyArrow Table to a list of columns.
-
-    Parameters
-    ----------
-    input_table : PyArrow table
-
-    Returns
-    -------
-    A list of columns to construct Frame object
-    """
-    return columns_from_pylibcudf_table(
-        pylibcudf.interop.from_arrow(input_table)
-    )
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 1ddc79e8970..13b4871f6a6 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -283,6 +283,7 @@ def dropna(self) -> Self:
         else:
             return self.copy()
 
+    @acquire_spill_lock()
     def to_arrow(self) -> pa.Array:
         """Convert to PyArrow Array
 
@@ -299,9 +300,7 @@ def to_arrow(self) -> pa.Array:
           4
         ]
         """
-        return libcudf.interop.to_arrow([self], [("None", self.dtype)])[
-            "None"
-        ].chunk(0)
+        return plc.interop.to_arrow(self.to_pylibcudf(mode="read")).chunk(0)
 
     @classmethod
     def from_arrow(cls, array: pa.Array) -> ColumnBase:
@@ -334,30 +333,20 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
         elif isinstance(array.type, ArrowIntervalType):
             return cudf.core.column.IntervalColumn.from_arrow(array)
 
-        data = pa.table([array], [None])
+        if isinstance(array, pa.ChunkedArray):
+            array = array.combine_chunks()
 
         if isinstance(array.type, pa.DictionaryType):
-            indices_table = pa.table(
-                {
-                    "None": pa.chunked_array(
-                        [chunk.indices for chunk in data["None"].chunks],
-                        type=array.type.index_type,
-                    )
-                }
-            )
-            dictionaries_table = pa.table(
-                {
-                    "None": pa.chunked_array(
-                        [chunk.dictionary for chunk in data["None"].chunks],
-                        type=array.type.value_type,
-                    )
-                }
-            )
-
-            codes = libcudf.interop.from_arrow(indices_table)[0]
-            categories = libcudf.interop.from_arrow(dictionaries_table)[0]
+            with acquire_spill_lock():
+                codes = cls.from_pylibcudf(
+                    plc.interop.from_arrow(array.indices)
+                )
+                categories = cls.from_pylibcudf(
+                    plc.interop.from_arrow(array.dictionary)
+                )
             codes = cudf.core.column.categorical.as_unsigned_codes(
-                len(categories), codes
+                len(categories),
+                codes,  # type: ignore[arg-type]
             )
             return cudf.core.column.CategoricalColumn(
                 data=None,
@@ -368,10 +357,12 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
                 mask=codes.base_mask,
                 children=(codes,),
             )
-
-        result = libcudf.interop.from_arrow(data)[0]
-
-        return result._with_type_metadata(cudf_dtype_from_pa_type(array.type))
+        else:
+            result = cls.from_pylibcudf(plc.interop.from_arrow(array))
+            # TODO: cudf_dtype_from_pa_type may be less necessary for some types
+            return result._with_type_metadata(
+                cudf_dtype_from_pa_type(array.type)
+            )
 
     def _get_mask_as_column(self) -> ColumnBase:
         return libcudf.transform.mask_to_bools(
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index b526a6efa51..601d4a6c2f7 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -1016,7 +1016,7 @@ def to_pandas(
                 self.dtype.tz, ambiguous="NaT", nonexistent="NaT"
             )
 
-    def to_arrow(self):
+    def to_arrow(self) -> pa.Array:
         return pa.compute.assume_timezone(
             self._local_time.to_arrow(), str(self.dtype.tz)
         )
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 2c22724d3d7..90737f229c6 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -264,8 +264,8 @@ def from_arrow(cls, data: pa.Array):
             mask=mask,
         )
 
-    def to_arrow(self):
-        data_buf_32 = np.array(self.base_data.memoryview()).view("int32")
+    def to_arrow(self) -> pa.Array:
+        data_buf_32 = np.array(self.base_data.memoryview()).view("int32")  # type: ignore[union-attr]
         data_buf_128 = np.empty(len(data_buf_32) * 4, dtype="int32")
 
         # use striding to set the first 32 bits of each 128-bit chunk:
@@ -332,7 +332,7 @@ def from_arrow(cls, data: pa.Array):
         result.dtype.precision = data.type.precision
         return result
 
-    def to_arrow(self):
+    def to_arrow(self) -> pa.Array:
         return super().to_arrow().cast(self.dtype.to_arrow())
 
     def _with_type_metadata(
@@ -391,8 +391,8 @@ def from_arrow(cls, data: pa.Array):
             mask=mask,
         )
 
-    def to_arrow(self):
-        data_buf_64 = np.array(self.base_data.memoryview()).view("int64")
+    def to_arrow(self) -> pa.Array:
+        data_buf_64 = np.array(self.base_data.memoryview()).view("int64")  # type: ignore[union-attr]
         data_buf_128 = np.empty(len(data_buf_64) * 2, dtype="int64")
 
         # use striding to set the first 64 bits of each 128-bit chunk:
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index ea384888388..704290639c0 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -151,7 +151,7 @@ def offsets(self) -> NumericalColumn:
         """
         return cast(NumericalColumn, self.children[0])
 
-    def to_arrow(self):
+    def to_arrow(self) -> pa.Array:
         offsets = self.offsets.to_arrow()
         elements = (
             pa.nulls(len(self.elements))
@@ -161,7 +161,7 @@ def to_arrow(self):
         pa_type = pa.list_(elements.type)
 
         if self.nullable:
-            nbuf = pa.py_buffer(self.mask.memoryview())
+            nbuf = pa.py_buffer(self.mask.memoryview())  # type: ignore[union-attr]
             buffers = (nbuf, offsets.buffers()[1])
         else:
             buffers = offsets.buffers()
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 0a7e6fefe6e..91b49af7ee9 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -902,16 +902,17 @@ def from_arrow(cls, data: pa.Table) -> Self:
         if len(dict_indices):
             dict_indices_table = pa.table(dict_indices)
             data = data.drop(dict_indices_table.column_names)
-            indices_columns = libcudf.interop.from_arrow(dict_indices_table)
+            plc_indices = plc.interop.from_arrow(dict_indices_table)
             # as dictionary size can vary, it can't be a single table
             cudf_dictionaries_columns = {
                 name: ColumnBase.from_arrow(dict_dictionaries[name])
                 for name in dict_dictionaries.keys()
             }
 
-            for name, codes in zip(
-                dict_indices_table.column_names, indices_columns
+            for name, plc_codes in zip(
+                dict_indices_table.column_names, plc_indices.columns()
             ):
+                codes = libcudf.column.Column.from_pylibcudf(plc_codes)
                 categories = cudf_dictionaries_columns[name]
                 codes = as_unsigned_codes(len(categories), codes)
                 cudf_category_frame[name] = CategoricalColumn(
@@ -927,9 +928,9 @@ def from_arrow(cls, data: pa.Table) -> Self:
 
         # Handle non-dict arrays
         cudf_non_category_frame = {
-            name: col
-            for name, col in zip(
-                data.column_names, libcudf.interop.from_arrow(data)
+            name: libcudf.column.Column.from_pylibcudf(plc_col)
+            for name, plc_col in zip(
+                data.column_names, plc.interop.from_arrow(data).columns()
             )
         }
 
@@ -988,7 +989,7 @@ def from_arrow(cls, data: pa.Table) -> Self:
         return cls._from_data({name: result[name] for name in column_names})
 
     @_performance_tracking
-    def to_arrow(self):
+    def to_arrow(self) -> pa.Table:
         """
         Convert to arrow Table
 
diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py
index fe8e446f9c0..3b3fd5f7c56 100644
--- a/python/cudf/cudf/io/dlpack.py
+++ b/python/cudf/cudf/io/dlpack.py
@@ -1,13 +1,14 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
+import pylibcudf as plc
 
 import cudf
-from cudf._lib import interop as libdlpack
 from cudf.core.column import ColumnBase
 from cudf.utils import ioutils
 
 
-def from_dlpack(pycapsule_obj):
+def from_dlpack(pycapsule_obj) -> cudf.Series | cudf.DataFrame:
     """Converts from a DLPack tensor to a cuDF object.
 
     DLPack is an open-source memory tensor structure:
@@ -33,18 +34,21 @@ def from_dlpack(pycapsule_obj):
     cuDF from_dlpack() assumes column-major (Fortran order) input. If the input
     tensor is row-major, transpose it before passing it to this function.
     """
+    plc_table = plc.interop.from_dlpack(pycapsule_obj)
+    data = dict(
+        enumerate(
+            (ColumnBase.from_pylibcudf(col) for col in plc_table.columns())
+        )
+    )
 
-    columns = libdlpack.from_dlpack(pycapsule_obj)
-    data = dict(enumerate(columns))
-
-    if len(columns) == 1:
+    if len(data) == 1:
         return cudf.Series._from_data(data)
     else:
         return cudf.DataFrame._from_data(data)
 
 
 @ioutils.doc_to_dlpack()
-def to_dlpack(cudf_obj):
+def to_dlpack(cudf_obj: cudf.Series | cudf.DataFrame | cudf.BaseIndex):
     """Converts a cuDF object to a DLPack tensor.
 
     DLPack is an open-source memory tensor structure:
@@ -80,13 +84,14 @@ def to_dlpack(cudf_obj):
 
     if any(
         not cudf.api.types._is_non_decimal_numeric_dtype(dtype)
-        for _, dtype in gdf._dtypes
+        for _, dtype in gdf._dtypes  # type: ignore[union-attr]
     ):
         raise TypeError("non-numeric data not yet supported")
 
     dtype = cudf.utils.dtypes.find_common_type(
-        [dtype for _, dtype in gdf._dtypes]
+        [dtype for _, dtype in gdf._dtypes]  # type: ignore[union-attr]
     )
     gdf = gdf.astype(dtype)
-
-    return libdlpack.to_dlpack([*gdf._columns])
+    return plc.interop.to_dlpack(
+        plc.Table([col.to_pylibcudf(mode="read") for col in gdf._columns])
+    )

From f7062442304908809e63fdbec14362206efb10f1 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 12 Dec 2024 19:43:56 -0800
Subject: [PATCH 2/2] Go back to using pyarrow table

---
 python/cudf/cudf/core/column/column.py | 29 +++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index ea69c0f798c..daffcb9fd7e 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -331,16 +331,33 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
         elif isinstance(array.type, ArrowIntervalType):
             return cudf.core.column.IntervalColumn.from_arrow(array)
 
-        if isinstance(array, pa.ChunkedArray):
-            array = array.combine_chunks()
+        data = pa.table([array], [None])
 
         if isinstance(array.type, pa.DictionaryType):
+            indices_table = pa.table(
+                [
+                    pa.chunked_array(
+                        [chunk.indices for chunk in data.column(0).chunks],
+                        type=array.type.index_type,
+                    )
+                ],
+                [None],
+            )
+            dictionaries_table = pa.table(
+                [
+                    pa.chunked_array(
+                        [chunk.dictionary for chunk in data.column(0).chunks],
+                        type=array.type.value_type,
+                    )
+                ],
+                [None],
+            )
             with acquire_spill_lock():
                 codes = cls.from_pylibcudf(
-                    plc.interop.from_arrow(array.indices)
+                    plc.interop.from_arrow(indices_table).columns()[0]
                 )
                 categories = cls.from_pylibcudf(
-                    plc.interop.from_arrow(array.dictionary)
+                    plc.interop.from_arrow(dictionaries_table).columns()[0]
                 )
             codes = cudf.core.column.categorical.as_unsigned_codes(
                 len(categories),
@@ -356,7 +373,9 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
                 children=(codes,),
             )
         else:
-            result = cls.from_pylibcudf(plc.interop.from_arrow(array))
+            result = cls.from_pylibcudf(
+                plc.interop.from_arrow(data).columns()[0]
+            )
             # TODO: cudf_dtype_from_pa_type may be less necessary for some types
             return result._with_type_metadata(
                 cudf_dtype_from_pa_type(array.type)