rapidsai · rapids-bot · Dec 17, 2024 · Dec 9, 2024 · Dec 9, 2024 · Dec 10, 2024
@@ -13,8 +13,8 @@
 # =============================================================================
 
 set(cython_sources
-    column.pyx copying.pyx csv.pyx groupby.pyx interop.pyx parquet.pyx reduce.pyx scalar.pyx
-    sort.pyx stream_compaction.pyx string_casting.pyx strings_udf.pyx types.pyx utils.pyx
+    column.pyx copying.pyx csv.pyx groupby.pyx parquet.pyx reduce.pyx scalar.pyx sort.pyx
+    stream_compaction.pyx string_casting.pyx strings_udf.pyx types.pyx utils.pyx
 )
 set(linked_libraries cudf::cudf)
 
@@ -25,11 +25,6 @@ rapids_cython_create_modules(
 )
 
 target_link_libraries(strings_udf PUBLIC cudf_strings_udf)
-target_include_directories(interop PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>")
-
-include(${rapids-cmake-dir}/export/find_package_root.cmake)
-include(../../../../cpp/cmake/thirdparty/get_nanoarrow.cmake)
-target_link_libraries(interop PUBLIC nanoarrow)
 
 add_subdirectory(io)
 add_subdirectory(nvtext)
@@ -5,7 +5,6 @@
     copying,
     csv,
     groupby,
-    interop,
     nvtext,
     parquet,
     reduce,

@@ -282,6 +282,7 @@ def dropna(self) -> Self:
         else:
             return self.copy()
 
+    @acquire_spill_lock()
     def to_arrow(self) -> pa.Array:
         """Convert to PyArrow Array
 
@@ -298,9 +299,7 @@ def to_arrow(self) -> pa.Array:
           4
         ]
         """
-        return libcudf.interop.to_arrow([self], [("None", self.dtype)])[
-            "None"
-        ].chunk(0)
+        return plc.interop.to_arrow(self.to_pylibcudf(mode="read")).chunk(0)
 
     @classmethod
     def from_arrow(cls, array: pa.Array) -> ColumnBase:
@@ -333,30 +332,20 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
         elif isinstance(array.type, ArrowIntervalType):
             return cudf.core.column.IntervalColumn.from_arrow(array)
 
-        data = pa.table([array], [None])
+        if isinstance(array, pa.ChunkedArray):
+            array = array.combine_chunks()
 
         if isinstance(array.type, pa.DictionaryType):
-            indices_table = pa.table(
-                {
-                    "None": pa.chunked_array(
-                        [chunk.indices for chunk in data["None"].chunks],
-                        type=array.type.index_type,
-                    )
-                }
-            )
-            dictionaries_table = pa.table(
-                {
-                    "None": pa.chunked_array(
-                        [chunk.dictionary for chunk in data["None"].chunks],
-                        type=array.type.value_type,
-                    )
-                }
-            )
-
-            codes = libcudf.interop.from_arrow(indices_table)[0]
-            categories = libcudf.interop.from_arrow(dictionaries_table)[0]
+            with acquire_spill_lock():
+                codes = cls.from_pylibcudf(
+                    plc.interop.from_arrow(array.indices)
+                )
+                categories = cls.from_pylibcudf(
+                    plc.interop.from_arrow(array.dictionary)
+                )
             codes = cudf.core.column.categorical.as_unsigned_codes(
-                len(categories), codes
+                len(categories),
+                codes,  # type: ignore[arg-type]
             )
             return cudf.core.column.CategoricalColumn(
                 data=None,
@@ -367,10 +356,12 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
                 mask=codes.base_mask,
                 children=(codes,),
             )
-
-        result = libcudf.interop.from_arrow(data)[0]
-
-        return result._with_type_metadata(cudf_dtype_from_pa_type(array.type))
+        else:
+            result = cls.from_pylibcudf(plc.interop.from_arrow(array))
+            # TODO: cudf_dtype_from_pa_type may be less necessary for some types
+            return result._with_type_metadata(
+                cudf_dtype_from_pa_type(array.type)
+            )
 
     @acquire_spill_lock()
     def _get_mask_as_column(self) -> ColumnBase:

@@ -1016,7 +1016,7 @@ def to_pandas(
                 self.dtype.tz, ambiguous="NaT", nonexistent="NaT"
             )
 
-    def to_arrow(self):
+    def to_arrow(self) -> pa.Array:
         return pa.compute.assume_timezone(
             self._local_time.to_arrow(), str(self.dtype.tz)
         )

@@ -269,8 +269,8 @@ def from_arrow(cls, data: pa.Array):
             mask=mask,
         )
 
-    def to_arrow(self):
-        data_buf_32 = np.array(self.base_data.memoryview()).view("int32")
+    def to_arrow(self) -> pa.Array:
+        data_buf_32 = np.array(self.base_data.memoryview()).view("int32")  # type: ignore[union-attr]
         data_buf_128 = np.empty(len(data_buf_32) * 4, dtype="int32")
 
         # use striding to set the first 32 bits of each 128-bit chunk:
@@ -337,7 +337,7 @@ def from_arrow(cls, data: pa.Array):
         result.dtype.precision = data.type.precision
         return result
 
-    def to_arrow(self):
+    def to_arrow(self) -> pa.Array:
         return super().to_arrow().cast(self.dtype.to_arrow())
 
     def _with_type_metadata(
@@ -396,8 +396,8 @@ def from_arrow(cls, data: pa.Array):
             mask=mask,
         )
 
-    def to_arrow(self):
-        data_buf_64 = np.array(self.base_data.memoryview()).view("int64")
+    def to_arrow(self) -> pa.Array:
+        data_buf_64 = np.array(self.base_data.memoryview()).view("int64")  # type: ignore[union-attr]
         data_buf_128 = np.empty(len(data_buf_64) * 2, dtype="int64")
 
         # use striding to set the first 64 bits of each 128-bit chunk:

@@ -150,7 +150,7 @@ def offsets(self) -> NumericalColumn:
         """
         return cast(NumericalColumn, self.children[0])
 
-    def to_arrow(self):
+    def to_arrow(self) -> pa.Array:
         offsets = self.offsets.to_arrow()
         elements = (
             pa.nulls(len(self.elements))
@@ -160,7 +160,7 @@ def to_arrow(self):
         pa_type = pa.list_(elements.type)
 
         if self.nullable:
-            nbuf = pa.py_buffer(self.mask.memoryview())
+            nbuf = pa.py_buffer(self.mask.memoryview())  # type: ignore[union-attr]
             buffers = (nbuf, offsets.buffers()[1])
         else:
             buffers = offsets.buffers()

@@ -902,16 +902,17 @@ def from_arrow(cls, data: pa.Table) -> Self:
         if len(dict_indices):
             dict_indices_table = pa.table(dict_indices)
             data = data.drop(dict_indices_table.column_names)
-            indices_columns = libcudf.interop.from_arrow(dict_indices_table)
+            plc_indices = plc.interop.from_arrow(dict_indices_table)
             # as dictionary size can vary, it can't be a single table
             cudf_dictionaries_columns = {
                 name: ColumnBase.from_arrow(dict_dictionaries[name])
                 for name in dict_dictionaries.keys()
             }
 
-            for name, codes in zip(
-                dict_indices_table.column_names, indices_columns
+            for name, plc_codes in zip(
+                dict_indices_table.column_names, plc_indices.columns()
             ):
+                codes = libcudf.column.Column.from_pylibcudf(plc_codes)
                 categories = cudf_dictionaries_columns[name]
                 codes = as_unsigned_codes(len(categories), codes)
                 cudf_category_frame[name] = CategoricalColumn(
@@ -927,9 +928,9 @@ def from_arrow(cls, data: pa.Table) -> Self:
 
         # Handle non-dict arrays
         cudf_non_category_frame = {
-            name: col
-            for name, col in zip(
-                data.column_names, libcudf.interop.from_arrow(data)
+            name: libcudf.column.Column.from_pylibcudf(plc_col)
+            for name, plc_col in zip(
+                data.column_names, plc.interop.from_arrow(data).columns()
             )
         }
 
@@ -988,7 +989,7 @@ def from_arrow(cls, data: pa.Table) -> Self:
         return cls._from_data({name: result[name] for name in column_names})
 
     @_performance_tracking
-    def to_arrow(self):
+    def to_arrow(self) -> pa.Table:
         """
         Convert to arrow Table
 

@@ -1,13 +1,14 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
+import pylibcudf as plc
 
 import cudf
-from cudf._lib import interop as libdlpack
 from cudf.core.column import ColumnBase
 from cudf.utils import ioutils
 
 
-def from_dlpack(pycapsule_obj):
+def from_dlpack(pycapsule_obj) -> cudf.Series | cudf.DataFrame:
     """Converts from a DLPack tensor to a cuDF object.
 
     DLPack is an open-source memory tensor structure:
@@ -33,18 +34,21 @@ def from_dlpack(pycapsule_obj):
     cuDF from_dlpack() assumes column-major (Fortran order) input. If the input
     tensor is row-major, transpose it before passing it to this function.
     """
+    plc_table = plc.interop.from_dlpack(pycapsule_obj)
+    data = dict(
+        enumerate(
+            (ColumnBase.from_pylibcudf(col) for col in plc_table.columns())
+        )
+    )
 
-    columns = libdlpack.from_dlpack(pycapsule_obj)
-    data = dict(enumerate(columns))
-
-    if len(columns) == 1:
+    if len(data) == 1:
         return cudf.Series._from_data(data)
     else:
         return cudf.DataFrame._from_data(data)
 
 
 @ioutils.doc_to_dlpack()
-def to_dlpack(cudf_obj):
+def to_dlpack(cudf_obj: cudf.Series | cudf.DataFrame | cudf.BaseIndex):
     """Converts a cuDF object to a DLPack tensor.
 
     DLPack is an open-source memory tensor structure:
@@ -80,13 +84,14 @@ def to_dlpack(cudf_obj):
 
     if any(
         not cudf.api.types._is_non_decimal_numeric_dtype(dtype)
-        for _, dtype in gdf._dtypes
+        for _, dtype in gdf._dtypes  # type: ignore[union-attr]
     ):
         raise TypeError("non-numeric data not yet supported")
 
     dtype = cudf.utils.dtypes.find_common_type(
-        [dtype for _, dtype in gdf._dtypes]
+        [dtype for _, dtype in gdf._dtypes]  # type: ignore[union-attr]
     )
     gdf = gdf.astype(dtype)
-
-    return libdlpack.to_dlpack([*gdf._columns])
+    return plc.interop.to_dlpack(
+        plc.Table([col.to_pylibcudf(mode="read") for col in gdf._columns])
+    )