From a5923d7a0e01961dc65e09c038a94d0d26938f51 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 9 Dec 2024 14:17:43 -0800 Subject: [PATCH 1/2] Remove cudf._lib.interop in favor of inlining pylibcudf --- python/cudf/cudf/_lib/CMakeLists.txt | 22 +---- python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/interop.pyx | 111 ----------------------- python/cudf/cudf/core/column/column.py | 47 ++++------ python/cudf/cudf/core/column/datetime.py | 2 +- python/cudf/cudf/core/column/decimal.py | 10 +- python/cudf/cudf/core/column/lists.py | 4 +- python/cudf/cudf/core/frame.py | 15 +-- python/cudf/cudf/io/dlpack.py | 27 +++--- 9 files changed, 53 insertions(+), 186 deletions(-) delete mode 100644 python/cudf/cudf/_lib/interop.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index e98cf283bbb..44275bdc56e 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -13,21 +13,8 @@ # ============================================================================= set(cython_sources - column.pyx - copying.pyx - csv.pyx - groupby.pyx - interop.pyx - parquet.pyx - reduce.pyx - scalar.pyx - sort.pyx - stream_compaction.pyx - string_casting.pyx - strings_udf.pyx - transform.pyx - types.pyx - utils.pyx + column.pyx copying.pyx csv.pyx groupby.pyx parquet.pyx reduce.pyx scalar.pyx sort.pyx + stream_compaction.pyx string_casting.pyx strings_udf.pyx transform.pyx types.pyx utils.pyx ) set(linked_libraries cudf::cudf) @@ -38,11 +25,6 @@ rapids_cython_create_modules( ) target_link_libraries(strings_udf PUBLIC cudf_strings_udf) -target_include_directories(interop PUBLIC "$") - -include(${rapids-cmake-dir}/export/find_package_root.cmake) -include(../../../../cpp/cmake/thirdparty/get_nanoarrow.cmake) -target_link_libraries(interop PUBLIC nanoarrow) add_subdirectory(io) add_subdirectory(nvtext) diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 4758a933898..7f36636a674 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -5,7 +5,6 @@ copying, csv, groupby, - interop, nvtext, parquet, reduce, diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx deleted file mode 100644 index 1c9d3a01b80..00000000000 --- a/python/cudf/cudf/_lib/interop.pyx +++ /dev/null @@ -1,111 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import pylibcudf - -from cudf._lib.utils cimport columns_from_pylibcudf_table - -from cudf.core.buffer import acquire_spill_lock -from cudf.core.dtypes import ListDtype, StructDtype - - -def from_dlpack(object dlpack_capsule): - """ - Converts a DLPack Tensor PyCapsule into a list of columns. - - DLPack Tensor PyCapsule is expected to have the name "dltensor". - """ - return columns_from_pylibcudf_table( - pylibcudf.interop.from_dlpack(dlpack_capsule) - ) - - -def to_dlpack(list source_columns): - """ - Converts a list of columns into a DLPack Tensor PyCapsule. - - DLPack Tensor PyCapsule will have the name "dltensor". - """ - return pylibcudf.interop.to_dlpack( - pylibcudf.Table( - [col.to_pylibcudf(mode="read") for col in source_columns] - ) - ) - - -def gather_metadata(object cols_dtypes): - """ - Generates a ColumnMetadata vector for each column. - - Parameters - ---------- - cols_dtypes : iterable - An iterable of ``(column_name, dtype)`` pairs. - """ - cpp_metadata = [] - if cols_dtypes is not None: - for idx, (col_name, col_dtype) in enumerate(cols_dtypes): - cpp_metadata.append(pylibcudf.interop.ColumnMetadata(col_name)) - if isinstance(col_dtype, (ListDtype, StructDtype)): - _set_col_children_metadata(col_dtype, cpp_metadata[idx]) - else: - raise TypeError( - "An iterable of (column_name, dtype) pairs is required to " - "construct column_metadata" - ) - return cpp_metadata - - -def _set_col_children_metadata(dtype, col_meta): - if isinstance(dtype, StructDtype): - for name, value in dtype.fields.items(): - element_metadata = pylibcudf.interop.ColumnMetadata(name) - _set_col_children_metadata(value, element_metadata) - col_meta.children_meta.append(element_metadata) - elif isinstance(dtype, ListDtype): - # Offsets - child 0 - col_meta.children_meta.append(pylibcudf.interop.ColumnMetadata()) - - # Element column - child 1 - element_metadata = pylibcudf.interop.ColumnMetadata() - _set_col_children_metadata(dtype.element_type, element_metadata) - col_meta.children_meta.append(element_metadata) - else: - col_meta.children_meta.append(pylibcudf.interop.ColumnMetadata()) - - -@acquire_spill_lock() -def to_arrow(list source_columns, object column_dtypes): - """Convert a list of columns from - cudf Frame to a PyArrow Table. - - Parameters - ---------- - source_columns : a list of columns to convert - column_dtypes : Iterable of ``(column_name, column_dtype)`` pairs - - Returns - ------- - pyarrow table - """ - cpp_metadata = gather_metadata(column_dtypes) - return pylibcudf.interop.to_arrow( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source_columns]), - cpp_metadata, - ) - - -@acquire_spill_lock() -def from_arrow(object input_table): - """Convert from PyArrow Table to a list of columns. - - Parameters - ---------- - input_table : PyArrow table - - Returns - ------- - A list of columns to construct Frame object - """ - return columns_from_pylibcudf_table( - pylibcudf.interop.from_arrow(input_table) - ) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 1ddc79e8970..13b4871f6a6 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -283,6 +283,7 @@ def dropna(self) -> Self: else: return self.copy() + @acquire_spill_lock() def to_arrow(self) -> pa.Array: """Convert to PyArrow Array @@ -299,9 +300,7 @@ def to_arrow(self) -> pa.Array: 4 ] """ - return libcudf.interop.to_arrow([self], [("None", self.dtype)])[ - "None" - ].chunk(0) + return plc.interop.to_arrow(self.to_pylibcudf(mode="read")).chunk(0) @classmethod def from_arrow(cls, array: pa.Array) -> ColumnBase: @@ -334,30 +333,20 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: elif isinstance(array.type, ArrowIntervalType): return cudf.core.column.IntervalColumn.from_arrow(array) - data = pa.table([array], [None]) + if isinstance(array, pa.ChunkedArray): + array = array.combine_chunks() if isinstance(array.type, pa.DictionaryType): - indices_table = pa.table( - { - "None": pa.chunked_array( - [chunk.indices for chunk in data["None"].chunks], - type=array.type.index_type, - ) - } - ) - dictionaries_table = pa.table( - { - "None": pa.chunked_array( - [chunk.dictionary for chunk in data["None"].chunks], - type=array.type.value_type, - ) - } - ) - - codes = libcudf.interop.from_arrow(indices_table)[0] - categories = libcudf.interop.from_arrow(dictionaries_table)[0] + with acquire_spill_lock(): + codes = cls.from_pylibcudf( + plc.interop.from_arrow(array.indices) + ) + categories = cls.from_pylibcudf( + plc.interop.from_arrow(array.dictionary) + ) codes = cudf.core.column.categorical.as_unsigned_codes( - len(categories), codes + len(categories), + codes, # type: ignore[arg-type] ) return cudf.core.column.CategoricalColumn( data=None, @@ -368,10 +357,12 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: mask=codes.base_mask, children=(codes,), ) - - result = libcudf.interop.from_arrow(data)[0] - - return result._with_type_metadata(cudf_dtype_from_pa_type(array.type)) + else: + result = cls.from_pylibcudf(plc.interop.from_arrow(array)) + # TODO: cudf_dtype_from_pa_type may be less necessary for some types + return result._with_type_metadata( + cudf_dtype_from_pa_type(array.type) + ) def _get_mask_as_column(self) -> ColumnBase: return libcudf.transform.mask_to_bools( diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index b526a6efa51..601d4a6c2f7 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -1016,7 +1016,7 @@ def to_pandas( self.dtype.tz, ambiguous="NaT", nonexistent="NaT" ) - def to_arrow(self): + def to_arrow(self) -> pa.Array: return pa.compute.assume_timezone( self._local_time.to_arrow(), str(self.dtype.tz) ) diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 2c22724d3d7..90737f229c6 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -264,8 +264,8 @@ def from_arrow(cls, data: pa.Array): mask=mask, ) - def to_arrow(self): - data_buf_32 = np.array(self.base_data.memoryview()).view("int32") + def to_arrow(self) -> pa.Array: + data_buf_32 = np.array(self.base_data.memoryview()).view("int32") # type: ignore[union-attr] data_buf_128 = np.empty(len(data_buf_32) * 4, dtype="int32") # use striding to set the first 32 bits of each 128-bit chunk: @@ -332,7 +332,7 @@ def from_arrow(cls, data: pa.Array): result.dtype.precision = data.type.precision return result - def to_arrow(self): + def to_arrow(self) -> pa.Array: return super().to_arrow().cast(self.dtype.to_arrow()) def _with_type_metadata( @@ -391,8 +391,8 @@ def from_arrow(cls, data: pa.Array): mask=mask, ) - def to_arrow(self): - data_buf_64 = np.array(self.base_data.memoryview()).view("int64") + def to_arrow(self) -> pa.Array: + data_buf_64 = np.array(self.base_data.memoryview()).view("int64") # type: ignore[union-attr] data_buf_128 = np.empty(len(data_buf_64) * 2, dtype="int64") # use striding to set the first 64 bits of each 128-bit chunk: diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index ea384888388..704290639c0 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -151,7 +151,7 @@ def offsets(self) -> NumericalColumn: """ return cast(NumericalColumn, self.children[0]) - def to_arrow(self): + def to_arrow(self) -> pa.Array: offsets = self.offsets.to_arrow() elements = ( pa.nulls(len(self.elements)) @@ -161,7 +161,7 @@ def to_arrow(self): pa_type = pa.list_(elements.type) if self.nullable: - nbuf = pa.py_buffer(self.mask.memoryview()) + nbuf = pa.py_buffer(self.mask.memoryview()) # type: ignore[union-attr] buffers = (nbuf, offsets.buffers()[1]) else: buffers = offsets.buffers() diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 0a7e6fefe6e..91b49af7ee9 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -902,16 +902,17 @@ def from_arrow(cls, data: pa.Table) -> Self: if len(dict_indices): dict_indices_table = pa.table(dict_indices) data = data.drop(dict_indices_table.column_names) - indices_columns = libcudf.interop.from_arrow(dict_indices_table) + plc_indices = plc.interop.from_arrow(dict_indices_table) # as dictionary size can vary, it can't be a single table cudf_dictionaries_columns = { name: ColumnBase.from_arrow(dict_dictionaries[name]) for name in dict_dictionaries.keys() } - for name, codes in zip( - dict_indices_table.column_names, indices_columns + for name, plc_codes in zip( + dict_indices_table.column_names, plc_indices.columns() ): + codes = libcudf.column.Column.from_pylibcudf(plc_codes) categories = cudf_dictionaries_columns[name] codes = as_unsigned_codes(len(categories), codes) cudf_category_frame[name] = CategoricalColumn( @@ -927,9 +928,9 @@ def from_arrow(cls, data: pa.Table) -> Self: # Handle non-dict arrays cudf_non_category_frame = { - name: col - for name, col in zip( - data.column_names, libcudf.interop.from_arrow(data) + name: libcudf.column.Column.from_pylibcudf(plc_col) + for name, plc_col in zip( + data.column_names, plc.interop.from_arrow(data).columns() ) } @@ -988,7 +989,7 @@ def from_arrow(cls, data: pa.Table) -> Self: return cls._from_data({name: result[name] for name in column_names}) @_performance_tracking - def to_arrow(self): + def to_arrow(self) -> pa.Table: """ Convert to arrow Table diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py index fe8e446f9c0..3b3fd5f7c56 100644 --- a/python/cudf/cudf/io/dlpack.py +++ b/python/cudf/cudf/io/dlpack.py @@ -1,13 +1,14 @@ # Copyright (c) 2019-2024, NVIDIA CORPORATION. +from __future__ import annotations +import pylibcudf as plc import cudf -from cudf._lib import interop as libdlpack from cudf.core.column import ColumnBase from cudf.utils import ioutils -def from_dlpack(pycapsule_obj): +def from_dlpack(pycapsule_obj) -> cudf.Series | cudf.DataFrame: """Converts from a DLPack tensor to a cuDF object. DLPack is an open-source memory tensor structure: @@ -33,18 +34,21 @@ def from_dlpack(pycapsule_obj): cuDF from_dlpack() assumes column-major (Fortran order) input. If the input tensor is row-major, transpose it before passing it to this function. """ + plc_table = plc.interop.from_dlpack(pycapsule_obj) + data = dict( + enumerate( + (ColumnBase.from_pylibcudf(col) for col in plc_table.columns()) + ) + ) - columns = libdlpack.from_dlpack(pycapsule_obj) - data = dict(enumerate(columns)) - - if len(columns) == 1: + if len(data) == 1: return cudf.Series._from_data(data) else: return cudf.DataFrame._from_data(data) @ioutils.doc_to_dlpack() -def to_dlpack(cudf_obj): +def to_dlpack(cudf_obj: cudf.Series | cudf.DataFrame | cudf.BaseIndex): """Converts a cuDF object to a DLPack tensor. DLPack is an open-source memory tensor structure: @@ -80,13 +84,14 @@ def to_dlpack(cudf_obj): if any( not cudf.api.types._is_non_decimal_numeric_dtype(dtype) - for _, dtype in gdf._dtypes + for _, dtype in gdf._dtypes # type: ignore[union-attr] ): raise TypeError("non-numeric data not yet supported") dtype = cudf.utils.dtypes.find_common_type( - [dtype for _, dtype in gdf._dtypes] + [dtype for _, dtype in gdf._dtypes] # type: ignore[union-attr] ) gdf = gdf.astype(dtype) - - return libdlpack.to_dlpack([*gdf._columns]) + return plc.interop.to_dlpack( + plc.Table([col.to_pylibcudf(mode="read") for col in gdf._columns]) + ) From f7062442304908809e63fdbec14362206efb10f1 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 12 Dec 2024 19:43:56 -0800 Subject: [PATCH 2/2] Go back to using pyarrow table --- python/cudf/cudf/core/column/column.py | 29 +++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index ea69c0f798c..daffcb9fd7e 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -331,16 +331,33 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: elif isinstance(array.type, ArrowIntervalType): return cudf.core.column.IntervalColumn.from_arrow(array) - if isinstance(array, pa.ChunkedArray): - array = array.combine_chunks() + data = pa.table([array], [None]) if isinstance(array.type, pa.DictionaryType): + indices_table = pa.table( + [ + pa.chunked_array( + [chunk.indices for chunk in data.column(0).chunks], + type=array.type.index_type, + ) + ], + [None], + ) + dictionaries_table = pa.table( + [ + pa.chunked_array( + [chunk.dictionary for chunk in data.column(0).chunks], + type=array.type.value_type, + ) + ], + [None], + ) with acquire_spill_lock(): codes = cls.from_pylibcudf( - plc.interop.from_arrow(array.indices) + plc.interop.from_arrow(indices_table).columns()[0] ) categories = cls.from_pylibcudf( - plc.interop.from_arrow(array.dictionary) + plc.interop.from_arrow(dictionaries_table).columns()[0] ) codes = cudf.core.column.categorical.as_unsigned_codes( len(categories), @@ -356,7 +373,9 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: children=(codes,), ) else: - result = cls.from_pylibcudf(plc.interop.from_arrow(array)) + result = cls.from_pylibcudf( + plc.interop.from_arrow(data).columns()[0] + ) # TODO: cudf_dtype_from_pa_type may be less necessary for some types return result._with_type_metadata( cudf_dtype_from_pa_type(array.type)