Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Restructure pylibcudf/arrow interop facilities #15325

Merged
merged 14 commits into from
Mar 18, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions python/cudf/cudf/_lib/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# =============================================================================
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
Expand Down Expand Up @@ -65,7 +65,8 @@ rapids_cython_create_modules(

target_link_libraries(strings_udf PUBLIC cudf_strings_udf)

link_to_pyarrow_headers("${RAPIDS_CYTHON_CREATED_TARGETS}")
set(targets_using_arrow_headers interop avro csv orc json parquet)
link_to_pyarrow_headers("${targets_using_arrow_headers}")

add_subdirectory(cpp)
add_subdirectory(io)
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/datetime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def date_range(DeviceScalar start, size_type n, offset):
+ offset.kwds.get("months", 0)
)

cdef const scalar* c_start = start.c_value.get()
cdef const scalar* c_start = start.get_raw_ptr()
with nogil:
c_result = move(calendrical_month_sequence(
n,
Expand Down
77 changes: 28 additions & 49 deletions python/cudf/cudf/_lib/interop.pyx
Original file line number Diff line number Diff line change
@@ -1,22 +1,23 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from cpython cimport pycapsule
from libcpp.memory cimport shared_ptr, unique_ptr
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from libcpp.vector cimport vector
from pyarrow.lib cimport CTable, pyarrow_unwrap_table, pyarrow_wrap_table

from cudf._lib import pylibcudf

from cudf._lib.cpp.interop cimport (
DLManagedTensor,
column_metadata,
from_arrow as cpp_from_arrow,
from_dlpack as cpp_from_dlpack,
to_arrow as cpp_to_arrow,
to_dlpack as cpp_to_dlpack,
)
from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
from cudf._lib.utils cimport (
columns_from_pylibcudf_table,
columns_from_unique_ptr,
table_view_from_columns,
)

from cudf.core.buffer import acquire_spill_lock
from cudf.core.dtypes import ListDtype, StructDtype
Expand Down Expand Up @@ -83,21 +84,19 @@ cdef void dlmanaged_tensor_pycapsule_deleter(object pycap_obj) noexcept:
dlpack_tensor.deleter(dlpack_tensor)


cdef vector[column_metadata] gather_metadata(object cols_dtypes) except *:
def gather_metadata(object cols_dtypes):
"""
Generates a column_metadata vector for each column.
Generates a ColumnMetadata vector for each column.

Parameters
----------
cols_dtypes : iterable
An iterable of ``(column_name, dtype)`` pairs.
"""
cdef vector[column_metadata] cpp_metadata
cpp_metadata.reserve(len(cols_dtypes))

cpp_metadata = []
if cols_dtypes is not None:
for idx, (col_name, col_dtype) in enumerate(cols_dtypes):
cpp_metadata.push_back(column_metadata(col_name.encode()))
cpp_metadata.append(pylibcudf.interop.ColumnMetadata(col_name))
if isinstance(col_dtype, (ListDtype, StructDtype)):
_set_col_children_metadata(col_dtype, cpp_metadata[idx])
else:
Expand All @@ -108,31 +107,22 @@ cdef vector[column_metadata] gather_metadata(object cols_dtypes) except *:
return cpp_metadata


cdef _set_col_children_metadata(dtype,
column_metadata& col_meta):

cdef column_metadata element_metadata

def _set_col_children_metadata(dtype, col_meta):
if isinstance(dtype, StructDtype):
for name, value in dtype.fields.items():
element_metadata = column_metadata(name.encode())
_set_col_children_metadata(
value, element_metadata
)
col_meta.children_meta.push_back(element_metadata)
element_metadata = pylibcudf.interop.ColumnMetadata(name)
_set_col_children_metadata(value, element_metadata)
col_meta.children_meta.append(element_metadata)
elif isinstance(dtype, ListDtype):
col_meta.children_meta.reserve(2)
# Offsets - child 0
col_meta.children_meta.push_back(column_metadata())
col_meta.children_meta.append(pylibcudf.interop.ColumnMetadata())

# Element column - child 1
element_metadata = column_metadata()
_set_col_children_metadata(
dtype.element_type, element_metadata
)
col_meta.children_meta.push_back(element_metadata)
element_metadata = pylibcudf.interop.ColumnMetadata()
_set_col_children_metadata(dtype.element_type, element_metadata)
col_meta.children_meta.append(element_metadata)
else:
col_meta.children_meta.push_back(column_metadata())
col_meta.children_meta.append(pylibcudf.interop.ColumnMetadata())


@acquire_spill_lock()
Expand All @@ -149,16 +139,11 @@ def to_arrow(list source_columns, object column_dtypes):
-------
pyarrow table
"""
cdef vector[column_metadata] cpp_metadata = gather_metadata(column_dtypes)
cdef table_view input_table_view = table_view_from_columns(source_columns)

cdef shared_ptr[CTable] cpp_arrow_table
with nogil:
cpp_arrow_table = cpp_to_arrow(
input_table_view, cpp_metadata
)

return pyarrow_wrap_table(cpp_arrow_table)
cpp_metadata = gather_metadata(column_dtypes)
return pylibcudf.interop.to_arrow(
pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source_columns]),
cpp_metadata,
)


@acquire_spill_lock()
Expand All @@ -173,12 +158,6 @@ def from_arrow(object input_table):
-------
A list of columns to construct Frame object
"""
cdef shared_ptr[CTable] cpp_arrow_table = (
pyarrow_unwrap_table(input_table)
return columns_from_pylibcudf_table(
pylibcudf.interop.from_arrow(input_table)
)
cdef unique_ptr[table] c_result

with nogil:
c_result = move(cpp_from_arrow(cpp_arrow_table.get()[0]))

return columns_from_unique_ptr(move(c_result))
3 changes: 1 addition & 2 deletions python/cudf/cudf/_lib/nvtext/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# =============================================================================
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
Expand All @@ -22,4 +22,3 @@ rapids_cython_create_modules(
SOURCE_FILES "${cython_sources}"
LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX nvtext_ ASSOCIATED_TARGETS cudf
)
link_to_pyarrow_headers("${RAPIDS_CYTHON_CREATED_TARGETS}")
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,4 @@ rapids_cython_create_modules(
SOURCE_FILES "${cython_sources}"
LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
)
link_to_pyarrow_headers("${RAPIDS_CYTHON_CREATED_TARGETS}")
link_to_pyarrow_headers(pylibcudf_interop)
2 changes: 0 additions & 2 deletions python/cudf/cudf/_lib/pylibcudf/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ from . cimport (
copying,
filling,
groupby,
interop,
join,
lists,
merge,
Expand Down Expand Up @@ -41,7 +40,6 @@ __all__ = [
"filling",
"gpumemoryview",
"groupby",
"interop",
"join",
"lists",
"merge",
Expand Down
22 changes: 20 additions & 2 deletions python/cudf/cudf/_lib/pylibcudf/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,17 @@ from libcpp.utility cimport move
from rmm._lib.device_buffer cimport DeviceBuffer

from cudf._lib.cpp.column.column cimport column, column_contents
from cudf._lib.cpp.column.column_factories cimport make_column_from_scalar
from cudf._lib.cpp.column.column_factories cimport (
make_column_from_scalar,
make_numeric_column,
)
from cudf._lib.cpp.scalar.scalar cimport scalar
from cudf._lib.cpp.types cimport size_type
from cudf._lib.cpp.unary cimport cast as libcudf_cast

from .gpumemoryview cimport gpumemoryview
from .scalar cimport Scalar
from .types cimport DataType, type_id
from .types cimport DataType, mask_state, type_id
from .utils cimport int_to_bitmask_ptr, int_to_void_ptr


Expand Down Expand Up @@ -134,6 +138,20 @@ cdef class Column:
"""
cdef DataType dtype = DataType.from_libcudf(libcudf_col.get().type())
cdef size_type size = libcudf_col.get().size()

# TODO: This behavior is consistent with how the legacy cuDF Python handled
# getting empty or TIMESTAMP_DAYS columns from libcudf, but we should really
# think about what we really want pylibcudf to do here. Is there a
# higher-level layer where we should be handling this?
if dtype.id() == type_id.TIMESTAMP_DAYS:
dtype = DataType(type_id.TIMESTAMP_SECONDS)
libcudf_col.swap(libcudf_cast(libcudf_col.get().view(), dtype.c_obj))
elif dtype.id() == type_id.EMPTY:
dtype = DataType(type_id.INT8)
libcudf_col.swap(
make_numeric_column(dtype.c_obj, size, mask_state.ALL_NULL)
)

cdef size_type null_count = libcudf_col.get().null_count()

cdef column_contents contents = move(libcudf_col.get().release())
Expand Down
9 changes: 0 additions & 9 deletions python/cudf/cudf/_lib/pylibcudf/interop.pxd

This file was deleted.

Loading
Loading