Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure cudf internals use pylibcudf in pure Python mode #13909

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions python/cudf/cudf/_lib/column.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ from libcpp.memory cimport unique_ptr

from rmm._lib.device_buffer cimport device_buffer

from cudf._lib cimport pylibcudf
from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
from cudf._lib.cpp.types cimport size_type
Expand All @@ -30,7 +29,7 @@ cdef class Column:
cdef column_view _view(self, size_type null_count) except *
cdef column_view view(self) except *
cdef mutable_column_view mutable_view(self) except *
cpdef pylibcudf.Column to_pylibcudf(self, mode: Literal["read", "write"])
cpdef to_pylibcudf(self, mode: Literal["read", "write"])

@staticmethod
cdef Column from_unique_ptr(
Expand Down
31 changes: 15 additions & 16 deletions python/cudf/cudf/_lib/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,6 @@ import rmm
import cudf
import cudf._lib as libcudf
from cudf._lib import pylibcudf

from cudf._lib cimport pylibcudf

from cudf.api.types import is_categorical_dtype, is_datetime64tz_dtype
from cudf.core.buffer import (
Buffer,
Expand All @@ -40,6 +37,7 @@ from cudf._lib.types cimport (
)

from cudf._lib.null_mask import bitmask_allocation_size_bytes
from cudf._lib.types import dtype_from_pylibcudf_column

cimport cudf._lib.cpp.types as libcudf_types
cimport cudf._lib.cpp.unary as libcudf_unary
Expand Down Expand Up @@ -446,7 +444,7 @@ cdef class Column:
# underlying buffers as exposed before this function can itself be exposed
# publicly. User requests to convert to pylibcudf must assume that the
# data may be modified afterwards.
cpdef pylibcudf.Column to_pylibcudf(self, mode: Literal["read", "write"]):
cpdef to_pylibcudf(self, mode: Literal["read", "write"]):
"""Convert this Column to a pylibcudf.Column.

This function will generate a pylibcudf Column pointing to the same
Expand Down Expand Up @@ -476,9 +474,9 @@ cdef class Column:
else:
col = self

cdef pylibcudf.DataType dtype = dtype_to_pylibcudf_type(col.dtype)
dtype = dtype_to_pylibcudf_type(col.dtype)

cdef pylibcudf.gpumemoryview data = None
data = None
if col.base_data is not None:
cai = cuda_array_interface_wrapper(
ptr=col.base_data.get_ptr(mode=mode),
Expand All @@ -487,7 +485,7 @@ cdef class Column:
)
data = pylibcudf.gpumemoryview(cai)

cdef pylibcudf.gpumemoryview mask = None
mask = None
if self.nullable:
# TODO: Are we intentionally use self's mask instead of col's?
# Where is the mask stored for categoricals?
Expand Down Expand Up @@ -586,7 +584,7 @@ cdef class Column:
# TODO: Actually support exposed data pointers.
@staticmethod
def from_pylibcudf(
pylibcudf.Column col, bint data_ptr_exposed=False
col, bint data_ptr_exposed=False
):
"""Create a Column from a pylibcudf.Column.

Expand All @@ -607,19 +605,20 @@ cdef class Column:
pylibcudf.Column
A new pylibcudf.Column referencing the same data.
"""
# TODO: Rewrite utility for dtype conversion to not need a column view.
dtype = dtype_from_column_view(col.view())
dtype = dtype_from_pylibcudf_column(col)

return cudf.core.column.build_column(
data=as_buffer(col.data.obj) if col.data is not None else None,
data=as_buffer(col.data().obj) if col.data() is not None else None,
dtype=dtype,
size=col.size,
mask=as_buffer(col.mask.obj) if col.mask is not None else None,
offset=col.offset,
null_count=col.null_count,
size=col.size(),
mask=as_buffer(
col.null_mask().obj
) if col.null_mask() is not None else None,
offset=col.offset(),
null_count=col.null_count(),
children=tuple([
Column.from_pylibcudf(child)
for child in col.children
for child in col.children()
])
)

Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/copying.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ from libcpp.vector cimport vector
from rmm._lib.device_buffer cimport DeviceBuffer

import cudf
from cudf._lib import pylibcudf
from cudf.core.buffer import Buffer, acquire_spill_lock, as_buffer

from cudf._lib cimport pylibcudf
from cudf._lib.column cimport Column

from cudf._lib.scalar import as_device_scalar
Expand Down Expand Up @@ -174,7 +174,7 @@ def gather(
Column gather_map,
bool nullify=False
):
cdef pylibcudf.Table tbl = pylibcudf.copying.gather(
tbl = pylibcudf.copying.gather(
pylibcudf.Table([col.to_pylibcudf(mode="read") for col in columns]),
gather_map.to_pylibcudf(mode="read"),
pylibcudf.copying.OutOfBoundsPolicy.NULLIFY if nullify
Expand Down
34 changes: 27 additions & 7 deletions python/cudf/cudf/_lib/pylibcudf/column.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,36 @@ cdef class Column:
# TODO: Should we document these attributes? Should we mark them readonly?
cdef:
# Core data
DataType data_type
size_type size
gpumemoryview data
gpumemoryview mask
size_type null_count
size_type offset
DataType _data_type
size_type _size
gpumemoryview _data
gpumemoryview _mask
size_type _null_count
size_type _offset
# children: List[Column]
list children
list _children
size_type _num_children

cdef column_view view(self) nogil

@staticmethod
cdef Column from_libcudf(unique_ptr[column] libcudf_col)

cpdef DataType type(self)
cpdef Column child(self, size_type index)
cpdef size_type num_children(self)
cpdef size_type size(self)
cpdef size_type null_count(self)
cpdef size_type offset(self)
cpdef gpumemoryview data(self)
cpdef gpumemoryview null_mask(self)
cpdef list children(self)

cpdef list_view(self)


cdef class ListColumnView:
"""Accessor for methods of a Column that are specific to lists."""
cdef Column _column
cpdef child(self)
cpdef offsets(self)
90 changes: 74 additions & 16 deletions python/cudf/cudf/_lib/pylibcudf/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ from cudf._lib.cpp.column.column cimport column, column_contents
from cudf._lib.cpp.types cimport size_type

from .gpumemoryview cimport gpumemoryview
from .types cimport DataType
from .types cimport DataType, TypeId
from .utils cimport int_to_bitmask_ptr, int_to_void_ptr


Expand Down Expand Up @@ -45,13 +45,14 @@ cdef class Column:
gpumemoryview mask, size_type null_count, size_type offset,
list children
):
self.data_type = data_type
self.size = size
self.data = data
self.mask = mask
self.null_count = null_count
self.offset = offset
self.children = children
self._data_type = data_type
self._size = size
self._data = data
self._mask = mask
self._null_count = null_count
self._offset = offset
self._children = children
self._num_children = len(children)

cdef column_view view(self) nogil:
"""Generate a libcudf column_view to pass to libcudf algorithms.
Expand All @@ -63,17 +64,17 @@ cdef class Column:
cdef const void * data = NULL
cdef const bitmask_type * null_mask = NULL

if self.data is not None:
data = int_to_void_ptr(self.data.ptr)
if self.mask is not None:
null_mask = int_to_bitmask_ptr(self.mask.ptr)
if self._data is not None:
data = int_to_void_ptr(self._data.ptr)
if self._mask is not None:
null_mask = int_to_bitmask_ptr(self._mask.ptr)

# TODO: Check if children can ever change. If not, this could be
# computed once in the constructor and always be reused.
cdef vector[column_view] c_children
with gil:
if self.children is not None:
for child in self.children:
if self._children is not None:
for child in self._children:
# Need to cast to Column here so that Cython knows that
# `view` returns a typed object, not a Python object. We
# cannot use a typed variable for `child` because cdef
Expand All @@ -86,8 +87,8 @@ cdef class Column:
c_children.push_back((<Column> child).view())

return column_view(
self.data_type.c_obj, self.size, data, null_mask,
self.null_count, self.offset, c_children
self._data_type.c_obj, self._size, data, null_mask,
self._null_count, self._offset, c_children
)

@staticmethod
Expand Down Expand Up @@ -133,3 +134,60 @@ cdef class Column:
0,
children,
)

cpdef DataType type(self):
"""The type of data in the column."""
return self._data_type

cpdef Column child(self, size_type index):
"""Get a child column of this column.

Parameters
----------
index : size_type
The index of the child column to get.

Returns
-------
Column
The child column.
"""
return self._children[index]

cpdef size_type num_children(self):
"""The number of children of this column."""
return self._num_children

cpdef list_view(self):
return ListColumnView(self)

cpdef gpumemoryview data(self):
return self._data

cpdef gpumemoryview null_mask(self):
return self._mask

cpdef size_type size(self):
return self._size

cpdef size_type offset(self):
return self._offset

cpdef size_type null_count(self):
return self._null_count

cpdef list children(self):
return self._children

cdef class ListColumnView:
"""Accessor for methods of a Column that are specific to lists."""
def __init__(self, Column col):
if col.type().id() != TypeId.LIST:
raise TypeError("Column is not a list type")
self._column = col

cpdef child(self):
return self._column.child(1)

cpdef offsets(self):
return self._column.child(1)
7 changes: 5 additions & 2 deletions python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,8 @@


cdef class gpumemoryview:
cdef Py_ssize_t ptr
cdef object obj
# TODO: Eventually probably want to make this opaque, but for now it's fine
# to treat this object as something like a POD struct
cdef readonly:
Py_ssize_t ptr
object obj
4 changes: 3 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/table.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@ from cudf._lib.cpp.table.table_view cimport table_view

cdef class Table:
# List[pylibcudf.Column]
cdef object columns
cdef list _columns

cdef table_view view(self) nogil

@staticmethod
cdef Table from_libcudf(unique_ptr[table] libcudf_tbl)

cpdef list columns(self)
9 changes: 6 additions & 3 deletions python/cudf/cudf/_lib/pylibcudf/table.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ cdef class Table:
The columns in this table.
"""
def __init__(self, list columns):
self.columns = columns
self._columns = columns

cdef table_view view(self) nogil:
"""Generate a libcudf table_view to pass to libcudf algorithms.
Expand All @@ -31,11 +31,11 @@ cdef class Table:
(even direct pylibcudf Cython users).
"""
# TODO: Make c_columns a class attribute that is updated along with
# self.columns whenever new columns are added or columns are removed.
# self._columns whenever new columns are added or columns are removed.
cdef vector[column_view] c_columns

with gil:
for col in self.columns:
for col in self._columns:
c_columns.push_back((<Column> col).view())

return table_view(c_columns)
Expand All @@ -57,3 +57,6 @@ cdef class Table:
Column.from_libcudf(move(c_columns[i]))
for i in range(c_columns.size())
])

cpdef list columns(self):
return self._columns
3 changes: 1 addition & 2 deletions python/cudf/cudf/_lib/types.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ from libc.stdint cimport int32_t
from libcpp cimport bool

cimport cudf._lib.cpp.types as libcudf_types
from cudf._lib cimport pylibcudf
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view

Expand All @@ -18,5 +17,5 @@ ctypedef bool underlying_type_t_null_policy
cdef dtype_from_column_view(column_view cv)

cdef libcudf_types.data_type dtype_to_data_type(dtype) except *
cpdef pylibcudf.DataType dtype_to_pylibcudf_type(dtype)
cpdef dtype_to_pylibcudf_type(dtype)
cdef bool is_decimal_type_id(libcudf_types.type_id tid) except *
Loading