From 9c16d895f509e1d4e9710651e57e4cd29defbcce Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 15 Dec 2023 14:41:20 -0800 Subject: [PATCH] Implement remaining copying APIs in pylibcudf along with required helper functions (#14640) This PR implements the remaining parts of libcudf's copying APIs in pylibcudf, updating cuDF Python to use those as a backend. The cudf copying Cython module is now largely just a set of pylibcudf calls. This represents the general transition that we hope to make, and eventually once all of cudf looks like this we can refactor its internal data structures to be built directly around pylibcudf objects we can remove these thin translation layers. This PR also implements a few core utility functions required for translating between pylibcudf and libcudf objects, namely for: - Generating pylibcudf Columns from libcudf column_views (instead of owning `column`s) - Creating libcudf mutable_column_view objects from pylibcudf.Columns - Producing cudf DeviceScalar objects from pylibcudf Scalars. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/14640 --- python/cudf/cudf/_lib/copying.pyx | 246 +++----------------- python/cudf/cudf/_lib/pylibcudf/column.pxd | 6 +- python/cudf/cudf/_lib/pylibcudf/column.pyx | 59 +++++ python/cudf/cudf/_lib/pylibcudf/copying.pxd | 18 ++ python/cudf/cudf/_lib/pylibcudf/copying.pyx | 108 +++++++++ python/cudf/cudf/_lib/pylibcudf/table.pxd | 3 + python/cudf/cudf/_lib/pylibcudf/table.pyx | 17 ++ python/cudf/cudf/_lib/scalar.pxd | 5 + python/cudf/cudf/_lib/scalar.pyx | 34 +-- 9 files changed, 274 insertions(+), 222 deletions(-) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index fbe2c8751dd..8eb0500617f 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -19,7 +19,7 @@ from cudf._lib.column cimport Column from cudf._lib.scalar import as_device_scalar from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport table_view_from_columns, table_view_from_table +from cudf._lib.utils cimport table_view_from_table from cudf._lib.reduce import minmax from cudf.core.abc import Serializable @@ -27,22 +27,15 @@ from cudf.core.abc import Serializable from libcpp.memory cimport make_unique cimport cudf._lib.cpp.contiguous_split as cpp_contiguous_split -cimport cudf._lib.cpp.copying as cpp_copying from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view +from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.lists.gather cimport ( segmented_gather as cpp_segmented_gather, ) from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view from cudf._lib.cpp.scalar.scalar cimport scalar -from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type -from cudf._lib.utils cimport ( - columns_from_pylibcudf_table, - columns_from_table_view, - data_from_table_view, - table_view_from_columns, -) +from cudf._lib.utils cimport columns_from_pylibcudf_table, data_from_table_view # workaround for https://github.com/cython/cython/issues/3885 ctypedef const scalar constscalar @@ -92,20 +85,13 @@ def _copy_range_in_place(Column input_column, size_type input_begin, size_type input_end, size_type target_begin): - - cdef column_view input_column_view = input_column.view() - cdef mutable_column_view target_column_view = target_column.mutable_view() - cdef size_type c_input_begin = input_begin - cdef size_type c_input_end = input_end - cdef size_type c_target_begin = target_begin - - with nogil: - cpp_copying.copy_range_in_place( - input_column_view, - target_column_view, - c_input_begin, - c_input_end, - c_target_begin) + pylibcudf.copying.copy_range( + input_column.to_pylibcudf(mode="write"), + target_column.to_pylibcudf(mode="write"), + input_begin, + input_end, + target_begin + ) def _copy_range(Column input_column, @@ -244,198 +230,46 @@ def columns_empty_like(list input_columns): @acquire_spill_lock() def column_slice(Column input_column, object indices): - - cdef column_view input_column_view = input_column.view() - cdef vector[size_type] c_indices - c_indices.reserve(len(indices)) - - cdef vector[column_view] c_result - - cdef int index - - for index in indices: - c_indices.push_back(index) - - with nogil: - c_result = move( - cpp_copying.slice( - input_column_view, - c_indices) + return [ + Column.from_pylibcudf(c) + for c in pylibcudf.copying.column_slice( + input_column.to_pylibcudf(mode="read"), + list(indices), ) - - num_of_result_cols = c_result.size() - result = [ - Column.from_column_view( - c_result[i], - input_column) for i in range(num_of_result_cols)] - - return result + ] @acquire_spill_lock() -def columns_slice(list input_columns, list indices): - """ - Given a list of input columns, return columns sliced by ``indices``. - - Returns a list of list of columns. The length of return is - `len(indices) / 2`. The `i`th item in return is a list of columns sliced - from ``input_columns`` with `slice(indices[i*2], indices[i*2 + 1])`. - """ - cdef table_view input_table_view = table_view_from_columns(input_columns) - cdef vector[size_type] c_indices = indices - cdef vector[table_view] c_result - - with nogil: - c_result = move( - cpp_copying.slice( - input_table_view, - c_indices) - ) - +def columns_slice(list input_columns, object indices): return [ - columns_from_table_view( - c_result[i], input_columns - ) for i in range(c_result.size()) + columns_from_pylibcudf_table(tbl) + for tbl in pylibcudf.copying.table_slice( + pylibcudf.Table([col.to_pylibcudf(mode="read") for col in input_columns]), + list(indices), + ) ] @acquire_spill_lock() def column_split(Column input_column, object splits): - - cdef column_view input_column_view = input_column.view() - cdef vector[size_type] c_splits - c_splits.reserve(len(splits)) - - cdef vector[column_view] c_result - - cdef int split - - for split in splits: - c_splits.push_back(split) - - with nogil: - c_result = move( - cpp_copying.split( - input_column_view, - c_splits) + return [ + Column.from_pylibcudf(c) + for c in pylibcudf.copying.column_split( + input_column.to_pylibcudf(mode="read"), + list(splits), ) - - num_of_result_cols = c_result.size() - result = [ - Column.from_column_view( - c_result[i], - input_column - ) for i in range(num_of_result_cols) ] - return result - @acquire_spill_lock() def columns_split(list input_columns, object splits): - - cdef table_view input_table_view = table_view_from_columns(input_columns) - cdef vector[size_type] c_splits = splits - cdef vector[table_view] c_result - - with nogil: - c_result = move( - cpp_copying.split( - input_table_view, - c_splits) - ) - return [ - columns_from_table_view( - c_result[i], input_columns - ) for i in range(c_result.size()) - ] - - -def _copy_if_else_column_column(Column lhs, Column rhs, Column boolean_mask): - - cdef column_view lhs_view = lhs.view() - cdef column_view rhs_view = rhs.view() - cdef column_view boolean_mask_view = boolean_mask.view() - - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_copying.copy_if_else( - lhs_view, - rhs_view, - boolean_mask_view - ) - ) - - return Column.from_unique_ptr(move(c_result)) - - -def _copy_if_else_scalar_column(DeviceScalar lhs, - Column rhs, - Column boolean_mask): - - cdef const scalar* lhs_scalar = lhs.get_raw_ptr() - cdef column_view rhs_view = rhs.view() - cdef column_view boolean_mask_view = boolean_mask.view() - - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_copying.copy_if_else( - lhs_scalar[0], - rhs_view, - boolean_mask_view - ) + columns_from_pylibcudf_table(tbl) + for tbl in pylibcudf.copying.table_split( + pylibcudf.Table([col.to_pylibcudf(mode="read") for col in input_columns]), + list(splits), ) - - return Column.from_unique_ptr(move(c_result)) - - -def _copy_if_else_column_scalar(Column lhs, - DeviceScalar rhs, - Column boolean_mask): - - cdef column_view lhs_view = lhs.view() - cdef const scalar* rhs_scalar = rhs.get_raw_ptr() - cdef column_view boolean_mask_view = boolean_mask.view() - - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_copying.copy_if_else( - lhs_view, - rhs_scalar[0], - boolean_mask_view - ) - ) - - return Column.from_unique_ptr(move(c_result)) - - -def _copy_if_else_scalar_scalar(DeviceScalar lhs, - DeviceScalar rhs, - Column boolean_mask): - - cdef const scalar* lhs_scalar = lhs.get_raw_ptr() - cdef const scalar* rhs_scalar = rhs.get_raw_ptr() - cdef column_view boolean_mask_view = boolean_mask.view() - - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_copying.copy_if_else( - lhs_scalar[0], - rhs_scalar[0], - boolean_mask_view - ) - ) - - return Column.from_unique_ptr(move(c_result)) + ] @acquire_spill_lock() @@ -504,16 +338,12 @@ def shift(Column input, int offset, object fill_value=None): @acquire_spill_lock() def get_element(Column input_column, size_type index): - cdef column_view col_view = input_column.view() - - cdef unique_ptr[scalar] c_output - with nogil: - c_output = move( - cpp_copying.get_element(col_view, index) - ) - - return DeviceScalar.from_unique_ptr( - move(c_output), dtype=input_column.dtype + return DeviceScalar.from_pylibcudf( + pylibcudf.copying.get_element( + input_column.to_pylibcudf(mode="read"), + index, + ), + dtype=input_column.dtype, ) diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pxd b/python/cudf/cudf/_lib/pylibcudf/column.pxd index 2af87db5b03..27b77438c79 100644 --- a/python/cudf/cudf/_lib/pylibcudf/column.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/column.pxd @@ -4,7 +4,7 @@ from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view from cudf._lib.cpp.types cimport bitmask_type, size_type from .gpumemoryview cimport gpumemoryview @@ -26,10 +26,14 @@ cdef class Column: size_type _num_children cdef column_view view(self) nogil + cdef mutable_column_view mutable_view(self) nogil @staticmethod cdef Column from_libcudf(unique_ptr[column] libcudf_col) + @staticmethod + cdef Column from_column_view(const column_view& libcudf_col, Column owner) + cpdef DataType type(self) cpdef Column child(self, size_type index) cpdef size_type num_children(self) diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx index 40afc8aaa8a..389a1c82be5 100644 --- a/python/cudf/cudf/_lib/pylibcudf/column.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx @@ -91,6 +91,33 @@ cdef class Column: self._null_count, self._offset, c_children ) + cdef mutable_column_view mutable_view(self) nogil: + """Generate a libcudf mutable_column_view to pass to libcudf algorithms. + + This method is for pylibcudf's functions to use to generate inputs when + calling libcudf algorithms, and should generally not be needed by users + (even direct pylibcudf Cython users). + """ + cdef void * data = NULL + cdef bitmask_type * null_mask = NULL + + if self._data is not None: + data = int_to_void_ptr(self._data.ptr) + if self._mask is not None: + null_mask = int_to_bitmask_ptr(self._mask.ptr) + + cdef vector[mutable_column_view] c_children + with gil: + if self._children is not None: + for child in self._children: + # See the view method for why this needs to be cast. + c_children.push_back(( child).mutable_view()) + + return mutable_column_view( + self._data_type.c_obj, self._size, data, null_mask, + self._null_count, self._offset, c_children + ) + @staticmethod cdef Column from_libcudf(unique_ptr[column] libcudf_col): """Create a Column from a libcudf column. @@ -135,6 +162,38 @@ cdef class Column: children, ) + @staticmethod + cdef Column from_column_view(const column_view& cv, Column owner): + """Create a Column from a libcudf column_view. + + This method accepts shared ownership of the underlying data from the + owner and relies on the offset from the view. + + This method is for pylibcudf's functions to use to ingest outputs of + calling libcudf algorithms, and should generally not be needed by users + (even direct pylibcudf Cython users). + """ + cdef DataType dtype = DataType.from_libcudf(cv.type()) + cdef size_type size = cv.size() + cdef size_type null_count = cv.null_count() + + children = [] + if cv.num_children() != 0: + for i in range(cv.num_children()): + children.append( + Column.from_column_view(cv.child(i), owner.child(i)) + ) + + return Column( + dtype, + size, + owner._data, + owner._mask, + null_count, + cv.offset(), + children, + ) + cpdef DataType type(self): """The type of data in the column.""" return self._data_type diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pxd b/python/cudf/cudf/_lib/pylibcudf/copying.pxd index db0e42f5804..3567df9ac9c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/copying.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/copying.pxd @@ -26,6 +26,14 @@ cpdef object empty_table_like(Table input) cpdef Column allocate_like(Column input_column, mask_allocation_policy policy, size=*) +cpdef Column copy_range_in_place( + Column input_column, + Column target_column, + size_type input_begin, + size_type input_end, + size_type target_begin, +) + cpdef Column copy_range( Column input_column, Column target_column, @@ -36,8 +44,18 @@ cpdef Column copy_range( cpdef Column shift(Column input, size_type offset, Scalar fill_values) +cpdef list column_split(Column input_column, list splits) + +cpdef list table_split(Table input_table, list splits) + +cpdef list column_slice(Column input_column, list indices) + +cpdef list table_slice(Table input_table, list indices) + cpdef Column copy_if_else(object lhs, object rhs, Column boolean_mask) cpdef Table boolean_mask_table_scatter(Table input, Table target, Column boolean_mask) cpdef Table boolean_mask_scalars_scatter(list input, Table target, Column boolean_mask) + +cpdef Scalar get_element(Column input_column, size_type index) diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pyx b/python/cudf/cudf/_lib/pylibcudf/copying.pyx index 634aed3e6e5..c08b57c05d1 100644 --- a/python/cudf/cudf/_lib/pylibcudf/copying.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/copying.pyx @@ -13,9 +13,11 @@ from libcpp.vector cimport vector # cimport libcudf... libcudf.copying.algo(...) from cudf._lib.cpp cimport copying as cpp_copying from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view from cudf._lib.cpp.copying cimport mask_allocation_policy, out_of_bounds_policy from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table +from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type from cudf._lib.cpp.copying import \ @@ -154,6 +156,27 @@ cpdef Column allocate_like( return Column.from_libcudf(move(c_result)) +cpdef Column copy_range_in_place( + Column input_column, + Column target_column, + size_type input_begin, + size_type input_end, + size_type target_begin, +): + # Need to initialize this outside the function call so that Cython doesn't + # try and pass a temporary that decays to an rvalue reference in where the + # function requires an lvalue reference. + cdef mutable_column_view target_view = target_column.mutable_view() + with nogil: + cpp_copying.copy_range_in_place( + input_column.view(), + target_view, + input_begin, + input_end, + target_begin + ) + + cpdef Column copy_range( Column input_column, Column target_column, @@ -188,6 +211,82 @@ cpdef Column shift(Column input, size_type offset, Scalar fill_values): return Column.from_libcudf(move(c_result)) +cpdef list column_split(Column input_column, list splits): + cdef vector[size_type] c_splits + cdef int split + for split in splits: + c_splits.push_back(split) + + cdef vector[column_view] c_result + with nogil: + c_result = move( + cpp_copying.split( + input_column.view(), + c_splits + ) + ) + + cdef int i + return [ + Column.from_column_view(c_result[i], input_column) + for i in range(c_result.size()) + ] + + +cpdef list table_split(Table input_table, list splits): + cdef vector[size_type] c_splits = splits + cdef vector[table_view] c_result + with nogil: + c_result = move( + cpp_copying.split( + input_table.view(), + c_splits + ) + ) + + cdef int i + return [ + Table.from_table_view(c_result[i], input_table) + for i in range(c_result.size()) + ] + + +cpdef list column_slice(Column input_column, list indices): + cdef vector[size_type] c_indices = indices + cdef vector[column_view] c_result + with nogil: + c_result = move( + cpp_copying.slice( + input_column.view(), + c_indices + ) + ) + + cdef int i + return [ + Column.from_column_view(c_result[i], input_column) + for i in range(c_result.size()) + ] + + +cpdef list table_slice(Table input_table, list indices): + cdef vector[size_type] c_indices = indices + cdef vector[table_view] c_result + with nogil: + c_result = move( + cpp_copying.slice( + input_table.view(), + c_indices + ) + ) + + cdef int i + return [ + Table.from_table_view(c_result[i], input_table) + for i in range(c_result.size()) + ] + + cpdef Column copy_if_else(object lhs, object rhs, Column boolean_mask): cdef unique_ptr[column] result @@ -263,3 +362,12 @@ cpdef Table boolean_mask_scalars_scatter(list input, Table target, Column boolea ) return Table.from_libcudf(move(result)) + +cpdef Scalar get_element(Column input_column, size_type index): + cdef unique_ptr[scalar] c_output + with nogil: + c_output = move( + cpp_copying.get_element(input_column.view(), index) + ) + + return Scalar.from_libcudf(move(c_output)) diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pxd b/python/cudf/cudf/_lib/pylibcudf/table.pxd index a9e2874232a..6fe06f00491 100644 --- a/python/cudf/cudf/_lib/pylibcudf/table.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/table.pxd @@ -16,6 +16,9 @@ cdef class Table: @staticmethod cdef Table from_libcudf(unique_ptr[table] libcudf_tbl) + @staticmethod + cdef Table from_table_view(const table_view& tv, Table owner) + cpdef list columns(self) cpdef pa.Table to_arrow(self, list metadata) diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pyx b/python/cudf/cudf/_lib/pylibcudf/table.pyx index c41eb82e4a1..6a6fad46d69 100644 --- a/python/cudf/cudf/_lib/pylibcudf/table.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/table.pyx @@ -65,6 +65,23 @@ cdef class Table: for i in range(c_columns.size()) ]) + @staticmethod + cdef Table from_table_view(const table_view& tv, Table owner): + """Create a Table from a libcudf table. + + This method accepts shared ownership of the underlying data from the + owner and relies on the offset from the view. + + This method is for pylibcudf's functions to use to ingest outputs of + calling libcudf algorithms, and should generally not be needed by users + (even direct pylibcudf Cython users). + """ + cdef int i + return Table([ + Column.from_column_view(tv.column(i), owner.columns()[i]) + for i in range(tv.num_columns()) + ]) + cpdef list columns(self): return self._columns diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd index 77733f59c3d..b5c5a8a64a3 100644 --- a/python/cudf/cudf/_lib/scalar.pxd +++ b/python/cudf/cudf/_lib/scalar.pxd @@ -22,4 +22,9 @@ cdef class DeviceScalar: @staticmethod cdef DeviceScalar from_unique_ptr(unique_ptr[scalar] ptr, dtype=*) + @staticmethod + cdef DeviceScalar from_pylibcudf(pylibcudf.Scalar scalar, dtype=*) + + cdef void _set_dtype(self, dtype=*) + cpdef bool is_valid(DeviceScalar s) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 0b64c75f7b6..27fb9e994f0 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -217,13 +217,22 @@ cdef class DeviceScalar: Construct a Scalar object from a unique_ptr. """ cdef DeviceScalar s = DeviceScalar.__new__(DeviceScalar) - cdef libcudf_types.data_type cdtype - s.c_value = pylibcudf.Scalar.from_libcudf(move(ptr)) - cdtype = s.get_raw_ptr()[0].type() + s._set_dtype(dtype) + return s + + @staticmethod + cdef DeviceScalar from_pylibcudf(pylibcudf.Scalar pscalar, dtype=None): + cdef DeviceScalar s = DeviceScalar.__new__(DeviceScalar) + s.c_value = pscalar + s._set_dtype(dtype) + return s + + cdef void _set_dtype(self, dtype=None): + cdef libcudf_types.data_type cdtype = self.get_raw_ptr()[0].type() if dtype is not None: - s._dtype = dtype + self._dtype = dtype elif cdtype.id() in { libcudf_types.type_id.DECIMAL32, libcudf_types.type_id.DECIMAL64, @@ -233,32 +242,31 @@ cdef class DeviceScalar: "Must pass a dtype when constructing from a fixed-point scalar" ) elif cdtype.id() == libcudf_types.type_id.STRUCT: - struct_table_view = (s.get_raw_ptr())[0].view() - s._dtype = StructDtype({ + struct_table_view = (self.get_raw_ptr())[0].view() + self._dtype = StructDtype({ str(i): dtype_from_column_view(struct_table_view.column(i)) for i in range(struct_table_view.num_columns()) }) elif cdtype.id() == libcudf_types.type_id.LIST: if ( - s.get_raw_ptr() + self.get_raw_ptr() )[0].view().type().id() == libcudf_types.type_id.LIST: - s._dtype = dtype_from_column_view( - (s.get_raw_ptr())[0].view() + self._dtype = dtype_from_column_view( + (self.get_raw_ptr())[0].view() ) else: - s._dtype = ListDtype( + self._dtype = ListDtype( LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ ( - (s.get_raw_ptr())[0] + (self.get_raw_ptr())[0] .view().type().id() ) ] ) else: - s._dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ + self._dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ (cdtype.id()) ] - return s # TODO: Currently the only uses of this function and the one below are in