diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index fbe2c8751dd..8eb0500617f 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -19,7 +19,7 @@ from cudf._lib.column cimport Column from cudf._lib.scalar import as_device_scalar from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport table_view_from_columns, table_view_from_table +from cudf._lib.utils cimport table_view_from_table from cudf._lib.reduce import minmax from cudf.core.abc import Serializable @@ -27,22 +27,15 @@ from cudf.core.abc import Serializable from libcpp.memory cimport make_unique cimport cudf._lib.cpp.contiguous_split as cpp_contiguous_split -cimport cudf._lib.cpp.copying as cpp_copying from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view +from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.lists.gather cimport ( segmented_gather as cpp_segmented_gather, ) from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view from cudf._lib.cpp.scalar.scalar cimport scalar -from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type -from cudf._lib.utils cimport ( - columns_from_pylibcudf_table, - columns_from_table_view, - data_from_table_view, - table_view_from_columns, -) +from cudf._lib.utils cimport columns_from_pylibcudf_table, data_from_table_view # workaround for https://github.com/cython/cython/issues/3885 ctypedef const scalar constscalar @@ -92,20 +85,13 @@ def _copy_range_in_place(Column input_column, size_type input_begin, size_type input_end, size_type target_begin): - - cdef column_view input_column_view = input_column.view() - cdef mutable_column_view target_column_view = target_column.mutable_view() - cdef size_type c_input_begin = input_begin - cdef size_type c_input_end = input_end - cdef size_type c_target_begin = target_begin - - with nogil: - cpp_copying.copy_range_in_place( - input_column_view, - target_column_view, - c_input_begin, - c_input_end, - c_target_begin) + pylibcudf.copying.copy_range( + input_column.to_pylibcudf(mode="write"), + target_column.to_pylibcudf(mode="write"), + input_begin, + input_end, + target_begin + ) def _copy_range(Column input_column, @@ -244,198 +230,46 @@ def columns_empty_like(list input_columns): @acquire_spill_lock() def column_slice(Column input_column, object indices): - - cdef column_view input_column_view = input_column.view() - cdef vector[size_type] c_indices - c_indices.reserve(len(indices)) - - cdef vector[column_view] c_result - - cdef int index - - for index in indices: - c_indices.push_back(index) - - with nogil: - c_result = move( - cpp_copying.slice( - input_column_view, - c_indices) + return [ + Column.from_pylibcudf(c) + for c in pylibcudf.copying.column_slice( + input_column.to_pylibcudf(mode="read"), + list(indices), ) - - num_of_result_cols = c_result.size() - result = [ - Column.from_column_view( - c_result[i], - input_column) for i in range(num_of_result_cols)] - - return result + ] @acquire_spill_lock() -def columns_slice(list input_columns, list indices): - """ - Given a list of input columns, return columns sliced by ``indices``. - - Returns a list of list of columns. The length of return is - `len(indices) / 2`. The `i`th item in return is a list of columns sliced - from ``input_columns`` with `slice(indices[i*2], indices[i*2 + 1])`. - """ - cdef table_view input_table_view = table_view_from_columns(input_columns) - cdef vector[size_type] c_indices = indices - cdef vector[table_view] c_result - - with nogil: - c_result = move( - cpp_copying.slice( - input_table_view, - c_indices) - ) - +def columns_slice(list input_columns, object indices): return [ - columns_from_table_view( - c_result[i], input_columns - ) for i in range(c_result.size()) + columns_from_pylibcudf_table(tbl) + for tbl in pylibcudf.copying.table_slice( + pylibcudf.Table([col.to_pylibcudf(mode="read") for col in input_columns]), + list(indices), + ) ] @acquire_spill_lock() def column_split(Column input_column, object splits): - - cdef column_view input_column_view = input_column.view() - cdef vector[size_type] c_splits - c_splits.reserve(len(splits)) - - cdef vector[column_view] c_result - - cdef int split - - for split in splits: - c_splits.push_back(split) - - with nogil: - c_result = move( - cpp_copying.split( - input_column_view, - c_splits) + return [ + Column.from_pylibcudf(c) + for c in pylibcudf.copying.column_split( + input_column.to_pylibcudf(mode="read"), + list(splits), ) - - num_of_result_cols = c_result.size() - result = [ - Column.from_column_view( - c_result[i], - input_column - ) for i in range(num_of_result_cols) ] - return result - @acquire_spill_lock() def columns_split(list input_columns, object splits): - - cdef table_view input_table_view = table_view_from_columns(input_columns) - cdef vector[size_type] c_splits = splits - cdef vector[table_view] c_result - - with nogil: - c_result = move( - cpp_copying.split( - input_table_view, - c_splits) - ) - return [ - columns_from_table_view( - c_result[i], input_columns - ) for i in range(c_result.size()) - ] - - -def _copy_if_else_column_column(Column lhs, Column rhs, Column boolean_mask): - - cdef column_view lhs_view = lhs.view() - cdef column_view rhs_view = rhs.view() - cdef column_view boolean_mask_view = boolean_mask.view() - - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_copying.copy_if_else( - lhs_view, - rhs_view, - boolean_mask_view - ) - ) - - return Column.from_unique_ptr(move(c_result)) - - -def _copy_if_else_scalar_column(DeviceScalar lhs, - Column rhs, - Column boolean_mask): - - cdef const scalar* lhs_scalar = lhs.get_raw_ptr() - cdef column_view rhs_view = rhs.view() - cdef column_view boolean_mask_view = boolean_mask.view() - - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_copying.copy_if_else( - lhs_scalar[0], - rhs_view, - boolean_mask_view - ) + columns_from_pylibcudf_table(tbl) + for tbl in pylibcudf.copying.table_split( + pylibcudf.Table([col.to_pylibcudf(mode="read") for col in input_columns]), + list(splits), ) - - return Column.from_unique_ptr(move(c_result)) - - -def _copy_if_else_column_scalar(Column lhs, - DeviceScalar rhs, - Column boolean_mask): - - cdef column_view lhs_view = lhs.view() - cdef const scalar* rhs_scalar = rhs.get_raw_ptr() - cdef column_view boolean_mask_view = boolean_mask.view() - - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_copying.copy_if_else( - lhs_view, - rhs_scalar[0], - boolean_mask_view - ) - ) - - return Column.from_unique_ptr(move(c_result)) - - -def _copy_if_else_scalar_scalar(DeviceScalar lhs, - DeviceScalar rhs, - Column boolean_mask): - - cdef const scalar* lhs_scalar = lhs.get_raw_ptr() - cdef const scalar* rhs_scalar = rhs.get_raw_ptr() - cdef column_view boolean_mask_view = boolean_mask.view() - - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_copying.copy_if_else( - lhs_scalar[0], - rhs_scalar[0], - boolean_mask_view - ) - ) - - return Column.from_unique_ptr(move(c_result)) + ] @acquire_spill_lock() @@ -504,16 +338,12 @@ def shift(Column input, int offset, object fill_value=None): @acquire_spill_lock() def get_element(Column input_column, size_type index): - cdef column_view col_view = input_column.view() - - cdef unique_ptr[scalar] c_output - with nogil: - c_output = move( - cpp_copying.get_element(col_view, index) - ) - - return DeviceScalar.from_unique_ptr( - move(c_output), dtype=input_column.dtype + return DeviceScalar.from_pylibcudf( + pylibcudf.copying.get_element( + input_column.to_pylibcudf(mode="read"), + index, + ), + dtype=input_column.dtype, ) diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pxd b/python/cudf/cudf/_lib/pylibcudf/column.pxd index 2af87db5b03..27b77438c79 100644 --- a/python/cudf/cudf/_lib/pylibcudf/column.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/column.pxd @@ -4,7 +4,7 @@ from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view from cudf._lib.cpp.types cimport bitmask_type, size_type from .gpumemoryview cimport gpumemoryview @@ -26,10 +26,14 @@ cdef class Column: size_type _num_children cdef column_view view(self) nogil + cdef mutable_column_view mutable_view(self) nogil @staticmethod cdef Column from_libcudf(unique_ptr[column] libcudf_col) + @staticmethod + cdef Column from_column_view(const column_view& libcudf_col, Column owner) + cpdef DataType type(self) cpdef Column child(self, size_type index) cpdef size_type num_children(self) diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx index 40afc8aaa8a..389a1c82be5 100644 --- a/python/cudf/cudf/_lib/pylibcudf/column.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx @@ -91,6 +91,33 @@ cdef class Column: self._null_count, self._offset, c_children ) + cdef mutable_column_view mutable_view(self) nogil: + """Generate a libcudf mutable_column_view to pass to libcudf algorithms. + + This method is for pylibcudf's functions to use to generate inputs when + calling libcudf algorithms, and should generally not be needed by users + (even direct pylibcudf Cython users). + """ + cdef void * data = NULL + cdef bitmask_type * null_mask = NULL + + if self._data is not None: + data = int_to_void_ptr(self._data.ptr) + if self._mask is not None: + null_mask = int_to_bitmask_ptr(self._mask.ptr) + + cdef vector[mutable_column_view] c_children + with gil: + if self._children is not None: + for child in self._children: + # See the view method for why this needs to be cast. + c_children.push_back(( child).mutable_view()) + + return mutable_column_view( + self._data_type.c_obj, self._size, data, null_mask, + self._null_count, self._offset, c_children + ) + @staticmethod cdef Column from_libcudf(unique_ptr[column] libcudf_col): """Create a Column from a libcudf column. @@ -135,6 +162,38 @@ cdef class Column: children, ) + @staticmethod + cdef Column from_column_view(const column_view& cv, Column owner): + """Create a Column from a libcudf column_view. + + This method accepts shared ownership of the underlying data from the + owner and relies on the offset from the view. + + This method is for pylibcudf's functions to use to ingest outputs of + calling libcudf algorithms, and should generally not be needed by users + (even direct pylibcudf Cython users). + """ + cdef DataType dtype = DataType.from_libcudf(cv.type()) + cdef size_type size = cv.size() + cdef size_type null_count = cv.null_count() + + children = [] + if cv.num_children() != 0: + for i in range(cv.num_children()): + children.append( + Column.from_column_view(cv.child(i), owner.child(i)) + ) + + return Column( + dtype, + size, + owner._data, + owner._mask, + null_count, + cv.offset(), + children, + ) + cpdef DataType type(self): """The type of data in the column.""" return self._data_type diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pxd b/python/cudf/cudf/_lib/pylibcudf/copying.pxd index db0e42f5804..3567df9ac9c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/copying.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/copying.pxd @@ -26,6 +26,14 @@ cpdef object empty_table_like(Table input) cpdef Column allocate_like(Column input_column, mask_allocation_policy policy, size=*) +cpdef Column copy_range_in_place( + Column input_column, + Column target_column, + size_type input_begin, + size_type input_end, + size_type target_begin, +) + cpdef Column copy_range( Column input_column, Column target_column, @@ -36,8 +44,18 @@ cpdef Column copy_range( cpdef Column shift(Column input, size_type offset, Scalar fill_values) +cpdef list column_split(Column input_column, list splits) + +cpdef list table_split(Table input_table, list splits) + +cpdef list column_slice(Column input_column, list indices) + +cpdef list table_slice(Table input_table, list indices) + cpdef Column copy_if_else(object lhs, object rhs, Column boolean_mask) cpdef Table boolean_mask_table_scatter(Table input, Table target, Column boolean_mask) cpdef Table boolean_mask_scalars_scatter(list input, Table target, Column boolean_mask) + +cpdef Scalar get_element(Column input_column, size_type index) diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pyx b/python/cudf/cudf/_lib/pylibcudf/copying.pyx index 634aed3e6e5..c08b57c05d1 100644 --- a/python/cudf/cudf/_lib/pylibcudf/copying.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/copying.pyx @@ -13,9 +13,11 @@ from libcpp.vector cimport vector # cimport libcudf... libcudf.copying.algo(...) from cudf._lib.cpp cimport copying as cpp_copying from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view from cudf._lib.cpp.copying cimport mask_allocation_policy, out_of_bounds_policy from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table +from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type from cudf._lib.cpp.copying import \ @@ -154,6 +156,27 @@ cpdef Column allocate_like( return Column.from_libcudf(move(c_result)) +cpdef Column copy_range_in_place( + Column input_column, + Column target_column, + size_type input_begin, + size_type input_end, + size_type target_begin, +): + # Need to initialize this outside the function call so that Cython doesn't + # try and pass a temporary that decays to an rvalue reference in where the + # function requires an lvalue reference. + cdef mutable_column_view target_view = target_column.mutable_view() + with nogil: + cpp_copying.copy_range_in_place( + input_column.view(), + target_view, + input_begin, + input_end, + target_begin + ) + + cpdef Column copy_range( Column input_column, Column target_column, @@ -188,6 +211,82 @@ cpdef Column shift(Column input, size_type offset, Scalar fill_values): return Column.from_libcudf(move(c_result)) +cpdef list column_split(Column input_column, list splits): + cdef vector[size_type] c_splits + cdef int split + for split in splits: + c_splits.push_back(split) + + cdef vector[column_view] c_result + with nogil: + c_result = move( + cpp_copying.split( + input_column.view(), + c_splits + ) + ) + + cdef int i + return [ + Column.from_column_view(c_result[i], input_column) + for i in range(c_result.size()) + ] + + +cpdef list table_split(Table input_table, list splits): + cdef vector[size_type] c_splits = splits + cdef vector[table_view] c_result + with nogil: + c_result = move( + cpp_copying.split( + input_table.view(), + c_splits + ) + ) + + cdef int i + return [ + Table.from_table_view(c_result[i], input_table) + for i in range(c_result.size()) + ] + + +cpdef list column_slice(Column input_column, list indices): + cdef vector[size_type] c_indices = indices + cdef vector[column_view] c_result + with nogil: + c_result = move( + cpp_copying.slice( + input_column.view(), + c_indices + ) + ) + + cdef int i + return [ + Column.from_column_view(c_result[i], input_column) + for i in range(c_result.size()) + ] + + +cpdef list table_slice(Table input_table, list indices): + cdef vector[size_type] c_indices = indices + cdef vector[table_view] c_result + with nogil: + c_result = move( + cpp_copying.slice( + input_table.view(), + c_indices + ) + ) + + cdef int i + return [ + Table.from_table_view(c_result[i], input_table) + for i in range(c_result.size()) + ] + + cpdef Column copy_if_else(object lhs, object rhs, Column boolean_mask): cdef unique_ptr[column] result @@ -263,3 +362,12 @@ cpdef Table boolean_mask_scalars_scatter(list input, Table target, Column boolea ) return Table.from_libcudf(move(result)) + +cpdef Scalar get_element(Column input_column, size_type index): + cdef unique_ptr[scalar] c_output + with nogil: + c_output = move( + cpp_copying.get_element(input_column.view(), index) + ) + + return Scalar.from_libcudf(move(c_output)) diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pxd b/python/cudf/cudf/_lib/pylibcudf/table.pxd index a9e2874232a..6fe06f00491 100644 --- a/python/cudf/cudf/_lib/pylibcudf/table.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/table.pxd @@ -16,6 +16,9 @@ cdef class Table: @staticmethod cdef Table from_libcudf(unique_ptr[table] libcudf_tbl) + @staticmethod + cdef Table from_table_view(const table_view& tv, Table owner) + cpdef list columns(self) cpdef pa.Table to_arrow(self, list metadata) diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pyx b/python/cudf/cudf/_lib/pylibcudf/table.pyx index c41eb82e4a1..6a6fad46d69 100644 --- a/python/cudf/cudf/_lib/pylibcudf/table.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/table.pyx @@ -65,6 +65,23 @@ cdef class Table: for i in range(c_columns.size()) ]) + @staticmethod + cdef Table from_table_view(const table_view& tv, Table owner): + """Create a Table from a libcudf table. + + This method accepts shared ownership of the underlying data from the + owner and relies on the offset from the view. + + This method is for pylibcudf's functions to use to ingest outputs of + calling libcudf algorithms, and should generally not be needed by users + (even direct pylibcudf Cython users). + """ + cdef int i + return Table([ + Column.from_column_view(tv.column(i), owner.columns()[i]) + for i in range(tv.num_columns()) + ]) + cpdef list columns(self): return self._columns diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd index 77733f59c3d..b5c5a8a64a3 100644 --- a/python/cudf/cudf/_lib/scalar.pxd +++ b/python/cudf/cudf/_lib/scalar.pxd @@ -22,4 +22,9 @@ cdef class DeviceScalar: @staticmethod cdef DeviceScalar from_unique_ptr(unique_ptr[scalar] ptr, dtype=*) + @staticmethod + cdef DeviceScalar from_pylibcudf(pylibcudf.Scalar scalar, dtype=*) + + cdef void _set_dtype(self, dtype=*) + cpdef bool is_valid(DeviceScalar s) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 0b64c75f7b6..27fb9e994f0 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -217,13 +217,22 @@ cdef class DeviceScalar: Construct a Scalar object from a unique_ptr. """ cdef DeviceScalar s = DeviceScalar.__new__(DeviceScalar) - cdef libcudf_types.data_type cdtype - s.c_value = pylibcudf.Scalar.from_libcudf(move(ptr)) - cdtype = s.get_raw_ptr()[0].type() + s._set_dtype(dtype) + return s + + @staticmethod + cdef DeviceScalar from_pylibcudf(pylibcudf.Scalar pscalar, dtype=None): + cdef DeviceScalar s = DeviceScalar.__new__(DeviceScalar) + s.c_value = pscalar + s._set_dtype(dtype) + return s + + cdef void _set_dtype(self, dtype=None): + cdef libcudf_types.data_type cdtype = self.get_raw_ptr()[0].type() if dtype is not None: - s._dtype = dtype + self._dtype = dtype elif cdtype.id() in { libcudf_types.type_id.DECIMAL32, libcudf_types.type_id.DECIMAL64, @@ -233,32 +242,31 @@ cdef class DeviceScalar: "Must pass a dtype when constructing from a fixed-point scalar" ) elif cdtype.id() == libcudf_types.type_id.STRUCT: - struct_table_view = (s.get_raw_ptr())[0].view() - s._dtype = StructDtype({ + struct_table_view = (self.get_raw_ptr())[0].view() + self._dtype = StructDtype({ str(i): dtype_from_column_view(struct_table_view.column(i)) for i in range(struct_table_view.num_columns()) }) elif cdtype.id() == libcudf_types.type_id.LIST: if ( - s.get_raw_ptr() + self.get_raw_ptr() )[0].view().type().id() == libcudf_types.type_id.LIST: - s._dtype = dtype_from_column_view( - (s.get_raw_ptr())[0].view() + self._dtype = dtype_from_column_view( + (self.get_raw_ptr())[0].view() ) else: - s._dtype = ListDtype( + self._dtype = ListDtype( LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ ( - (s.get_raw_ptr())[0] + (self.get_raw_ptr())[0] .view().type().id() ) ] ) else: - s._dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ + self._dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ (cdtype.id()) ] - return s # TODO: Currently the only uses of this function and the one below are in