From 7710fc7592824d6faec8603f86a2753e31c6e01c Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 27 Oct 2022 10:01:50 +0200 Subject: [PATCH 01/16] replaced `DeviceBufferLike` with `Buffer`. --- python/cudf/cudf/_lib/column.pyi | 28 ++-- python/cudf/cudf/_lib/column.pyx | 40 +++--- python/cudf/cudf/_lib/concat.pyx | 4 +- python/cudf/cudf/_lib/null_mask.pyx | 10 +- python/cudf/cudf/_lib/transform.pyx | 12 +- python/cudf/cudf/_lib/utils.pyx | 4 +- python/cudf/cudf/core/abc.py | 30 +--- python/cudf/cudf/core/buffer/__init__.py | 4 + python/cudf/cudf/core/{ => buffer}/buffer.py | 136 +++--------------- python/cudf/cudf/core/buffer/utils.py | 39 +++++ python/cudf/cudf/core/column/categorical.py | 8 +- python/cudf/cudf/core/column/column.py | 50 +++---- python/cudf/cudf/core/column/datetime.py | 14 +- python/cudf/cudf/core/column/decimal.py | 6 +- python/cudf/cudf/core/column/numerical.py | 18 ++- python/cudf/cudf/core/column/string.py | 6 +- python/cudf/cudf/core/column/timedelta.py | 14 +- python/cudf/cudf/core/df_protocol.py | 10 +- python/cudf/cudf/core/dtypes.py | 4 +- python/cudf/cudf/core/index.py | 2 +- python/cudf/cudf/tests/test_buffer.py | 18 +-- python/cudf/cudf/tests/test_column.py | 2 +- .../cudf/tests/test_cuda_array_interface.py | 4 +- python/cudf/cudf/tests/test_pickling.py | 4 +- python/cudf/cudf/tests/test_testing.py | 2 +- python/cudf/cudf/utils/utils.py | 6 +- 26 files changed, 194 insertions(+), 281 deletions(-) create mode 100644 python/cudf/cudf/core/buffer/__init__.py rename python/cudf/cudf/core/{ => buffer}/buffer.py (63%) create mode 100644 python/cudf/cudf/core/buffer/utils.py diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi index fd9aab038d4..c38c560b982 100644 --- a/python/cudf/cudf/_lib/column.pyi +++ b/python/cudf/cudf/_lib/column.pyi @@ -5,16 +5,16 @@ from __future__ import annotations from typing import Dict, Optional, Tuple, TypeVar from cudf._typing import Dtype, DtypeObj, ScalarLike -from cudf.core.buffer import DeviceBufferLike +from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase T = TypeVar("T") class Column: - _data: Optional[DeviceBufferLike] - _mask: Optional[DeviceBufferLike] - _base_data: Optional[DeviceBufferLike] - _base_mask: Optional[DeviceBufferLike] + _data: Optional[Buffer] + _mask: Optional[Buffer] + _base_data: Optional[Buffer] + _base_mask: Optional[Buffer] _dtype: DtypeObj _size: int _offset: int @@ -25,10 +25,10 @@ class Column: def __init__( self, - data: Optional[DeviceBufferLike], + data: Optional[Buffer], size: int, dtype: Dtype, - mask: Optional[DeviceBufferLike] = None, + mask: Optional[Buffer] = None, offset: int = None, null_count: int = None, children: Tuple[ColumnBase, ...] = (), @@ -40,27 +40,27 @@ class Column: @property def size(self) -> int: ... @property - def base_data(self) -> Optional[DeviceBufferLike]: ... + def base_data(self) -> Optional[Buffer]: ... @property def base_data_ptr(self) -> int: ... @property - def data(self) -> Optional[DeviceBufferLike]: ... + def data(self) -> Optional[Buffer]: ... @property def data_ptr(self) -> int: ... - def set_base_data(self, value: DeviceBufferLike) -> None: ... + def set_base_data(self, value: Buffer) -> None: ... @property def nullable(self) -> bool: ... def has_nulls(self, include_nan: bool = False) -> bool: ... @property - def base_mask(self) -> Optional[DeviceBufferLike]: ... + def base_mask(self) -> Optional[Buffer]: ... @property def base_mask_ptr(self) -> int: ... @property - def mask(self) -> Optional[DeviceBufferLike]: ... + def mask(self) -> Optional[Buffer]: ... @property def mask_ptr(self) -> int: ... - def set_base_mask(self, value: Optional[DeviceBufferLike]) -> None: ... - def set_mask(self: T, value: Optional[DeviceBufferLike]) -> T: ... + def set_base_mask(self, value: Optional[Buffer]) -> None: ... + def set_mask(self: T, value: Optional[Buffer]) -> T: ... @property def null_count(self) -> int: ... @property diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 78125c027dd..5b7f89f5881 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -9,7 +9,7 @@ import rmm import cudf import cudf._lib as libcudf from cudf.api.types import is_categorical_dtype, is_list_dtype, is_struct_dtype -from cudf.core.buffer import Buffer, DeviceBufferLike, as_device_buffer_like +from cudf.core.buffer import Buffer, as_buffer from cpython.buffer cimport PyObject_CheckBuffer from libc.stdint cimport uintptr_t @@ -56,9 +56,9 @@ cdef class Column: A Column stores columnar data in device memory. A Column may be composed of: - * A *data* DeviceBufferLike + * A *data* Buffer * One or more (optional) *children* Columns - * An (optional) *mask* DeviceBufferLike representing the nullmask + * An (optional) *mask* Buffer representing the nullmask The *dtype* indicates the Column's element type. """ @@ -123,9 +123,9 @@ cdef class Column: return self.data.ptr def set_base_data(self, value): - if value is not None and not isinstance(value, DeviceBufferLike): + if value is not None and not isinstance(value, Buffer): raise TypeError( - "Expected a DeviceBufferLike or None for data, " + "Expected a Buffer or None for data, " f"got {type(value).__name__}" ) @@ -172,9 +172,9 @@ cdef class Column: modify size or offset in any way, so the passed mask is expected to be compatible with the current offset. """ - if value is not None and not isinstance(value, DeviceBufferLike): + if value is not None and not isinstance(value, Buffer): raise TypeError( - "Expected a DeviceBufferLike or None for mask, " + "Expected a Buffer or None for mask, " f"got {type(value).__name__}" ) @@ -182,7 +182,7 @@ cdef class Column: required_size = bitmask_allocation_size_bytes(self.base_size) if value.size < required_size: error_msg = ( - "The DeviceBufferLike for mask is smaller than expected, " + "The Buffer for mask is smaller than expected, " f"got {value.size} bytes, expected {required_size} bytes." ) if self.offset > 0 or self.size < self.base_size: @@ -227,30 +227,30 @@ cdef class Column: if isinstance(value, Column): value = value.data_array_view value = cp.asarray(value).view('|u1') - mask = as_device_buffer_like(value) + mask = as_buffer(value) if mask.size < required_num_bytes: raise ValueError(error_msg.format(str(value.size))) if mask.size < mask_size: dbuf = rmm.DeviceBuffer(size=mask_size) dbuf.copy_from_device(value) - mask = as_device_buffer_like(dbuf) + mask = as_buffer(dbuf) elif hasattr(value, "__array_interface__"): value = np.asarray(value).view("u1")[:mask_size] if value.size < required_num_bytes: raise ValueError(error_msg.format(str(value.size))) dbuf = rmm.DeviceBuffer(size=mask_size) dbuf.copy_from_host(value) - mask = as_device_buffer_like(dbuf) + mask = as_buffer(dbuf) elif PyObject_CheckBuffer(value): value = np.asarray(value).view("u1")[:mask_size] if value.size < required_num_bytes: raise ValueError(error_msg.format(str(value.size))) dbuf = rmm.DeviceBuffer(size=mask_size) dbuf.copy_from_host(value) - mask = as_device_buffer_like(dbuf) + mask = as_buffer(dbuf) else: raise TypeError( - "Expected a DeviceBufferLike object or None for mask, " + "Expected a Buffer object or None for mask, " f"got {type(value).__name__}" ) @@ -449,11 +449,11 @@ cdef class Column: cdef column_contents contents = move(c_col.get()[0].release()) data = DeviceBuffer.c_from_unique_ptr(move(contents.data)) - data = as_device_buffer_like(data) + data = as_buffer(data) if null_count > 0: mask = DeviceBuffer.c_from_unique_ptr(move(contents.null_mask)) - mask = as_device_buffer_like(mask) + mask = as_buffer(mask) else: mask = None @@ -478,8 +478,8 @@ cdef class Column: Given a ``cudf::column_view``, constructs a ``cudf.Column`` from it, along with referencing an ``owner`` Python object that owns the memory lifetime. If ``owner`` is a ``cudf.Column``, we reach inside of it and - make the owner of each newly created ``DeviceBufferLike`` the - respective ``DeviceBufferLike`` from the ``owner`` ``cudf.Column``. + make the owner of each newly created ``Buffer`` the + respective ``Buffer`` from the ``owner`` ``cudf.Column``. If ``owner`` is ``None``, we allocate new memory for the resulting ``cudf.Column``. """ @@ -504,7 +504,7 @@ cdef class Column: if data_ptr: if data_owner is None: - data = as_device_buffer_like( + data = as_buffer( rmm.DeviceBuffer(ptr=data_ptr, size=(size+offset) * dtype.itemsize) ) @@ -515,7 +515,7 @@ cdef class Column: owner=data_owner ) else: - data = as_device_buffer_like( + data = as_buffer( rmm.DeviceBuffer(ptr=data_ptr, size=0) ) @@ -545,7 +545,7 @@ cdef class Column: # result: mask = None else: - mask = as_device_buffer_like( + mask = as_buffer( rmm.DeviceBuffer( ptr=mask_ptr, size=bitmask_allocation_size_bytes(size+offset) diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx index ed858034032..75e2d3bfbdc 100644 --- a/python/cudf/cudf/_lib/concat.pyx +++ b/python/cudf/cudf/_lib/concat.pyx @@ -19,7 +19,7 @@ from cudf._lib.utils cimport ( table_view_from_table, ) -from cudf.core.buffer import as_device_buffer_like +from cudf.core.buffer import as_buffer from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer @@ -31,7 +31,7 @@ cpdef concat_masks(object columns): with nogil: c_result = move(libcudf_concatenate_masks(c_views)) c_unique_result = make_unique[device_buffer](move(c_result)) - return as_device_buffer_like( + return as_buffer( DeviceBuffer.c_from_unique_ptr(move(c_unique_result)) ) diff --git a/python/cudf/cudf/_lib/null_mask.pyx b/python/cudf/cudf/_lib/null_mask.pyx index 976fe0e78fc..61988019c70 100644 --- a/python/cudf/cudf/_lib/null_mask.pyx +++ b/python/cudf/cudf/_lib/null_mask.pyx @@ -22,7 +22,7 @@ from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport mask_state, size_type from cudf._lib.utils cimport table_view_from_columns -from cudf.core.buffer import as_device_buffer_like +from cudf.core.buffer import as_buffer class MaskState(Enum): @@ -52,7 +52,7 @@ def copy_bitmask(Column col): up_db = make_unique[device_buffer](move(db)) rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db)) - buf = as_device_buffer_like(rmm_db) + buf = as_buffer(rmm_db) return buf @@ -98,7 +98,7 @@ def create_null_mask(size_type size, state=MaskState.UNINITIALIZED): up_db = make_unique[device_buffer](move(db)) rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db)) - buf = as_device_buffer_like(rmm_db) + buf = as_buffer(rmm_db) return buf @@ -110,7 +110,7 @@ def bitmask_and(columns: list): c_result = move(cpp_bitmask_and(c_view)) up_db = make_unique[device_buffer](move(c_result.first)) dbuf = DeviceBuffer.c_from_unique_ptr(move(up_db)) - buf = as_device_buffer_like(dbuf) + buf = as_buffer(dbuf) return buf, c_result.second @@ -122,5 +122,5 @@ def bitmask_or(columns: list): c_result = move(cpp_bitmask_or(c_view)) up_db = make_unique[device_buffer](move(c_result.first)) dbuf = DeviceBuffer.c_from_unique_ptr(move(up_db)) - buf = as_device_buffer_like(dbuf) + buf = as_buffer(dbuf) return buf, c_result.second diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index 5fa45f68357..5d124839e4d 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -6,7 +6,7 @@ from numba.np import numpy_support import cudf from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES from cudf.core._internals.expressions import parse_expression -from cudf.core.buffer import as_device_buffer_like +from cudf.core.buffer import as_buffer from cudf.utils import cudautils from cython.operator cimport dereference @@ -40,7 +40,7 @@ from cudf._lib.utils cimport ( def bools_to_mask(Column col): """ Given an int8 (boolean) column, compress the data from booleans to bits and - return a DeviceBufferLike + return a Buffer """ cdef column_view col_view = col.view() cdef pair[unique_ptr[device_buffer], size_type] cpp_out @@ -52,7 +52,7 @@ def bools_to_mask(Column col): up_db = move(cpp_out.first) rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db)) - buf = as_device_buffer_like(rmm_db) + buf = as_buffer(rmm_db) return buf @@ -61,9 +61,9 @@ def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit): Given a mask buffer, returns a boolean column representng bit 0 -> False and 1 -> True within range of [begin_bit, end_bit), """ - if not isinstance(mask_buffer, cudf.core.buffer.DeviceBufferLike): + if not isinstance(mask_buffer, cudf.core.buffer.Buffer): raise TypeError("mask_buffer is not an instance of " - "cudf.core.buffer.DeviceBufferLike") + "cudf.core.buffer.Buffer") cdef bitmask_type* bit_mask = (mask_buffer.ptr) cdef unique_ptr[column] result @@ -88,7 +88,7 @@ def nans_to_nulls(Column input): return None buffer = DeviceBuffer.c_from_unique_ptr(move(c_buffer)) - buffer = as_device_buffer_like(buffer) + buffer = as_buffer(buffer) return buffer diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index e0bdc7d8f74..643a1adca9f 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -341,8 +341,8 @@ cdef data_from_table_view( along with referencing an ``owner`` Python object that owns the memory lifetime. If ``owner`` is a Frame we reach inside of it and reach inside of each ``cudf.Column`` to make the owner of each newly - created ``DeviceBufferLike`` underneath the ``cudf.Column`` objects of the - created Frame the respective ``DeviceBufferLike`` from the relevant + created ``Buffer`` underneath the ``cudf.Column`` objects of the + created Frame the respective ``Buffer`` from the relevant ``cudf.Column`` of the ``owner`` Frame """ cdef size_type column_idx = 0 diff --git a/python/cudf/cudf/core/abc.py b/python/cudf/cudf/core/abc.py index dcbf96313a7..1c8874a2abd 100644 --- a/python/cudf/cudf/core/abc.py +++ b/python/cudf/cudf/core/abc.py @@ -1,20 +1,10 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. """Common abstract base classes for cudf.""" -import sys - -import rmm +import pickle import cudf -if sys.version_info < (3, 8): - try: - import pickle5 as pickle - except ImportError: - import pickle # type: ignore -else: - import pickle # type: ignore - class Serializable: """A serializable object composed of device memory buffers. @@ -90,14 +80,14 @@ def device_serialize(self): header : dict The metadata required to reconstruct the object. frames : list - The DeviceBufferLike or memoryview objects that the object + The Buffer or memoryview objects that the object should contain. :meta private: """ header, frames = self.serialize() assert all( - isinstance(f, (cudf.core.buffer.DeviceBufferLike, memoryview)) + isinstance(f, (cudf.core.buffer.Buffer, memoryview)) for f in frames ) header["type-serialized"] = pickle.dumps(type(self)) @@ -132,18 +122,10 @@ def device_deserialize(cls, header, frames): """ typ = pickle.loads(header["type-serialized"]) frames = [ - cudf.core.buffer.as_device_buffer_like(f) if c else memoryview(f) + cudf.core.buffer.as_buffer(f) if c else memoryview(f) for c, f in zip(header["is-cuda"], frames) ] - assert all( - (type(f._owner) is rmm.DeviceBuffer) - if c - else (type(f) is memoryview) - for c, f in zip(header["is-cuda"], frames) - ) - obj = typ.deserialize(header, frames) - - return obj + return typ.deserialize(header, frames) def host_serialize(self): """Serialize data and metadata associated with host memory. @@ -186,7 +168,7 @@ def host_deserialize(cls, header, frames): :meta private: """ frames = [ - rmm.DeviceBuffer.to_device(f) if c else f + cudf.core.buffer.as_buffer(f) if c else f for c, f in zip(header["is-cuda"], map(memoryview, frames)) ] obj = cls.device_deserialize(header, frames) diff --git a/python/cudf/cudf/core/buffer/__init__.py b/python/cudf/cudf/core/buffer/__init__.py new file mode 100644 index 00000000000..93551f084ba --- /dev/null +++ b/python/cudf/cudf/core/buffer/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +from cudf.core.buffer.buffer import Buffer +from cudf.core.buffer.utils import as_buffer diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer/buffer.py similarity index 63% rename from python/cudf/cudf/core/buffer.py rename to python/cudf/cudf/core/buffer/buffer.py index 647e747e127..cc582ed041e 100644 --- a/python/cudf/cudf/core/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -4,17 +4,7 @@ import math import pickle -from typing import ( - Any, - Dict, - List, - Mapping, - Protocol, - Sequence, - Tuple, - Union, - runtime_checkable, -) +from typing import Any, Dict, Mapping, Sequence, Tuple, Union import numpy as np @@ -24,113 +14,13 @@ from cudf.core.abc import Serializable from cudf.utils.string import format_bytes -# Frame type for serialization and deserialization of `DeviceBufferLike` -Frame = Union[memoryview, "DeviceBufferLike"] - - -@runtime_checkable -class DeviceBufferLike(Protocol): - def __getitem__(self, key: slice) -> DeviceBufferLike: - """Create a new view of the buffer.""" - - @property - def size(self) -> int: - """Size of the buffer in bytes.""" - - @property - def nbytes(self) -> int: - """Size of the buffer in bytes.""" - - @property - def ptr(self) -> int: - """Device pointer to the start of the buffer.""" - - @property - def owner(self) -> Any: - """Object owning the memory of the buffer.""" - - @property - def __cuda_array_interface__(self) -> Mapping: - """Implementation of the CUDA Array Interface.""" - - def memoryview(self) -> memoryview: - """Read-only access to the buffer through host memory.""" - - def serialize(self) -> Tuple[dict, List[Frame]]: - """Serialize the buffer into header and frames. - - The frames can be a mixture of memoryview and device-buffer-like - objects. - - Returns - ------- - Tuple[Dict, List] - The first element of the returned tuple is a dict containing any - serializable metadata required to reconstruct the object. The - second element is a list containing the device buffers and - memoryviews of the object. - """ - - @classmethod - def deserialize( - cls, header: dict, frames: List[Frame] - ) -> DeviceBufferLike: - """Generate an buffer from a serialized representation. - - Parameters - ---------- - header : dict - The metadata required to reconstruct the object. - frames : list - The device-buffer-like and memoryview buffers that the object - should contain. - - Returns - ------- - DeviceBufferLike - A new object that implements DeviceBufferLike. - """ - - -def as_device_buffer_like(obj: Any) -> DeviceBufferLike: - """ - Factory function to wrap `obj` in a DeviceBufferLike object. - - If `obj` isn't device-buffer-like already, a new buffer that implements - DeviceBufferLike and points to the memory of `obj` is created. If `obj` - represents host memory, it is copied to a new `rmm.DeviceBuffer` device - allocation. Otherwise, the data of `obj` is **not** copied, instead the - new buffer keeps a reference to `obj` in order to retain the lifetime - of `obj`. - - Raises ValueError if the data of `obj` isn't C-contiguous. - - Parameters - ---------- - obj : buffer-like or array-like - An object that exposes either device or host memory through - `__array_interface__`, `__cuda_array_interface__`, or the - buffer protocol. If `obj` represents host memory, data will - be copied. - - Return - ------ - DeviceBufferLike - A device-buffer-like instance that represents the device memory - of `obj`. - """ - - if isinstance(obj, DeviceBufferLike): - return obj - return Buffer(obj) - class Buffer(Serializable): """ A Buffer represents device memory. - Usually Buffers will be created using `as_device_buffer_like(obj)`, - which will make sure that `obj` is device-buffer-like and not a `Buffer` + Usually Buffers will be created using `as_buffer(obj)`, + which will make sure that `obj` is buffer and not a `Buffer` necessarily. Parameters @@ -191,15 +81,25 @@ def __init__( self._size = buf.size self._owner = buf + def _getitem(self, offset: int, size: int) -> Buffer: + """ + Sub-classes can overwrite this to implement __getitem__ + without having to handle non-slice inputs. + """ + return self.__class__( + data=self.ptr + offset, size=size, owner=self.owner + ) + def __getitem__(self, key: slice) -> Buffer: if not isinstance(key, slice): - raise ValueError("index must be an slice") + raise TypeError( + "Argument 'key' has incorrect type " + f"(expected slice, got {key.__class__.__name__})" + ) start, stop, step = key.indices(self.size) if step != 1: - raise ValueError("slice must be contiguous") - return self.__class__( - data=self.ptr + start, size=stop - start, owner=self.owner - ) + raise ValueError("slice must be C-contiguous") + return self._getitem(offset=start, size=stop - start) @property def size(self) -> int: diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py new file mode 100644 index 00000000000..5d189632281 --- /dev/null +++ b/python/cudf/cudf/core/buffer/utils.py @@ -0,0 +1,39 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +from __future__ import annotations + +from typing import Any + +from cudf.core.buffer.buffer import Buffer + + +def as_buffer(obj: Any) -> Buffer: + """ + Factory function to wrap `obj` in a Buffer object. + + If `obj` isn't buffer already, a new buffer that points to the memory of + `obj` is created. If `obj` represents host memory, it is copied to a new + `rmm.DeviceBuffer` device allocation. Otherwise, the data of `obj` is + **not** copied, instead the new buffer keeps a reference to `obj` in order + to retain the lifetime of `obj`. + + Raises ValueError if the data of `obj` isn't C-contiguous. + + Parameters + ---------- + obj : buffer-like or array-like + An object that exposes either device or host memory through + `__array_interface__`, `__cuda_array_interface__`, or the + buffer protocol. If `obj` represents host memory, data will + be copied. + + Return + ------ + Buffer + A buffer instance that represents the device memory + of `obj`. + """ + + if isinstance(obj, Buffer): + return obj + return Buffer(obj) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index af5d140a20a..322092a149c 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -16,7 +16,7 @@ from cudf._lib.transform import bools_to_mask from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike from cudf.api.types import is_categorical_dtype, is_interval_dtype -from cudf.core.buffer import DeviceBufferLike +from cudf.core.buffer import Buffer from cudf.core.column import column from cudf.core.column.methods import ColumnMethods from cudf.core.dtypes import CategoricalDtype @@ -595,7 +595,7 @@ class CategoricalColumn(column.ColumnBase): Parameters ---------- dtype : CategoricalDtype - mask : DeviceBufferLike + mask : Buffer The validity mask offset : int Data offset @@ -619,7 +619,7 @@ class CategoricalColumn(column.ColumnBase): def __init__( self, dtype: CategoricalDtype, - mask: DeviceBufferLike = None, + mask: Buffer = None, size: int = None, offset: int = 0, null_count: int = None, @@ -678,7 +678,7 @@ def _process_values_for_isin( rhs = cudf.core.column.as_column(values, dtype=self.dtype) return lhs, rhs - def set_base_mask(self, value: Optional[DeviceBufferLike]): + def set_base_mask(self, value: Optional[Buffer]): super().set_base_mask(value) self._codes = None diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 7291b695312..31ada81b7fd 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -64,7 +64,7 @@ ) from cudf.core._compat import PANDAS_GE_150 from cudf.core.abc import Serializable -from cudf.core.buffer import Buffer, DeviceBufferLike, as_device_buffer_like +from cudf.core.buffer import Buffer, as_buffer from cudf.core.dtypes import ( CategoricalDtype, IntervalDtype, @@ -357,7 +357,7 @@ def valid_count(self) -> int: return len(self) - self.null_count @property - def nullmask(self) -> DeviceBufferLike: + def nullmask(self) -> Buffer: """The gpu buffer for the null-mask""" if not self.nullable: raise ValueError("Column has no null mask") @@ -761,12 +761,12 @@ def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase: res = res.drop_duplicates(subset="orig_order", ignore_index=True) return res._data["bool"].fillna(False) - def as_mask(self) -> DeviceBufferLike: + def as_mask(self) -> Buffer: """Convert booleans to bitmask Returns ------- - DeviceBufferLike + Buffer """ if self.has_nulls(): @@ -1281,7 +1281,7 @@ def column_empty( data = None children = ( build_column( - data=as_device_buffer_like( + data=as_buffer( rmm.DeviceBuffer( size=row_count * cudf.dtype("int32").itemsize ) @@ -1294,7 +1294,7 @@ def column_empty( children = ( full(row_count + 1, 0, dtype="int32"), build_column( - data=as_device_buffer_like( + data=as_buffer( rmm.DeviceBuffer( size=row_count * cudf.dtype("int8").itemsize ) @@ -1303,9 +1303,7 @@ def column_empty( ), ) else: - data = as_device_buffer_like( - rmm.DeviceBuffer(size=row_count * dtype.itemsize) - ) + data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize)) if masked: mask = create_null_mask(row_count, state=MaskState.ALL_NULL) @@ -1318,11 +1316,11 @@ def column_empty( def build_column( - data: Union[DeviceBufferLike, None], + data: Union[Buffer, None], dtype: Dtype, *, size: int = None, - mask: DeviceBufferLike = None, + mask: Buffer = None, offset: int = 0, null_count: int = None, children: Tuple[ColumnBase, ...] = (), @@ -1332,12 +1330,12 @@ def build_column( Parameters ---------- - data : DeviceBufferLike + data : Buffer The data buffer (can be None if constructing certain Column types like StringColumn, ListColumn, or CategoricalColumn) dtype The dtype associated with the Column to construct - mask : DeviceBufferLike, optional + mask : Buffer, optional The mask buffer size : int, optional offset : int, optional @@ -1482,7 +1480,7 @@ def build_column( def build_categorical_column( categories: ColumnBase, codes: ColumnBase, - mask: DeviceBufferLike = None, + mask: Buffer = None, size: int = None, offset: int = 0, null_count: int = None, @@ -1498,7 +1496,7 @@ def build_categorical_column( codes : Column Column of codes, the size of the resulting Column will be the size of `codes` - mask : DeviceBufferLike + mask : Buffer Null mask size : int, optional offset : int, optional @@ -1542,7 +1540,7 @@ def build_interval_column( Column of values representing the left of the interval right_col : Column Column of representing the right of the interval - mask : DeviceBufferLike + mask : Buffer Null mask size : int, optional offset : int, optional @@ -1573,7 +1571,7 @@ def build_interval_column( def build_list_column( indices: ColumnBase, elements: ColumnBase, - mask: DeviceBufferLike = None, + mask: Buffer = None, size: int = None, offset: int = 0, null_count: int = None, @@ -1587,7 +1585,7 @@ def build_list_column( Column of list indices elements : ColumnBase Column of list elements - mask: DeviceBufferLike + mask: Buffer Null mask size: int, optional offset: int, optional @@ -1619,7 +1617,7 @@ def build_struct_column( names: Sequence[str], children: Tuple[ColumnBase, ...], dtype: Optional[Dtype] = None, - mask: DeviceBufferLike = None, + mask: Buffer = None, size: int = None, offset: int = 0, null_count: int = None, @@ -1633,7 +1631,7 @@ def build_struct_column( Field names to map to children dtypes, must be strings. children : tuple - mask: DeviceBufferLike + mask: Buffer Null mask size: int, optional offset: int, optional @@ -1669,9 +1667,7 @@ def _make_copy_replacing_NaT_with_null(column): out_col = cudf._lib.replace.replace( column, build_column( - as_device_buffer_like( - np.array([na_value], dtype=column.dtype).view("|u1") - ), + as_buffer(np.array([na_value], dtype=column.dtype).view("|u1")), dtype=column.dtype, ), null, @@ -1766,7 +1762,7 @@ def as_column( ): arbitrary = cupy.ascontiguousarray(arbitrary) - data = as_device_buffer_like(arbitrary) + data = as_buffer(arbitrary) col = build_column(data, dtype=current_dtype, mask=mask) if dtype is not None: @@ -1914,7 +1910,7 @@ def as_column( if cast_dtype: arbitrary = arbitrary.astype(cudf.dtype("datetime64[s]")) - buffer = as_device_buffer_like(arbitrary.view("|u1")) + buffer = as_buffer(arbitrary.view("|u1")) mask = None if nan_as_null is None or nan_as_null is True: data = build_column(buffer, dtype=arbitrary.dtype) @@ -1932,7 +1928,7 @@ def as_column( if cast_dtype: arbitrary = arbitrary.astype(cudf.dtype("timedelta64[s]")) - buffer = as_device_buffer_like(arbitrary.view("|u1")) + buffer = as_buffer(arbitrary.view("|u1")) mask = None if nan_as_null is None or nan_as_null is True: data = build_column(buffer, dtype=arbitrary.dtype) @@ -2211,7 +2207,7 @@ def _construct_array( return arbitrary -def _mask_from_cuda_array_interface_desc(obj) -> Union[DeviceBufferLike, None]: +def _mask_from_cuda_array_interface_desc(obj) -> Union[Buffer, None]: desc = obj.__cuda_array_interface__ mask = desc.get("mask", None) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 1419b14e8c6..375a19f5423 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -23,7 +23,7 @@ ) from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype from cudf.core._compat import PANDAS_GE_120 -from cudf.core.buffer import DeviceBufferLike +from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase, as_column, column, string from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion from cudf.utils.utils import _fillna_natwise @@ -98,11 +98,11 @@ class DatetimeColumn(column.ColumnBase): Parameters ---------- - data : DeviceBufferLike + data : Buffer The datetime values dtype : np.dtype The data type - mask : DeviceBufferLike; optional + mask : Buffer; optional The validity mask """ @@ -121,9 +121,9 @@ class DatetimeColumn(column.ColumnBase): def __init__( self, - data: DeviceBufferLike, + data: Buffer, dtype: DtypeObj, - mask: DeviceBufferLike = None, + mask: Buffer = None, size: int = None, # TODO: make non-optional offset: int = 0, null_count: int = None, @@ -131,9 +131,7 @@ def __init__( dtype = cudf.dtype(dtype) if data.size % dtype.itemsize: - raise ValueError( - "DeviceBufferLike size must be divisible by element size" - ) + raise ValueError("Buffer size must be divisible by element size") if size is None: size = data.size // dtype.itemsize size = size - offset diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index e03802e6d8c..0beb07bb591 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -16,7 +16,7 @@ ) from cudf._typing import ColumnBinaryOperand, Dtype from cudf.api.types import is_integer_dtype, is_scalar -from cudf.core.buffer import as_device_buffer_like +from cudf.core.buffer import as_buffer from cudf.core.column import ColumnBase, as_column from cudf.core.dtypes import ( Decimal32Dtype, @@ -203,7 +203,7 @@ def from_arrow(cls, data: pa.Array): data_128 = cp.array(np.frombuffer(data.buffers()[1]).view("int32")) data_32 = data_128[::4].copy() return cls( - data=as_device_buffer_like(data_32.view("uint8")), + data=as_buffer(data_32.view("uint8")), size=len(data), dtype=dtype, offset=data.offset, @@ -290,7 +290,7 @@ def from_arrow(cls, data: pa.Array): data_128 = cp.array(np.frombuffer(data.buffers()[1]).view("int64")) data_64 = data_128[::2].copy() return cls( - data=as_device_buffer_like(data_64.view("uint8")), + data=as_buffer(data_64.view("uint8")), size=len(data), dtype=dtype, offset=data.offset, diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index a66c11c8bdc..32bda1e0b6c 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -36,7 +36,7 @@ is_number, is_scalar, ) -from cudf.core.buffer import DeviceBufferLike, as_device_buffer_like +from cudf.core.buffer import Buffer, as_buffer from cudf.core.column import ( ColumnBase, as_column, @@ -66,10 +66,10 @@ class NumericalColumn(NumericalBaseColumn): Parameters ---------- - data : DeviceBufferLike + data : Buffer dtype : np.dtype - The dtype associated with the data DeviceBufferLike - mask : DeviceBufferLike, optional + The dtype associated with the data Buffer + mask : Buffer, optional """ _nan_count: Optional[int] @@ -77,9 +77,9 @@ class NumericalColumn(NumericalBaseColumn): def __init__( self, - data: DeviceBufferLike, + data: Buffer, dtype: DtypeObj, - mask: DeviceBufferLike = None, + mask: Buffer = None, size: int = None, # TODO: make this non-optional offset: int = 0, null_count: int = None, @@ -87,9 +87,7 @@ def __init__( dtype = cudf.dtype(dtype) if data.size % dtype.itemsize: - raise ValueError( - "DeviceBufferLike size must be divisible by element size" - ) + raise ValueError("Buffer size must be divisible by element size") if size is None: size = (data.size // dtype.itemsize) - offset self._nan_count = None @@ -306,7 +304,7 @@ def normalize_binop_value( else: ary = full(len(self), other, dtype=other_dtype) return column.build_column( - data=as_device_buffer_like(ary), + data=as_buffer(ary), dtype=ary.dtype, mask=self.mask, ) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index c84e4ff4adb..1993d33e468 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -33,7 +33,7 @@ is_scalar, is_string_dtype, ) -from cudf.core.buffer import DeviceBufferLike +from cudf.core.buffer import Buffer from cudf.core.column import column, datetime from cudf.core.column.column import ColumnBase from cudf.core.column.methods import ColumnMethods @@ -5173,7 +5173,7 @@ class StringColumn(column.ColumnBase): Parameters ---------- - mask : DeviceBufferLike + mask : Buffer The validity mask offset : int Data offset @@ -5207,7 +5207,7 @@ class StringColumn(column.ColumnBase): def __init__( self, - mask: DeviceBufferLike = None, + mask: Buffer = None, size: int = None, # TODO: make non-optional offset: int = 0, null_count: int = None, diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index e6d688014fa..3dc923e7ded 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -13,7 +13,7 @@ from cudf import _lib as libcudf from cudf._typing import ColumnBinaryOperand, DatetimeLikeScalar, Dtype from cudf.api.types import is_scalar, is_timedelta64_dtype -from cudf.core.buffer import DeviceBufferLike +from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase, column, string from cudf.utils.dtypes import np_to_pa_dtype from cudf.utils.utils import _fillna_natwise @@ -40,13 +40,13 @@ class TimeDeltaColumn(ColumnBase): """ Parameters ---------- - data : DeviceBufferLike + data : Buffer The Timedelta values dtype : np.dtype The data type size : int Size of memory allocation. - mask : DeviceBufferLike; optional + mask : Buffer; optional The validity mask offset : int Data offset @@ -78,19 +78,17 @@ class TimeDeltaColumn(ColumnBase): def __init__( self, - data: DeviceBufferLike, + data: Buffer, dtype: Dtype, size: int = None, # TODO: make non-optional - mask: DeviceBufferLike = None, + mask: Buffer = None, offset: int = 0, null_count: int = None, ): dtype = cudf.dtype(dtype) if data.size % dtype.itemsize: - raise ValueError( - "DeviceBufferLike size must be divisible by element size" - ) + raise ValueError("Buffer size must be divisible by element size") if size is None: size = data.size // dtype.itemsize size = size - offset diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index d770f4f6130..cdd4da364c1 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -18,7 +18,7 @@ from numba.cuda import as_cuda_array import cudf -from cudf.core.buffer import Buffer, DeviceBufferLike +from cudf.core.buffer import Buffer from cudf.core.column import as_column, build_categorical_column, build_column # Implementation of interchange protocol classes @@ -64,12 +64,12 @@ class _CuDFBuffer: def __init__( self, - buf: DeviceBufferLike, + buf: Buffer, dtype: np.dtype, allow_copy: bool = True, ) -> None: """ - Use DeviceBufferLike object. + Use Buffer object. """ # Store the cudf buffer where the data resides as a private # attribute, so we can use it to retrieve the public attributes @@ -80,7 +80,7 @@ def __init__( @property def bufsize(self) -> int: """ - The DeviceBufferLike size in bytes. + The Buffer size in bytes. """ return self._buf.size @@ -627,7 +627,7 @@ def __dataframe__( Notes ----- -- Interpreting a raw pointer (as in ``DeviceBufferLike.ptr``) is annoying and +- Interpreting a raw pointer (as in ``Buffer.ptr``) is annoying and unsafe to do in pure Python. It's more general but definitely less friendly than having ``to_arrow`` and ``to_numpy`` methods. So for the buffers which lack ``__dlpack__`` (e.g., because the column dtype isn't supported by diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 84f528549e9..25b1b3895de 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -20,7 +20,7 @@ from cudf._typing import Dtype from cudf.core._compat import PANDAS_GE_130, PANDAS_GE_150 from cudf.core.abc import Serializable -from cudf.core.buffer import DeviceBufferLike +from cudf.core.buffer import Buffer from cudf.utils.docutils import doc_apply if PANDAS_GE_150: @@ -592,7 +592,7 @@ def serialize(self) -> Tuple[dict, list]: header: Dict[str, Any] = {} header["type-serialized"] = pickle.dumps(type(self)) - frames: List[DeviceBufferLike] = [] + frames: List[Buffer] = [] fields: Dict[str, Union[bytes, Tuple[Any, Tuple[int, int]]]] = {} diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 0628497fc29..f4f960f3274 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2974,7 +2974,7 @@ def as_index(arbitrary, nan_as_null=None, **kwargs) -> BaseIndex: Currently supported inputs are: * ``Column`` - * ``DeviceBufferLike`` + * ``Buffer`` * ``Series`` * ``Index`` * numba device array diff --git a/python/cudf/cudf/tests/test_buffer.py b/python/cudf/cudf/tests/test_buffer.py index eaa615a2839..30c75d07d07 100644 --- a/python/cudf/cudf/tests/test_buffer.py +++ b/python/cudf/cudf/tests/test_buffer.py @@ -4,7 +4,7 @@ import cupy as cp import pytest -from cudf.core.buffer import Buffer, DeviceBufferLike, as_device_buffer_like +from cudf.core.buffer import Buffer, as_buffer arr_len = 10 @@ -23,10 +23,10 @@ def test_buffer_from_cuda_iface_contiguous(data): data, expect_success = data if expect_success: - as_device_buffer_like(data.view("|u1")) + as_buffer(data.view("|u1")) else: with pytest.raises(ValueError): - as_device_buffer_like(data.view("|u1")) + as_buffer(data.view("|u1")) @pytest.mark.parametrize( @@ -41,17 +41,17 @@ def test_buffer_from_cuda_iface_contiguous(data): @pytest.mark.parametrize("dtype", ["uint8", "int8", "float32", "int32"]) def test_buffer_from_cuda_iface_dtype(data, dtype): data = data.astype(dtype) - buf = as_device_buffer_like(data) + buf = as_buffer(data) got = cp.array(buf).reshape(-1).view("uint8") expect = data.reshape(-1).view("uint8") assert (expect == got).all() -@pytest.mark.parametrize("creator", [Buffer, as_device_buffer_like]) +@pytest.mark.parametrize("creator", [Buffer, as_buffer]) def test_buffer_creation_from_any(creator: Callable[[object], Buffer]): ary = cp.arange(arr_len) b = creator(ary) - assert isinstance(b, DeviceBufferLike) + assert isinstance(b, Buffer) assert ary.__cuda_array_interface__["data"][0] == b.ptr assert ary.nbytes == b.size @@ -66,7 +66,7 @@ def test_buffer_creation_from_any(creator: Callable[[object], Buffer]): ) def test_buffer_repr(size, expect): ary = cp.arange(size, dtype="uint8") - buf = as_device_buffer_like(ary) + buf = as_buffer(ary) assert f"size={expect}" in repr(buf) @@ -83,7 +83,7 @@ def test_buffer_repr(size, expect): ) def test_buffer_slice(idx): ary = cp.arange(arr_len, dtype="uint8") - buf = as_device_buffer_like(ary) + buf = as_buffer(ary) expect = ary[idx] got = cp.array(buf[idx]) assert (expect == got).all() @@ -101,7 +101,7 @@ def test_buffer_slice(idx): ) def test_buffer_slice_fail(idx, err_msg): ary = cp.arange(arr_len, dtype="uint8") - buf = as_device_buffer_like(ary) + buf = as_buffer(ary) with pytest.raises(ValueError, match=err_msg): buf[idx] diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 4e2a26d31bd..467c88b200f 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -406,7 +406,7 @@ def test_column_view_string_slice(slc): ) def test_as_column_buffer(data, expected): actual_column = cudf.core.column.as_column( - cudf.core.buffer.as_device_buffer_like(data), dtype=data.dtype + cudf.core.buffer.as_buffer(data), dtype=data.dtype ) assert_eq(cudf.Series(actual_column), cudf.Series(expected)) diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py index 2a62a289747..9b9709b52c3 100644 --- a/python/cudf/cudf/tests/test_cuda_array_interface.py +++ b/python/cudf/cudf/tests/test_cuda_array_interface.py @@ -179,9 +179,7 @@ def test_cuda_array_interface_pytorch(): got = cudf.Series(tensor) assert_eq(got, series) - buffer = cudf.core.buffer.as_device_buffer_like( - cupy.ones(10, dtype=np.bool_) - ) + buffer = cudf.core.buffer.as_buffer(cupy.ones(10, dtype=np.bool_)) tensor = torch.tensor(buffer) got = cudf.Series(tensor, dtype=np.bool_) diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py index 1427a214a72..21343f19d79 100644 --- a/python/cudf/cudf/tests/test_pickling.py +++ b/python/cudf/cudf/tests/test_pickling.py @@ -7,7 +7,7 @@ import pytest from cudf import DataFrame, GenericIndex, RangeIndex, Series -from cudf.core.buffer import as_device_buffer_like +from cudf.core.buffer import as_buffer from cudf.testing._utils import assert_eq if sys.version_info < (3, 8): @@ -97,7 +97,7 @@ def test_pickle_index(): def test_pickle_buffer(): arr = np.arange(10).view("|u1") - buf = as_device_buffer_like(arr) + buf = as_buffer(arr) assert buf.size == arr.nbytes pickled = pickle.dumps(buf) unpacked = pickle.loads(pickled) diff --git a/python/cudf/cudf/tests/test_testing.py b/python/cudf/cudf/tests/test_testing.py index 60f01d567ef..c3dfeac9a3f 100644 --- a/python/cudf/cudf/tests/test_testing.py +++ b/python/cudf/cudf/tests/test_testing.py @@ -429,7 +429,7 @@ def test_assert_column_memory_slice(arrow_arrays): def test_assert_column_memory_basic_same(arrow_arrays): data = cudf.core.column.ColumnBase.from_arrow(arrow_arrays) - buf = cudf.core.buffer.as_device_buffer_like(data.base_data) + buf = cudf.core.buffer.as_buffer(data.base_data) left = cudf.core.column.build_column(buf, dtype=np.int32) right = cudf.core.column.build_column(buf, dtype=np.int32) diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 63bc6d59524..6c9ed61c7a5 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -15,7 +15,7 @@ import cudf import cudf.api.types from cudf.core import column -from cudf.core.buffer import as_device_buffer_like +from cudf.core.buffer import as_buffer # The size of the mask in bytes mask_dtype = cudf.api.types.dtype(np.int32) @@ -283,8 +283,8 @@ def pa_mask_buffer_to_mask(mask_buf, size): if mask_buf.size < mask_size: dbuf = rmm.DeviceBuffer(size=mask_size) dbuf.copy_from_host(np.asarray(mask_buf).view("u1")) - return as_device_buffer_like(dbuf) - return as_device_buffer_like(mask_buf) + return as_buffer(dbuf) + return as_buffer(mask_buf) def _isnat(val): From 28f86bd8b0185d865aa9d5e890b0c11f19d82d88 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 27 Oct 2022 10:19:17 +0200 Subject: [PATCH 02/16] fix test_buffer_slice_fail --- python/cudf/cudf/tests/test_buffer.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/tests/test_buffer.py b/python/cudf/cudf/tests/test_buffer.py index 30c75d07d07..96023903253 100644 --- a/python/cudf/cudf/tests/test_buffer.py +++ b/python/cudf/cudf/tests/test_buffer.py @@ -90,18 +90,18 @@ def test_buffer_slice(idx): @pytest.mark.parametrize( - "idx, err_msg", + "idx, err_type, err_msg", [ - (1, "index must be an slice"), - (slice(3, 2), "size cannot be negative"), - (slice(1, 2, 2), "slice must be contiguous"), - (slice(1, 2, -1), "slice must be contiguous"), - (slice(3, 2, -1), "slice must be contiguous"), + (1, TypeError, "Argument 'key' has incorrect type"), + (slice(3, 2), ValueError, "size cannot be negative"), + (slice(1, 2, 2), ValueError, "slice must be C-contiguous"), + (slice(1, 2, -1), ValueError, "slice must be C-contiguous"), + (slice(3, 2, -1), ValueError, "slice must be C-contiguous"), ], ) -def test_buffer_slice_fail(idx, err_msg): +def test_buffer_slice_fail(idx, err_type, err_msg): ary = cp.arange(arr_len, dtype="uint8") buf = as_buffer(ary) - with pytest.raises(ValueError, match=err_msg): + with pytest.raises(err_type, match=err_msg): buf[idx] From 3b5def9cd28a9734047a83b491c457c98188fa05 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 27 Oct 2022 11:08:19 +0200 Subject: [PATCH 03/16] generalize deserialize() --- python/cudf/cudf/core/buffer/buffer.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index cc582ed041e..2257aee8fb8 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -144,19 +144,12 @@ def serialize(self) -> Tuple[dict, list]: @classmethod def deserialize(cls, header: dict, frames: list) -> Buffer: - assert ( - header["frame_count"] == 1 - ), "Only expecting to deserialize Buffer with a single frame." - buf = cls(frames[0], **header["constructor-kwargs"]) - - if header["desc"]["shape"] != buf.__cuda_array_interface__["shape"]: - raise ValueError( - f"Received a `Buffer` with the wrong size." - f" Expected {header['desc']['shape']}, " - f"but got {buf.__cuda_array_interface__['shape']}" - ) - - return buf + if header["frame_count"] != 1: + raise ValueError("Deserializing a Buffer expect a single frame") + frame = frames[0] + if isinstance(frame, cls): + return frame # The frame is already deserialized + return cls(frame) def __repr__(self) -> str: return ( From 6ec27ae1d4b5c4541c1d3f22697c3894106f0e7b Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 27 Oct 2022 11:22:06 +0200 Subject: [PATCH 04/16] impl. _init_from_host_memory() --- python/cudf/cudf/core/buffer/buffer.py | 61 ++++++++++++++++---------- 1 file changed, 37 insertions(+), 24 deletions(-) diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index 2257aee8fb8..1352f3b1711 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -6,7 +6,7 @@ import pickle from typing import Any, Dict, Mapping, Sequence, Tuple, Union -import numpy as np +import numpy import rmm @@ -54,32 +54,45 @@ def __init__( self._ptr = data self._size = size self._owner = owner - else: - if size is not None or owner is not None: - raise ValueError( - "`size` and `owner` must be None when " - "`data` is a buffer-like object" - ) - - # `data` is a buffer-like object - buf: Any = data - if isinstance(buf, rmm.DeviceBuffer): - self._ptr = buf.ptr - self._size = buf.size - self._owner = buf - return - iface = getattr(buf, "__cuda_array_interface__", None) - if iface: - ptr, size = get_ptr_and_size(iface) - self._ptr = ptr - self._size = size - self._owner = buf - return - ptr, size = get_ptr_and_size(np.asarray(buf).__array_interface__) - buf = rmm.DeviceBuffer(ptr=ptr, size=size) + return + if size is not None or owner is not None: + raise ValueError( + "`size` and `owner` must be None when " + "`data` is a buffer-like object" + ) + # `data` is a buffer-like or array-like object + buf: Any = data + if isinstance(buf, rmm.DeviceBuffer): self._ptr = buf.ptr self._size = buf.size self._owner = buf + return + iface = getattr(buf, "__cuda_array_interface__", None) + if iface: + self._ptr, self._size = get_ptr_and_size(iface) + self._owner = buf + return + # At this point, `buf` must represents host memory, let's deligate + # to `._init_from_host_memory()` + buf = memoryview(buf) + if not buf.c_contiguous: + raise ValueError("`data` must be C-contiguous") + self._init_from_host_memory(buf) + + def _init_from_host_memory(self, data: memoryview) -> None: + """Initialize in the case where `data` represents host memory. + + Sub-classes can overwrite this and still use `super().__init__()` + to handle the (trivial) case where `data` represents device memory. + + This default implemention copies `data` to a newly allocated RMM + device buffer. + """ + ptr, size = get_ptr_and_size(numpy.asarray(data).__array_interface__) + buf = rmm.DeviceBuffer(ptr=ptr, size=size) + self._ptr = buf.ptr + self._size = buf.size + self._owner = buf def _getitem(self, offset: int, size: int) -> Buffer: """ From cb1b2243851c8c6ede9d9c92f3eec876bdeb94d0 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 27 Oct 2022 11:43:46 +0200 Subject: [PATCH 05/16] doc --- python/cudf/cudf/_lib/column.pyx | 4 ++-- python/cudf/cudf/core/buffer/utils.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 5b7f89f5881..7ad02404633 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -478,8 +478,8 @@ cdef class Column: Given a ``cudf::column_view``, constructs a ``cudf.Column`` from it, along with referencing an ``owner`` Python object that owns the memory lifetime. If ``owner`` is a ``cudf.Column``, we reach inside of it and - make the owner of each newly created ``Buffer`` the - respective ``Buffer`` from the ``owner`` ``cudf.Column``. + make the owner of each newly created ``Buffer`` the respective + ``Buffer`` from the ``owner`` ``cudf.Column``. If ``owner`` is ``None``, we allocate new memory for the resulting ``cudf.Column``. """ diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py index 5d189632281..9139509e0cb 100644 --- a/python/cudf/cudf/core/buffer/utils.py +++ b/python/cudf/cudf/core/buffer/utils.py @@ -30,8 +30,7 @@ def as_buffer(obj: Any) -> Buffer: Return ------ Buffer - A buffer instance that represents the device memory - of `obj`. + A buffer instance that represents the device memory of `obj`. """ if isinstance(obj, Buffer): From f5e49a9044f3237a124506972e9f95f8e27ba0fc Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 28 Oct 2022 10:12:11 +0200 Subject: [PATCH 06/16] deligate to as_buffer --- python/cudf/cudf/core/buffer/buffer.py | 29 ++++++++-- python/cudf/cudf/core/buffer/utils.py | 75 +++++++++++++++++++------- 2 files changed, 81 insertions(+), 23 deletions(-) diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index 1352f3b1711..df583e98edb 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -16,10 +16,9 @@ class Buffer(Serializable): - """ - A Buffer represents device memory. + """A Buffer represents device memory. - Usually Buffers will be created using `as_buffer(obj)`, + Usually buffers will be created using `as_buffer(obj)`, which will make sure that `obj` is buffer and not a `Buffer` necessarily. @@ -94,6 +93,30 @@ def _init_from_host_memory(self, data: memoryview) -> None: self._size = buf.size self._owner = buf + @classmethod + def from_device_memory(cls, data: Any) -> Buffer: + ret = cls.__new__(cls) + if isinstance(data, rmm.DeviceBuffer): + ret._ptr = data.ptr + ret._size = data.size + ret._owner = data + else: + ret._ptr, ret._size = get_ptr_and_size( + data.__cuda_array_interface__ + ) + ret._owner = data + return ret + + @classmethod + def from_host_memory(cls, data: Any) -> Buffer: + ret = cls.__new__(cls) + ptr, size = get_ptr_and_size(numpy.asarray(data).__array_interface__) + buf = rmm.DeviceBuffer(ptr=ptr, size=size) + ret._ptr = buf.ptr + ret._size = buf.size + ret._owner = buf + return ret + def _getitem(self, offset: int, size: int) -> Buffer: """ Sub-classes can overwrite this to implement __getitem__ diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py index 9139509e0cb..a21a21c506d 100644 --- a/python/cudf/cudf/core/buffer/utils.py +++ b/python/cudf/cudf/core/buffer/utils.py @@ -2,37 +2,72 @@ from __future__ import annotations -from typing import Any +from types import SimpleNamespace +from typing import Any, Union from cudf.core.buffer.buffer import Buffer -def as_buffer(obj: Any) -> Buffer: - """ - Factory function to wrap `obj` in a Buffer object. - - If `obj` isn't buffer already, a new buffer that points to the memory of - `obj` is created. If `obj` represents host memory, it is copied to a new - `rmm.DeviceBuffer` device allocation. Otherwise, the data of `obj` is - **not** copied, instead the new buffer keeps a reference to `obj` in order - to retain the lifetime of `obj`. +def as_buffer( + data: Union[int, Any], + *, + size: int = None, + owner: object = None, +) -> Buffer: + """Factory function to wrap `data` in a Buffer object. - Raises ValueError if the data of `obj` isn't C-contiguous. + If `data` isn't a buffer already, a new buffer that points to the memory of + `data` is created. If `data` represents host memory, it is copied to a new + `rmm.DeviceBuffer` device allocation. Otherwise, the memory of `data` is + **not** copied, instead the new buffer keeps a reference to `data` in order + to retain its lifetime. Parameters ---------- - obj : buffer-like or array-like - An object that exposes either device or host memory through - `__array_interface__`, `__cuda_array_interface__`, or the - buffer protocol. If `obj` represents host memory, data will - be copied. + data : int or buffer-like or array-like + An integer representing a pointer to device memory or a buffer-like + or array-like object. When not an integer, `size` and `owner` must + be None. + size : int, optional + Size of device memory in bytes. Must be specified if `data` is an + integer. + owner : object, optional + Python object to which the lifetime of the memory allocation is tied. + A reference to this object is kept in the returned Buffer. Return ------ Buffer - A buffer instance that represents the device memory of `obj`. + A buffer instance that represents the device memory of `data`. """ - if isinstance(obj, Buffer): - return obj - return Buffer(obj) + if isinstance(data, Buffer): + return data + + # We handle the integer argument in the factory function by wrapping + # the pointer in a `__cuda_array_interface__` exposing object so that + # the Buffer (and its sub-classes) do not have to. + if isinstance(data, int): + if size is None: + raise ValueError( + "size must be specified when `data` is an integer" + ) + data = SimpleNamespace( + __cuda_array_interface__={ + "data": (data, False), + "shape": (size,), + "strides": None, + "typestr": "|u1", + "version": 0, + }, + owner=owner, + ) + elif size is not None or owner is not None: + raise ValueError( + "`size` and `owner` must be None when " + "`data` is a buffer-like or array-like object" + ) + + if hasattr(data, "__cuda_array_interface__"): + return Buffer.from_device_memory(data) + return Buffer.from_host_memory(data) From 0286814fae5347f6241cc67f3ff648e5403b21a7 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 28 Oct 2022 10:16:45 +0200 Subject: [PATCH 07/16] use as_buffer() everywhere --- python/cudf/cudf/_lib/column.pyx | 4 ++-- python/cudf/cudf/_lib/copying.pyx | 4 ++-- python/cudf/cudf/core/column/column.py | 2 +- python/cudf/cudf/core/df_protocol.py | 14 ++++++-------- python/cudf/cudf/tests/test_buffer.py | 8 +++----- python/cudf/cudf/tests/test_df_protocol.py | 4 ++-- .../strings_udf/strings_udf/_lib/cudf_jit_udf.pyx | 2 +- 7 files changed, 17 insertions(+), 21 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 7ad02404633..1fb0d9e3169 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -509,7 +509,7 @@ cdef class Column: size=(size+offset) * dtype.itemsize) ) else: - data = Buffer( + data = as_buffer( data=data_ptr, size=(base_size) * dtype.itemsize, owner=data_owner @@ -552,7 +552,7 @@ cdef class Column: ) ) else: - mask = Buffer( + mask = as_buffer( data=mask_ptr, size=bitmask_allocation_size_bytes(base_size), owner=mask_owner diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index a9cfbbbe223..3186c5e2da5 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -14,7 +14,7 @@ from libcpp.vector cimport vector from rmm._lib.device_buffer cimport DeviceBuffer import cudf -from cudf.core.buffer import Buffer +from cudf.core.buffer import Buffer, as_buffer from cudf._lib.column cimport Column @@ -724,7 +724,7 @@ cdef class _CPackedColumns: header = {} frames = [] - gpu_data = Buffer( + gpu_data = as_buffer( data=self.gpu_data_ptr, size=self.gpu_data_size, owner=self diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 31ada81b7fd..22f8d27f9e8 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2219,7 +2219,7 @@ def _mask_from_cuda_array_interface_desc(obj) -> Union[Buffer, None]: typecode = typestr[1] if typecode == "t": mask_size = bitmask_allocation_size_bytes(nelem) - mask = Buffer(data=ptr, size=mask_size, owner=obj) + mask = as_buffer(data=ptr, size=mask_size, owner=obj) elif typecode == "b": col = as_column(mask) mask = bools_to_mask(col) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index cdd4da364c1..b29fc41e5b4 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -18,7 +18,7 @@ from numba.cuda import as_cuda_array import cudf -from cudf.core.buffer import Buffer +from cudf.core.buffer import Buffer, as_buffer from cudf.core.column import as_column, build_categorical_column, build_column # Implementation of interchange protocol classes @@ -721,7 +721,7 @@ def _protocol_to_cudf_column_numeric( _dbuffer, _ddtype = buffers["data"] _check_buffer_is_on_gpu(_dbuffer) cudfcol_num = build_column( - Buffer(data=_dbuffer.ptr, size=_dbuffer.bufsize, owner=None), + as_buffer(data=_dbuffer.ptr, size=_dbuffer.bufsize, owner=None), protocol_dtype_to_cupy_dtype(_ddtype), ) return _set_missing_values(col, cudfcol_num), buffers @@ -751,9 +751,7 @@ def _set_missing_values( valid_mask = protocol_col.get_buffers()["validity"] if valid_mask is not None: bitmask = cp.asarray( - Buffer( - data=valid_mask[0].ptr, size=valid_mask[0].bufsize, owner=None - ), + as_buffer(data=valid_mask[0].ptr, size=valid_mask[0].bufsize), cp.bool8, ) cudf_col[~bitmask] = None @@ -792,7 +790,7 @@ def _protocol_to_cudf_column_categorical( _check_buffer_is_on_gpu(codes_buffer) cdtype = protocol_dtype_to_cupy_dtype(codes_dtype) codes = build_column( - Buffer(data=codes_buffer.ptr, size=codes_buffer.bufsize, owner=None), + as_buffer(data=codes_buffer.ptr, size=codes_buffer.bufsize), cdtype, ) @@ -824,7 +822,7 @@ def _protocol_to_cudf_column_string( data_buffer, data_dtype = buffers["data"] _check_buffer_is_on_gpu(data_buffer) encoded_string = build_column( - Buffer(data=data_buffer.ptr, size=data_buffer.bufsize, owner=None), + as_buffer(data=data_buffer.ptr, size=data_buffer.bufsize), protocol_dtype_to_cupy_dtype(data_dtype), ) @@ -834,7 +832,7 @@ def _protocol_to_cudf_column_string( offset_buffer, offset_dtype = buffers["offsets"] _check_buffer_is_on_gpu(offset_buffer) offsets = build_column( - Buffer(data=offset_buffer.ptr, size=offset_buffer.bufsize, owner=None), + as_buffer(data=offset_buffer.ptr, size=offset_buffer.bufsize), protocol_dtype_to_cupy_dtype(offset_dtype), ) diff --git a/python/cudf/cudf/tests/test_buffer.py b/python/cudf/cudf/tests/test_buffer.py index 96023903253..5ed5750f29b 100644 --- a/python/cudf/cudf/tests/test_buffer.py +++ b/python/cudf/cudf/tests/test_buffer.py @@ -1,5 +1,4 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. -from typing import Callable import cupy as cp import pytest @@ -47,10 +46,9 @@ def test_buffer_from_cuda_iface_dtype(data, dtype): assert (expect == got).all() -@pytest.mark.parametrize("creator", [Buffer, as_buffer]) -def test_buffer_creation_from_any(creator: Callable[[object], Buffer]): +def test_buffer_creation_from_any(): ary = cp.arange(arr_len) - b = creator(ary) + b = as_buffer(ary) assert isinstance(b, Buffer) assert ary.__cuda_array_interface__["data"][0] == b.ptr assert ary.nbytes == b.size @@ -58,7 +56,7 @@ def test_buffer_creation_from_any(creator: Callable[[object], Buffer]): with pytest.raises( ValueError, match="size must be specified when `data` is an integer" ): - Buffer(42) + as_buffer(42) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index 7b83eec9b63..6f8305e6751 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -7,7 +7,7 @@ import pytest import cudf -from cudf.core.buffer import Buffer +from cudf.core.buffer import as_buffer from cudf.core.column import build_column from cudf.core.df_protocol import ( DataFrameObject, @@ -25,7 +25,7 @@ def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol): device_id = cp.asarray(cudfcol.data).device.id assert buf.__dlpack_device__() == (2, device_id) col_from_buf = build_column( - Buffer(data=buf.ptr, size=buf.bufsize, owner=None), + as_buffer(data=buf.ptr, size=buf.bufsize), protocol_dtype_to_cupy_dtype(dtype), ) # check that non null values are the equals as nulls are represented diff --git a/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx b/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx index bb1892a4d26..31bff73b1f9 100644 --- a/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx +++ b/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx @@ -21,4 +21,4 @@ def to_string_view_array(Column strings_col): c_buffer = move(cpp_to_string_view_array(input_view)) device_buffer = DeviceBuffer.c_from_unique_ptr(move(c_buffer)) - return Buffer(device_buffer) + return as_buffer(device_buffer) From 8ebeb9af8826f4068e2f048d91f2a313ae11b35f Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 28 Oct 2022 11:12:40 +0200 Subject: [PATCH 08/16] doc --- python/cudf/cudf/core/buffer/buffer.py | 183 +++++++++++++------------ 1 file changed, 95 insertions(+), 88 deletions(-) diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index df583e98edb..1619267bdf8 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -4,7 +4,7 @@ import math import pickle -from typing import Any, Dict, Mapping, Sequence, Tuple, Union +from typing import Any, Dict, Mapping, Sequence, Tuple import numpy @@ -18,104 +18,79 @@ class Buffer(Serializable): """A Buffer represents device memory. - Usually buffers will be created using `as_buffer(obj)`, - which will make sure that `obj` is buffer and not a `Buffer` - necessarily. + Usually the factory function `as_buffer` should be used to + create a Buffer instance. Parameters ---------- - data : int or buffer-like or array-like - An integer representing a pointer to device memory or a buffer-like - or array-like object. When not an integer, `size` and `owner` must - be None. - size : int, optional - Size of device memory in bytes. Must be specified if `data` is an - integer. - owner : object, optional + ptr : int + An integer representing a pointer to device memory. + size : int + Size of device memory in bytes. + owner : object Python object to which the lifetime of the memory allocation is tied. - A reference to this object is kept in the returned Buffer. """ _ptr: int _size: int _owner: object - def __init__( - self, data: Union[int, Any], *, size: int = None, owner: object = None - ): - if isinstance(data, int): - if size is None: - raise ValueError( - "size must be specified when `data` is an integer" - ) - if size < 0: - raise ValueError("size cannot be negative") - self._ptr = data - self._size = size - self._owner = owner - return - if size is not None or owner is not None: - raise ValueError( - "`size` and `owner` must be None when " - "`data` is a buffer-like object" - ) - # `data` is a buffer-like or array-like object - buf: Any = data - if isinstance(buf, rmm.DeviceBuffer): - self._ptr = buf.ptr - self._size = buf.size - self._owner = buf - return - iface = getattr(buf, "__cuda_array_interface__", None) - if iface: - self._ptr, self._size = get_ptr_and_size(iface) - self._owner = buf - return - # At this point, `buf` must represents host memory, let's deligate - # to `._init_from_host_memory()` - buf = memoryview(buf) - if not buf.c_contiguous: - raise ValueError("`data` must be C-contiguous") - self._init_from_host_memory(buf) - - def _init_from_host_memory(self, data: memoryview) -> None: - """Initialize in the case where `data` represents host memory. - - Sub-classes can overwrite this and still use `super().__init__()` - to handle the (trivial) case where `data` represents device memory. - - This default implemention copies `data` to a newly allocated RMM - device buffer. - """ - ptr, size = get_ptr_and_size(numpy.asarray(data).__array_interface__) - buf = rmm.DeviceBuffer(ptr=ptr, size=size) - self._ptr = buf.ptr - self._size = buf.size - self._owner = buf + def __init__(self, ptr: int, size: int, owner: object): + if size < 0: + raise ValueError("size cannot be negative") + self._ptr = ptr + self._size = size + self._owner = owner @classmethod def from_device_memory(cls, data: Any) -> Buffer: - ret = cls.__new__(cls) + """Create a Buffer from an object exposing `__cuda_array_interface__`. + + No data is being copied. + + Parameters + ---------- + data : device-buffer-like + An object implementing the CUDA Array Interface. + + Returns + ------- + Buffer + Buffer representing the same device memory as `data` + """ + if isinstance(data, rmm.DeviceBuffer): - ret._ptr = data.ptr - ret._size = data.size - ret._owner = data - else: - ret._ptr, ret._size = get_ptr_and_size( - data.__cuda_array_interface__ - ) - ret._owner = data - return ret + return cls(data.ptr, data.size, owner=data) # Common case shortcut + + ptr, size = get_ptr_and_size(data.__cuda_array_interface__) + return cls(ptr, size, owner=data) @classmethod def from_host_memory(cls, data: Any) -> Buffer: - ret = cls.__new__(cls) + """Create a Buffer from a buffer or array like object + + Data must implement `__array_interface__`, the buffer protocol, and/or + be convertible to a buffer object using `numpy.asarray()` + + The host memory is copied to a new device allocation. + + Parameters + ---------- + data : array-like or buffer-like + An object that represens host memory. + + Returns + ------- + Buffer + Buffer representing a copy of `data`. + """ + + # Extract pointer and size ptr, size = get_ptr_and_size(numpy.asarray(data).__array_interface__) + # And copy to device memory buf = rmm.DeviceBuffer(ptr=ptr, size=size) - ret._ptr = buf.ptr - ret._size = buf.size - ret._owner = buf - return ret + # Then we can crate from device memory + return cls.from_device_memory(buf) def _getitem(self, offset: int, size: int) -> Buffer: """ @@ -123,10 +98,11 @@ def _getitem(self, offset: int, size: int) -> Buffer: without having to handle non-slice inputs. """ return self.__class__( - data=self.ptr + offset, size=size, owner=self.owner + ptr=self.ptr + offset, size=size, owner=self.owner ) def __getitem__(self, key: slice) -> Buffer: + """Create a new slice of the buffer.""" if not isinstance(key, slice): raise TypeError( "Argument 'key' has incorrect type " @@ -139,22 +115,27 @@ def __getitem__(self, key: slice) -> Buffer: @property def size(self) -> int: + """Size of the buffer in bytes.""" return self._size @property def nbytes(self) -> int: + """Size of the buffer in bytes.""" return self._size @property def ptr(self) -> int: + """Device pointer to the start of the buffer.""" return self._ptr @property def owner(self) -> Any: + """Object owning the memory of the buffer.""" return self._owner @property - def __cuda_array_interface__(self) -> dict: + def __cuda_array_interface__(self) -> Mapping: + """Implementation of the CUDA Array Interface.""" return { "data": (self.ptr, False), "shape": (self.size,), @@ -164,12 +145,24 @@ def __cuda_array_interface__(self) -> dict: } def memoryview(self) -> memoryview: + """Read-only access to the buffer through host memory.""" host_buf = bytearray(self.size) rmm._lib.device_buffer.copy_ptr_to_host(self.ptr, host_buf) return memoryview(host_buf).toreadonly() def serialize(self) -> Tuple[dict, list]: - header = {} # type: Dict[Any, Any] + """Serialize the buffer into header and frames. + + The frames can be a mixture of memoryview and Buffer objects. + + Returns + ------- + Tuple[dict, List] + The first element of the returned tuple is a dict containing any + serializable metadata required to reconstruct the object. The + second element is a list containing Buffers and memoryviews. + """ + header: Dict[str, Any] = {} header["type-serialized"] = pickle.dumps(type(self)) header["constructor-kwargs"] = {} header["desc"] = self.__cuda_array_interface__.copy() @@ -180,12 +173,28 @@ def serialize(self) -> Tuple[dict, list]: @classmethod def deserialize(cls, header: dict, frames: list) -> Buffer: + """Create an Buffer from a serialized representation. + + Parameters + ---------- + header : dict + The metadata required to reconstruct the object. + frames : list + The Buffer and memoryview that makes up the Buffer. + + Returns + ------- + Buffer + The deserialized Buffer. + """ + from cudf.core.buffer import as_buffer + if header["frame_count"] != 1: raise ValueError("Deserializing a Buffer expect a single frame") frame = frames[0] if isinstance(frame, cls): return frame # The frame is already deserialized - return cls(frame) + return as_buffer(frame) def __repr__(self) -> str: return ( @@ -197,8 +206,7 @@ def __repr__(self) -> str: def is_c_contiguous( shape: Sequence[int], strides: Sequence[int], itemsize: int ) -> bool: - """ - Determine if shape and strides are C-contiguous + """Determine if shape and strides are C-contiguous Parameters ---------- @@ -226,8 +234,7 @@ def is_c_contiguous( def get_ptr_and_size(array_interface: Mapping) -> Tuple[int, int]: - """ - Retrieve the pointer and size from an array interface. + """Retrieve the pointer and size from an array interface. Raises ValueError if array isn't C-contiguous. From b69a9ef053261c44add9967c36b24859389483f4 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 28 Oct 2022 12:10:33 +0200 Subject: [PATCH 09/16] clean up type hints and serialization --- python/cudf/cudf/core/buffer/buffer.py | 31 +++++++++++++++++--------- python/cudf/cudf/tests/test_buffer.py | 21 +++++++++++++++++ 2 files changed, 41 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index 1619267bdf8..61b023ffbbf 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -4,7 +4,7 @@ import math import pickle -from typing import Any, Dict, Mapping, Sequence, Tuple +from typing import Any, Dict, Mapping, Sequence, Tuple, Type, TypeVar import numpy @@ -14,6 +14,8 @@ from cudf.core.abc import Serializable from cudf.utils.string import format_bytes +T = TypeVar("T", bound="Buffer") + class Buffer(Serializable): """A Buffer represents device memory. @@ -43,7 +45,7 @@ def __init__(self, ptr: int, size: int, owner: object): self._owner = owner @classmethod - def from_device_memory(cls, data: Any) -> Buffer: + def from_device_memory(cls: Type[T], data: Any) -> T: """Create a Buffer from an object exposing `__cuda_array_interface__`. No data is being copied. @@ -66,7 +68,7 @@ def from_device_memory(cls, data: Any) -> Buffer: return cls(ptr, size, owner=data) @classmethod - def from_host_memory(cls, data: Any) -> Buffer: + def from_host_memory(cls: Type[T], data: Any) -> T: """Create a Buffer from a buffer or array like object Data must implement `__array_interface__`, the buffer protocol, and/or @@ -165,14 +167,12 @@ def serialize(self) -> Tuple[dict, list]: header: Dict[str, Any] = {} header["type-serialized"] = pickle.dumps(type(self)) header["constructor-kwargs"] = {} - header["desc"] = self.__cuda_array_interface__.copy() - header["desc"]["strides"] = (1,) header["frame_count"] = 1 frames = [self] return header, frames @classmethod - def deserialize(cls, header: dict, frames: list) -> Buffer: + def deserialize(cls: Type[T], header: dict, frames: list) -> T: """Create an Buffer from a serialized representation. Parameters @@ -187,19 +187,28 @@ def deserialize(cls, header: dict, frames: list) -> Buffer: Buffer The deserialized Buffer. """ - from cudf.core.buffer import as_buffer - if header["frame_count"] != 1: raise ValueError("Deserializing a Buffer expect a single frame") frame = frames[0] if isinstance(frame, cls): return frame # The frame is already deserialized - return as_buffer(frame) + + # TODO: remove handling of "constructor-kwargs" used by cuML's + # `CumlArray`, which will require `CumlArray` to implement + # its own deserialize. + if header["constructor-kwargs"]: + return cls(frame, **header["constructor-kwargs"]) + + if hasattr(frame, "__cuda_array_interface__"): + return cls.from_device_memory(frame) + return cls.from_host_memory(frame) def __repr__(self) -> str: + klass = self.__class__ + name = f"{klass.__module__}.{klass.__qualname__}" return ( - f"" ) diff --git a/python/cudf/cudf/tests/test_buffer.py b/python/cudf/cudf/tests/test_buffer.py index 5ed5750f29b..4ec88591834 100644 --- a/python/cudf/cudf/tests/test_buffer.py +++ b/python/cudf/cudf/tests/test_buffer.py @@ -103,3 +103,24 @@ def test_buffer_slice_fail(idx, err_type, err_msg): with pytest.raises(err_type, match=err_msg): buf[idx] + + +class SerializeTestBuffer(Buffer): + def __init__(self, data, extra_arg): + super().__init__(data.ptr, data.size, data.owner) + self.extra_arg = extra_arg + + def serialize(self): + header, frames = super().serialize() + header["constructor-kwargs"] = {"extra_arg", self.extra_arg} + return header, frames + + +def test_serialize_constructor_kwargs(): + ary = cp.arange(arr_len, dtype="uint8") + buf = SerializeTestBuffer(as_buffer(ary), extra_arg="my-extra-argument") + out = SerializeTestBuffer.deserialize(*buf.serialize()) + assert out.ptr == buf.ptr + assert out.size == buf.size + assert out.owner is buf.owner + assert out.extra_arg == buf.extra_arg From b0b185ea3d29d5007bef897bf8863bfb24d737dc Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 28 Oct 2022 12:51:24 +0200 Subject: [PATCH 10/16] fix doc --- python/cudf/cudf/core/series.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 07e1782d788..b632ddd714b 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1831,8 +1831,6 @@ def data(self): 2 3 3 4 dtype: int64 - >>> series.data - >>> np.array(series.data.memoryview()) array([1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0], dtype=uint8) From 0267bca0d21785f7f00e8d89bc562d276e726063 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 28 Oct 2022 16:08:16 +0200 Subject: [PATCH 11/16] strings_udf: fix missing import of as_buffer --- python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx b/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx index 31bff73b1f9..6644d72b812 100644 --- a/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx +++ b/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx @@ -3,7 +3,7 @@ from libcpp.memory cimport unique_ptr from libcpp.utility cimport move -from cudf.core.buffer import Buffer +from cudf.core.buffer import as_buffer from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column, column_view From eb721f5abc64a1ff8e4b212622da513f99fe0270 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 1 Nov 2022 10:10:33 +0100 Subject: [PATCH 12/16] Buffer.__init__ now raise exception --- python/cudf/cudf/core/buffer/buffer.py | 73 +++++++++++++++++++++----- python/cudf/cudf/core/buffer/utils.py | 14 +---- python/cudf/cudf/tests/test_buffer.py | 2 +- 3 files changed, 62 insertions(+), 27 deletions(-) diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index 61b023ffbbf..ce183ec180e 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -4,6 +4,7 @@ import math import pickle +from types import SimpleNamespace from typing import Any, Dict, Mapping, Sequence, Tuple, Type, TypeVar import numpy @@ -17,6 +18,41 @@ T = TypeVar("T", bound="Buffer") +def cuda_array_interface_wrapper(ptr: int, size: int, owner: object = None): + """Wrap device pointer in an object that exposes `__cuda_array_interface__` + + Parameters + ---------- + ptr : int + An integer representing a pointer to device memory. + size : int, optional + Size of device memory in bytes. + owner : object, optional + Python object to which the lifetime of the memory allocation is tied. + A reference to this object is kept in the returned wrapper object. + + Return + ------ + SimpleNamespace + An object that exposes `__cuda_array_interface__` and keeps a reference + to `owner`. + """ + + if size < 0: + raise ValueError("size cannot be negative") + + return SimpleNamespace( + __cuda_array_interface__={ + "data": (ptr, False), + "shape": (size,), + "strides": None, + "typestr": "|u1", + "version": 0, + }, + owner=owner, + ) + + class Buffer(Serializable): """A Buffer represents device memory. @@ -38,11 +74,10 @@ class Buffer(Serializable): _owner: object def __init__(self, ptr: int, size: int, owner: object): - if size < 0: - raise ValueError("size cannot be negative") - self._ptr = ptr - self._size = size - self._owner = owner + raise ValueError( + f"do not create a {self.__class__} directly, please " + "use the factory function `cudf.core.buffer.as_buffer`" + ) @classmethod def from_device_memory(cls: Type[T], data: Any) -> T: @@ -61,11 +96,19 @@ def from_device_memory(cls: Type[T], data: Any) -> T: Buffer representing the same device memory as `data` """ - if isinstance(data, rmm.DeviceBuffer): - return cls(data.ptr, data.size, owner=data) # Common case shortcut - - ptr, size = get_ptr_and_size(data.__cuda_array_interface__) - return cls(ptr, size, owner=data) + # Bypass `__init__` and initialize attributes manually + ret = cls.__new__(cls) + ret._owner = data + if isinstance(data, rmm.DeviceBuffer): # Common case shortcut + ret._ptr = data.ptr + ret._size = data.size + else: + ret._ptr, ret._size = get_ptr_and_size( + data.__cuda_array_interface__ + ) + if ret.size < 0: + raise ValueError("size cannot be negative") + return ret @classmethod def from_host_memory(cls: Type[T], data: Any) -> T: @@ -89,9 +132,9 @@ def from_host_memory(cls: Type[T], data: Any) -> T: # Extract pointer and size ptr, size = get_ptr_and_size(numpy.asarray(data).__array_interface__) - # And copy to device memory + # Copy to device memory buf = rmm.DeviceBuffer(ptr=ptr, size=size) - # Then we can crate from device memory + # Create from device memory return cls.from_device_memory(buf) def _getitem(self, offset: int, size: int) -> Buffer: @@ -99,8 +142,10 @@ def _getitem(self, offset: int, size: int) -> Buffer: Sub-classes can overwrite this to implement __getitem__ without having to handle non-slice inputs. """ - return self.__class__( - ptr=self.ptr + offset, size=size, owner=self.owner + return self.from_device_memory( + cuda_array_interface_wrapper( + ptr=self.ptr + offset, size=size, owner=self.owner + ) ) def __getitem__(self, key: slice) -> Buffer: diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py index a21a21c506d..f8a25340c44 100644 --- a/python/cudf/cudf/core/buffer/utils.py +++ b/python/cudf/cudf/core/buffer/utils.py @@ -2,10 +2,9 @@ from __future__ import annotations -from types import SimpleNamespace from typing import Any, Union -from cudf.core.buffer.buffer import Buffer +from cudf.core.buffer.buffer import Buffer, cuda_array_interface_wrapper def as_buffer( @@ -52,16 +51,7 @@ def as_buffer( raise ValueError( "size must be specified when `data` is an integer" ) - data = SimpleNamespace( - __cuda_array_interface__={ - "data": (data, False), - "shape": (size,), - "strides": None, - "typestr": "|u1", - "version": 0, - }, - owner=owner, - ) + data = cuda_array_interface_wrapper(ptr=data, size=size, owner=owner) elif size is not None or owner is not None: raise ValueError( "`size` and `owner` must be None when " diff --git a/python/cudf/cudf/tests/test_buffer.py b/python/cudf/cudf/tests/test_buffer.py index 4ec88591834..310688e52e4 100644 --- a/python/cudf/cudf/tests/test_buffer.py +++ b/python/cudf/cudf/tests/test_buffer.py @@ -107,7 +107,7 @@ def test_buffer_slice_fail(idx, err_type, err_msg): class SerializeTestBuffer(Buffer): def __init__(self, data, extra_arg): - super().__init__(data.ptr, data.size, data.owner) + self._ptr, self._size, self._owner = data.ptr, data.size, data.owner self.extra_arg = extra_arg def serialize(self): From 6fd341e88d9675080a44611768555b2df49303f2 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 1 Nov 2022 10:14:12 +0100 Subject: [PATCH 13/16] clean up --- python/cudf/cudf/core/buffer/buffer.py | 26 ++++++++------------------ python/cudf/cudf/core/buffer/utils.py | 4 ++-- 2 files changed, 10 insertions(+), 20 deletions(-) diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index ce183ec180e..dfc1bae53eb 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -56,31 +56,21 @@ def cuda_array_interface_wrapper(ptr: int, size: int, owner: object = None): class Buffer(Serializable): """A Buffer represents device memory. - Usually the factory function `as_buffer` should be used to - create a Buffer instance. - - Parameters - ---------- - ptr : int - An integer representing a pointer to device memory. - size : int - Size of device memory in bytes. - owner : object - Python object to which the lifetime of the memory allocation is tied. + Use the factory function `as_buffer` to create a Buffer instance. """ _ptr: int _size: int _owner: object - def __init__(self, ptr: int, size: int, owner: object): + def __init__(self): raise ValueError( f"do not create a {self.__class__} directly, please " "use the factory function `cudf.core.buffer.as_buffer`" ) @classmethod - def from_device_memory(cls: Type[T], data: Any) -> T: + def _from_device_memory(cls: Type[T], data: Any) -> T: """Create a Buffer from an object exposing `__cuda_array_interface__`. No data is being copied. @@ -111,7 +101,7 @@ def from_device_memory(cls: Type[T], data: Any) -> T: return ret @classmethod - def from_host_memory(cls: Type[T], data: Any) -> T: + def _from_host_memory(cls: Type[T], data: Any) -> T: """Create a Buffer from a buffer or array like object Data must implement `__array_interface__`, the buffer protocol, and/or @@ -135,14 +125,14 @@ def from_host_memory(cls: Type[T], data: Any) -> T: # Copy to device memory buf = rmm.DeviceBuffer(ptr=ptr, size=size) # Create from device memory - return cls.from_device_memory(buf) + return cls._from_device_memory(buf) def _getitem(self, offset: int, size: int) -> Buffer: """ Sub-classes can overwrite this to implement __getitem__ without having to handle non-slice inputs. """ - return self.from_device_memory( + return self._from_device_memory( cuda_array_interface_wrapper( ptr=self.ptr + offset, size=size, owner=self.owner ) @@ -245,8 +235,8 @@ def deserialize(cls: Type[T], header: dict, frames: list) -> T: return cls(frame, **header["constructor-kwargs"]) if hasattr(frame, "__cuda_array_interface__"): - return cls.from_device_memory(frame) - return cls.from_host_memory(frame) + return cls._from_device_memory(frame) + return cls._from_host_memory(frame) def __repr__(self) -> str: klass = self.__class__ diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py index f8a25340c44..a2b2d2526bc 100644 --- a/python/cudf/cudf/core/buffer/utils.py +++ b/python/cudf/cudf/core/buffer/utils.py @@ -59,5 +59,5 @@ def as_buffer( ) if hasattr(data, "__cuda_array_interface__"): - return Buffer.from_device_memory(data) - return Buffer.from_host_memory(data) + return Buffer._from_device_memory(data) + return Buffer._from_host_memory(data) From 062626a5fe8a6ebbac646633a83725908651908d Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 1 Nov 2022 10:15:48 +0100 Subject: [PATCH 14/16] remove "constructor-kwargs" Rollback https://github.com/rapidsai/cudf/commit/61fe635282e0a709af24fb78f9f1b01831be1671 --- python/cudf/cudf/core/buffer/buffer.py | 7 ------- python/cudf/cudf/tests/test_buffer.py | 21 --------------------- 2 files changed, 28 deletions(-) diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index dfc1bae53eb..150014ce4db 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -201,7 +201,6 @@ def serialize(self) -> Tuple[dict, list]: """ header: Dict[str, Any] = {} header["type-serialized"] = pickle.dumps(type(self)) - header["constructor-kwargs"] = {} header["frame_count"] = 1 frames = [self] return header, frames @@ -228,12 +227,6 @@ def deserialize(cls: Type[T], header: dict, frames: list) -> T: if isinstance(frame, cls): return frame # The frame is already deserialized - # TODO: remove handling of "constructor-kwargs" used by cuML's - # `CumlArray`, which will require `CumlArray` to implement - # its own deserialize. - if header["constructor-kwargs"]: - return cls(frame, **header["constructor-kwargs"]) - if hasattr(frame, "__cuda_array_interface__"): return cls._from_device_memory(frame) return cls._from_host_memory(frame) diff --git a/python/cudf/cudf/tests/test_buffer.py b/python/cudf/cudf/tests/test_buffer.py index 310688e52e4..5ed5750f29b 100644 --- a/python/cudf/cudf/tests/test_buffer.py +++ b/python/cudf/cudf/tests/test_buffer.py @@ -103,24 +103,3 @@ def test_buffer_slice_fail(idx, err_type, err_msg): with pytest.raises(err_type, match=err_msg): buf[idx] - - -class SerializeTestBuffer(Buffer): - def __init__(self, data, extra_arg): - self._ptr, self._size, self._owner = data.ptr, data.size, data.owner - self.extra_arg = extra_arg - - def serialize(self): - header, frames = super().serialize() - header["constructor-kwargs"] = {"extra_arg", self.extra_arg} - return header, frames - - -def test_serialize_constructor_kwargs(): - ary = cp.arange(arr_len, dtype="uint8") - buf = SerializeTestBuffer(as_buffer(ary), extra_arg="my-extra-argument") - out = SerializeTestBuffer.deserialize(*buf.serialize()) - assert out.ptr == buf.ptr - assert out.size == buf.size - assert out.owner is buf.owner - assert out.extra_arg == buf.extra_arg From 93f41649ea1bfb3d43f5aae0079641c2c96e963f Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 2 Nov 2022 08:39:13 +0100 Subject: [PATCH 15/16] Extend cuda_array_interface_wrapper --- python/cudf/cudf/core/buffer/__init__.py | 2 +- python/cudf/cudf/core/buffer/buffer.py | 25 +++++++++++++++++++---- python/cudf/cudf/core/column/datetime.py | 19 +++++++---------- python/cudf/cudf/core/column/numerical.py | 18 +++++++--------- 4 files changed, 36 insertions(+), 28 deletions(-) diff --git a/python/cudf/cudf/core/buffer/__init__.py b/python/cudf/cudf/core/buffer/__init__.py index 93551f084ba..a73bc69ffb5 100644 --- a/python/cudf/cudf/core/buffer/__init__.py +++ b/python/cudf/cudf/core/buffer/__init__.py @@ -1,4 +1,4 @@ # Copyright (c) 2022, NVIDIA CORPORATION. -from cudf.core.buffer.buffer import Buffer +from cudf.core.buffer.buffer import Buffer, cuda_array_interface_wrapper from cudf.core.buffer.utils import as_buffer diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index 150014ce4db..c6cac634c03 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -18,9 +18,18 @@ T = TypeVar("T", bound="Buffer") -def cuda_array_interface_wrapper(ptr: int, size: int, owner: object = None): +def cuda_array_interface_wrapper( + ptr: int, + size: int, + owner: object = None, + readonly=False, + typestr="|u1", + version=0, +): """Wrap device pointer in an object that exposes `__cuda_array_interface__` + See + Parameters ---------- ptr : int @@ -30,6 +39,14 @@ def cuda_array_interface_wrapper(ptr: int, size: int, owner: object = None): owner : object, optional Python object to which the lifetime of the memory allocation is tied. A reference to this object is kept in the returned wrapper object. + readonly: bool, optional + Mark the interface read-only. + typestr: str, optional + The type string of the interface. By default this is "|u1", which + means "an unsigned integer with a not relevant byteorder". See: + + version : bool, optional + The version of the interface. Return ------ @@ -43,11 +60,11 @@ def cuda_array_interface_wrapper(ptr: int, size: int, owner: object = None): return SimpleNamespace( __cuda_array_interface__={ - "data": (ptr, False), + "data": (ptr, readonly), "shape": (size,), "strides": None, - "typestr": "|u1", - "version": 0, + "typestr": typestr, + "version": version, }, owner=owner, ) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 375a19f5423..56436ac141d 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -6,7 +6,6 @@ import locale import re from locale import nl_langinfo -from types import SimpleNamespace from typing import Any, Mapping, Sequence, cast import numpy as np @@ -23,7 +22,7 @@ ) from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype from cudf.core._compat import PANDAS_GE_120 -from cudf.core.buffer import Buffer +from cudf.core.buffer import Buffer, cuda_array_interface_wrapper from cudf.core.column import ColumnBase, as_column, column, string from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion from cudf.utils.utils import _fillna_natwise @@ -289,20 +288,16 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]: } if self.nullable and self.has_nulls(): - # Create a simple Python object that exposes the # `__cuda_array_interface__` attribute here since we need to modify # some of the attributes from the numba device array - mask = SimpleNamespace( - __cuda_array_interface__={ - "shape": (len(self),), - "typestr": " DatetimeColumn: diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 32bda1e0b6c..f126f47c3c2 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -2,7 +2,6 @@ from __future__ import annotations -from types import SimpleNamespace from typing import ( Any, Callable, @@ -36,7 +35,7 @@ is_number, is_scalar, ) -from cudf.core.buffer import Buffer, as_buffer +from cudf.core.buffer import Buffer, as_buffer, cuda_array_interface_wrapper from cudf.core.column import ( ColumnBase, as_column, @@ -175,19 +174,16 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]: } if self.nullable and self.has_nulls(): - # Create a simple Python object that exposes the # `__cuda_array_interface__` attribute here since we need to modify # some of the attributes from the numba device array - mask = SimpleNamespace( - __cuda_array_interface__={ - "shape": (len(self),), - "typestr": " Date: Wed, 2 Nov 2022 09:19:25 +0100 Subject: [PATCH 16/16] _from_host_memory(): use numpy.array() --- python/cudf/cudf/core/buffer/buffer.py | 10 +++++++--- python/cudf/cudf/core/buffer/utils.py | 4 ++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index c6cac634c03..73e589ebb8e 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -122,13 +122,15 @@ def _from_host_memory(cls: Type[T], data: Any) -> T: """Create a Buffer from a buffer or array like object Data must implement `__array_interface__`, the buffer protocol, and/or - be convertible to a buffer object using `numpy.asarray()` + be convertible to a buffer object using `numpy.array()` The host memory is copied to a new device allocation. + Raises ValueError if array isn't C-contiguous. + Parameters ---------- - data : array-like or buffer-like + data : Any An object that represens host memory. Returns @@ -137,8 +139,10 @@ def _from_host_memory(cls: Type[T], data: Any) -> T: Buffer representing a copy of `data`. """ + # Convert to numpy array, this will not copy data in most cases. + ary = numpy.array(data, copy=False, subok=True) # Extract pointer and size - ptr, size = get_ptr_and_size(numpy.asarray(data).__array_interface__) + ptr, size = get_ptr_and_size(ary.__array_interface__) # Copy to device memory buf = rmm.DeviceBuffer(ptr=ptr, size=size) # Create from device memory diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py index a2b2d2526bc..5e017c4bc92 100644 --- a/python/cudf/cudf/core/buffer/utils.py +++ b/python/cudf/cudf/core/buffer/utils.py @@ -21,6 +21,10 @@ def as_buffer( **not** copied, instead the new buffer keeps a reference to `data` in order to retain its lifetime. + If `data` is an integer, it is assumed to point to device memory. + + Raises ValueError if data isn't C-contiguous. + Parameters ---------- data : int or buffer-like or array-like