diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 8cbadfa19a5..1f9b4c1596a 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -107,17 +107,21 @@ cdef class Column: @property def data(self): + if self.base_data is None: + return None if self._data is None: - if self.base_data is None: + itemsize = self.dtype.itemsize + size = self.size * itemsize + offset = self.offset * itemsize if self.size else 0 + if offset == 0 and self.base_data.size == size: + # `data` spans all of `base_data` self._data = self.base_data else: - buf = Buffer(self.base_data) - if self.size == 0: - buf.ptr = 0 - else: - buf.ptr = buf.ptr + (self.offset * self.dtype.itemsize) - buf.size = self.size * self.dtype.itemsize - self._data = buf + self._data = Buffer.from_buffer( + buffer=self.base_data, + size=size, + offset=offset + ) return self._data @property @@ -133,7 +137,6 @@ cdef class Column: type(value).__name__) self._data = None - self._base_data = value @property diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py index 63e99f34803..374738c241e 100644 --- a/python/cudf/cudf/core/buffer.py +++ b/python/cudf/cudf/core/buffer.py @@ -4,12 +4,11 @@ import functools import operator import pickle -from typing import Any, Dict, Optional, Tuple +from typing import Any, Dict, Tuple import numpy as np import rmm -from rmm import DeviceBuffer import cudf from cudf.core.abc import Serializable @@ -33,21 +32,20 @@ class Buffer(Serializable): object is kept in this Buffer. """ - ptr: int - size: int - _owner: Any + _ptr: int + _size: int + _owner: object def __init__( - self, data: Any = None, size: Optional[int] = None, owner: Any = None + self, data: Any = None, size: int = None, owner: object = None ): - if isinstance(data, Buffer): - self.ptr = data.ptr - self.size = data.size + self._ptr = data._ptr + self._size = data.size self._owner = owner or data._owner elif isinstance(data, rmm.DeviceBuffer): - self.ptr = data.ptr - self.size = data.size + self._ptr = data.ptr + self._size = data.size self._owner = data elif hasattr(data, "__array_interface__") or hasattr( data, "__cuda_array_interface__" @@ -58,12 +56,12 @@ def __init__( elif isinstance(data, int): if not isinstance(size, int): raise TypeError("size must be integer") - self.ptr = data - self.size = size + self._ptr = data + self._size = size self._owner = owner elif data is None: - self.ptr = 0 - self.size = 0 + self._ptr = 0 + self._size = 0 self._owner = None else: try: @@ -72,23 +70,52 @@ def __init__( raise TypeError("data must be Buffer, array-like or integer") self._init_from_array_like(np.asarray(data), owner) + @classmethod + def from_buffer(cls, buffer: Buffer, size: int = None, offset: int = 0): + """ + Create a buffer from another buffer + + Parameters + ---------- + buffer : Buffer + The base buffer, which will also be set as the owner of + the memory allocation. + size : int, optional + Size of the memory allocation (default: `buffer.size`). + offset : int, optional + Start offset relative to `buffer.ptr`. + """ + + ret = cls() + ret._ptr = buffer._ptr + offset + ret._size = buffer.size if size is None else size + ret._owner = buffer + return ret + def __len__(self) -> int: - return self.size + return self._size + + @property + def ptr(self) -> int: + return self._ptr + + @property + def size(self) -> int: + return self._size @property def nbytes(self) -> int: - return self.size + return self._size @property def __cuda_array_interface__(self) -> dict: - intf = { + return { "data": (self.ptr, False), "shape": (self.size,), "strides": None, "typestr": "|u1", "version": 0, } - return intf def to_host_array(self): data = np.empty((self.size,), "u1") @@ -102,15 +129,15 @@ def _init_from_array_like(self, data, owner): ptr, size = _buffer_data_from_array_interface( data.__cuda_array_interface__ ) - self.ptr = ptr - self.size = size + self._ptr = ptr + self._size = size self._owner = owner or data elif hasattr(data, "__array_interface__"): confirm_1d_contiguous(data.__array_interface__) ptr, size = _buffer_data_from_array_interface( data.__array_interface__ ) - dbuf = DeviceBuffer(ptr=ptr, size=size) + dbuf = rmm.DeviceBuffer(ptr=ptr, size=size) self._init_from_array_like(dbuf, owner) else: raise TypeError( @@ -145,17 +172,16 @@ def deserialize(cls, header: dict, frames: list) -> Buffer: @classmethod def empty(cls, size: int) -> Buffer: - dbuf = DeviceBuffer(size=size) - return Buffer(dbuf) + return Buffer(rmm.DeviceBuffer(size=size)) - def copy(self): + def copy(self) -> Buffer: """ Create a new Buffer containing a copy of the data contained in this Buffer. """ from rmm._lib.device_buffer import copy_device_to_ptr - out = Buffer(DeviceBuffer(size=self.size)) + out = Buffer.empty(size=self.size) copy_device_to_ptr(self.ptr, out.ptr, self.size) return out diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index e28b7d059b7..c04e2e45461 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -706,10 +706,11 @@ def children(self) -> Tuple[NumericalColumn]: if self._children is None: codes_column = self.base_children[0] - buf = Buffer(codes_column.base_data) - buf.ptr = buf.ptr + (self.offset * codes_column.dtype.itemsize) - buf.size = self.size * codes_column.dtype.itemsize - + buf = Buffer.from_buffer( + buffer=codes_column.base_data, + size=self.size * codes_column.dtype.itemsize, + offset=self.offset * codes_column.dtype.itemsize, + ) codes_column = cast( cudf.core.column.NumericalColumn, column.build_column( diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 47a2e3489e8..ffd17cb7d31 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -423,15 +423,19 @@ def view(self, dtype: Dtype) -> ColumnBase: # This assertion prevents mypy errors below. assert self.base_data is not None - new_buf_ptr = ( - self.base_data.ptr + self.offset * self.dtype.itemsize - ) - new_buf_size = self.size * self.dtype.itemsize - view_buf = Buffer( - data=new_buf_ptr, - size=new_buf_size, - owner=self.base_data._owner, - ) + + # If the view spans all of `base_data`, we return `base_data`. + if ( + self.offset == 0 + and self.base_data.size == self.size * self.dtype.itemsize + ): + view_buf = self.base_data + else: + view_buf = Buffer.from_buffer( + buffer=self.base_data, + size=self.size * self.dtype.itemsize, + offset=self.offset * self.dtype.itemsize, + ) return build_column(view_buf, dtype=dtype) def element_indexing(self, index: int):