Skip to content

Commit

Permalink
Buffer: make .ptr read-only (#10872)
Browse files Browse the repository at this point in the history
This PR makes `Buffer.ptr` read-only and introduce `Buffer.from_buffer`:
```python 
@classmethod
def from_buffer(cls, buffer: Buffer, size: int = None, offset: int = 0):
    """
    Create a buffer from another buffer

    Parameters
    ----------
    buffer : Buffer
        The base buffer, which will also be set as the owner of
        the memory allocation.
    size : int, optional
        Size of the memory allocation (default: `buffer.size`).
    offset : int, optional
        Start offset relative to `buffer.ptr`.
    """
```

This is mainly motivated by my work on [spilling](#10746) by making it a bit easier to reason about the relationship between buffers.

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ashwin Srinath (https://github.com/shwina)

URL: #10872
  • Loading branch information
madsbk authored May 25, 2022
1 parent 328830f commit ca4bc97
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 48 deletions.
21 changes: 12 additions & 9 deletions python/cudf/cudf/_lib/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -107,17 +107,21 @@ cdef class Column:

@property
def data(self):
if self.base_data is None:
return None
if self._data is None:
if self.base_data is None:
itemsize = self.dtype.itemsize
size = self.size * itemsize
offset = self.offset * itemsize if self.size else 0
if offset == 0 and self.base_data.size == size:
# `data` spans all of `base_data`
self._data = self.base_data
else:
buf = Buffer(self.base_data)
if self.size == 0:
buf.ptr = 0
else:
buf.ptr = buf.ptr + (self.offset * self.dtype.itemsize)
buf.size = self.size * self.dtype.itemsize
self._data = buf
self._data = Buffer.from_buffer(
buffer=self.base_data,
size=size,
offset=offset
)
return self._data

@property
Expand All @@ -133,7 +137,6 @@ cdef class Column:
type(value).__name__)

self._data = None

self._base_data = value

@property
Expand Down
78 changes: 52 additions & 26 deletions python/cudf/cudf/core/buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,11 @@
import functools
import operator
import pickle
from typing import Any, Dict, Optional, Tuple
from typing import Any, Dict, Tuple

import numpy as np

import rmm
from rmm import DeviceBuffer

import cudf
from cudf.core.abc import Serializable
Expand All @@ -33,21 +32,20 @@ class Buffer(Serializable):
object is kept in this Buffer.
"""

ptr: int
size: int
_owner: Any
_ptr: int
_size: int
_owner: object

def __init__(
self, data: Any = None, size: Optional[int] = None, owner: Any = None
self, data: Any = None, size: int = None, owner: object = None
):

if isinstance(data, Buffer):
self.ptr = data.ptr
self.size = data.size
self._ptr = data._ptr
self._size = data.size
self._owner = owner or data._owner
elif isinstance(data, rmm.DeviceBuffer):
self.ptr = data.ptr
self.size = data.size
self._ptr = data.ptr
self._size = data.size
self._owner = data
elif hasattr(data, "__array_interface__") or hasattr(
data, "__cuda_array_interface__"
Expand All @@ -58,12 +56,12 @@ def __init__(
elif isinstance(data, int):
if not isinstance(size, int):
raise TypeError("size must be integer")
self.ptr = data
self.size = size
self._ptr = data
self._size = size
self._owner = owner
elif data is None:
self.ptr = 0
self.size = 0
self._ptr = 0
self._size = 0
self._owner = None
else:
try:
Expand All @@ -72,23 +70,52 @@ def __init__(
raise TypeError("data must be Buffer, array-like or integer")
self._init_from_array_like(np.asarray(data), owner)

@classmethod
def from_buffer(cls, buffer: Buffer, size: int = None, offset: int = 0):
"""
Create a buffer from another buffer
Parameters
----------
buffer : Buffer
The base buffer, which will also be set as the owner of
the memory allocation.
size : int, optional
Size of the memory allocation (default: `buffer.size`).
offset : int, optional
Start offset relative to `buffer.ptr`.
"""

ret = cls()
ret._ptr = buffer._ptr + offset
ret._size = buffer.size if size is None else size
ret._owner = buffer
return ret

def __len__(self) -> int:
return self.size
return self._size

@property
def ptr(self) -> int:
return self._ptr

@property
def size(self) -> int:
return self._size

@property
def nbytes(self) -> int:
return self.size
return self._size

@property
def __cuda_array_interface__(self) -> dict:
intf = {
return {
"data": (self.ptr, False),
"shape": (self.size,),
"strides": None,
"typestr": "|u1",
"version": 0,
}
return intf

def to_host_array(self):
data = np.empty((self.size,), "u1")
Expand All @@ -102,15 +129,15 @@ def _init_from_array_like(self, data, owner):
ptr, size = _buffer_data_from_array_interface(
data.__cuda_array_interface__
)
self.ptr = ptr
self.size = size
self._ptr = ptr
self._size = size
self._owner = owner or data
elif hasattr(data, "__array_interface__"):
confirm_1d_contiguous(data.__array_interface__)
ptr, size = _buffer_data_from_array_interface(
data.__array_interface__
)
dbuf = DeviceBuffer(ptr=ptr, size=size)
dbuf = rmm.DeviceBuffer(ptr=ptr, size=size)
self._init_from_array_like(dbuf, owner)
else:
raise TypeError(
Expand Down Expand Up @@ -145,17 +172,16 @@ def deserialize(cls, header: dict, frames: list) -> Buffer:

@classmethod
def empty(cls, size: int) -> Buffer:
dbuf = DeviceBuffer(size=size)
return Buffer(dbuf)
return Buffer(rmm.DeviceBuffer(size=size))

def copy(self):
def copy(self) -> Buffer:
"""
Create a new Buffer containing a copy of the data contained
in this Buffer.
"""
from rmm._lib.device_buffer import copy_device_to_ptr

out = Buffer(DeviceBuffer(size=self.size))
out = Buffer.empty(size=self.size)
copy_device_to_ptr(self.ptr, out.ptr, self.size)
return out

Expand Down
9 changes: 5 additions & 4 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -706,10 +706,11 @@ def children(self) -> Tuple[NumericalColumn]:
if self._children is None:
codes_column = self.base_children[0]

buf = Buffer(codes_column.base_data)
buf.ptr = buf.ptr + (self.offset * codes_column.dtype.itemsize)
buf.size = self.size * codes_column.dtype.itemsize

buf = Buffer.from_buffer(
buffer=codes_column.base_data,
size=self.size * codes_column.dtype.itemsize,
offset=self.offset * codes_column.dtype.itemsize,
)
codes_column = cast(
cudf.core.column.NumericalColumn,
column.build_column(
Expand Down
22 changes: 13 additions & 9 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,15 +423,19 @@ def view(self, dtype: Dtype) -> ColumnBase:

# This assertion prevents mypy errors below.
assert self.base_data is not None
new_buf_ptr = (
self.base_data.ptr + self.offset * self.dtype.itemsize
)
new_buf_size = self.size * self.dtype.itemsize
view_buf = Buffer(
data=new_buf_ptr,
size=new_buf_size,
owner=self.base_data._owner,
)

# If the view spans all of `base_data`, we return `base_data`.
if (
self.offset == 0
and self.base_data.size == self.size * self.dtype.itemsize
):
view_buf = self.base_data
else:
view_buf = Buffer.from_buffer(
buffer=self.base_data,
size=self.size * self.dtype.itemsize,
offset=self.offset * self.dtype.itemsize,
)
return build_column(view_buf, dtype=dtype)

def element_indexing(self, index: int):
Expand Down

0 comments on commit ca4bc97

Please sign in to comment.