Skip to content

Commit

Permalink
Refactor the Buffer class (#11447)
Browse files Browse the repository at this point in the history
This PR introduces factory functions to create `Buffer` instances, which makes it possible to change the returned buffer type based on a configuration option in a follow-up PR.

Beside simplifying the code base a bit, this is motivated by the spilling work in #10746. We would like to introduce a new spillable Buffer class that requires minimal changes to the existing code and is only used when enabled explicitly. This way, we can introduce spilling in cuDF as an experimental feature with minimal risk to the existing code.

@shwina and I discussed the possibility to let `Buffer.__new__` return different class type instances instead of using factory functions but we concluded that having `Buffer()` return anything other than an instance of `Buffer` is simply too surprising :)

**Notice**, this is breaking because it removes unused methods such as `Buffer.copy()` and `Buffer.nbytes`. 
~~However, we still support creating a buffer directly by calling `Buffer(obj)`. AFAIK, this is the only way `Buffer` is created outside of cuDF, which [a github search seems to confirm](https://github.com/search?l=&q=cudf.core.buffer+-repo%3Arapidsai%2Fcudf&type=code).~~
This PR doesn't change the signature of `Buffer.__init__()` anymore.

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Lawrence Mitchell (https://github.com/wence-)
  - Bradley Dice (https://github.com/bdice)
  - https://github.com/brandon-b-miller

URL: #11447
  • Loading branch information
madsbk authored Aug 11, 2022
1 parent 5628f57 commit 95935db
Show file tree
Hide file tree
Showing 31 changed files with 517 additions and 364 deletions.
28 changes: 14 additions & 14 deletions python/cudf/cudf/_lib/column.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@ from __future__ import annotations
from typing import Dict, Optional, Tuple, TypeVar

from cudf._typing import Dtype, DtypeObj, ScalarLike
from cudf.core.buffer import Buffer
from cudf.core.buffer import DeviceBufferLike
from cudf.core.column import ColumnBase

T = TypeVar("T")

class Column:
_data: Optional[Buffer]
_mask: Optional[Buffer]
_base_data: Optional[Buffer]
_base_mask: Optional[Buffer]
_data: Optional[DeviceBufferLike]
_mask: Optional[DeviceBufferLike]
_base_data: Optional[DeviceBufferLike]
_base_mask: Optional[DeviceBufferLike]
_dtype: DtypeObj
_size: int
_offset: int
Expand All @@ -25,10 +25,10 @@ class Column:

def __init__(
self,
data: Optional[Buffer],
data: Optional[DeviceBufferLike],
size: int,
dtype: Dtype,
mask: Optional[Buffer] = None,
mask: Optional[DeviceBufferLike] = None,
offset: int = None,
null_count: int = None,
children: Tuple[ColumnBase, ...] = (),
Expand All @@ -40,27 +40,27 @@ class Column:
@property
def size(self) -> int: ...
@property
def base_data(self) -> Optional[Buffer]: ...
def base_data(self) -> Optional[DeviceBufferLike]: ...
@property
def base_data_ptr(self) -> int: ...
@property
def data(self) -> Optional[Buffer]: ...
def data(self) -> Optional[DeviceBufferLike]: ...
@property
def data_ptr(self) -> int: ...
def set_base_data(self, value: Buffer) -> None: ...
def set_base_data(self, value: DeviceBufferLike) -> None: ...
@property
def nullable(self) -> bool: ...
def has_nulls(self, include_nan: bool = False) -> bool: ...
@property
def base_mask(self) -> Optional[Buffer]: ...
def base_mask(self) -> Optional[DeviceBufferLike]: ...
@property
def base_mask_ptr(self) -> int: ...
@property
def mask(self) -> Optional[Buffer]: ...
def mask(self) -> Optional[DeviceBufferLike]: ...
@property
def mask_ptr(self) -> int: ...
def set_base_mask(self, value: Optional[Buffer]) -> None: ...
def set_mask(self: T, value: Optional[Buffer]) -> T: ...
def set_base_mask(self, value: Optional[DeviceBufferLike]) -> None: ...
def set_mask(self: T, value: Optional[DeviceBufferLike]) -> T: ...
@property
def null_count(self) -> int: ...
@property
Expand Down
71 changes: 33 additions & 38 deletions python/cudf/cudf/_lib/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import rmm
import cudf
import cudf._lib as libcudf
from cudf.api.types import is_categorical_dtype, is_list_dtype, is_struct_dtype
from cudf.core.buffer import Buffer
from cudf.core.buffer import Buffer, DeviceBufferLike, as_device_buffer_like

from cpython.buffer cimport PyObject_CheckBuffer
from libc.stdint cimport uintptr_t
Expand Down Expand Up @@ -56,9 +56,9 @@ cdef class Column:
A Column stores columnar data in device memory.
A Column may be composed of:
* A *data* Buffer
* A *data* DeviceBufferLike
* One or more (optional) *children* Columns
* An (optional) *mask* Buffer representing the nullmask
* An (optional) *mask* DeviceBufferLike representing the nullmask
The *dtype* indicates the Column's element type.
"""
Expand Down Expand Up @@ -110,18 +110,9 @@ cdef class Column:
if self.base_data is None:
return None
if self._data is None:
itemsize = self.dtype.itemsize
size = self.size * itemsize
offset = self.offset * itemsize if self.size else 0
if offset == 0 and self.base_data.size == size:
# `data` spans all of `base_data`
self._data = self.base_data
else:
self._data = Buffer.from_buffer(
buffer=self.base_data,
size=size,
offset=offset
)
start = self.offset * self.dtype.itemsize
end = start + self.size * self.dtype.itemsize
self._data = self.base_data[start:end]
return self._data

@property
Expand All @@ -132,9 +123,11 @@ cdef class Column:
return self.data.ptr

def set_base_data(self, value):
if value is not None and not isinstance(value, Buffer):
raise TypeError("Expected a Buffer or None for data, got " +
type(value).__name__)
if value is not None and not isinstance(value, DeviceBufferLike):
raise TypeError(
"Expected a DeviceBufferLike or None for data, "
f"got {type(value).__name__}"
)

self._data = None
self._base_data = value
Expand Down Expand Up @@ -179,17 +172,18 @@ cdef class Column:
modify size or offset in any way, so the passed mask is expected to be
compatible with the current offset.
"""
if value is not None and not isinstance(value, Buffer):
raise TypeError("Expected a Buffer or None for mask, got " +
type(value).__name__)
if value is not None and not isinstance(value, DeviceBufferLike):
raise TypeError(
"Expected a DeviceBufferLike or None for mask, "
f"got {type(value).__name__}"
)

if value is not None:
required_size = bitmask_allocation_size_bytes(self.base_size)
if value.size < required_size:
error_msg = (
"The Buffer for mask is smaller than expected, got " +
str(value.size) + " bytes, expected " +
str(required_size) + " bytes."
"The DeviceBufferLike for mask is smaller than expected, "
f"got {value.size} bytes, expected {required_size} bytes."
)
if self.offset > 0 or self.size < self.base_size:
error_msg += (
Expand Down Expand Up @@ -233,31 +227,31 @@ cdef class Column:
if isinstance(value, Column):
value = value.data_array_view
value = cp.asarray(value).view('|u1')
mask = Buffer(value)
mask = as_device_buffer_like(value)
if mask.size < required_num_bytes:
raise ValueError(error_msg.format(str(value.size)))
if mask.size < mask_size:
dbuf = rmm.DeviceBuffer(size=mask_size)
dbuf.copy_from_device(value)
mask = Buffer(dbuf)
mask = as_device_buffer_like(dbuf)
elif hasattr(value, "__array_interface__"):
value = np.asarray(value).view("u1")[:mask_size]
if value.size < required_num_bytes:
raise ValueError(error_msg.format(str(value.size)))
dbuf = rmm.DeviceBuffer(size=mask_size)
dbuf.copy_from_host(value)
mask = Buffer(dbuf)
mask = as_device_buffer_like(dbuf)
elif PyObject_CheckBuffer(value):
value = np.asarray(value).view("u1")[:mask_size]
if value.size < required_num_bytes:
raise ValueError(error_msg.format(str(value.size)))
dbuf = rmm.DeviceBuffer(size=mask_size)
dbuf.copy_from_host(value)
mask = Buffer(dbuf)
mask = as_device_buffer_like(dbuf)
else:
raise TypeError(
"Expected a Buffer-like object or None for mask, got "
+ type(value).__name__
"Expected a DeviceBufferLike object or None for mask, "
f"got {type(value).__name__}"
)

return cudf.core.column.build_column(
Expand Down Expand Up @@ -455,11 +449,11 @@ cdef class Column:
cdef column_contents contents = move(c_col.get()[0].release())

data = DeviceBuffer.c_from_unique_ptr(move(contents.data))
data = Buffer(data)
data = as_device_buffer_like(data)

if null_count > 0:
mask = DeviceBuffer.c_from_unique_ptr(move(contents.null_mask))
mask = Buffer(mask)
mask = as_device_buffer_like(mask)
else:
mask = None

Expand All @@ -484,9 +478,10 @@ cdef class Column:
Given a ``cudf::column_view``, constructs a ``cudf.Column`` from it,
along with referencing an ``owner`` Python object that owns the memory
lifetime. If ``owner`` is a ``cudf.Column``, we reach inside of it and
make the owner of each newly created ``Buffer`` the respective
``Buffer`` from the ``owner`` ``cudf.Column``. If ``owner`` is
``None``, we allocate new memory for the resulting ``cudf.Column``.
make the owner of each newly created ``DeviceBufferLike`` the
respective ``DeviceBufferLike`` from the ``owner`` ``cudf.Column``.
If ``owner`` is ``None``, we allocate new memory for the resulting
``cudf.Column``.
"""
column_owner = isinstance(owner, Column)
mask_owner = owner
Expand All @@ -509,7 +504,7 @@ cdef class Column:

if data_ptr:
if data_owner is None:
data = Buffer(
data = as_device_buffer_like(
rmm.DeviceBuffer(ptr=data_ptr,
size=(size+offset) * dtype.itemsize)
)
Expand All @@ -520,7 +515,7 @@ cdef class Column:
owner=data_owner
)
else:
data = Buffer(
data = as_device_buffer_like(
rmm.DeviceBuffer(ptr=data_ptr, size=0)
)

Expand Down Expand Up @@ -550,7 +545,7 @@ cdef class Column:
# result:
mask = None
else:
mask = Buffer(
mask = as_device_buffer_like(
rmm.DeviceBuffer(
ptr=mask_ptr,
size=bitmask_allocation_size_bytes(size+offset)
Expand Down
8 changes: 5 additions & 3 deletions python/cudf/cudf/_lib/concat.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.memory cimport make_unique, unique_ptr
Expand All @@ -19,7 +19,7 @@ from cudf._lib.utils cimport (
table_view_from_table,
)

from cudf.core.buffer import Buffer
from cudf.core.buffer import as_device_buffer_like

from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer

Expand All @@ -31,7 +31,9 @@ cpdef concat_masks(object columns):
with nogil:
c_result = move(libcudf_concatenate_masks(c_views))
c_unique_result = make_unique[device_buffer](move(c_result))
return Buffer(DeviceBuffer.c_from_unique_ptr(move(c_unique_result)))
return as_device_buffer_like(
DeviceBuffer.c_from_unique_ptr(move(c_unique_result))
)


cpdef concat_columns(object columns):
Expand Down
6 changes: 5 additions & 1 deletion python/cudf/cudf/_lib/copying.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -718,7 +718,11 @@ cdef class _CPackedColumns:
header = {}
frames = []

gpu_data = Buffer(self.gpu_data_ptr, self.gpu_data_size, self)
gpu_data = Buffer(
data=self.gpu_data_ptr,
size=self.gpu_data_size,
owner=self
)
data_header, data_frames = gpu_data.serialize()
header["data"] = data_header
frames.extend(data_frames)
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/_lib/null_mask.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ from cudf._lib.cpp.null_mask cimport (
)
from cudf._lib.cpp.types cimport mask_state, size_type

from cudf.core.buffer import Buffer
from cudf.core.buffer import as_device_buffer_like


class MaskState(Enum):
Expand Down Expand Up @@ -47,7 +47,7 @@ def copy_bitmask(Column col):
up_db = make_unique[device_buffer](move(db))

rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db))
buf = Buffer(rmm_db)
buf = as_device_buffer_like(rmm_db)
return buf


Expand Down Expand Up @@ -93,5 +93,5 @@ def create_null_mask(size_type size, state=MaskState.UNINITIALIZED):
up_db = make_unique[device_buffer](move(db))

rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db))
buf = Buffer(rmm_db)
buf = as_device_buffer_like(rmm_db)
return buf
12 changes: 6 additions & 6 deletions python/cudf/cudf/_lib/transform.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ from numba.np import numpy_support
import cudf
from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
from cudf.core._internals.expressions import parse_expression
from cudf.core.buffer import Buffer
from cudf.core.buffer import as_device_buffer_like
from cudf.utils import cudautils

from cython.operator cimport dereference
Expand Down Expand Up @@ -40,7 +40,7 @@ from cudf._lib.utils cimport (
def bools_to_mask(Column col):
"""
Given an int8 (boolean) column, compress the data from booleans to bits and
return a Buffer
return a DeviceBufferLike
"""
cdef column_view col_view = col.view()
cdef pair[unique_ptr[device_buffer], size_type] cpp_out
Expand All @@ -52,7 +52,7 @@ def bools_to_mask(Column col):
up_db = move(cpp_out.first)

rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db))
buf = Buffer(rmm_db)
buf = as_device_buffer_like(rmm_db)
return buf


Expand All @@ -61,9 +61,9 @@ def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit):
Given a mask buffer, returns a boolean column representng bit 0 -> False
and 1 -> True within range of [begin_bit, end_bit),
"""
if not isinstance(mask_buffer, cudf.core.buffer.Buffer):
if not isinstance(mask_buffer, cudf.core.buffer.DeviceBufferLike):
raise TypeError("mask_buffer is not an instance of "
"cudf.core.buffer.Buffer")
"cudf.core.buffer.DeviceBufferLike")
cdef bitmask_type* bit_mask = <bitmask_type*><uintptr_t>(mask_buffer.ptr)

cdef unique_ptr[column] result
Expand All @@ -88,7 +88,7 @@ def nans_to_nulls(Column input):
return None

buffer = DeviceBuffer.c_from_unique_ptr(move(c_buffer))
buffer = Buffer(buffer)
buffer = as_device_buffer_like(buffer)
return buffer


Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/utils.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -341,8 +341,8 @@ cdef data_from_table_view(
along with referencing an ``owner`` Python object that owns the memory
lifetime. If ``owner`` is a Frame we reach inside of it and
reach inside of each ``cudf.Column`` to make the owner of each newly
created ``Buffer`` underneath the ``cudf.Column`` objects of the
created Frame the respective ``Buffer`` from the relevant
created ``DeviceBufferLike`` underneath the ``cudf.Column`` objects of the
created Frame the respective ``DeviceBufferLike`` from the relevant
``cudf.Column`` of the ``owner`` Frame
"""
cdef size_type column_idx = 0
Expand Down
Loading

0 comments on commit 95935db

Please sign in to comment.