From bd72a17962384c0dd6d0a29e84af725aa3436513 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 13 Jan 2023 15:12:49 -0600 Subject: [PATCH 01/31] [REVIEW] Copy on write implementation (#11718) Initial copy-on-write implementation --- .../source/developer_guide/library_design.md | 177 +++++++++++++ docs/cudf/source/user_guide/copy-on-write.md | 169 +++++++++++++ docs/cudf/source/user_guide/index.md | 1 + python/cudf/cudf/_lib/column.pyx | 43 +++- python/cudf/cudf/core/buffer/__init__.py | 3 +- python/cudf/cudf/core/buffer/buffer.py | 50 +++- python/cudf/cudf/core/buffer/cow_buffer.py | 211 ++++++++++++++++ .../cudf/cudf/core/buffer/spillable_buffer.py | 6 +- python/cudf/cudf/core/buffer/utils.py | 12 +- python/cudf/cudf/core/column/categorical.py | 29 +-- python/cudf/cudf/core/column/column.py | 82 +++++- python/cudf/cudf/core/column/datetime.py | 3 +- python/cudf/cudf/core/column/lists.py | 5 + python/cudf/cudf/core/column/numerical.py | 14 +- python/cudf/cudf/core/column/string.py | 5 + python/cudf/cudf/core/column/struct.py | 6 +- python/cudf/cudf/core/column/timedelta.py | 4 +- python/cudf/cudf/core/column_accessor.py | 6 +- python/cudf/cudf/core/dataframe.py | 10 +- python/cudf/cudf/core/frame.py | 10 +- python/cudf/cudf/core/multiindex.py | 4 +- python/cudf/cudf/core/reshape.py | 4 +- python/cudf/cudf/core/series.py | 26 +- python/cudf/cudf/options.py | 51 +++- python/cudf/cudf/testing/_utils.py | 5 + python/cudf/cudf/tests/test_copying.py | 238 +++++++++++++++++- python/cudf/cudf/tests/test_index.py | 14 +- python/cudf/cudf/tests/test_multiindex.py | 12 +- python/cudf/cudf/tests/test_series.py | 22 ++ python/cudf/cudf/tests/test_stats.py | 1 - python/cudf/cudf/utils/applyutils.py | 4 +- 31 files changed, 1132 insertions(+), 95 deletions(-) create mode 100644 docs/cudf/source/user_guide/copy-on-write.md create mode 100644 python/cudf/cudf/core/buffer/cow_buffer.py diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index 54a28db1b58..f9a51f005cb 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -316,3 +316,180 @@ The pandas API also includes a number of helper objects, such as `GroupBy`, `Rol cuDF implements corresponding objects with the same APIs. Internally, these objects typically interact with cuDF objects at the Frame layer via composition. However, for performance reasons they frequently access internal attributes and methods of `Frame` and its subclasses. + + +## Copy-on-write + + +Copy-on-write (COW) is designed to reduce memory footprint on GPUs. With this feature, a copy (`.copy(deep=False)`) is only really made whenever +there is a write operation on a column. It is first recommended to see +the public usage [here](copy-on-write-user-doc) of this functionality before reading through the internals +below. + +The core copy-on-write implementation relies on the `CopyOnWriteBuffer` class. This class stores the pointer to the device memory and size. +With the help of `CopyOnWriteBuffer.ptr` we generate [weak references](https://docs.python.org/3/library/weakref.html) of `CopyOnWriteBuffer` and store it in `CopyOnWriteBuffer._instances`. +This is a mapping from `ptr` keys to `WeakSet`s containing references to `CopyOnWriterBuffer` objects. This +means all the new `CopyOnWriteBuffer`s that are created map to the same key in `CopyOnWriteBuffer._instances` if they have same `.ptr` +i.e., if they are all pointing to the same device memory. + +When the cudf option `"copy_on_write"` is `True`, `as_buffer` will always return a `CopyOnWriteBuffer`. This class contains all the +mechanisms to enable copy-on-write for all buffers. When a `CopyOnWriteBuffer` is created, its weakref is generated and added to the `WeakSet` which is in turn stored in `CopyOnWriterBuffer._instances`. This will later serve as an indication of whether or not to make a copy when a +when write operation is performed on a `Column` (see below). + + +### Eager copies when exposing to third-party libraries + +If `Column`/`CopyOnWriteBuffer` is exposed to a third-party library via `__cuda_array_interface__`, we are no longer able to track whether or not modification of the buffer has occurred without introspection. Hence whenever +someone accesses data through the `__cuda_array_interface__`, we eagerly trigger the copy by calling +`_unlink_shared_buffers` which ensures a true copy of underlying device data is made and +unlinks the buffer from any shared "weak" references. Any future shallow-copy requests must also trigger a true physical copy (since we cannot track the lifetime of the third-party object), to handle this we also mark the `Column`/`CopyOnWriteBuffer` as +`obj._zero_copied=True` thus indicating any future shallow-copy requests will trigger a true physical copy +rather than a copy-on-write shallow copy with weak references. + +### How to obtain read-only object? + +A read-only object can be quite useful for operations that will not +mutate the data. This can be achieved by calling `._get_readonly_proxy_obj` +API, this API will return a proxy object that has `__cuda_array_interface__` +implemented and will not trigger a deep copy even if the `CopyOnWriteBuffer` +has weak references. It is only recommended to use this API as long as +the objects/arrays created with this proxy object gets cleaned up during +the developer code execution. We currently use this API for device to host +copies like in `ColumnBase._data_array_view` which is used for `Column.values_host`. + +Notes: +1. Weak references are implemented only for fixed-width data types as these are only column +types that can be mutated in place. +2. Deep copies of variable width data types return shallow-copies of the Columns, because these +types don't support real in-place mutations to the data. We just mimic in such a way that it looks +like an in-place operation using `_mimic_inplace`. + + +### Examples + +When copy-on-write is enabled, taking a shallow copy of a `Series` or a `DataFrame` does not +eagerly create a copy of the data. Instead, it produces a view that will be lazily +copied when a write operation is performed on any of its copies. + +Let's create a series: + +```python +>>> import cudf +>>> cudf.set_option("copy_on_write", True) +>>> s1 = cudf.Series([1, 2, 3, 4]) +``` + +Make a copy of `s1`: +```python +>>> s2 = s1.copy(deep=False) +``` + +Make another copy, but of `s2`: +```python +>>> s3 = s2.copy(deep=False) +``` + +Viewing the data and memory addresses show that they all point to the same device memory: +```python +>>> s1 +0 1 +1 2 +2 3 +3 4 +dtype: int64 +>>> s2 +0 1 +1 2 +2 3 +3 4 +dtype: int64 +>>> s3 +0 1 +1 2 +2 3 +3 4 +dtype: int64 + +>>> s1.data._ptr +139796315897856 +>>> s2.data._ptr +139796315897856 +>>> s3.data._ptr +139796315897856 +``` + +Now, when we perform a write operation on one of them, say on `s2`, a new copy is created +for `s2` on device and then modified: + +```python +>>> s2[0:2] = 10 +>>> s2 +0 10 +1 10 +2 3 +3 4 +dtype: int64 +>>> s1 +0 1 +1 2 +2 3 +3 4 +dtype: int64 +>>> s3 +0 1 +1 2 +2 3 +3 4 +dtype: int64 +``` + +If we inspect the memory address of the data, `s1` and `s3` still share the same address but `s2` has a new one: + +```python +>>> s1.data._ptr +139796315897856 +>>> s3.data._ptr +139796315897856 +>>> s2.data._ptr +139796315899392 +``` + +Now, performing write operation on `s1` will trigger a new copy on device memory as there +is a weak reference being shared in `s3`: + +```python +>>> s1[0:2] = 11 +>>> s1 +0 11 +1 11 +2 3 +3 4 +dtype: int64 +>>> s2 +0 10 +1 10 +2 3 +3 4 +dtype: int64 +>>> s3 +0 1 +1 2 +2 3 +3 4 +dtype: int64 +``` + +If we inspect the memory address of the data, the addresses of `s2` and `s3` remain unchanged, but `s1`'s memory address has changed because of a copy operation performed during the writing: + +```python +>>> s2.data._ptr +139796315899392 +>>> s3.data._ptr +139796315897856 +>>> s1.data._ptr +139796315879723 +``` + +cudf Copy-on-write implementation is motivated by pandas Copy-on-write proposal here: +1. [Google doc](https://docs.google.com/document/d/1ZCQ9mx3LBMy-nhwRl33_jgcvWo9IWdEfxDNQ2thyTb0/edit#heading=h.iexejdstiz8u) +2. [Github issue](https://github.com/pandas-dev/pandas/issues/36195) diff --git a/docs/cudf/source/user_guide/copy-on-write.md b/docs/cudf/source/user_guide/copy-on-write.md new file mode 100644 index 00000000000..14ca3656250 --- /dev/null +++ b/docs/cudf/source/user_guide/copy-on-write.md @@ -0,0 +1,169 @@ +(copy-on-write-user-doc)= + +# Copy-on-write + +Copy-on-write reduces GPU memory usage when copies(`.copy(deep=False)`) of a column +are made. + +| | Copy-on-Write enabled | Copy-on-Write disabled (default) | +|---------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------| +| `.copy(deep=True)` | A true copy is made and changes don't propagate to the original object. | A true copy is made and changes don't propagate to the original object. | +| `.copy(deep=False)` | Memory is shared between the two objects and but any write operation on one object will trigger a true physical copy before the write is performed. Hence changes will not propagate to the original object. | Memory is shared between the two objects and changes performed on one will propagate to the other object. | + +## How to enable it + +i. Use `cudf.set_option`: + +```python +>>> import cudf +>>> cudf.set_option("copy_on_write", True) +``` + +ii. Set the environment variable ``CUDF_COPY_ON_WRITE`` to ``1`` prior to the +launch of the Python interpreter: + +```bash +export CUDF_COPY_ON_WRITE="1" python -c "import cudf" +``` + + +## Making copies + +There are no additional changes required in the code to make use of copy-on-write. + +```python +>>> series = cudf.Series([1, 2, 3, 4]) +``` + +Performing a shallow copy will create a new Series object pointing to the +same underlying device memory: + +```python +>>> copied_series = series.copy(deep=False) +>>> series +0 1 +1 2 +2 3 +3 4 +dtype: int64 +>>> copied_series +0 1 +1 2 +2 3 +3 4 +dtype: int64 +``` + +When a write operation is performed on either ``series`` or +``copied_series``, a true physical copy of the data is created: + +```python +>>> series[0:2] = 10 +>>> series +0 10 +1 10 +2 3 +3 4 +dtype: int64 +>>> copied_series +0 1 +1 2 +2 3 +3 4 +dtype: int64 +``` + + +## Notes + +When copy-on-write is enabled, there is no concept of views. i.e., modifying any view created inside cudf will not actually not modify +the original object it was viewing and thus a separate copy is created and then modified. + +## Advantages + +1. With copy-on-write enabled and by requesting `.copy(deep=False)`, the GPU memory usage can be reduced drastically if you are not performing +write operations on all of those copies. This will also increase the speed at which objects are created for execution of your ETL workflow. +2. With the concept of views going away, every object is a copy of it's original object. This will bring consistency across operations and cudf closer to parity with +pandas. Following is one of the inconsistency: + +```python + +>>> import pandas as pd +>>> s = pd.Series([1, 2, 3, 4, 5]) +>>> s1 = s[0:2] +>>> s1[0] = 10 +>>> s1 +0 10 +1 2 +dtype: int64 +>>> s +0 10 +1 2 +2 3 +3 4 +4 5 +dtype: int64 + +>>> import cudf +>>> s = cudf.Series([1, 2, 3, 4, 5]) +>>> s1 = s[0:2] +>>> s1[0] = 10 +>>> s1 +0 10 +1 2 +>>> s +0 1 +1 2 +2 3 +3 4 +4 5 +dtype: int64 +``` + +The above inconsistency is solved when Copy-on-write is enabled: + +```python +>>> import pandas as pd +>>> pd.set_option("mode.copy_on_write", True) +>>> s = pd.Series([1, 2, 3, 4, 5]) +>>> s1 = s[0:2] +>>> s1[0] = 10 +>>> s1 +0 10 +1 2 +dtype: int64 +>>> s +0 1 +1 2 +2 3 +3 4 +4 5 +dtype: int64 + + +>>> import cudf +>>> cudf.set_option("copy_on_write", True) +>>> s = cudf.Series([1, 2, 3, 4, 5]) +>>> s1 = s[0:2] +>>> s1[0] = 10 +>>> s1 +0 10 +1 2 +dtype: int64 +>>> s +0 1 +1 2 +2 3 +3 4 +4 5 +dtype: int64 +``` + +## How to disable it + + +Copy-on-write can be disable by setting ``copy_on_write`` cudf option to ``False``: + +```python +>>> cudf.set_option("copy_on_write", False) +``` diff --git a/docs/cudf/source/user_guide/index.md b/docs/cudf/source/user_guide/index.md index 86168f0d81b..f39f3b49671 100644 --- a/docs/cudf/source/user_guide/index.md +++ b/docs/cudf/source/user_guide/index.md @@ -13,4 +13,5 @@ guide-to-udfs cupy-interop options PandasCompat +copy-on-write ``` diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index ec7d2570708..8f09a0f41f4 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -1,4 +1,6 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. + +import inspect import cupy as cp import numpy as np @@ -10,6 +12,7 @@ import cudf._lib as libcudf from cudf.api.types import is_categorical_dtype from cudf.core.buffer import ( Buffer, + CopyOnWriteBuffer, SpillableBuffer, SpillLock, acquire_spill_lock, @@ -203,9 +206,21 @@ cdef class Column: "The value for mask is smaller than expected, got {} bytes, " "expected " + str(required_num_bytes) + " bytes." ) + + # Because hasattr will trigger invocation of + # `__cuda_array_interface__` which could + # be expensive in CopyOnWriteBuffer case. + value_cai = inspect.getattr_static( + value, + "__cuda_array_interface__", + None + ) + if value is None: mask = None - elif hasattr(value, "__cuda_array_interface__"): + elif type(value_cai) is property: + if isinstance(value, CopyOnWriteBuffer): + value = value._get_readonly_proxy_obj if value.__cuda_array_interface__["typestr"] not in ("|i1", "|u1"): if isinstance(value, Column): value = value.data_array_view @@ -303,6 +318,7 @@ cdef class Column: instead replaces the Buffers and other attributes underneath the column object with the Buffers and attributes from the other column. """ + if inplace: self._offset = other_col.offset self._size = other_col.size @@ -318,6 +334,7 @@ cdef class Column: return self._view(libcudf_types.UNKNOWN_NULL_COUNT).null_count() cdef mutable_column_view mutable_view(self) except *: + if is_categorical_dtype(self.dtype): col = self.base_children[0] else: @@ -331,12 +348,8 @@ cdef class Column: if col.base_data is None: data = NULL - elif isinstance(col.base_data, SpillableBuffer): - data = (col.base_data).get_ptr( - spill_lock=get_spill_lock() - ) else: - data = (col.base_data.ptr) + data = (col.base_data.mutable_ptr) cdef Column child_column if col.base_children: @@ -396,7 +409,11 @@ cdef class Column: spill_lock=get_spill_lock() ) else: - data = (col.base_data.ptr) + # Shouldn't access `.ptr`, because in case + # of `CopyOnWriteBuffer` that could trigger + # a copy, which isn't required to create a + # view that is read only. + data = (col.base_data._ptr) cdef Column child_column if col.base_children: @@ -526,6 +543,12 @@ cdef class Column: rmm.DeviceBuffer(ptr=data_ptr, size=(size+offset) * dtype_itemsize) ) + elif column_owner and isinstance(data_owner, CopyOnWriteBuffer): + # TODO: In future, see if we can just pass on the + # CopyOnWriteBuffer reference to another column + # and still create a weak reference. + # With the current design that's not possible. + data = data_owner.copy(deep=False) elif ( # This is an optimization of the most common case where # from_column_view creates a "view" that is identical to @@ -552,7 +575,9 @@ cdef class Column: owner=data_owner, exposed=True, ) - if isinstance(data_owner, SpillableBuffer): + if isinstance(data_owner, CopyOnWriteBuffer): + data_owner.ptr # accessing the pointer marks it exposed. + elif isinstance(data_owner, SpillableBuffer): if data_owner.is_spilled: raise ValueError( f"{data_owner} is spilled, which invalidates " diff --git a/python/cudf/cudf/core/buffer/__init__.py b/python/cudf/cudf/core/buffer/__init__.py index 49f2c57b17f..0d433509497 100644 --- a/python/cudf/cudf/core/buffer/__init__.py +++ b/python/cudf/cudf/core/buffer/__init__.py @@ -1,6 +1,7 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. from cudf.core.buffer.buffer import Buffer, cuda_array_interface_wrapper +from cudf.core.buffer.cow_buffer import CopyOnWriteBuffer from cudf.core.buffer.spillable_buffer import SpillableBuffer, SpillLock from cudf.core.buffer.utils import ( acquire_spill_lock, diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index ebc4d76b6a0..5479dc1fd50 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -192,6 +192,28 @@ def __getitem__(self, key: slice) -> Buffer: raise ValueError("slice must be C-contiguous") return self._getitem(offset=start, size=stop - start) + def copy(self, deep: bool = True): + """ + Return a copy of Buffer. + + Parameters + ---------- + deep : bool, default True + If True, returns a deep copy of the underlying Buffer data. + If False, returns a shallow copy of the Buffer pointing to + the same underlying data. + + Returns + ------- + Buffer + """ + if not deep: + return self[:] + else: + return self._from_device_memory( + rmm.DeviceBuffer(ptr=self.ptr, size=self.size) + ) + @property def size(self) -> int: """Size of the buffer in bytes.""" @@ -207,22 +229,44 @@ def ptr(self) -> int: """Device pointer to the start of the buffer.""" return self._ptr + @property + def mutable_ptr(self) -> int: + """Device pointer to the start of the buffer.""" + return self._ptr + @property def owner(self) -> Any: """Object owning the memory of the buffer.""" return self._owner @property - def __cuda_array_interface__(self) -> Mapping: - """Implementation of the CUDA Array Interface.""" + def __cuda_array_interface__(self) -> dict: + """Implementation for the CUDA Array Interface.""" + return self._get_cuda_array_interface(readonly=False) + + def _get_cuda_array_interface(self, readonly=False): return { - "data": (self.ptr, False), + "data": (self.ptr, readonly), "shape": (self.size,), "strides": None, "typestr": "|u1", "version": 0, } + @property + def _get_readonly_proxy_obj(self) -> dict: + """ + Returns a proxy object with a read-only CUDA Array Interface. + """ + return cuda_array_interface_wrapper( + ptr=self.ptr, + size=self.size, + owner=self, + readonly=True, + typestr="|u1", + version=0, + ) + def memoryview(self) -> memoryview: """Read-only access to the buffer through host memory.""" host_buf = host_memory_allocation(self.size) diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py new file mode 100644 index 00000000000..e322912ed4f --- /dev/null +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -0,0 +1,211 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION. + +from __future__ import annotations + +import weakref +from collections import defaultdict +from typing import Any, DefaultDict, Tuple, Type, TypeVar +from weakref import WeakSet + +import rmm + +from cudf.core.buffer.buffer import Buffer, cuda_array_interface_wrapper + +T = TypeVar("T", bound="CopyOnWriteBuffer") + + +def _keys_cleanup(ptr): + weak_set_values = CopyOnWriteBuffer._instances[ptr] + if ( + len(weak_set_values) == 1 + and next(iter(weak_set_values.data))() is None + ): + # When the last remaining reference is being cleaned up we will still + # have a dead reference in `weak_set_values`. If that is the case, then + # we can safely clean up the key + del CopyOnWriteBuffer._instances[ptr] + + +class CopyOnWriteBuffer(Buffer): + """A Copy-on-write buffer that implements Buffer. + + This buffer enables making copies of data only when there + is a write operation being performed. + + See `Copy-on-write` section in `library_design.md` for + detailed information on `CopyOnWriteBuffer`. + + Use the factory function `as_buffer` to create a CopyOnWriteBuffer + instance. + """ + + # This dict keeps track of all instances that have the same `ptr` + # and `size` attributes. Each key of the dict is a `(ptr, size)` + # tuple and the corresponding value is a set of weak references to + # instances with that `ptr` and `size`. + _instances: DefaultDict[Tuple, WeakSet] = defaultdict(WeakSet) + + # TODO: This is synonymous to SpillableBuffer._exposed attribute + # and has to be merged. + _zero_copied: bool + + def _finalize_init(self): + self.__class__._instances[self._ptr].add(self) + self._instances = self.__class__._instances[self._ptr] + self._zero_copied = False + weakref.finalize(self, _keys_cleanup, self._ptr) + + @classmethod + def _from_device_memory( + cls: Type[T], data: Any, *, exposed: bool = False + ) -> T: + """Create a Buffer from an object exposing `__cuda_array_interface__`. + + No data is being copied. + + Parameters + ---------- + data : device-buffer-like + An object implementing the CUDA Array Interface. + exposed : bool, optional + Mark the buffer as zero copied. + + Returns + ------- + Buffer + Buffer representing the same device memory as `data` + """ + + # Bypass `__init__` and initialize attributes manually + ret = super()._from_device_memory(data) + ret._finalize_init() + ret._zero_copied = exposed + return ret + + @classmethod + def _from_host_memory(cls: Type[T], data: Any) -> T: + ret = super()._from_host_memory(data) + ret._finalize_init() + return ret + + @property + def _is_shared(self): + """ + Return `True` if `self`'s memory is shared with other columns. + """ + return len(self._instances) > 1 + + @property + def ptr(self) -> int: + """Device pointer to the start of the buffer. + + This will trigger a deep copy if there are any weak references. + The Buffer would be marked as zero copied. + """ + self._unlink_shared_buffers() + self._zero_copied = True + return self._ptr + + @property + def mutable_ptr(self) -> int: + """Device pointer to the start of the buffer. + + This will trigger a deep copy if there are any weak references. + """ + # Shouldn't need to mark the Buffer as zero copied, + # because this API is used by libcudf only to create + # mutable views. + self._unlink_shared_buffers() + return self._ptr + + def _getitem(self, offset: int, size: int) -> Buffer: + """ + Helper for `__getitem__` + """ + return self._from_device_memory( + cuda_array_interface_wrapper( + ptr=self._ptr + offset, size=size, owner=self.owner + ) + ) + + def copy(self, deep: bool = True): + """ + Return a copy of Buffer. + + Parameters + ---------- + deep : bool, default True + If True, returns a deep-copy of the underlying Buffer data. + If False, returns a shallow-copy of the Buffer pointing to + the same underlying data. + + Returns + ------- + Buffer + """ + if deep or self._zero_copied: + return self._from_device_memory( + rmm.DeviceBuffer(ptr=self._ptr, size=self.size) + ) + else: + copied_buf = CopyOnWriteBuffer.__new__(CopyOnWriteBuffer) + copied_buf._ptr = self._ptr + copied_buf._size = self._size + copied_buf._owner = self._owner + copied_buf._finalize_init() + return copied_buf + + @property + def __cuda_array_interface__(self) -> dict: + # Unlink if there are any weak references. + self._unlink_shared_buffers() + # Mark the Buffer as ``zero_copied=True``, + # which will prevent any copy-on-write + # mechanism post this operation. + # This is done because we don't have any + # control over knowing if a third-party library + # has modified the data this Buffer is + # pointing to. + self._zero_copied = True + return self._get_cuda_array_interface(readonly=False) + + def _get_cuda_array_interface(self, readonly=False): + return { + "data": (self._ptr, readonly), + "shape": (self.size,), + "strides": None, + "typestr": "|u1", + "version": 0, + } + + @property + def _get_readonly_proxy_obj(self) -> dict: + """ + Returns a proxy object with a read-only CUDA Array Interface. + + See `Copy-on-write` section in `library_design.md` for + more information on this API. + """ + return cuda_array_interface_wrapper( + ptr=self._ptr, + size=self.size, + owner=self, + readonly=True, + typestr="|u1", + version=0, + ) + + def _unlink_shared_buffers(self): + """ + Unlinks a Buffer if it is shared with other buffers by + making a true deep-copy. + """ + if not self._zero_copied and self._is_shared: + # make a deep copy of existing DeviceBuffer + # and replace pointer to it. + current_buf = rmm.DeviceBuffer(ptr=self._ptr, size=self._size) + new_buf = current_buf.copy() + self._ptr = new_buf.ptr + self._size = new_buf.size + self._owner = new_buf + self._finalize_init() diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py index 6b99b875572..d22fb6fdc20 100644 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ b/python/cudf/cudf/core/buffer/spillable_buffer.py @@ -62,7 +62,7 @@ def __getitem__(self, i): class SpillableBuffer(Buffer): - """A spillable buffer that implements DeviceBufferLike. + """A spillable buffer that implements Buffer. This buffer supports spilling the represented data to host memory. Spilling can be done manually by calling `.spill(target="cpu")` but @@ -258,6 +258,10 @@ def ptr(self) -> int: self._last_accessed = time.monotonic() return self._ptr + @property + def mutable_ptr(self) -> int: + return self.get_ptr(spill_lock=SpillLock()) + def spill_lock(self, spill_lock: SpillLock) -> None: """Spill lock the buffer diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py index 062e86d0cb1..fc67138de42 100644 --- a/python/cudf/cudf/core/buffer/utils.py +++ b/python/cudf/cudf/core/buffer/utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. from __future__ import annotations @@ -7,8 +7,10 @@ from typing import Any, Dict, Optional, Tuple, Union from cudf.core.buffer.buffer import Buffer, cuda_array_interface_wrapper +from cudf.core.buffer.cow_buffer import CopyOnWriteBuffer from cudf.core.buffer.spill_manager import get_global_manager from cudf.core.buffer.spillable_buffer import SpillableBuffer, SpillLock +from cudf.options import get_option def as_buffer( @@ -71,6 +73,14 @@ def as_buffer( "`data` is a buffer-like or array-like object" ) + if get_option("copy_on_write"): + if isinstance(data, Buffer) or hasattr( + data, "__cuda_array_interface__" + ): + return CopyOnWriteBuffer._from_device_memory(data, exposed=exposed) + if exposed: + raise ValueError("cannot created exposed host memory") + return CopyOnWriteBuffer._from_host_memory(data) if get_global_manager() is not None: if hasattr(data, "__cuda_array_interface__"): return SpillableBuffer._from_device_memory(data, exposed=exposed) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index ef9f515fff7..4b53c3ccd92 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -733,7 +733,7 @@ def categories(self) -> ColumnBase: @categories.setter def categories(self, value): - self.dtype = CategoricalDtype( + self._dtype = CategoricalDtype( categories=value, ordered=self.dtype.ordered ) @@ -1275,31 +1275,12 @@ def _get_decategorized_column(self) -> ColumnBase: return out def copy(self, deep: bool = True) -> CategoricalColumn: + result_col = super().copy(deep=deep) if deep: - copied_col = libcudf.copying.copy_column(self) - copied_cat = libcudf.copying.copy_column(self.dtype._categories) - - return column.build_categorical_column( - categories=copied_cat, - codes=column.build_column( - copied_col.base_data, dtype=copied_col.dtype - ), - offset=copied_col.offset, - size=copied_col.size, - mask=copied_col.base_mask, - ordered=self.dtype.ordered, - ) - else: - return column.build_categorical_column( - categories=self.dtype.categories._values, - codes=column.build_column( - self.codes.base_data, dtype=self.codes.dtype - ), - mask=self.base_mask, - ordered=self.dtype.ordered, - offset=self.offset, - size=self.size, + result_col.categories = libcudf.copying.copy_column( + self.dtype._categories ) + return result_col @cached_property def memory_usage(self) -> int: diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index e3c0abde66c..1f0b675f536 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -120,6 +120,30 @@ def data_array_view(self) -> "cuda.devicearray.DeviceNDArray": """ return cuda.as_cuda_array(self.data).view(self.dtype) + @property + def _data_array_view(self) -> "cuda.devicearray.DeviceNDArray": + """ + Internal implementation for viewing the data as a device array object + without triggering a deep-copy. + """ + return cuda.as_cuda_array( + self.data._get_readonly_proxy_obj + if self.data is not None + else None + ).view(self.dtype) + + @property + def _mask_array_view(self) -> "cuda.devicearray.DeviceNDArray": + """ + Internal implementation for viewing the mask as a device array object + without triggering a deep-copy. + """ + return cuda.as_cuda_array( + self.mask._get_readonly_proxy_obj + if self.mask is not None + else None + ).view(mask_dtype) + @property def mask_array_view(self) -> "cuda.devicearray.DeviceNDArray": """ @@ -163,7 +187,7 @@ def values_host(self) -> "np.ndarray": if self.has_nulls(): raise ValueError("Column must have no nulls.") - return self.data_array_view.copy_to_host() + return self._data_array_view.copy_to_host() @property def values(self) -> "cupy.ndarray": @@ -365,24 +389,59 @@ def nullmask(self) -> Buffer: raise ValueError("Column has no null mask") return self.mask_array_view + @property + def _nullmask(self) -> Buffer: + """The gpu buffer for the null-mask""" + if not self.nullable: + raise ValueError("Column has no null mask") + return self._mask_array_view + + def force_deep_copy(self: T) -> T: + """ + A method to create deep copy irrespective of whether + `copy-on-write` is enable. + """ + result = libcudf.copying.copy_column(self) + return cast(T, result._with_type_metadata(self.dtype)) + def copy(self: T, deep: bool = True) -> T: - """Columns are immutable, so a deep copy produces a copy of the - underlying data and mask and a shallow copy creates a new column and - copies the references of the data and mask. + """ + Makes a copy of the Column. + + Parameters + ---------- + deep : bool, default True + If True, a true physical copy of the column + is made. + If False and `copy_on_write` is False, the same + memory is shared between the buffers of the Column + and changes made to one Column will propagate to + it's copy and vice-versa. + If False and `copy_on_write` is True, the same + memory is shared between the buffers of the Column + until there is a write operation being performed on + them. """ if deep: - result = libcudf.copying.copy_column(self) - return cast(T, result._with_type_metadata(self.dtype)) + return self.force_deep_copy() else: return cast( T, build_column( - self.base_data, - self.dtype, - mask=self.base_mask, + data=self.base_data + if self.base_data is None + else self.base_data.copy(deep=False), + dtype=self.dtype, + mask=self.base_mask + if self.base_mask is None + else self.base_mask.copy(deep=False), size=self.size, offset=self.offset, - children=self.base_children, + children=tuple( + col.copy(deep=False) for col in self.base_children + ) + if cudf.get_option("copy_on_write") + else self.base_children, ), ) @@ -1301,11 +1360,12 @@ def column_empty_like( ): column = cast("cudf.core.column.CategoricalColumn", column) codes = column_empty_like(column.codes, masked=masked, newsize=newsize) + return build_column( data=None, dtype=dtype, mask=codes.base_mask, - children=(as_column(codes.base_data, dtype=codes.dtype),), + children=(codes,), size=codes.size, ) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 56436ac141d..449691750d8 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. from __future__ import annotations @@ -279,6 +279,7 @@ def as_numerical(self) -> "cudf.core.column.NumericalColumn": @property def __cuda_array_interface__(self) -> Mapping[str, Any]: + output = { "shape": (len(self),), "strides": (self.dtype.itemsize,), diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 3bd8f99c863..92bb0d2e4c5 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -198,6 +198,11 @@ def _with_type_metadata( return self + def copy(self, deep: bool = True): + # Since list columns are immutable, both deep and shallow copies share + # the underlying device data and mask. + return super().copy(deep=False) + def leaves(self): if isinstance(self.elements, ListColumn): return self.elements.leaves() diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 7943135afe1..37d1d7b6e61 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. from __future__ import annotations @@ -13,7 +13,6 @@ cast, ) -import cupy import numpy as np import pandas as pd @@ -110,8 +109,8 @@ def __contains__(self, item: ScalarLike) -> bool: # Handles improper item types # Fails if item is of type None, so the handler. try: - if np.can_cast(item, self.data_array_view.dtype): - item = self.data_array_view.dtype.type(item) + if np.can_cast(item, self.dtype): + item = self.dtype.type(item) else: return False except (TypeError, ValueError): @@ -165,6 +164,7 @@ def __setitem__(self, key: Any, value: Any): @property def __cuda_array_interface__(self) -> Mapping[str, Any]: + output = { "shape": (len(self),), "strides": (self.dtype.itemsize,), @@ -573,14 +573,14 @@ def _find_value( found = 0 if len(self): found = find( - self.data_array_view, + self._data_array_view, value, mask=self.mask, ) if found == -1: if self.is_monotonic_increasing and closest: found = find( - self.data_array_view, + self._data_array_view, value, mask=self.mask, compare=compare, @@ -724,7 +724,7 @@ def to_pandas( pandas_array = pandas_nullable_dtype.__from_arrow__(arrow_array) pd_series = pd.Series(pandas_array, copy=False) elif str(self.dtype) in NUMERIC_TYPES and not self.has_nulls(): - pd_series = pd.Series(cupy.asnumpy(self.values), copy=False) + pd_series = pd.Series(self.values_host, copy=False) else: pd_series = self.to_arrow().to_pandas(**kwargs) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index c5d5b396ebc..da717edf0e2 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5251,6 +5251,11 @@ def __init__( self._start_offset = None self._end_offset = None + def copy(self, deep: bool = True): + # Since string columns are immutable, both deep + # and shallow copies share the underlying device data and mask. + return super().copy(deep=False) + @property def start_offset(self) -> int: if self._start_offset is None: diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 69d70cf427f..6838d711641 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from __future__ import annotations from functools import cached_property @@ -95,7 +95,9 @@ def __setitem__(self, key, value): super().__setitem__(key, value) def copy(self, deep=True): - result = super().copy(deep=deep) + # Since struct columns are immutable, both deep and + # shallow copies share the underlying device data and mask. + result = super().copy(deep=False) if deep: result = result._rename_fields(self.dtype.fields.keys()) return result diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 901547d94a9..74cb1efb3a5 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from __future__ import annotations @@ -129,7 +129,7 @@ def to_arrow(self) -> pa.Array: mask = None if self.nullable: mask = pa.py_buffer(self.mask_array_view.copy_to_host()) - data = pa.py_buffer(self.as_numerical.data_array_view.copy_to_host()) + data = pa.py_buffer(self.as_numerical.values_host) pa_dtype = np_to_pa_dtype(self.dtype) return pa.Array.from_buffers( type=pa_dtype, diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index e1d12807fa8..707eda3f5e6 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. from __future__ import annotations @@ -319,9 +319,9 @@ def copy(self, deep=False) -> ColumnAccessor: """ Make a copy of this ColumnAccessor. """ - if deep: + if deep or cudf.get_option("copy_on_write"): return self.__class__( - {k: v.copy(deep=True) for k, v in self._data.items()}, + {k: v.copy(deep=deep) for k, v in self._data.items()}, multiindex=self.multiindex, level_names=self.level_names, ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index b24895805a9..d073f17089c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6533,12 +6533,14 @@ def to_struct(self, name=None): field_names = [str(name) for name in self._data.names] col = cudf.core.column.build_struct_column( - names=field_names, children=self._data.columns, size=len(self) + names=field_names, + children=tuple( + [col.copy(deep=True) for col in self._data.columns] + ), + size=len(self), ) return cudf.Series._from_data( - cudf.core.column_accessor.ColumnAccessor( - {name: col.copy(deep=True)} - ), + cudf.core.column_accessor.ColumnAccessor({name: col}), index=self.index, name=name, ) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 32764c6c2f0..2e85d75eb71 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from __future__ import annotations @@ -1448,7 +1448,7 @@ def searchsorted( # Return result as cupy array if the values is non-scalar # If values is scalar, result is expected to be scalar. - result = cupy.asarray(outcol.data_array_view) + result = cupy.asarray(outcol._data_array_view) if scalar_flag: return result[0].item() else: @@ -1701,8 +1701,8 @@ def _colwise_binop( # that nulls that are present in both left_column and # right_column are not filled. if left_column.nullable and right_column.nullable: - lmask = as_column(left_column.nullmask) - rmask = as_column(right_column.nullmask) + lmask = as_column(left_column._nullmask) + rmask = as_column(right_column._nullmask) output_mask = (lmask | rmask).data left_column = left_column.fillna(fill_value) right_column = right_column.fillna(fill_value) @@ -1756,7 +1756,7 @@ def _apply_cupy_ufunc_to_operands( cupy_inputs = [] for inp in (left, right) if ufunc.nin == 2 else (left,): if isinstance(inp, ColumnBase) and inp.has_nulls(): - new_mask = as_column(inp.nullmask) + new_mask = as_column(inp._nullmask) # TODO: This is a hackish way to perform a bitwise and # of bitmasks. Once we expose diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 7c4eac59eab..783c3996400 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. from __future__ import annotations @@ -156,7 +156,7 @@ def __init__( source_data = {} for i, (column_name, col) in enumerate(codes._data.items()): - if -1 in col.values: + if -1 in col: level = cudf.DataFrame( {column_name: [None] + list(levels[i])}, index=range(-1, len(levels[i])), diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 64622bde86e..df1a543c4aa 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -584,9 +584,7 @@ def _tile(A, reps): mdata[var_name] = cudf.Series( cudf.core.column.build_categorical_column( categories=value_vars, - codes=cudf.core.column.as_column( - temp._column.base_data, dtype=temp._column.dtype - ), + codes=temp._column, mask=temp._column.base_mask, size=temp._column.size, offset=temp._column.offset, diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index faad5275abd..74dfb69593e 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -365,6 +365,9 @@ class Series(SingleColumnFrame, IndexedFrame, Serializable): name : str, optional The name to give to the Series. + copy : bool, default False + Copy input data. Only affects Series or 1d ndarray input. + nan_as_null : bool, Default True If ``None``/``True``, converts ``np.nan`` values to ``null`` values. @@ -490,6 +493,7 @@ def __init__( index=None, dtype=None, name=None, + copy=False, nan_as_null=True, ): if isinstance(data, pd.Series): @@ -520,6 +524,8 @@ def __init__( if name is None: name = data.name data = data._column + if copy: + data = data.copy(deep=True) if dtype is not None: data = data.astype(dtype) @@ -538,7 +544,21 @@ def __init__( data = {} if not isinstance(data, ColumnBase): - data = column.as_column(data, nan_as_null=nan_as_null, dtype=dtype) + has_cai = ( + type( + inspect.getattr_static( + data, "__cuda_array_interface__", None + ) + ) + is property + ) + data = column.as_column( + data, + nan_as_null=nan_as_null, + dtype=dtype, + ) + if copy and has_cai: + data = data.copy(deep=True) else: if dtype is not None: data = data.astype(dtype) @@ -4959,10 +4979,10 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False): index = as_index(a.index) a_col = column.as_column(a) - a_array = cupy.asarray(a_col.data_array_view) + a_array = cupy.asarray(a_col._data_array_view) b_col = column.as_column(b) - b_array = cupy.asarray(b_col.data_array_view) + b_array = cupy.asarray(b_col._data_array_view) result = cupy.isclose( a=a_array, b=b_array, rtol=rtol, atol=atol, equal_nan=equal_nan diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index f80ad92879d..d98b194722a 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. import os import textwrap @@ -150,6 +150,33 @@ def _validator(val): return _validator +def _cow_validator(val): + if get_option("spill") and val: + raise ValueError( + "Copy-on-write is not supported when spilling is enabled. " + "Please set `spill` to `False`" + ) + if val not in {False, True}: + raise ValueError( + f"{val} is not a valid option. Must be one of {{False, True}}." + ) + + +def _spill_validator(val): + try: + if get_option("copy_on_write") and val: + raise ValueError( + "Spilling is not supported when Copy-on-write is enabled. " + "Please set `copy_on_write` to `False`" + ) + except KeyError: + pass + if val not in {False, True}: + raise ValueError( + f"{val} is not a valid option. Must be one of {{False, True}}." + ) + + def _integer_validator(val): try: int(val) @@ -205,7 +232,6 @@ def _integer_and_none_validator(val): _make_contains_validator([None, 32, 64]), ) - _register_option( "spill", _env_get_bool("CUDF_SPILL", False), @@ -215,9 +241,28 @@ def _integer_and_none_validator(val): \tValid values are True or False. Default is False. """ ), - _make_contains_validator([False, True]), + _spill_validator, ) + +_register_option( + "copy_on_write", + _env_get_bool("CUDF_COPY_ON_WRITE", False), + textwrap.dedent( + """ + Default behavior of performing shallow copies. + If set to `False`, each shallow copy will perform a true shallow copy. + If set to `True`, each shallow copy will perform a shallow copy + with underlying data actually referring to the actual column, in this + case a copy is only made when there is a write operation performed on + the column. + \tValid values are True or False. Default is False. + """ + ), + _cow_validator, +) + + _register_option( "spill_on_demand", _env_get_bool("CUDF_SPILL_ON_DEMAND", True), diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index cbaf47a4c68..c27e1eaab36 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -346,6 +346,11 @@ def get_ptr(x) -> int: assert len(lhs.base_children) == len(rhs.base_children) for lhs_child, rhs_child in zip(lhs.base_children, rhs.base_children): assert_column_memory_eq(lhs_child, rhs_child) + if isinstance(lhs, cudf.core.column.CategoricalColumn) and isinstance( + rhs, cudf.core.column.CategoricalColumn + ): + assert_column_memory_eq(lhs.categories, rhs.categories) + assert_column_memory_eq(lhs.codes, rhs.codes) def assert_column_memory_ne( diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py index 1d3d9e91ae2..1dd73a69384 100644 --- a/python/cudf/cudf/tests/test_copying.py +++ b/python/cudf/cudf/tests/test_copying.py @@ -1,5 +1,6 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. +import cupy as cp import numpy as np import pandas as pd import pytest @@ -52,3 +53,238 @@ def test_null_copy(): col = Series(np.arange(2049)) col[:] = None assert len(col) == 2049 + + +@pytest.mark.parametrize("copy_on_write", [True, False]) +def test_series_setitem_cow(copy_on_write): + cudf.set_option("copy_on_write", copy_on_write) + actual = cudf.Series([1, 2, 3, 4, 5]) + new_copy = actual.copy(deep=False) + + actual[1] = 100 + assert_eq(actual, cudf.Series([1, 100, 3, 4, 5])) + if copy_on_write: + assert_eq(new_copy, cudf.Series([1, 2, 3, 4, 5])) + else: + assert_eq(new_copy, cudf.Series([1, 100, 3, 4, 5])) + + actual = cudf.Series([1, 2, 3, 4, 5]) + new_copy = actual.copy(deep=False) + + actual[slice(0, 2, 1)] = 100 + assert_eq(actual, cudf.Series([100, 100, 3, 4, 5])) + if copy_on_write: + assert_eq(new_copy, cudf.Series([1, 2, 3, 4, 5])) + else: + assert_eq(new_copy, cudf.Series([100, 100, 3, 4, 5])) + + new_copy[slice(2, 4, 1)] = 300 + if copy_on_write: + assert_eq(actual, cudf.Series([100, 100, 3, 4, 5])) + else: + assert_eq(actual, cudf.Series([100, 100, 300, 300, 5])) + + if copy_on_write: + assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5])) + else: + assert_eq(new_copy, cudf.Series([100, 100, 300, 300, 5])) + + actual = cudf.Series([1, 2, 3, 4, 5]) + new_copy = actual.copy(deep=False) + + new_copy[slice(2, 4, 1)] = 300 + if copy_on_write: + assert_eq(actual, cudf.Series([1, 2, 3, 4, 5])) + else: + assert_eq(actual, cudf.Series([1, 2, 300, 300, 5])) + assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5])) + + new_slice = actual[2:] + assert new_slice._column.base_data._ptr == actual._column.base_data._ptr + new_slice[0:2] = 10 + assert_eq(new_slice, cudf.Series([10, 10, 5], index=[2, 3, 4])) + if copy_on_write: + assert_eq(actual, cudf.Series([1, 2, 3, 4, 5])) + else: + assert_eq(actual, cudf.Series([1, 2, 10, 10, 5])) + + +def test_multiple_series_cow(): + cudf.set_option("copy_on_write", True) + s = cudf.Series([10, 20, 30, 40, 50]) + s1 = s.copy(deep=False) + s2 = s.copy(deep=False) + s3 = s.copy(deep=False) + s4 = s2.copy(deep=False) + s5 = s4.copy(deep=False) + s6 = s3.copy(deep=False) + + s1[0:3] = 10000 + assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) + for ser in [s, s2, s3, s4, s5, s6]: + assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) + + s6[0:3] = 3000 + assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) + assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) + for ser in [s2, s3, s4, s5]: + assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) + + s2[1:4] = 4000 + assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50])) + assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) + assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) + for ser in [s3, s4, s5]: + assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) + + s4[2:4] = 5000 + assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50])) + assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50])) + assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) + assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) + for ser in [s3, s5]: + assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) + + s5[2:4] = 6000 + assert_eq(s5, cudf.Series([10, 20, 6000, 6000, 50])) + assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50])) + assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50])) + assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) + assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) + for ser in [s3]: + assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) + + s7 = s5.copy(deep=False) + assert_eq(s7, cudf.Series([10, 20, 6000, 6000, 50])) + s7[1:3] = 55 + assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) + + assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50])) + assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50])) + assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) + assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) + for ser in [s3]: + assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) + + del s2 + + assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) + assert_eq(s3, cudf.Series([10, 20, 30, 40, 50])) + assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50])) + assert_eq(s5, cudf.Series([10, 20, 6000, 6000, 50])) + assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) + assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) + + del s4 + del s1 + + assert_eq(s3, cudf.Series([10, 20, 30, 40, 50])) + assert_eq(s5, cudf.Series([10, 20, 6000, 6000, 50])) + assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) + assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) + + del s + del s6 + + assert_eq(s3, cudf.Series([10, 20, 30, 40, 50])) + assert_eq(s5, cudf.Series([10, 20, 6000, 6000, 50])) + assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) + + del s5 + + assert_eq(s3, cudf.Series([10, 20, 30, 40, 50])) + assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) + + del s3 + assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) + + +@pytest.mark.parametrize("copy_on_write", [True, False]) +def test_series_zero_copy(copy_on_write): + cudf.set_option("copy_on_write", copy_on_write) + s = cudf.Series([1, 2, 3, 4, 5]) + s1 = s.copy(deep=False) + cp_array = cp.asarray(s) + + assert_eq(s, cudf.Series([1, 2, 3, 4, 5])) + assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) + assert_eq(cp_array, cp.array([1, 2, 3, 4, 5])) + + cp_array[0:3] = 10 + + assert_eq(s, cudf.Series([10, 10, 10, 4, 5])) + if copy_on_write: + assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) + else: + assert_eq(s1, cudf.Series([10, 10, 10, 4, 5])) + assert_eq(cp_array, cp.array([10, 10, 10, 4, 5])) + + s2 = cudf.Series(cp_array) + assert_eq(s2, cudf.Series([10, 10, 10, 4, 5])) + s3 = s2.copy(deep=False) + cp_array[0] = 20 + + assert_eq(s, cudf.Series([20, 10, 10, 4, 5])) + if copy_on_write: + assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) + else: + assert_eq(s1, cudf.Series([20, 10, 10, 4, 5])) + assert_eq(cp_array, cp.array([20, 10, 10, 4, 5])) + assert_eq(s2, cudf.Series([20, 10, 10, 4, 5])) + if copy_on_write: + assert_eq(s3, cudf.Series([10, 10, 10, 4, 5])) + else: + assert_eq(s3, cudf.Series([20, 10, 10, 4, 5])) + + s4 = cudf.Series([10, 20, 30, 40, 50]) + s5 = cudf.Series(s4) + assert_eq(s5, cudf.Series([10, 20, 30, 40, 50])) + s5[0:2] = 1 + assert_eq(s5, cudf.Series([1, 1, 30, 40, 50])) + assert_eq(s4, cudf.Series([1, 1, 30, 40, 50])) + + +@pytest.mark.parametrize("copy_on_write", [True, False]) +def test_series_str_copy(copy_on_write): + cudf.set_option("copy_on_write", copy_on_write) + s = cudf.Series(["a", "b", "c", "d", "e"]) + s1 = s.copy(deep=True) + s2 = s.copy(deep=True) + + assert_eq(s, cudf.Series(["a", "b", "c", "d", "e"])) + assert_eq(s1, cudf.Series(["a", "b", "c", "d", "e"])) + assert_eq(s2, cudf.Series(["a", "b", "c", "d", "e"])) + + s[0:3] = "abc" + + assert_eq(s, cudf.Series(["abc", "abc", "abc", "d", "e"])) + assert_eq(s1, cudf.Series(["a", "b", "c", "d", "e"])) + assert_eq(s2, cudf.Series(["a", "b", "c", "d", "e"])) + + s2[1:4] = "xyz" + + assert_eq(s, cudf.Series(["abc", "abc", "abc", "d", "e"])) + assert_eq(s1, cudf.Series(["a", "b", "c", "d", "e"])) + assert_eq(s2, cudf.Series(["a", "xyz", "xyz", "xyz", "e"])) + + +@pytest.mark.parametrize("copy_on_write", [True, False]) +def test_series_cat_copy(copy_on_write): + cudf.set_option("copy_on_write", copy_on_write) + s = cudf.Series([10, 20, 30, 40, 50], dtype="category") + s1 = s.copy(deep=True) + s2 = s1.copy(deep=True) + s3 = s1.copy(deep=True) + + s[0] = 50 + assert_eq(s, cudf.Series([50, 20, 30, 40, 50], dtype=s.dtype)) + assert_eq(s1, cudf.Series([10, 20, 30, 40, 50], dtype="category")) + assert_eq(s2, cudf.Series([10, 20, 30, 40, 50], dtype="category")) + assert_eq(s3, cudf.Series([10, 20, 30, 40, 50], dtype="category")) + + s2[3] = 10 + s3[2:5] = 20 + assert_eq(s, cudf.Series([50, 20, 30, 40, 50], dtype=s.dtype)) + assert_eq(s1, cudf.Series([10, 20, 30, 40, 50], dtype=s.dtype)) + assert_eq(s2, cudf.Series([10, 20, 30, 10, 50], dtype=s.dtype)) + assert_eq(s3, cudf.Series([10, 20, 20, 20, 20], dtype=s.dtype)) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 5b0eca1c635..32d83bb1c83 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -392,6 +392,7 @@ def test_index_copy_category(name, dtype, deep=True): with pytest.warns(FutureWarning): cidx_copy = cidx.copy(name=name, deep=deep, dtype=dtype) + assert_column_memory_ne(cidx._values, cidx_copy._values) assert_eq(pidx_copy, cidx_copy) @@ -410,7 +411,18 @@ def test_index_copy_category(name, dtype, deep=True): def test_index_copy_deep(idx, deep): """Test if deep copy creates a new instance for device data.""" idx_copy = idx.copy(deep=deep) - if not deep: + + if ( + isinstance(idx, cudf.StringIndex) + or not deep + or (cudf.get_option("copy_on_write") and not deep) + ): + # StringColumn is immutable hence, deep copies of a + # StringIndex will share the same StringColumn. + + # When `copy_on_write` is turned on, Index objects will + # have unique column object but they all point to same + # data pointers. assert_column_memory_eq(idx._values, idx_copy._values) else: assert_column_memory_ne(idx._values, idx_copy._values) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index d27d6732226..ee7c10d607a 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -781,13 +781,15 @@ def test_multiindex_copy_sem(data, levels, codes, names): ), ], ) +@pytest.mark.parametrize("copy_on_write", [True, False]) @pytest.mark.parametrize("deep", [True, False]) -def test_multiindex_copy_deep(data, deep): +def test_multiindex_copy_deep(data, copy_on_write, deep): """Test memory identity for deep copy Case1: Constructed from GroupBy, StringColumns Case2: Constructed from MultiIndex, NumericColumns """ - same_ref = not deep + cudf.set_option("copy_on_write", copy_on_write) + same_ref = (not deep) or (cudf.get_option("copy_on_write") and not deep) if isinstance(data, dict): import operator @@ -804,10 +806,10 @@ def test_multiindex_copy_deep(data, deep): lchildren = reduce(operator.add, lchildren) rchildren = reduce(operator.add, rchildren) - lptrs = [child.base_data.ptr for child in lchildren] - rptrs = [child.base_data.ptr for child in rchildren] + lptrs = [child.base_data._ptr for child in lchildren] + rptrs = [child.base_data._ptr for child in rchildren] - assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) + assert all((x == y) for x, y in zip(lptrs, rptrs)) elif isinstance(data, cudf.MultiIndex): mi1 = data diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 6a1245a1582..b3c7c9ac9bb 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2080,3 +2080,25 @@ def test_series_duplicated(data, index, keep): ps = gs.to_pandas() assert_eq(gs.duplicated(keep=keep), ps.duplicated(keep=keep)) + + +@pytest.mark.parametrize( + "data", + [ + [1, 2, 3, 4], + [10, 20, None, None], + ], +) +@pytest.mark.parametrize("copy", [True, False]) +def test_series_copy(data, copy): + psr = pd.Series(data) + gsr = cudf.from_pandas(psr) + + new_psr = pd.Series(psr, copy=copy) + new_gsr = cudf.Series(gsr, copy=copy) + + new_psr.iloc[0] = 999 + new_gsr.iloc[0] = 999 + + assert_eq(psr, gsr) + assert_eq(new_psr, new_gsr) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 715d17169a6..6478fbaad95 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -531,7 +531,6 @@ def test_nans_stats(data, ops, skipna): getattr(psr, ops)(skipna=skipna), getattr(gsr, ops)(skipna=skipna) ) - psr = _create_pandas_series(data) gsr = cudf.Series(data, nan_as_null=False) # Since there is no concept of `nan_as_null` in pandas, # nulls will be returned in the operations. So only diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py index 89331b933a8..b7677f7b43e 100644 --- a/python/cudf/cudf/utils/applyutils.py +++ b/python/cudf/cudf/utils/applyutils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. import functools from typing import Any, Dict @@ -110,7 +110,7 @@ def make_aggregate_nullmask(df, columns=None, op="__and__"): col = cudf.core.dataframe.extract_col(df, k) if not col.nullable: continue - nullmask = df[k].nullmask + nullmask = cudf.Series(df[k]._column._nullmask) if out_mask is None: out_mask = column.as_column( From a857ad9efdffb3520d5e8bd2ccbd9afa61d563a8 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 26 Jan 2023 09:45:17 -0600 Subject: [PATCH 02/31] [REVIEW] Update `copy-on-write` with `branch-23.02` changes (#12556) * get_ptr & _array_view refactor * Apply suggestions from code review Co-authored-by: Lawrence Mitchell * address reviews * Apply suggestions from code review Co-authored-by: Mads R. B. Kristensen * drop internal_write * add docstring * Apply suggestions from code review Co-authored-by: Lawrence Mitchell * address reviews * address reviews * add locks * Apply suggestions from code review Co-authored-by: Mads R. B. Kristensen * Apply suggestions from code review Co-authored-by: Vyas Ramasubramani * make mode a required key-arg * rename to _readonly_proxy_cai_obj * Update python/cudf/cudf/core/column/column.py Co-authored-by: Lawrence Mitchell * revert * fix * Apply suggestions from code review Co-authored-by: Lawrence Mitchell Co-authored-by: Mads R. B. Kristensen Co-authored-by: Vyas Ramasubramani --- .../source/developer_guide/library_design.md | 4 +- python/cudf/cudf/_lib/column.pyx | 2 +- python/cudf/cudf/core/buffer/buffer.py | 14 +++--- python/cudf/cudf/core/buffer/cow_buffer.py | 44 ++++++++++--------- .../cudf/cudf/core/buffer/spillable_buffer.py | 5 +-- python/cudf/cudf/core/column/column.py | 9 +--- python/cudf/cudf/core/frame.py | 11 +++-- python/cudf/cudf/utils/applyutils.py | 3 +- 8 files changed, 48 insertions(+), 44 deletions(-) diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index 83d92df96a7..0e87c46faab 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -347,13 +347,13 @@ rather than a copy-on-write shallow copy with weak references. ### How to obtain read-only object? A read-only object can be quite useful for operations that will not -mutate the data. This can be achieved by calling `._get_readonly_proxy_obj` +mutate the data. This can be achieved by calling `._readonly_proxy_cai_obj` API, this API will return a proxy object that has `__cuda_array_interface__` implemented and will not trigger a deep copy even if the `CopyOnWriteBuffer` has weak references. It is only recommended to use this API as long as the objects/arrays created with this proxy object gets cleaned up during the developer code execution. We currently use this API for device to host -copies like in `ColumnBase._data_array_view` which is used for `Column.values_host`. +copies like in `ColumnBase.data_array_view(mode="read")` which is used for `Column.values_host`. Notes: 1. Weak references are implemented only for fixed-width data types as these are only column diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 62bd4991aab..365524ab745 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -211,7 +211,7 @@ cdef class Column: mask = None elif type(value_cai) is property: if isinstance(value, CopyOnWriteBuffer): - value = value._get_readonly_proxy_obj + value = value._readonly_proxy_cai_obj if value.__cuda_array_interface__["typestr"] not in ("|i1", "|u1"): if isinstance(value, Column): value = value.data_array_view(mode="write") diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index cd1995edebf..17ca20e09e2 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -209,12 +209,15 @@ def copy(self, deep: bool = True): ------- Buffer """ - if not deep: - return self[:] + if deep: + with cudf.core.buffer.acquire_spill_lock(): + return self._from_device_memory( + rmm.DeviceBuffer( + ptr=self.get_ptr(mode="read"), size=self.size + ) + ) else: - return self._from_device_memory( - rmm.DeviceBuffer(ptr=self.ptr, size=self.size) - ) + return self[:] @property def size(self) -> int: @@ -293,6 +296,7 @@ def get_ptr(self, *, mode) -> int: See Also -------- SpillableBuffer.get_ptr + CopyOnWriteBuffer.get_ptr """ return self._ptr diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index e322912ed4f..fdd1855eac7 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -95,28 +95,32 @@ def _is_shared(self): """ return len(self._instances) > 1 - @property - def ptr(self) -> int: - """Device pointer to the start of the buffer. - - This will trigger a deep copy if there are any weak references. - The Buffer would be marked as zero copied. - """ - self._unlink_shared_buffers() - self._zero_copied = True - return self._ptr - - @property - def mutable_ptr(self) -> int: + def get_ptr(self, mode: str = "write") -> int: """Device pointer to the start of the buffer. - This will trigger a deep copy if there are any weak references. + Parameters + ---------- + mode : str, default 'write' + Supported values are {"read", "write"} + If "write", when weak-references exist, they + are unlinked and the data pointed to may be modified + by the caller. If "read", the data pointed to + must not be modified by the caller. + Failure to fulfill this contract will cause + incorrect behavior. + + See Also + -------- + Buffer.get_ptr + SpillableBuffer.get_ptr """ - # Shouldn't need to mark the Buffer as zero copied, - # because this API is used by libcudf only to create - # mutable views. - self._unlink_shared_buffers() - return self._ptr + if mode == "read": + return self._ptr + elif mode == "write": + self._unlink_shared_buffers() + return self._ptr + else: + raise ValueError(f"Incorrect mode passed : {mode}") def _getitem(self, offset: int, size: int) -> Buffer: """ @@ -179,7 +183,7 @@ def _get_cuda_array_interface(self, readonly=False): } @property - def _get_readonly_proxy_obj(self) -> dict: + def _readonly_proxy_cai_obj(self) -> dict: """ Returns a proxy object with a read-only CUDA Array Interface. diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py index 19c7138a719..6d4404c9609 100644 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ b/python/cudf/cudf/core/buffer/spillable_buffer.py @@ -343,10 +343,6 @@ def mark_exposed(self) -> None: self._exposed = True self._last_accessed = time.monotonic() - @property - def mutable_ptr(self) -> int: - return self.get_ptr() - def spill_lock(self, spill_lock: SpillLock) -> None: """Spill lock the buffer @@ -556,6 +552,7 @@ def get_ptr(self, *, mode) -> int: A passthrough method to `SpillableBuffer.get_ptr` with factoring in the `offset`. """ + return self._base.get_ptr(mode=mode) + self._offset def _getitem(self, offset: int, size: int) -> Buffer: diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 920469af173..9e2e0d8b799 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -418,13 +418,6 @@ def nullmask(self) -> Buffer: raise ValueError("Column has no null mask") return self.mask_array_view(mode="read") - @property - def _nullmask(self) -> Buffer: - """The gpu buffer for the null-mask""" - if not self.nullable: - raise ValueError("Column has no null mask") - return self._mask_array_view - def force_deep_copy(self: T) -> T: """ A method to create deep copy irrespective of whether @@ -1923,6 +1916,8 @@ def as_column( arbitrary = cupy.ascontiguousarray(arbitrary) data = as_buffer(arbitrary) + if cudf.get_option("copy_on_write"): + data._zero_copied = True col = build_column(data, dtype=current_dtype, mask=mask) if dtype is not None: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 9089b9c0f90..ea6a6de0b2b 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -29,6 +29,7 @@ from cudf import _lib as libcudf from cudf._typing import Dtype from cudf.api.types import is_dtype_equal, is_scalar +from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ( ColumnBase, as_column, @@ -1701,9 +1702,10 @@ def _colwise_binop( # that nulls that are present in both left_column and # right_column are not filled. if left_column.nullable and right_column.nullable: - lmask = as_column(left_column._nullmask) - rmask = as_column(right_column._nullmask) - output_mask = (lmask | rmask).data + with acquire_spill_lock(): + lmask = as_column(left_column.nullmask) + rmask = as_column(right_column.nullmask) + output_mask = (lmask | rmask).data left_column = left_column.fillna(fill_value) right_column = right_column.fillna(fill_value) elif left_column.nullable: @@ -1738,6 +1740,7 @@ def _colwise_binop( def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): return _array_ufunc(self, ufunc, method, inputs, kwargs) + @acquire_spill_lock() def _apply_cupy_ufunc_to_operands( self, ufunc, cupy_func, operands, **kwargs ): @@ -1756,7 +1759,7 @@ def _apply_cupy_ufunc_to_operands( cupy_inputs = [] for inp in (left, right) if ufunc.nin == 2 else (left,): if isinstance(inp, ColumnBase) and inp.has_nulls(): - new_mask = as_column(inp._nullmask) + new_mask = as_column(inp.nullmask) # TODO: This is a hackish way to perform a bitwise and # of bitmasks. Once we expose diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py index 186b897b936..c7315a1ecb9 100644 --- a/python/cudf/cudf/utils/applyutils.py +++ b/python/cudf/cudf/utils/applyutils.py @@ -105,6 +105,7 @@ def apply_chunks( return applychunks.run(df, chunks=chunks, tpb=tpb) +@acquire_spill_lock() def make_aggregate_nullmask(df, columns=None, op="__and__"): out_mask = None @@ -112,7 +113,7 @@ def make_aggregate_nullmask(df, columns=None, op="__and__"): col = cudf.core.dataframe.extract_col(df, k) if not col.nullable: continue - nullmask = cudf.Series(df[k]._column._nullmask) + nullmask = cudf.Series(df[k]._column.nullmask) if out_mask is None: out_mask = column.as_column( From 5b545192cf192dccca36f4b1d190212acc99caec Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 26 Jan 2023 08:54:04 -0800 Subject: [PATCH 03/31] update docs --- docs/cudf/source/developer_guide/library_design.md | 4 ++-- docs/cudf/source/user_guide/copy-on-write.md | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index 0e87c46faab..9dd0c9bfcdd 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -325,9 +325,9 @@ the public usage [here](copy-on-write-user-doc) of this functionality before rea below. The core copy-on-write implementation relies on the `CopyOnWriteBuffer` class. This class stores the pointer to the device memory and size. -With the help of `CopyOnWriteBuffer.ptr` we generate [weak references](https://docs.python.org/3/library/weakref.html) of `CopyOnWriteBuffer` and store it in `CopyOnWriteBuffer._instances`. +With the help of `CopyOnWriteBuffer._ptr` we generate [weak references](https://docs.python.org/3/library/weakref.html) of `CopyOnWriteBuffer` and store it in `CopyOnWriteBuffer._instances`. This is a mapping from `ptr` keys to `WeakSet`s containing references to `CopyOnWriterBuffer` objects. This -means all the new `CopyOnWriteBuffer`s that are created map to the same key in `CopyOnWriteBuffer._instances` if they have same `.ptr` +means all the new `CopyOnWriteBuffer`s that are created map to the same key in `CopyOnWriteBuffer._instances` if they have same `._ptr` i.e., if they are all pointing to the same device memory. When the cudf option `"copy_on_write"` is `True`, `as_buffer` will always return a `CopyOnWriteBuffer`. This class contains all the diff --git a/docs/cudf/source/user_guide/copy-on-write.md b/docs/cudf/source/user_guide/copy-on-write.md index 14ca3656250..25821d4667f 100644 --- a/docs/cudf/source/user_guide/copy-on-write.md +++ b/docs/cudf/source/user_guide/copy-on-write.md @@ -81,9 +81,7 @@ the original object it was viewing and thus a separate copy is created and then ## Advantages -1. With copy-on-write enabled and by requesting `.copy(deep=False)`, the GPU memory usage can be reduced drastically if you are not performing -write operations on all of those copies. This will also increase the speed at which objects are created for execution of your ETL workflow. -2. With the concept of views going away, every object is a copy of it's original object. This will bring consistency across operations and cudf closer to parity with +1. With the concept of views going away, every object is a copy of it's original object. This will bring consistency across operations and cudf closer to parity with pandas. Following is one of the inconsistency: ```python @@ -158,6 +156,8 @@ dtype: int64 4 5 dtype: int64 ``` +2. There are numerous other inconsistencies, which are solved by copy-on-write. Read more about them [here](https://phofl.github.io/cow-introduction.html). + ## How to disable it From d46c3a889f0dec7dff20419ca37c5b86da3493d9 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 26 Jan 2023 09:04:51 -0800 Subject: [PATCH 04/31] simplify docstring --- python/cudf/cudf/options.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index d98b194722a..dae24b827c2 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -251,11 +251,9 @@ def _integer_and_none_validator(val): textwrap.dedent( """ Default behavior of performing shallow copies. - If set to `False`, each shallow copy will perform a true shallow copy. - If set to `True`, each shallow copy will perform a shallow copy - with underlying data actually referring to the actual column, in this - case a copy is only made when there is a write operation performed on - the column. + If set to `False`, disables copy-on-write. + If set to `True`, enables copy-on-write. + Read more at: :ref: copy-on-write-user-doc \tValid values are True or False. Default is False. """ ), From ae95e48854785393f877f4f48959ce93302d5234 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 26 Jan 2023 10:31:43 -0800 Subject: [PATCH 05/31] revert redundant changes --- python/cudf/cudf/_lib/column.pyx | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 365524ab745..8471e3a4b95 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -1,6 +1,5 @@ # Copyright (c) 2020-2023, NVIDIA CORPORATION. -import inspect import cupy as cp import numpy as np @@ -198,20 +197,9 @@ cdef class Column: "expected " + str(required_num_bytes) + " bytes." ) - # Because hasattr will trigger invocation of - # `__cuda_array_interface__` which could - # be expensive in CopyOnWriteBuffer case. - value_cai = inspect.getattr_static( - value, - "__cuda_array_interface__", - None - ) - if value is None: mask = None - elif type(value_cai) is property: - if isinstance(value, CopyOnWriteBuffer): - value = value._readonly_proxy_cai_obj + elif hasattr(value, "__cuda_array_interface__"): if value.__cuda_array_interface__["typestr"] not in ("|i1", "|u1"): if isinstance(value, Column): value = value.data_array_view(mode="write") From ee78be8fc9779ff6f93f875b85cccc9a3483ac94 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 27 Jan 2023 07:02:45 -0600 Subject: [PATCH 06/31] Apply suggestions from code review Co-authored-by: Vyas Ramasubramani Co-authored-by: Lawrence Mitchell --- .../source/developer_guide/library_design.md | 50 ++++++++----------- docs/cudf/source/user_guide/copy-on-write.md | 15 +++--- python/cudf/cudf/_lib/column.pyx | 3 -- python/cudf/cudf/core/buffer/cow_buffer.py | 28 +++-------- .../cudf/cudf/core/buffer/spillable_buffer.py | 1 - python/cudf/cudf/core/column/column.py | 5 +- python/cudf/cudf/core/column/datetime.py | 3 +- python/cudf/cudf/core/column/numerical.py | 1 - python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/options.py | 3 +- 10 files changed, 41 insertions(+), 70 deletions(-) diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index 9dd0c9bfcdd..a812624bc23 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -318,49 +318,43 @@ However, for performance reasons they frequently access internal attributes and ## Copy-on-write - -Copy-on-write (COW) is designed to reduce memory footprint on GPUs. With this feature, a copy (`.copy(deep=False)`) is only really made whenever -there is a write operation on a column. It is first recommended to see -the public usage [here](copy-on-write-user-doc) of this functionality before reading through the internals +This section describes the internal implementation details of the copy-on-write feature. +It is recommended that developers familiarize themselves with [the user-facing documentation](copy-on-write-user-doc) of this functionality before reading through the internals below. -The core copy-on-write implementation relies on the `CopyOnWriteBuffer` class. This class stores the pointer to the device memory and size. -With the help of `CopyOnWriteBuffer._ptr` we generate [weak references](https://docs.python.org/3/library/weakref.html) of `CopyOnWriteBuffer` and store it in `CopyOnWriteBuffer._instances`. -This is a mapping from `ptr` keys to `WeakSet`s containing references to `CopyOnWriterBuffer` objects. This -means all the new `CopyOnWriteBuffer`s that are created map to the same key in `CopyOnWriteBuffer._instances` if they have same `._ptr` -i.e., if they are all pointing to the same device memory. - -When the cudf option `"copy_on_write"` is `True`, `as_buffer` will always return a `CopyOnWriteBuffer`. This class contains all the -mechanisms to enable copy-on-write for all buffers. When a `CopyOnWriteBuffer` is created, its weakref is generated and added to the `WeakSet` which is in turn stored in `CopyOnWriterBuffer._instances`. This will later serve as an indication of whether or not to make a copy when a -when write operation is performed on a `Column` (see below). +The core copy-on-write implementation relies on the `CopyOnWriteBuffer` class. +When the cudf option `"copy_on_write"` is `True`, `as_buffer` will always return a `CopyOnWriteBuffer`. +This subclass of `cudf.Buffer` contains all the mechanisms to enable copy-on-write behavior. +The class stores [weak references](https://docs.python.org/3/library/weakref.html) to every existing `CopyOnWriteBuffer` in `CopyOnWriteBuffer._instances`, a mapping from `ptr` keys to `WeakSet`s containing references to `CopyOnWriterBuffer` objects. +This means that all `CopyOnWriteBuffer`s that point to the same device memory are contained in the same `WeakSet` (corresponding to the same `ptr` key) in `CopyOnWriteBuffer._instances`. +This data structure is then used to determine whether or not to make a copy when a write operation is performed on a `Column` (see below). +If multiple buffers point to the same underlying memory, then a copy must be made whenever a modification is attempted. ### Eager copies when exposing to third-party libraries -If `Column`/`CopyOnWriteBuffer` is exposed to a third-party library via `__cuda_array_interface__`, we are no longer able to track whether or not modification of the buffer has occurred without introspection. Hence whenever +If `Column`/`CopyOnWriteBuffer` is exposed to a third-party library via `__cuda_array_interface__`, we are no longer able to track whether or not modification of the buffer has occurred. Hence whenever someone accesses data through the `__cuda_array_interface__`, we eagerly trigger the copy by calling `_unlink_shared_buffers` which ensures a true copy of underlying device data is made and -unlinks the buffer from any shared "weak" references. Any future shallow-copy requests must also trigger a true physical copy (since we cannot track the lifetime of the third-party object), to handle this we also mark the `Column`/`CopyOnWriteBuffer` as -`obj._zero_copied=True` thus indicating any future shallow-copy requests will trigger a true physical copy +unlinks the buffer from any shared "weak" references. Any future copy requests must also trigger a true physical copy (since we cannot track the lifetime of the third-party object). To handle this we also mark the `Column`/`CopyOnWriteBuffer` as +`obj._zero_copied=True` thus indicating that any future shallow-copy requests will trigger a true physical copy rather than a copy-on-write shallow copy with weak references. -### How to obtain read-only object? +### Obtaining a read-only object A read-only object can be quite useful for operations that will not -mutate the data. This can be achieved by calling `._readonly_proxy_cai_obj` -API, this API will return a proxy object that has `__cuda_array_interface__` +mutate the data. This can be achieved by calling `._readonly_proxy_cai_obj`. +This function will return a proxy object that has `__cuda_array_interface__` implemented and will not trigger a deep copy even if the `CopyOnWriteBuffer` -has weak references. It is only recommended to use this API as long as -the objects/arrays created with this proxy object gets cleaned up during -the developer code execution. We currently use this API for device to host +has weak references. This API should only be used when the lifetime of the proxy object is restricted to cudf's internal code execution. Handing this out to external libraries or user-facing APIs will lead to untracked references and undefined copy-on-write behavior. We currently use this API for device to host copies like in `ColumnBase.data_array_view(mode="read")` which is used for `Column.values_host`. -Notes: -1. Weak references are implemented only for fixed-width data types as these are only column +### Variable width data types +Weak references are implemented only for fixed-width data types as these are only column types that can be mutated in place. -2. Deep copies of variable width data types return shallow-copies of the Columns, because these -types don't support real in-place mutations to the data. We just mimic in such a way that it looks -like an in-place operation using `_mimic_inplace`. +Deep copies of variable width data types return shallow-copies of the Columns, because these +types don't support real in-place mutations to the data. +Internally, we mimic in-place mutations using `_mimic_inplace`, but the resulting data is always a deep copy of the underlying data. ### Examples @@ -488,6 +482,6 @@ If we inspect the memory address of the data, the addresses of `s2` and `s3` rem 139796315879723 ``` -cudf Copy-on-write implementation is motivated by pandas Copy-on-write proposal here: +cudf's copy-on-write implementation is motivated by the pandas proposals documented here: 1. [Google doc](https://docs.google.com/document/d/1ZCQ9mx3LBMy-nhwRl33_jgcvWo9IWdEfxDNQ2thyTb0/edit#heading=h.iexejdstiz8u) 2. [Github issue](https://github.com/pandas-dev/pandas/issues/36195) diff --git a/docs/cudf/source/user_guide/copy-on-write.md b/docs/cudf/source/user_guide/copy-on-write.md index 25821d4667f..5ad4ca109af 100644 --- a/docs/cudf/source/user_guide/copy-on-write.md +++ b/docs/cudf/source/user_guide/copy-on-write.md @@ -2,8 +2,11 @@ # Copy-on-write -Copy-on-write reduces GPU memory usage when copies(`.copy(deep=False)`) of a column -are made. +Copy-on-write is a memory management strategy that allows multiple cudf objects containing the same data to refer to the same memory address as long as neither of them modify the underlying data. +With this approach, any operation that generates an unmodified view of an object (such as copies, slices, or methods like `DataFrame.head`) returns a new object that points to the same memory as the original. +However, when either the new object is modified, a new copy of the data is made prior to the modification, ensuring that the changes do not propagate back to the original object. +The same behavior also works in the other direction, i.e. modifications of the original object will not propagate to the new object. +This behavior is best understood by looking at the examples below. | | Copy-on-Write enabled | Copy-on-Write disabled (default) | |---------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------| @@ -76,13 +79,11 @@ dtype: int64 ## Notes -When copy-on-write is enabled, there is no concept of views. i.e., modifying any view created inside cudf will not actually not modify -the original object it was viewing and thus a separate copy is created and then modified. +When copy-on-write is enabled, there is no concept of views. Modifying any view created inside cudf will always trigger a copy and will not modify the original object. ## Advantages -1. With the concept of views going away, every object is a copy of it's original object. This will bring consistency across operations and cudf closer to parity with -pandas. Following is one of the inconsistency: +1. Copy-on-write produces much more consistent copy semantics. Since every object is a copy of the original, users no longer have to think about when modifications may unexpectedly happen in place. This will bring consistency across operations and bring cudf and pandas behavior into alignment when copy-on-write is enabled for both. Here is one example where pandas and cudf are currently inconsistent without copy-on-write enabled: ```python @@ -118,7 +119,7 @@ dtype: int64 dtype: int64 ``` -The above inconsistency is solved when Copy-on-write is enabled: +The above inconsistency is solved when copy-on-write is enabled: ```python >>> import pandas as pd diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 8471e3a4b95..047822be0b4 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -196,7 +196,6 @@ cdef class Column: "The value for mask is smaller than expected, got {} bytes, " "expected " + str(required_num_bytes) + " bytes." ) - if value is None: mask = None elif hasattr(value, "__cuda_array_interface__"): @@ -297,7 +296,6 @@ cdef class Column: instead replaces the Buffers and other attributes underneath the column object with the Buffers and attributes from the other column. """ - if inplace: self._offset = other_col.offset self._size = other_col.size @@ -313,7 +311,6 @@ cdef class Column: return self._view(libcudf_types.UNKNOWN_NULL_COUNT).null_count() cdef mutable_column_view mutable_view(self) except *: - if is_categorical_dtype(self.dtype): col = self.base_children[0] else: diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index fdd1855eac7..e2d2ac64603 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -27,7 +27,7 @@ def _keys_cleanup(ptr): class CopyOnWriteBuffer(Buffer): - """A Copy-on-write buffer that implements Buffer. + """A copy-on-write buffer that implements Buffer. This buffer enables making copies of data only when there is a write operation being performed. @@ -39,11 +39,11 @@ class CopyOnWriteBuffer(Buffer): instance. """ - # This dict keeps track of all instances that have the same `ptr` - # and `size` attributes. Each key of the dict is a `(ptr, size)` - # tuple and the corresponding value is a set of weak references to - # instances with that `ptr` and `size`. _instances: DefaultDict[Tuple, WeakSet] = defaultdict(WeakSet) + """This dict keeps track of all instances that have the same `ptr` + and `size` attributes. Each key of the dict is a `(ptr, size)` + tuple and the corresponding value is a set of weak references to + instances with that `ptr` and `size`.""" # TODO: This is synonymous to SpillableBuffer._exposed attribute # and has to be merged. @@ -133,24 +133,8 @@ def _getitem(self, offset: int, size: int) -> Buffer: ) def copy(self, deep: bool = True): - """ - Return a copy of Buffer. - - Parameters - ---------- - deep : bool, default True - If True, returns a deep-copy of the underlying Buffer data. - If False, returns a shallow-copy of the Buffer pointing to - the same underlying data. - - Returns - ------- - Buffer - """ if deep or self._zero_copied: - return self._from_device_memory( - rmm.DeviceBuffer(ptr=self._ptr, size=self.size) - ) + return super().copy(deep=True) else: copied_buf = CopyOnWriteBuffer.__new__(CopyOnWriteBuffer) copied_buf._ptr = self._ptr diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py index 6d4404c9609..74da9c2f257 100644 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ b/python/cudf/cudf/core/buffer/spillable_buffer.py @@ -552,7 +552,6 @@ def get_ptr(self, *, mode) -> int: A passthrough method to `SpillableBuffer.get_ptr` with factoring in the `offset`. """ - return self._base.get_ptr(mode=mode) + self._offset def _getitem(self, offset: int, size: int) -> Buffer: diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 9e2e0d8b799..47897beabb1 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -421,7 +421,7 @@ def nullmask(self) -> Buffer: def force_deep_copy(self: T) -> T: """ A method to create deep copy irrespective of whether - `copy-on-write` is enable. + `copy-on-write` is enabled. """ result = libcudf.copying.copy_column(self) return cast(T, result._with_type_metadata(self.dtype)) @@ -438,7 +438,7 @@ def copy(self: T, deep: bool = True) -> T: If False and `copy_on_write` is False, the same memory is shared between the buffers of the Column and changes made to one Column will propagate to - it's copy and vice-versa. + its copy and vice-versa. If False and `copy_on_write` is True, the same memory is shared between the buffers of the Column until there is a write operation being performed on @@ -1382,7 +1382,6 @@ def column_empty_like( ): column = cast("cudf.core.column.CategoricalColumn", column) codes = column_empty_like(column.codes, masked=masked, newsize=newsize) - return build_column( data=None, dtype=dtype, diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 449691750d8..56436ac141d 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. from __future__ import annotations @@ -279,7 +279,6 @@ def as_numerical(self) -> "cudf.core.column.NumericalColumn": @property def __cuda_array_interface__(self) -> Mapping[str, Any]: - output = { "shape": (len(self),), "strides": (self.dtype.itemsize,), diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 152f1a78625..87e73d212ef 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -168,7 +168,6 @@ def __setitem__(self, key: Any, value: Any): @property def __cuda_array_interface__(self) -> Mapping[str, Any]: - output = { "shape": (len(self),), "strides": (self.dtype.itemsize,), diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index e39ea68838f..58ed7ae5127 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6543,7 +6543,7 @@ def to_struct(self, name=None): col = cudf.core.column.build_struct_column( names=field_names, children=tuple( - [col.copy(deep=True) for col in self._data.columns] + col.copy(deep=True) for col in self._data.columns ), size=len(self), ) diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index dae24b827c2..e44a36eedc2 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -166,7 +166,7 @@ def _spill_validator(val): try: if get_option("copy_on_write") and val: raise ValueError( - "Spilling is not supported when Copy-on-write is enabled. " + "Spilling is not supported when copy-on-write is enabled. " "Please set `copy_on_write` to `False`" ) except KeyError: @@ -250,7 +250,6 @@ def _integer_and_none_validator(val): _env_get_bool("CUDF_COPY_ON_WRITE", False), textwrap.dedent( """ - Default behavior of performing shallow copies. If set to `False`, disables copy-on-write. If set to `True`, enables copy-on-write. Read more at: :ref: copy-on-write-user-doc From a02a1506be3f9ee0d8c2e9f8fd67883ddcce7d81 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 27 Jan 2023 07:36:21 -0600 Subject: [PATCH 07/31] Apply suggestions from code review Co-authored-by: Lawrence Mitchell --- python/cudf/cudf/core/buffer/cow_buffer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index e2d2ac64603..f4fed706fce 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -136,7 +136,8 @@ def copy(self, deep: bool = True): if deep or self._zero_copied: return super().copy(deep=True) else: - copied_buf = CopyOnWriteBuffer.__new__(CopyOnWriteBuffer) + cls = type(self) + copied_buf = cls.__new__(cls) copied_buf._ptr = self._ptr copied_buf._size = self._size copied_buf._owner = self._owner From 7c843438570979da1e34a3db4541e98f07d01c07 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 27 Jan 2023 12:31:03 -0800 Subject: [PATCH 08/31] style --- python/cudf/cudf/core/buffer/cow_buffer.py | 2 +- python/cudf/cudf/core/dataframe.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index f4fed706fce..c601c0193e1 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -43,7 +43,7 @@ class CopyOnWriteBuffer(Buffer): """This dict keeps track of all instances that have the same `ptr` and `size` attributes. Each key of the dict is a `(ptr, size)` tuple and the corresponding value is a set of weak references to - instances with that `ptr` and `size`.""" + instances with that `ptr` and `size`.""" # TODO: This is synonymous to SpillableBuffer._exposed attribute # and has to be merged. diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 58ed7ae5127..517aa6eb1ae 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6542,9 +6542,7 @@ def to_struct(self, name=None): col = cudf.core.column.build_struct_column( names=field_names, - children=tuple( - col.copy(deep=True) for col in self._data.columns - ), + children=tuple(col.copy(deep=True) for col in self._data.columns), size=len(self), ) return cudf.Series._from_data( From b69c34de340330cdbe4ad33ebb129ac9ec90954e Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 30 Jan 2023 14:10:35 -0800 Subject: [PATCH 09/31] address reviews in code --- python/cudf/cudf/_lib/column.pyx | 3 +- python/cudf/cudf/core/buffer/buffer.py | 9 +- python/cudf/cudf/core/buffer/cow_buffer.py | 10 -- .../cudf/cudf/core/buffer/spillable_buffer.py | 5 + python/cudf/cudf/core/column/column.py | 4 +- python/cudf/cudf/testing/_utils.py | 2 + python/cudf/cudf/tests/test_copying.py | 123 ++++++++++++------ python/cudf/cudf/utils/applyutils.py | 11 +- 8 files changed, 99 insertions(+), 68 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 047822be0b4..d6aac597c17 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -547,7 +547,8 @@ cdef class Column: exposed=True, ) if isinstance(data_owner, CopyOnWriteBuffer): - data_owner.ptr # accessing the pointer marks it exposed. + data_owner.get_ptr(mode="write") + # accessing the pointer marks it exposed. elif isinstance(data_owner, SpillableBuffer): if data_owner.is_spilled: raise ValueError( diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index 17ca20e09e2..fb5675513ae 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -210,12 +210,9 @@ def copy(self, deep: bool = True): Buffer """ if deep: - with cudf.core.buffer.acquire_spill_lock(): - return self._from_device_memory( - rmm.DeviceBuffer( - ptr=self.get_ptr(mode="read"), size=self.size - ) - ) + return self._from_device_memory( + rmm.DeviceBuffer(ptr=self.get_ptr(mode="read"), size=self.size) + ) else: return self[:] diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index c601c0193e1..353d894c64d 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -122,16 +122,6 @@ def get_ptr(self, mode: str = "write") -> int: else: raise ValueError(f"Incorrect mode passed : {mode}") - def _getitem(self, offset: int, size: int) -> Buffer: - """ - Helper for `__getitem__` - """ - return self._from_device_memory( - cuda_array_interface_wrapper( - ptr=self._ptr + offset, size=size, owner=self.owner - ) - ) - def copy(self, deep: bool = True): if deep or self._zero_copied: return super().copy(deep=True) diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py index 74da9c2f257..fdcc6797916 100644 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ b/python/cudf/cudf/core/buffer/spillable_buffer.py @@ -278,6 +278,11 @@ def _from_host_memory(cls: Type[T], data: Any) -> T: def is_spilled(self) -> bool: return self._ptr_desc["type"] != "gpu" + def copy(self, deep: bool = True): + spill_lock = SpillLock() + self.spill_lock(spill_lock=spill_lock) + return super().copy(deep=deep) + def spill(self, target: str = "cpu") -> None: """Spill or un-spill this buffer in-place diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 47897beabb1..8ea0446d4bd 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -461,9 +461,7 @@ def copy(self: T, deep: bool = True) -> T: offset=self.offset, children=tuple( col.copy(deep=False) for col in self.base_children - ) - if cudf.get_option("copy_on_write") - else self.base_children, + ), ), ) diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index 061bad1d44b..4e7b0a6e035 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -15,6 +15,7 @@ import cudf from cudf._lib.null_mask import bitmask_allocation_size_bytes +from cudf.core.buffer import acquire_spill_lock from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion from cudf.utils import dtypes as dtypeutils @@ -325,6 +326,7 @@ def does_not_raise(): yield +@acquire_spill_lock() def assert_column_memory_eq( lhs: cudf.core.column.ColumnBase, rhs: cudf.core.column.ColumnBase ): diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py index 1dd73a69384..1a3f9f6fd23 100644 --- a/python/cudf/cudf/tests/test_copying.py +++ b/python/cudf/cudf/tests/test_copying.py @@ -55,58 +55,72 @@ def test_null_copy(): assert len(col) == 2049 -@pytest.mark.parametrize("copy_on_write", [True, False]) -def test_series_setitem_cow(copy_on_write): - cudf.set_option("copy_on_write", copy_on_write) +def test_series_setitem_cow_on(): + cudf.set_option("copy_on_write", True) actual = cudf.Series([1, 2, 3, 4, 5]) new_copy = actual.copy(deep=False) actual[1] = 100 assert_eq(actual, cudf.Series([1, 100, 3, 4, 5])) - if copy_on_write: - assert_eq(new_copy, cudf.Series([1, 2, 3, 4, 5])) - else: - assert_eq(new_copy, cudf.Series([1, 100, 3, 4, 5])) + assert_eq(new_copy, cudf.Series([1, 2, 3, 4, 5])) actual = cudf.Series([1, 2, 3, 4, 5]) new_copy = actual.copy(deep=False) actual[slice(0, 2, 1)] = 100 assert_eq(actual, cudf.Series([100, 100, 3, 4, 5])) - if copy_on_write: - assert_eq(new_copy, cudf.Series([1, 2, 3, 4, 5])) - else: - assert_eq(new_copy, cudf.Series([100, 100, 3, 4, 5])) + assert_eq(new_copy, cudf.Series([1, 2, 3, 4, 5])) new_copy[slice(2, 4, 1)] = 300 - if copy_on_write: - assert_eq(actual, cudf.Series([100, 100, 3, 4, 5])) - else: - assert_eq(actual, cudf.Series([100, 100, 300, 300, 5])) + assert_eq(actual, cudf.Series([100, 100, 3, 4, 5])) + assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5])) - if copy_on_write: - assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5])) - else: - assert_eq(new_copy, cudf.Series([100, 100, 300, 300, 5])) + actual = cudf.Series([1, 2, 3, 4, 5]) + new_copy = actual.copy(deep=False) + new_copy[slice(2, 4, 1)] = 300 + assert_eq(actual, cudf.Series([1, 2, 3, 4, 5])) + assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5])) + + new_slice = actual[2:] + assert new_slice._column.base_data._ptr == actual._column.base_data._ptr + new_slice[0:2] = 10 + assert_eq(new_slice, cudf.Series([10, 10, 5], index=[2, 3, 4])) + assert_eq(actual, cudf.Series([1, 2, 3, 4, 5])) + + +def test_series_setitem_cow_off(): + cudf.set_option("copy_on_write", False) actual = cudf.Series([1, 2, 3, 4, 5]) new_copy = actual.copy(deep=False) + actual[1] = 100 + assert_eq(actual, cudf.Series([1, 100, 3, 4, 5])) + assert_eq(new_copy, cudf.Series([1, 100, 3, 4, 5])) + + actual = cudf.Series([1, 2, 3, 4, 5]) + new_copy = actual.copy(deep=False) + + actual[slice(0, 2, 1)] = 100 + assert_eq(actual, cudf.Series([100, 100, 3, 4, 5])) + assert_eq(new_copy, cudf.Series([100, 100, 3, 4, 5])) + new_copy[slice(2, 4, 1)] = 300 - if copy_on_write: - assert_eq(actual, cudf.Series([1, 2, 3, 4, 5])) - else: - assert_eq(actual, cudf.Series([1, 2, 300, 300, 5])) + assert_eq(actual, cudf.Series([100, 100, 300, 300, 5])) + assert_eq(new_copy, cudf.Series([100, 100, 300, 300, 5])) + + actual = cudf.Series([1, 2, 3, 4, 5]) + new_copy = actual.copy(deep=False) + + new_copy[slice(2, 4, 1)] = 300 + assert_eq(actual, cudf.Series([1, 2, 300, 300, 5])) assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5])) new_slice = actual[2:] assert new_slice._column.base_data._ptr == actual._column.base_data._ptr new_slice[0:2] = 10 assert_eq(new_slice, cudf.Series([10, 10, 5], index=[2, 3, 4])) - if copy_on_write: - assert_eq(actual, cudf.Series([1, 2, 3, 4, 5])) - else: - assert_eq(actual, cudf.Series([1, 2, 10, 10, 5])) + assert_eq(actual, cudf.Series([1, 2, 10, 10, 5])) def test_multiple_series_cow(): @@ -199,9 +213,43 @@ def test_multiple_series_cow(): assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) -@pytest.mark.parametrize("copy_on_write", [True, False]) -def test_series_zero_copy(copy_on_write): - cudf.set_option("copy_on_write", copy_on_write) +def test_series_zero_copy_cow_on(): + cudf.set_option("copy_on_write", True) + s = cudf.Series([1, 2, 3, 4, 5]) + s1 = s.copy(deep=False) + cp_array = cp.asarray(s) + + assert_eq(s, cudf.Series([1, 2, 3, 4, 5])) + assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) + assert_eq(cp_array, cp.array([1, 2, 3, 4, 5])) + + cp_array[0:3] = 10 + + assert_eq(s, cudf.Series([10, 10, 10, 4, 5])) + assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) + assert_eq(cp_array, cp.array([10, 10, 10, 4, 5])) + + s2 = cudf.Series(cp_array) + assert_eq(s2, cudf.Series([10, 10, 10, 4, 5])) + s3 = s2.copy(deep=False) + cp_array[0] = 20 + + assert_eq(s, cudf.Series([20, 10, 10, 4, 5])) + assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) + assert_eq(cp_array, cp.array([20, 10, 10, 4, 5])) + assert_eq(s2, cudf.Series([20, 10, 10, 4, 5])) + assert_eq(s3, cudf.Series([10, 10, 10, 4, 5])) + + s4 = cudf.Series([10, 20, 30, 40, 50]) + s5 = cudf.Series(s4) + assert_eq(s5, cudf.Series([10, 20, 30, 40, 50])) + s5[0:2] = 1 + assert_eq(s5, cudf.Series([1, 1, 30, 40, 50])) + assert_eq(s4, cudf.Series([1, 1, 30, 40, 50])) + + +def test_series_zero_copy_cow_off(): + cudf.set_option("copy_on_write", False) s = cudf.Series([1, 2, 3, 4, 5]) s1 = s.copy(deep=False) cp_array = cp.asarray(s) @@ -213,10 +261,7 @@ def test_series_zero_copy(copy_on_write): cp_array[0:3] = 10 assert_eq(s, cudf.Series([10, 10, 10, 4, 5])) - if copy_on_write: - assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) - else: - assert_eq(s1, cudf.Series([10, 10, 10, 4, 5])) + assert_eq(s1, cudf.Series([10, 10, 10, 4, 5])) assert_eq(cp_array, cp.array([10, 10, 10, 4, 5])) s2 = cudf.Series(cp_array) @@ -225,16 +270,10 @@ def test_series_zero_copy(copy_on_write): cp_array[0] = 20 assert_eq(s, cudf.Series([20, 10, 10, 4, 5])) - if copy_on_write: - assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) - else: - assert_eq(s1, cudf.Series([20, 10, 10, 4, 5])) + assert_eq(s1, cudf.Series([20, 10, 10, 4, 5])) assert_eq(cp_array, cp.array([20, 10, 10, 4, 5])) assert_eq(s2, cudf.Series([20, 10, 10, 4, 5])) - if copy_on_write: - assert_eq(s3, cudf.Series([10, 10, 10, 4, 5])) - else: - assert_eq(s3, cudf.Series([20, 10, 10, 4, 5])) + assert_eq(s3, cudf.Series([20, 10, 10, 4, 5])) s4 = cudf.Series([10, 20, 30, 40, 50]) s5 = cudf.Series(s4) diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py index c7315a1ecb9..933b98367b6 100644 --- a/python/cudf/cudf/utils/applyutils.py +++ b/python/cudf/cudf/utils/applyutils.py @@ -113,17 +113,16 @@ def make_aggregate_nullmask(df, columns=None, op="__and__"): col = cudf.core.dataframe.extract_col(df, k) if not col.nullable: continue - nullmask = cudf.Series(df[k]._column.nullmask) + nullmask = column.as_column(df[k]._column.nullmask) if out_mask is None: out_mask = column.as_column( nullmask.copy(), dtype=utils.mask_dtype ) - continue - - out_mask = libcudf.binaryop.binaryop( - column.as_column(nullmask), out_mask, op, out_mask.dtype - ) + else: + out_mask = libcudf.binaryop.binaryop( + nullmask, out_mask, op, out_mask.dtype + ) return out_mask From 60d009f6ac01c311cb1e5c356cfe66103f8e74ad Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 30 Jan 2023 16:18:44 -0800 Subject: [PATCH 10/31] address reviews in docs --- .../source/developer_guide/library_design.md | 17 +++++++ docs/cudf/source/user_guide/copy-on-write.md | 49 ++++++++++--------- python/cudf/cudf/core/buffer/cow_buffer.py | 3 +- python/cudf/cudf/core/series.py | 7 +++ python/cudf/cudf/options.py | 2 +- 5 files changed, 52 insertions(+), 26 deletions(-) diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index a812624bc23..8a353005a4b 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -229,6 +229,7 @@ Additionally, parameters are: of `` in bytes. This introduces a modest overhead and is **disabled by default**. Furthermore, this is a *soft* limit. The memory usage might exceed the limit if too many buffers are unspillable. +(Buffer-design)= #### Design Spilling consists of two components: @@ -316,6 +317,8 @@ Internally, these objects typically interact with cuDF objects at the Frame laye However, for performance reasons they frequently access internal attributes and methods of `Frame` and its subclasses. +(copy-on-write-dev-doc)= + ## Copy-on-write This section describes the internal implementation details of the copy-on-write feature. @@ -349,6 +352,20 @@ implemented and will not trigger a deep copy even if the `CopyOnWriteBuffer` has weak references. This API should only be used when the lifetime of the proxy object is restricted to cudf's internal code execution. Handing this out to external libraries or user-facing APIs will lead to untracked references and undefined copy-on-write behavior. We currently use this API for device to host copies like in `ColumnBase.data_array_view(mode="read")` which is used for `Column.values_host`. + +### Internal access to raw data pointers + +Since it is unsafe to access the raw pointer associated with a buffer when +copy-on-write is enabled, in addition to the readonly proxy object described above, +access to the pointer is gated through `Buffer.get_ptr`. This method accepts a mode +argument through which the caller indicates how they will access the data associated +with the buffer. If only read-only access is required (`mode="read"`), this indicates +that the caller has no intention of modifying the buffer through this pointer. +In this case, any shallow copies are not unlinked. In contrast, if modification is +required one may pass `mode="write"`, provoking unlinking of any shallow copies. +The same mechanism is also used for `SpillableBuffer`, though in that case one must +also consider acquiring a spill lock (see `Buffer` [design](Buffer-design) for more details). + ### Variable width data types Weak references are implemented only for fixed-width data types as these are only column types that can be mutated in place. diff --git a/docs/cudf/source/user_guide/copy-on-write.md b/docs/cudf/source/user_guide/copy-on-write.md index 5ad4ca109af..d8c102d2f46 100644 --- a/docs/cudf/source/user_guide/copy-on-write.md +++ b/docs/cudf/source/user_guide/copy-on-write.md @@ -8,28 +8,32 @@ However, when either the new object is modified, a new copy of the data is made The same behavior also works in the other direction, i.e. modifications of the original object will not propagate to the new object. This behavior is best understood by looking at the examples below. -| | Copy-on-Write enabled | Copy-on-Write disabled (default) | -|---------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------| -| `.copy(deep=True)` | A true copy is made and changes don't propagate to the original object. | A true copy is made and changes don't propagate to the original object. | -| `.copy(deep=False)` | Memory is shared between the two objects and but any write operation on one object will trigger a true physical copy before the write is performed. Hence changes will not propagate to the original object. | Memory is shared between the two objects and changes performed on one will propagate to the other object. | ## How to enable it -i. Use `cudf.set_option`: +1. Use `cudf.set_option`: -```python ->>> import cudf ->>> cudf.set_option("copy_on_write", True) -``` + ```python + >>> import cudf + >>> cudf.set_option("copy_on_write", True) + ``` -ii. Set the environment variable ``CUDF_COPY_ON_WRITE`` to ``1`` prior to the +2. Set the environment variable ``CUDF_COPY_ON_WRITE`` to ``1`` prior to the launch of the Python interpreter: -```bash -export CUDF_COPY_ON_WRITE="1" python -c "import cudf" -``` + ```bash + export CUDF_COPY_ON_WRITE="1" python -c "import cudf" + ``` + +## How to disable it +Copy-on-write can be disable by setting ``copy_on_write`` cudf option to ``False``: + +```python +>>> cudf.set_option("copy_on_write", False) +``` + ## Making copies There are no additional changes required in the code to make use of copy-on-write. @@ -81,6 +85,15 @@ dtype: int64 When copy-on-write is enabled, there is no concept of views. Modifying any view created inside cudf will always trigger a copy and will not modify the original object. +### Explicit deep and shallow copies comparison + + +| | Copy-on-Write enabled | Copy-on-Write disabled (default) | +|---------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------| +| `.copy(deep=True)` | A true copy is made and changes don't propagate to the original object. | A true copy is made and changes don't propagate to the original object. | +| `.copy(deep=False)` | Memory is shared between the two objects and but any write operation on one object will trigger a true physical copy before the write is performed. Hence changes will not propagate to the original object. | Memory is shared between the two objects and changes performed on one will propagate to the other object. | + + ## Advantages 1. Copy-on-write produces much more consistent copy semantics. Since every object is a copy of the original, users no longer have to think about when modifications may unexpectedly happen in place. This will bring consistency across operations and bring cudf and pandas behavior into alignment when copy-on-write is enabled for both. Here is one example where pandas and cudf are currently inconsistent without copy-on-write enabled: @@ -158,13 +171,3 @@ dtype: int64 dtype: int64 ``` 2. There are numerous other inconsistencies, which are solved by copy-on-write. Read more about them [here](https://phofl.github.io/cow-introduction.html). - - -## How to disable it - - -Copy-on-write can be disable by setting ``copy_on_write`` cudf option to ``False``: - -```python ->>> cudf.set_option("copy_on_write", False) -``` diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index 353d894c64d..138dbb8607f 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -32,8 +32,7 @@ class CopyOnWriteBuffer(Buffer): This buffer enables making copies of data only when there is a write operation being performed. - See `Copy-on-write` section in `library_design.md` for - detailed information on `CopyOnWriteBuffer`. + See more here :ref:`copy-on-write-dev-doc`. Use the factory function `as_buffer` to create a CopyOnWriteBuffer instance. diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 75db4db7312..02b2a3a0b17 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -545,6 +545,13 @@ def __init__( data = {} if not isinstance(data, ColumnBase): + # Using `getattr_static` to check if + # `data` is on device memory and perform + # a deep copy later. This is different + # from `hasattr` because, it doesn't + # invoke the property we are looking + # for and the later actually invokes + # the property. has_cai = ( type( inspect.getattr_static( diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index e44a36eedc2..a375d8236d6 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -252,7 +252,7 @@ def _integer_and_none_validator(val): """ If set to `False`, disables copy-on-write. If set to `True`, enables copy-on-write. - Read more at: :ref: copy-on-write-user-doc + Read more at: :ref:`copy-on-write-user-doc` \tValid values are True or False. Default is False. """ ), From 39e59c943f02f91120a4d09683d98109084d96f2 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 30 Jan 2023 16:20:54 -0800 Subject: [PATCH 11/31] add coverage --- python/cudf/cudf/tests/test_copying.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py index 1a3f9f6fd23..5962a968253 100644 --- a/python/cudf/cudf/tests/test_copying.py +++ b/python/cudf/cudf/tests/test_copying.py @@ -327,3 +327,29 @@ def test_series_cat_copy(copy_on_write): assert_eq(s1, cudf.Series([10, 20, 30, 40, 50], dtype=s.dtype)) assert_eq(s2, cudf.Series([10, 20, 30, 10, 50], dtype=s.dtype)) assert_eq(s3, cudf.Series([10, 20, 20, 20, 20], dtype=s.dtype)) + + +def test_dataframe_cow_slice_setitem(): + cudf.set_option("copy_on_write", True) + df = cudf.DataFrame({"a": [10, 11, 12, 13, 14], "b": [20, 30, 40, 50, 60]}) + slice_df = df[1:4] + + assert_eq( + slice_df, + cudf.DataFrame( + {"a": [11, 12, 13], "b": [30, 40, 50]}, index=[1, 2, 3] + ), + ) + + slice_df["a"][2] = 1111 + + assert_eq( + slice_df, + cudf.DataFrame( + {"a": [11, 1111, 13], "b": [30, 40, 50]}, index=[1, 2, 3] + ), + ) + assert_eq( + df, + cudf.DataFrame({"a": [10, 11, 12, 13, 14], "b": [20, 30, 40, 50, 60]}), + ) From 0cdec0347e023474bb4899c0f837ebf7f75a4be6 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 30 Jan 2023 17:27:45 -0800 Subject: [PATCH 12/31] cleanup after runs --- python/cudf/cudf/tests/test_copying.py | 6 ++++++ python/cudf/cudf/tests/test_multiindex.py | 1 + 2 files changed, 7 insertions(+) diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py index 5962a968253..8dbcbe10062 100644 --- a/python/cudf/cudf/tests/test_copying.py +++ b/python/cudf/cudf/tests/test_copying.py @@ -87,6 +87,7 @@ def test_series_setitem_cow_on(): new_slice[0:2] = 10 assert_eq(new_slice, cudf.Series([10, 10, 5], index=[2, 3, 4])) assert_eq(actual, cudf.Series([1, 2, 3, 4, 5])) + cudf.set_option("copy_on_write", False) def test_series_setitem_cow_off(): @@ -211,6 +212,7 @@ def test_multiple_series_cow(): del s3 assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) + cudf.set_option("copy_on_write", False) def test_series_zero_copy_cow_on(): @@ -246,6 +248,7 @@ def test_series_zero_copy_cow_on(): s5[0:2] = 1 assert_eq(s5, cudf.Series([1, 1, 30, 40, 50])) assert_eq(s4, cudf.Series([1, 1, 30, 40, 50])) + cudf.set_option("copy_on_write", False) def test_series_zero_copy_cow_off(): @@ -305,6 +308,7 @@ def test_series_str_copy(copy_on_write): assert_eq(s, cudf.Series(["abc", "abc", "abc", "d", "e"])) assert_eq(s1, cudf.Series(["a", "b", "c", "d", "e"])) assert_eq(s2, cudf.Series(["a", "xyz", "xyz", "xyz", "e"])) + cudf.set_option("copy_on_write", False) @pytest.mark.parametrize("copy_on_write", [True, False]) @@ -327,6 +331,7 @@ def test_series_cat_copy(copy_on_write): assert_eq(s1, cudf.Series([10, 20, 30, 40, 50], dtype=s.dtype)) assert_eq(s2, cudf.Series([10, 20, 30, 10, 50], dtype=s.dtype)) assert_eq(s3, cudf.Series([10, 20, 20, 20, 20], dtype=s.dtype)) + cudf.set_option("copy_on_write", False) def test_dataframe_cow_slice_setitem(): @@ -353,3 +358,4 @@ def test_dataframe_cow_slice_setitem(): df, cudf.DataFrame({"a": [10, 11, 12, 13, 14], "b": [20, 30, 40, 50, 60]}), ) + cudf.set_option("copy_on_write", False) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index b1dce6b88cc..7556a2a0048 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -848,6 +848,7 @@ def test_multiindex_copy_deep(data, copy_on_write, deep): ] assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) + cudf.set_option("copy_on_write", False) @pytest.mark.parametrize( From 7cd81501e4823288cdedb230abdac7718804c7f3 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 6 Feb 2023 12:40:11 -0600 Subject: [PATCH 13/31] Apply suggestions from code review Co-authored-by: Lawrence Mitchell --- docs/cudf/source/developer_guide/library_design.md | 10 +++++----- docs/cudf/source/user_guide/copy-on-write.md | 13 ++++++------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index 8a353005a4b..21f02b9a547 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -328,7 +328,7 @@ below. The core copy-on-write implementation relies on the `CopyOnWriteBuffer` class. When the cudf option `"copy_on_write"` is `True`, `as_buffer` will always return a `CopyOnWriteBuffer`. This subclass of `cudf.Buffer` contains all the mechanisms to enable copy-on-write behavior. -The class stores [weak references](https://docs.python.org/3/library/weakref.html) to every existing `CopyOnWriteBuffer` in `CopyOnWriteBuffer._instances`, a mapping from `ptr` keys to `WeakSet`s containing references to `CopyOnWriterBuffer` objects. +The class stores [weak references](https://docs.python.org/3/library/weakref.html) to every existing `CopyOnWriteBuffer` in `CopyOnWriteBuffer._instances`, a mapping from `ptr` keys to `WeakSet`s containing references to `CopyOnWriteBuffer` objects. This means that all `CopyOnWriteBuffer`s that point to the same device memory are contained in the same `WeakSet` (corresponding to the same `ptr` key) in `CopyOnWriteBuffer._instances`. This data structure is then used to determine whether or not to make a copy when a write operation is performed on a `Column` (see below). If multiple buffers point to the same underlying memory, then a copy must be made whenever a modification is attempted. @@ -336,7 +336,7 @@ If multiple buffers point to the same underlying memory, then a copy must be mad ### Eager copies when exposing to third-party libraries -If `Column`/`CopyOnWriteBuffer` is exposed to a third-party library via `__cuda_array_interface__`, we are no longer able to track whether or not modification of the buffer has occurred. Hence whenever +If a `Column`/`CopyOnWriteBuffer` is exposed to a third-party library via `__cuda_array_interface__`, we are no longer able to track whether or not modification of the buffer has occurred. Hence whenever someone accesses data through the `__cuda_array_interface__`, we eagerly trigger the copy by calling `_unlink_shared_buffers` which ensures a true copy of underlying device data is made and unlinks the buffer from any shared "weak" references. Any future copy requests must also trigger a true physical copy (since we cannot track the lifetime of the third-party object). To handle this we also mark the `Column`/`CopyOnWriteBuffer` as @@ -369,8 +369,8 @@ also consider acquiring a spill lock (see `Buffer` [design](Buffer-design) for m ### Variable width data types Weak references are implemented only for fixed-width data types as these are only column types that can be mutated in place. -Deep copies of variable width data types return shallow-copies of the Columns, because these -types don't support real in-place mutations to the data. +Requests for deep copies of variable width data types always return shallow copies of the Columns, because these +types don't support real in-place mutation of the data. Internally, we mimic in-place mutations using `_mimic_inplace`, but the resulting data is always a deep copy of the underlying data. @@ -499,6 +499,6 @@ If we inspect the memory address of the data, the addresses of `s2` and `s3` rem 139796315879723 ``` -cudf's copy-on-write implementation is motivated by the pandas proposals documented here: +cuDF's copy-on-write implementation is motivated by the pandas proposals documented here: 1. [Google doc](https://docs.google.com/document/d/1ZCQ9mx3LBMy-nhwRl33_jgcvWo9IWdEfxDNQ2thyTb0/edit#heading=h.iexejdstiz8u) 2. [Github issue](https://github.com/pandas-dev/pandas/issues/36195) diff --git a/docs/cudf/source/user_guide/copy-on-write.md b/docs/cudf/source/user_guide/copy-on-write.md index d8c102d2f46..6c937ee7ad8 100644 --- a/docs/cudf/source/user_guide/copy-on-write.md +++ b/docs/cudf/source/user_guide/copy-on-write.md @@ -2,14 +2,13 @@ # Copy-on-write -Copy-on-write is a memory management strategy that allows multiple cudf objects containing the same data to refer to the same memory address as long as neither of them modify the underlying data. +Copy-on-write is a memory management strategy that allows multiple cuDF objects containing the same data to refer to the same memory address as long as neither of them modify the underlying data. With this approach, any operation that generates an unmodified view of an object (such as copies, slices, or methods like `DataFrame.head`) returns a new object that points to the same memory as the original. -However, when either the new object is modified, a new copy of the data is made prior to the modification, ensuring that the changes do not propagate back to the original object. -The same behavior also works in the other direction, i.e. modifications of the original object will not propagate to the new object. +However, when either the existing or new object is _modified_, a copy of the data is made prior to the modification, ensuring that the changes do not propagate between the two objects. This behavior is best understood by looking at the examples below. -## How to enable it +## Enabling copy-on-write 1. Use `cudf.set_option`: @@ -25,10 +24,10 @@ launch of the Python interpreter: export CUDF_COPY_ON_WRITE="1" python -c "import cudf" ``` -## How to disable it +## Disabling copy-on-write -Copy-on-write can be disable by setting ``copy_on_write`` cudf option to ``False``: +Copy-on-write can be disabled by setting the ``copy_on_write`` option to ``False``: ```python >>> cudf.set_option("copy_on_write", False) @@ -83,7 +82,7 @@ dtype: int64 ## Notes -When copy-on-write is enabled, there is no concept of views. Modifying any view created inside cudf will always trigger a copy and will not modify the original object. +When copy-on-write is enabled, there is no concept of views. Modifying any view created inside cuDF will always trigger a copy and will not modify the original object. ### Explicit deep and shallow copies comparison From f474e79e6bff89e37b93b21a62731a1ab1dadef4 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 8 Feb 2023 12:37:47 -0600 Subject: [PATCH 14/31] Update docs/cudf/source/user_guide/copy-on-write.md Co-authored-by: Lawrence Mitchell --- docs/cudf/source/user_guide/copy-on-write.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/cudf/source/user_guide/copy-on-write.md b/docs/cudf/source/user_guide/copy-on-write.md index 6c937ee7ad8..9a0f098f649 100644 --- a/docs/cudf/source/user_guide/copy-on-write.md +++ b/docs/cudf/source/user_guide/copy-on-write.md @@ -7,7 +7,7 @@ With this approach, any operation that generates an unmodified view of an object However, when either the existing or new object is _modified_, a copy of the data is made prior to the modification, ensuring that the changes do not propagate between the two objects. This behavior is best understood by looking at the examples below. - +The default behaviour in cuDF is for copy-on-write to be disabled, so to use it, one must explicitly opt in by setting a cuDF option. This can be done operation by operation if this is required, but the most common use case is to enable copy-on-write for the duration of a script. ## Enabling copy-on-write 1. Use `cudf.set_option`: From 5a8ad614396f4cb122bc595e982675cd887a43bb Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 8 Feb 2023 10:46:52 -0800 Subject: [PATCH 15/31] removed advantages title --- docs/cudf/source/user_guide/copy-on-write.md | 21 ++++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/docs/cudf/source/user_guide/copy-on-write.md b/docs/cudf/source/user_guide/copy-on-write.md index 9a0f098f649..65e309f2c39 100644 --- a/docs/cudf/source/user_guide/copy-on-write.md +++ b/docs/cudf/source/user_guide/copy-on-write.md @@ -8,6 +8,7 @@ However, when either the existing or new object is _modified_, a copy of the dat This behavior is best understood by looking at the examples below. The default behaviour in cuDF is for copy-on-write to be disabled, so to use it, one must explicitly opt in by setting a cuDF option. This can be done operation by operation if this is required, but the most common use case is to enable copy-on-write for the duration of a script. + ## Enabling copy-on-write 1. Use `cudf.set_option`: @@ -84,17 +85,6 @@ dtype: int64 When copy-on-write is enabled, there is no concept of views. Modifying any view created inside cuDF will always trigger a copy and will not modify the original object. -### Explicit deep and shallow copies comparison - - -| | Copy-on-Write enabled | Copy-on-Write disabled (default) | -|---------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------| -| `.copy(deep=True)` | A true copy is made and changes don't propagate to the original object. | A true copy is made and changes don't propagate to the original object. | -| `.copy(deep=False)` | Memory is shared between the two objects and but any write operation on one object will trigger a true physical copy before the write is performed. Hence changes will not propagate to the original object. | Memory is shared between the two objects and changes performed on one will propagate to the other object. | - - -## Advantages - 1. Copy-on-write produces much more consistent copy semantics. Since every object is a copy of the original, users no longer have to think about when modifications may unexpectedly happen in place. This will bring consistency across operations and bring cudf and pandas behavior into alignment when copy-on-write is enabled for both. Here is one example where pandas and cudf are currently inconsistent without copy-on-write enabled: ```python @@ -170,3 +160,12 @@ dtype: int64 dtype: int64 ``` 2. There are numerous other inconsistencies, which are solved by copy-on-write. Read more about them [here](https://phofl.github.io/cow-introduction.html). + + +### Explicit deep and shallow copies comparison + + +| | Copy-on-Write enabled | Copy-on-Write disabled (default) | +|---------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------| +| `.copy(deep=True)` | A true copy is made and changes don't propagate to the original object. | A true copy is made and changes don't propagate to the original object. | +| `.copy(deep=False)` | Memory is shared between the two objects and but any write operation on one object will trigger a true physical copy before the write is performed. Hence changes will not propagate to the original object. | Memory is shared between the two objects and changes performed on one will propagate to the other object. | From 417883b04896c87fa26f015c78de2a5fe7747035 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 8 Feb 2023 11:32:33 -0800 Subject: [PATCH 16/31] address reviews --- python/cudf/cudf/_lib/column.pyx | 1 + python/cudf/cudf/tests/test_copying.py | 27 ++++++++++++++++++----- python/cudf/cudf/tests/test_index.py | 7 ++++-- python/cudf/cudf/tests/test_multiindex.py | 3 ++- 4 files changed, 29 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index d6aac597c17..2a163a795eb 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -522,6 +522,7 @@ cdef class Column: # CopyOnWriteBuffer reference to another column # and still create a weak reference. # With the current design that's not possible. + # https://github.com/rapidsai/cudf/issues/12734 data = data_owner.copy(deep=False) elif ( # This is an optimization of the most common case where diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py index 8dbcbe10062..bba31a184d4 100644 --- a/python/cudf/cudf/tests/test_copying.py +++ b/python/cudf/cudf/tests/test_copying.py @@ -55,7 +55,13 @@ def test_null_copy(): assert len(col) == 2049 +# TODO: Make use of set_option context manager +# once https://github.com/rapidsai/cudf/issues/12736 +# is resolved. + + def test_series_setitem_cow_on(): + original_cow_setting = cudf.get_option("copy_on_write") cudf.set_option("copy_on_write", True) actual = cudf.Series([1, 2, 3, 4, 5]) new_copy = actual.copy(deep=False) @@ -87,10 +93,11 @@ def test_series_setitem_cow_on(): new_slice[0:2] = 10 assert_eq(new_slice, cudf.Series([10, 10, 5], index=[2, 3, 4])) assert_eq(actual, cudf.Series([1, 2, 3, 4, 5])) - cudf.set_option("copy_on_write", False) + cudf.set_option("copy_on_write", original_cow_setting) def test_series_setitem_cow_off(): + original_cow_setting = cudf.get_option("copy_on_write") cudf.set_option("copy_on_write", False) actual = cudf.Series([1, 2, 3, 4, 5]) new_copy = actual.copy(deep=False) @@ -122,9 +129,11 @@ def test_series_setitem_cow_off(): new_slice[0:2] = 10 assert_eq(new_slice, cudf.Series([10, 10, 5], index=[2, 3, 4])) assert_eq(actual, cudf.Series([1, 2, 10, 10, 5])) + cudf.set_option("copy_on_write", original_cow_setting) def test_multiple_series_cow(): + original_cow_setting = cudf.get_option("copy_on_write") cudf.set_option("copy_on_write", True) s = cudf.Series([10, 20, 30, 40, 50]) s1 = s.copy(deep=False) @@ -212,10 +221,11 @@ def test_multiple_series_cow(): del s3 assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) - cudf.set_option("copy_on_write", False) + cudf.set_option("copy_on_write", original_cow_setting) def test_series_zero_copy_cow_on(): + original_cow_setting = cudf.get_option("copy_on_write") cudf.set_option("copy_on_write", True) s = cudf.Series([1, 2, 3, 4, 5]) s1 = s.copy(deep=False) @@ -248,10 +258,11 @@ def test_series_zero_copy_cow_on(): s5[0:2] = 1 assert_eq(s5, cudf.Series([1, 1, 30, 40, 50])) assert_eq(s4, cudf.Series([1, 1, 30, 40, 50])) - cudf.set_option("copy_on_write", False) + cudf.set_option("copy_on_write", original_cow_setting) def test_series_zero_copy_cow_off(): + original_cow_setting = cudf.get_option("copy_on_write") cudf.set_option("copy_on_write", False) s = cudf.Series([1, 2, 3, 4, 5]) s1 = s.copy(deep=False) @@ -284,10 +295,12 @@ def test_series_zero_copy_cow_off(): s5[0:2] = 1 assert_eq(s5, cudf.Series([1, 1, 30, 40, 50])) assert_eq(s4, cudf.Series([1, 1, 30, 40, 50])) + cudf.set_option("copy_on_write", original_cow_setting) @pytest.mark.parametrize("copy_on_write", [True, False]) def test_series_str_copy(copy_on_write): + original_cow_setting = cudf.get_option("copy_on_write") cudf.set_option("copy_on_write", copy_on_write) s = cudf.Series(["a", "b", "c", "d", "e"]) s1 = s.copy(deep=True) @@ -308,11 +321,12 @@ def test_series_str_copy(copy_on_write): assert_eq(s, cudf.Series(["abc", "abc", "abc", "d", "e"])) assert_eq(s1, cudf.Series(["a", "b", "c", "d", "e"])) assert_eq(s2, cudf.Series(["a", "xyz", "xyz", "xyz", "e"])) - cudf.set_option("copy_on_write", False) + cudf.set_option("copy_on_write", original_cow_setting) @pytest.mark.parametrize("copy_on_write", [True, False]) def test_series_cat_copy(copy_on_write): + original_cow_setting = cudf.get_option("copy_on_write") cudf.set_option("copy_on_write", copy_on_write) s = cudf.Series([10, 20, 30, 40, 50], dtype="category") s1 = s.copy(deep=True) @@ -331,10 +345,11 @@ def test_series_cat_copy(copy_on_write): assert_eq(s1, cudf.Series([10, 20, 30, 40, 50], dtype=s.dtype)) assert_eq(s2, cudf.Series([10, 20, 30, 10, 50], dtype=s.dtype)) assert_eq(s3, cudf.Series([10, 20, 20, 20, 20], dtype=s.dtype)) - cudf.set_option("copy_on_write", False) + cudf.set_option("copy_on_write", original_cow_setting) def test_dataframe_cow_slice_setitem(): + original_cow_setting = cudf.get_option("copy_on_write") cudf.set_option("copy_on_write", True) df = cudf.DataFrame({"a": [10, 11, 12, 13, 14], "b": [20, 30, 40, 50, 60]}) slice_df = df[1:4] @@ -358,4 +373,4 @@ def test_dataframe_cow_slice_setitem(): df, cudf.DataFrame({"a": [10, 11, 12, 13, 14], "b": [20, 30, 40, 50, 60]}), ) - cudf.set_option("copy_on_write", False) + cudf.set_option("copy_on_write", original_cow_setting) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 32d83bb1c83..f0b74ce70e7 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -408,10 +408,12 @@ def test_index_copy_category(name, dtype, deep=True): cudf.CategoricalIndex(["a", "b", "c"]), ], ) -def test_index_copy_deep(idx, deep): +@pytest.mark.parametrize("copy_on_write", [True, False]) +def test_index_copy_deep(idx, deep, copy_on_write): """Test if deep copy creates a new instance for device data.""" idx_copy = idx.copy(deep=deep) - + original_cow_setting = cudf.get_option("copy_on_write") + cudf.set_option("copy_on_write", copy_on_write) if ( isinstance(idx, cudf.StringIndex) or not deep @@ -426,6 +428,7 @@ def test_index_copy_deep(idx, deep): assert_column_memory_eq(idx._values, idx_copy._values) else: assert_column_memory_ne(idx._values, idx_copy._values) + cudf.set_option("copy_on_write", original_cow_setting) @pytest.mark.parametrize("idx", [[1, None, 3, None, 5]]) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 7556a2a0048..9629b73da56 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -788,6 +788,7 @@ def test_multiindex_copy_deep(data, copy_on_write, deep): Case1: Constructed from GroupBy, StringColumns Case2: Constructed from MultiIndex, NumericColumns """ + original_cow_setting = cudf.get_option("copy_on_write") cudf.set_option("copy_on_write", copy_on_write) same_ref = (not deep) or (cudf.get_option("copy_on_write") and not deep) @@ -848,7 +849,7 @@ def test_multiindex_copy_deep(data, copy_on_write, deep): ] assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) - cudf.set_option("copy_on_write", False) + cudf.set_option("copy_on_write", original_cow_setting) @pytest.mark.parametrize( From 99967fc04d7e5c027b361baa0634bba2db6d5aad Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 9 Feb 2023 22:45:25 -0600 Subject: [PATCH 17/31] Apply suggestions from code review Co-authored-by: Vyas Ramasubramani --- python/cudf/cudf/core/buffer/cow_buffer.py | 8 +++----- python/cudf/cudf/core/buffer/spillable_buffer.py | 2 +- python/cudf/cudf/core/series.py | 6 ++++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index 138dbb8607f..229c98592a6 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -113,13 +113,11 @@ def get_ptr(self, mode: str = "write") -> int: Buffer.get_ptr SpillableBuffer.get_ptr """ - if mode == "read": - return self._ptr - elif mode == "write": + if mode == "write": self._unlink_shared_buffers() - return self._ptr - else: + elif mode != "read": raise ValueError(f"Incorrect mode passed : {mode}") + return self._ptr def copy(self, deep: bool = True): if deep or self._zero_copied: diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py index fdcc6797916..c71841a5a26 100644 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ b/python/cudf/cudf/core/buffer/spillable_buffer.py @@ -152,7 +152,7 @@ def __getitem__(self, i): class SpillableBuffer(Buffer): - """A spillable buffer that implements Buffer. + """A Buffer that supports spilling memory off the GPU to avoid OOMs. This buffer supports spilling the represented data to host memory. Spilling can be done manually by calling `.spill(target="cpu")` but diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 02b2a3a0b17..43b2f38ca46 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -550,8 +550,10 @@ def __init__( # a deep copy later. This is different # from `hasattr` because, it doesn't # invoke the property we are looking - # for and the later actually invokes - # the property. + # for and the latter actually invokes + # the property, which in this case could + # be expensive or mark a buffer as + # unspillable. has_cai = ( type( inspect.getattr_static( From 45e1fd18d2f663aacd638cec822b2d4b5ec61e42 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 9 Feb 2023 20:48:18 -0800 Subject: [PATCH 18/31] address reviews --- python/cudf/cudf/tests/test_copying.py | 61 ++++++++++++++++------- python/cudf/cudf/tests/test_multiindex.py | 4 +- 2 files changed, 45 insertions(+), 20 deletions(-) diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py index bba31a184d4..1449edb1454 100644 --- a/python/cudf/cudf/tests/test_copying.py +++ b/python/cudf/cudf/tests/test_copying.py @@ -70,6 +70,25 @@ def test_series_setitem_cow_on(): assert_eq(actual, cudf.Series([1, 100, 3, 4, 5])) assert_eq(new_copy, cudf.Series([1, 2, 3, 4, 5])) + cudf.set_option("copy_on_write", original_cow_setting) + + +def test_series_setitem_cow_off(): + original_cow_setting = cudf.get_option("copy_on_write") + cudf.set_option("copy_on_write", False) + actual = cudf.Series([1, 2, 3, 4, 5]) + new_copy = actual.copy(deep=False) + + actual[1] = 100 + assert_eq(actual, cudf.Series([1, 100, 3, 4, 5])) + assert_eq(new_copy, cudf.Series([1, 100, 3, 4, 5])) + cudf.set_option("copy_on_write", original_cow_setting) + + +def test_series_setitem_both_slice_cow_on(): + original_cow_setting = cudf.get_option("copy_on_write") + cudf.set_option("copy_on_write", True) + actual = cudf.Series([1, 2, 3, 4, 5]) new_copy = actual.copy(deep=False) @@ -80,10 +99,32 @@ def test_series_setitem_cow_on(): new_copy[slice(2, 4, 1)] = 300 assert_eq(actual, cudf.Series([100, 100, 3, 4, 5])) assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5])) + cudf.set_option("copy_on_write", original_cow_setting) + + +def test_series_setitem_both_slice_cow_off(): + original_cow_setting = cudf.get_option("copy_on_write") + cudf.set_option("copy_on_write", False) actual = cudf.Series([1, 2, 3, 4, 5]) new_copy = actual.copy(deep=False) + actual[slice(0, 2, 1)] = 100 + assert_eq(actual, cudf.Series([100, 100, 3, 4, 5])) + assert_eq(new_copy, cudf.Series([100, 100, 3, 4, 5])) + + new_copy[slice(2, 4, 1)] = 300 + assert_eq(actual, cudf.Series([100, 100, 300, 300, 5])) + assert_eq(new_copy, cudf.Series([100, 100, 300, 300, 5])) + cudf.set_option("copy_on_write", original_cow_setting) + + +def test_series_setitem_partial_slice_cow_on(): + original_cow_setting = cudf.get_option("copy_on_write") + cudf.set_option("copy_on_write", True) + actual = cudf.Series([1, 2, 3, 4, 5]) + new_copy = actual.copy(deep=False) + new_copy[slice(2, 4, 1)] = 300 assert_eq(actual, cudf.Series([1, 2, 3, 4, 5])) assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5])) @@ -96,30 +137,12 @@ def test_series_setitem_cow_on(): cudf.set_option("copy_on_write", original_cow_setting) -def test_series_setitem_cow_off(): +def test_series_setitem_partial_slice_cow_off(): original_cow_setting = cudf.get_option("copy_on_write") cudf.set_option("copy_on_write", False) actual = cudf.Series([1, 2, 3, 4, 5]) new_copy = actual.copy(deep=False) - actual[1] = 100 - assert_eq(actual, cudf.Series([1, 100, 3, 4, 5])) - assert_eq(new_copy, cudf.Series([1, 100, 3, 4, 5])) - - actual = cudf.Series([1, 2, 3, 4, 5]) - new_copy = actual.copy(deep=False) - - actual[slice(0, 2, 1)] = 100 - assert_eq(actual, cudf.Series([100, 100, 3, 4, 5])) - assert_eq(new_copy, cudf.Series([100, 100, 3, 4, 5])) - - new_copy[slice(2, 4, 1)] = 300 - assert_eq(actual, cudf.Series([100, 100, 300, 300, 5])) - assert_eq(new_copy, cudf.Series([100, 100, 300, 300, 5])) - - actual = cudf.Series([1, 2, 3, 4, 5]) - new_copy = actual.copy(deep=False) - new_copy[slice(2, 4, 1)] = 300 assert_eq(actual, cudf.Series([1, 2, 300, 300, 5])) assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5])) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 9629b73da56..14e3a2a1b9b 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -790,7 +790,6 @@ def test_multiindex_copy_deep(data, copy_on_write, deep): """ original_cow_setting = cudf.get_option("copy_on_write") cudf.set_option("copy_on_write", copy_on_write) - same_ref = (not deep) or (cudf.get_option("copy_on_write") and not deep) if isinstance(data, dict): import operator @@ -813,6 +812,9 @@ def test_multiindex_copy_deep(data, copy_on_write, deep): assert all((x == y) for x, y in zip(lptrs, rptrs)) elif isinstance(data, cudf.MultiIndex): + same_ref = (not deep) or ( + cudf.get_option("copy_on_write") and not deep + ) mi1 = data mi2 = mi1.copy(deep=deep) From 2e9eac740574dde2af9b206befe574570c61ad7b Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Sat, 11 Feb 2023 15:12:50 -0800 Subject: [PATCH 19/31] add comments --- python/cudf/cudf/tests/test_copying.py | 48 ++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py index 1449edb1454..37fb54f2b7a 100644 --- a/python/cudf/cudf/tests/test_copying.py +++ b/python/cudf/cudf/tests/test_copying.py @@ -156,6 +156,10 @@ def test_series_setitem_partial_slice_cow_off(): def test_multiple_series_cow(): + # Verify constructing, modifying, deleting + # multiple copies of a series preserves + # the data appropriately when COW is enabled. + original_cow_setting = cudf.get_option("copy_on_write") cudf.set_option("copy_on_write", True) s = cudf.Series([10, 20, 30, 40, 50]) @@ -167,17 +171,26 @@ def test_multiple_series_cow(): s6 = s3.copy(deep=False) s1[0:3] = 10000 + # s1 will be unlinked from actual data in s, + # and then modified. Rest all should + # contain the original data. assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) for ser in [s, s2, s3, s4, s5, s6]: assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) s6[0:3] = 3000 + # s6 will be unlinked from actual data in s, + # and then modified. Rest all should + # contain the original data. assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) for ser in [s2, s3, s4, s5]: assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) s2[1:4] = 4000 + # s2 will be unlinked from actual data in s, + # and then modified. Rest all should + # contain the original data. assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50])) assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) @@ -185,6 +198,9 @@ def test_multiple_series_cow(): assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) s4[2:4] = 5000 + # s4 will be unlinked from actual data in s, + # and then modified. Rest all should + # contain the original data. assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50])) assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50])) assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) @@ -193,6 +209,9 @@ def test_multiple_series_cow(): assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) s5[2:4] = 6000 + # s5 will be unlinked from actual data in s, + # and then modified. Rest all should + # contain the original data. assert_eq(s5, cudf.Series([10, 20, 6000, 6000, 50])) assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50])) assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50])) @@ -204,6 +223,8 @@ def test_multiple_series_cow(): s7 = s5.copy(deep=False) assert_eq(s7, cudf.Series([10, 20, 6000, 6000, 50])) s7[1:3] = 55 + # Making a copy of s5, i.e., s7 and modifying shouldn't + # be touching/modifying data in other series. assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50])) @@ -213,6 +234,10 @@ def test_multiple_series_cow(): for ser in [s3]: assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) + # Deleting any of the following series objects + # shouldn't delete rest of the weekly referenced data + # else-where. + del s2 assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) @@ -254,11 +279,16 @@ def test_series_zero_copy_cow_on(): s1 = s.copy(deep=False) cp_array = cp.asarray(s) + # Ensure all original data & zero-copied + # data is same. assert_eq(s, cudf.Series([1, 2, 3, 4, 5])) assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) assert_eq(cp_array, cp.array([1, 2, 3, 4, 5])) cp_array[0:3] = 10 + # Modifying a zero-copied array should only + # modify `s` and will leave rest of the copies + # un-touched. assert_eq(s, cudf.Series([10, 10, 10, 4, 5])) assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) @@ -266,8 +296,12 @@ def test_series_zero_copy_cow_on(): s2 = cudf.Series(cp_array) assert_eq(s2, cudf.Series([10, 10, 10, 4, 5])) + s3 = s2.copy(deep=False) cp_array[0] = 20 + # Modifying a zero-copied array should modify + # `s2` and `s` only. Because `cp_array` + # is zero-copied shared with `s` & `s2`. assert_eq(s, cudf.Series([20, 10, 10, 4, 5])) assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) @@ -279,6 +313,8 @@ def test_series_zero_copy_cow_on(): s5 = cudf.Series(s4) assert_eq(s5, cudf.Series([10, 20, 30, 40, 50])) s5[0:2] = 1 + # Modifying `s5` should also modify `s4` + # because they are zero-copied. assert_eq(s5, cudf.Series([1, 1, 30, 40, 50])) assert_eq(s4, cudf.Series([1, 1, 30, 40, 50])) cudf.set_option("copy_on_write", original_cow_setting) @@ -291,11 +327,16 @@ def test_series_zero_copy_cow_off(): s1 = s.copy(deep=False) cp_array = cp.asarray(s) + # Ensure all original data & zero-copied + # data is same. assert_eq(s, cudf.Series([1, 2, 3, 4, 5])) assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) assert_eq(cp_array, cp.array([1, 2, 3, 4, 5])) cp_array[0:3] = 10 + # When COW is off, modifying a zero-copied array + # will need to modify `s` & `s1` since they are + # shallow copied. assert_eq(s, cudf.Series([10, 10, 10, 4, 5])) assert_eq(s1, cudf.Series([10, 10, 10, 4, 5])) @@ -306,6 +347,10 @@ def test_series_zero_copy_cow_off(): s3 = s2.copy(deep=False) cp_array[0] = 20 + # Modifying `cp_array`, will propagate the changes + # across all Series objects, because they are + # either shallow copied or zero-copied. + assert_eq(s, cudf.Series([20, 10, 10, 4, 5])) assert_eq(s1, cudf.Series([20, 10, 10, 4, 5])) assert_eq(cp_array, cp.array([20, 10, 10, 4, 5])) @@ -316,6 +361,9 @@ def test_series_zero_copy_cow_off(): s5 = cudf.Series(s4) assert_eq(s5, cudf.Series([10, 20, 30, 40, 50])) s5[0:2] = 1 + + # Modifying `s5` should also modify `s4` + # because they are zero-copied. assert_eq(s5, cudf.Series([1, 1, 30, 40, 50])) assert_eq(s4, cudf.Series([1, 1, 30, 40, 50])) cudf.set_option("copy_on_write", original_cow_setting) From 26b1d87c188e47b5e2363ee377b7c0686f79f3f2 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Sun, 12 Feb 2023 22:54:01 -0800 Subject: [PATCH 20/31] address reviews --- docs/cudf/source/developer_guide/library_design.md | 3 +-- docs/cudf/source/user_guide/copy-on-write.md | 10 +++++++--- python/cudf/cudf/core/buffer/cow_buffer.py | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index 21f02b9a547..7fa888b0fc2 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -363,8 +363,7 @@ with the buffer. If only read-only access is required (`mode="read"`), this indi that the caller has no intention of modifying the buffer through this pointer. In this case, any shallow copies are not unlinked. In contrast, if modification is required one may pass `mode="write"`, provoking unlinking of any shallow copies. -The same mechanism is also used for `SpillableBuffer`, though in that case one must -also consider acquiring a spill lock (see `Buffer` [design](Buffer-design) for more details). + ### Variable width data types Weak references are implemented only for fixed-width data types as these are only column diff --git a/docs/cudf/source/user_guide/copy-on-write.md b/docs/cudf/source/user_guide/copy-on-write.md index 65e309f2c39..1cf518270d6 100644 --- a/docs/cudf/source/user_guide/copy-on-write.md +++ b/docs/cudf/source/user_guide/copy-on-write.md @@ -7,7 +7,12 @@ With this approach, any operation that generates an unmodified view of an object However, when either the existing or new object is _modified_, a copy of the data is made prior to the modification, ensuring that the changes do not propagate between the two objects. This behavior is best understood by looking at the examples below. -The default behaviour in cuDF is for copy-on-write to be disabled, so to use it, one must explicitly opt in by setting a cuDF option. This can be done operation by operation if this is required, but the most common use case is to enable copy-on-write for the duration of a script. +The default behaviour in cuDF is for copy-on-write to be disabled, so to use it, one must explicitly +opt in by setting a cuDF option. It is recommended to set the copy-on-write at the beginning of the +script execution, because when this setting is changed in middle of a script execution there will +be un-intended behavior where the objects created when copy-on-write is enabled will still have the +copy-on-write behavior whereas the objects created when copy-on-write is disabled will have +different behavior. ## Enabling copy-on-write @@ -85,7 +90,7 @@ dtype: int64 When copy-on-write is enabled, there is no concept of views. Modifying any view created inside cuDF will always trigger a copy and will not modify the original object. -1. Copy-on-write produces much more consistent copy semantics. Since every object is a copy of the original, users no longer have to think about when modifications may unexpectedly happen in place. This will bring consistency across operations and bring cudf and pandas behavior into alignment when copy-on-write is enabled for both. Here is one example where pandas and cudf are currently inconsistent without copy-on-write enabled: +Copy-on-write produces much more consistent copy semantics. Since every object is a copy of the original, users no longer have to think about when modifications may unexpectedly happen in place. This will bring consistency across operations and bring cudf and pandas behavior into alignment when copy-on-write is enabled for both. Here is one example where pandas and cudf are currently inconsistent without copy-on-write enabled: ```python @@ -159,7 +164,6 @@ dtype: int64 4 5 dtype: int64 ``` -2. There are numerous other inconsistencies, which are solved by copy-on-write. Read more about them [here](https://phofl.github.io/cow-introduction.html). ### Explicit deep and shallow copies comparison diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index 229c98592a6..7a3fb8ca612 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -133,8 +133,8 @@ def copy(self, deep: bool = True): @property def __cuda_array_interface__(self) -> dict: - # Unlink if there are any weak references. self._unlink_shared_buffers() + # Unlink if there are any weak references. # Mark the Buffer as ``zero_copied=True``, # which will prevent any copy-on-write # mechanism post this operation. From 36ecf22e099081db9df9d81a667c821cdd6a4e54 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 13 Feb 2023 07:25:27 -0800 Subject: [PATCH 21/31] drop _readonly_proxy_cai_obj --- .../source/developer_guide/library_design.md | 5 ++-- python/cudf/cudf/core/buffer/buffer.py | 14 ---------- python/cudf/cudf/core/buffer/cow_buffer.py | 19 +------------ python/cudf/cudf/core/column/column.py | 28 +++++++++++++++++-- 4 files changed, 28 insertions(+), 38 deletions(-) diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index 7fa888b0fc2..16b84476549 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -346,9 +346,8 @@ rather than a copy-on-write shallow copy with weak references. ### Obtaining a read-only object A read-only object can be quite useful for operations that will not -mutate the data. This can be achieved by calling `._readonly_proxy_cai_obj`. -This function will return a proxy object that has `__cuda_array_interface__` -implemented and will not trigger a deep copy even if the `CopyOnWriteBuffer` +mutate the data. This can be achieved by calling `._get_cuda_array_interface(readonly=True)`, and creating a `SimpleNameSpace` object around it. +This will not trigger a deep copy even if the `CopyOnWriteBuffer` has weak references. This API should only be used when the lifetime of the proxy object is restricted to cudf's internal code execution. Handing this out to external libraries or user-facing APIs will lead to untracked references and undefined copy-on-write behavior. We currently use this API for device to host copies like in `ColumnBase.data_array_view(mode="read")` which is used for `Column.values_host`. diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index fb5675513ae..2262730d8a1 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -262,20 +262,6 @@ def _get_cuda_array_interface(self, readonly=False): "version": 0, } - @property - def _readonly_proxy_cai_obj(self): - """ - Returns a proxy object with a read-only CUDA Array Interface. - """ - return cuda_array_interface_wrapper( - ptr=self.get_ptr(mode="read"), - size=self.size, - owner=self, - readonly=True, - typestr="|u1", - version=0, - ) - def get_ptr(self, *, mode) -> int: """Device pointer to the start of the buffer. diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index 7a3fb8ca612..006ba20d252 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -9,7 +9,7 @@ import rmm -from cudf.core.buffer.buffer import Buffer, cuda_array_interface_wrapper +from cudf.core.buffer.buffer import Buffer T = TypeVar("T", bound="CopyOnWriteBuffer") @@ -154,23 +154,6 @@ def _get_cuda_array_interface(self, readonly=False): "version": 0, } - @property - def _readonly_proxy_cai_obj(self) -> dict: - """ - Returns a proxy object with a read-only CUDA Array Interface. - - See `Copy-on-write` section in `library_design.md` for - more information on this API. - """ - return cuda_array_interface_wrapper( - ptr=self._ptr, - size=self.size, - owner=self, - readonly=True, - typestr="|u1", - version=0, - ) - def _unlink_shared_buffers(self): """ Unlinks a Buffer if it is shared with other buffers by diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 8ea0446d4bd..f06b86f4da9 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -64,7 +64,12 @@ ) from cudf.core._compat import PANDAS_GE_150 from cudf.core.abc import Serializable -from cudf.core.buffer import Buffer, acquire_spill_lock, as_buffer +from cudf.core.buffer import ( + Buffer, + acquire_spill_lock, + as_buffer, + cuda_array_interface_wrapper, +) from cudf.core.dtypes import ( CategoricalDtype, IntervalDtype, @@ -137,7 +142,10 @@ def data_array_view( """ if self.data is not None: if mode == "read": - obj = self.data._readonly_proxy_cai_obj + obj = _proxy_cai_obj( + self.data._get_cuda_array_interface(readonly=True), + owner=self.data, + ) elif mode == "write": obj = self.data else: @@ -170,7 +178,10 @@ def mask_array_view( """ if self.mask is not None: if mode == "read": - obj = self.mask._readonly_proxy_cai_obj + obj = _proxy_cai_obj( + self.mask._get_cuda_array_interface(readonly=True), + owner=self.mask, + ) elif mode == "write": obj = self.mask else: @@ -2589,3 +2600,14 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: ) from e raise return col + + +def _proxy_cai_obj(cai, owner): + return cuda_array_interface_wrapper( + ptr=cai["data"][0], + size=cai["shape"][0], + owner=owner, + readonly=cai["data"][1], + typestr=cai["typestr"], + version=cai["version"], + ) From c3977b716d718df5e7a5ce2e3f4132a25ec53588 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 13 Feb 2023 09:37:56 -0800 Subject: [PATCH 22/31] cleanup --- python/cudf/cudf/testing/_utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index 4e7b0a6e035..061bad1d44b 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -15,7 +15,6 @@ import cudf from cudf._lib.null_mask import bitmask_allocation_size_bytes -from cudf.core.buffer import acquire_spill_lock from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion from cudf.utils import dtypes as dtypeutils @@ -326,7 +325,6 @@ def does_not_raise(): yield -@acquire_spill_lock() def assert_column_memory_eq( lhs: cudf.core.column.ColumnBase, rhs: cudf.core.column.ColumnBase ): From 33f6d3bfbdb0ad01b913e972652673c5db8b97c9 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 13 Feb 2023 12:43:42 -0600 Subject: [PATCH 23/31] Update python/cudf/cudf/core/buffer/cow_buffer.py Co-authored-by: Vyas Ramasubramani --- python/cudf/cudf/core/buffer/cow_buffer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py index 006ba20d252..16a1e3942e7 100644 --- a/python/cudf/cudf/core/buffer/cow_buffer.py +++ b/python/cudf/cudf/core/buffer/cow_buffer.py @@ -133,7 +133,6 @@ def copy(self, deep: bool = True): @property def __cuda_array_interface__(self) -> dict: - self._unlink_shared_buffers() # Unlink if there are any weak references. # Mark the Buffer as ``zero_copied=True``, # which will prevent any copy-on-write @@ -142,6 +141,7 @@ def __cuda_array_interface__(self) -> dict: # control over knowing if a third-party library # has modified the data this Buffer is # pointing to. + self._unlink_shared_buffers() self._zero_copied = True return self._get_cuda_array_interface(readonly=False) From 0412db68580c0b6a2116131da0ba5322c48d7a59 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 13 Feb 2023 12:47:03 -0600 Subject: [PATCH 24/31] Update python/cudf/cudf/tests/test_copying.py Co-authored-by: Vyas Ramasubramani --- python/cudf/cudf/tests/test_copying.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py index 37fb54f2b7a..8765d2f199e 100644 --- a/python/cudf/cudf/tests/test_copying.py +++ b/python/cudf/cudf/tests/test_copying.py @@ -301,7 +301,7 @@ def test_series_zero_copy_cow_on(): cp_array[0] = 20 # Modifying a zero-copied array should modify # `s2` and `s` only. Because `cp_array` - # is zero-copied shared with `s` & `s2`. + # is zero-copy shared with `s` & `s2`. assert_eq(s, cudf.Series([20, 10, 10, 4, 5])) assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) From a74bb484faaed6a6d011bcaf2ba83ef8b4e4866c Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 13 Feb 2023 12:47:13 -0600 Subject: [PATCH 25/31] Update python/cudf/cudf/tests/test_copying.py Co-authored-by: Vyas Ramasubramani --- python/cudf/cudf/tests/test_copying.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py index 8765d2f199e..fa398c617b0 100644 --- a/python/cudf/cudf/tests/test_copying.py +++ b/python/cudf/cudf/tests/test_copying.py @@ -288,7 +288,7 @@ def test_series_zero_copy_cow_on(): cp_array[0:3] = 10 # Modifying a zero-copied array should only # modify `s` and will leave rest of the copies - # un-touched. + # untouched. assert_eq(s, cudf.Series([10, 10, 10, 4, 5])) assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) From 1a2ec7bb537968e9bd42465d5fc5847c5efcfeb8 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 13 Feb 2023 12:47:28 -0600 Subject: [PATCH 26/31] Update python/cudf/cudf/tests/test_copying.py Co-authored-by: Vyas Ramasubramani --- python/cudf/cudf/tests/test_copying.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py index fa398c617b0..864badabdc2 100644 --- a/python/cudf/cudf/tests/test_copying.py +++ b/python/cudf/cudf/tests/test_copying.py @@ -236,7 +236,7 @@ def test_multiple_series_cow(): # Deleting any of the following series objects # shouldn't delete rest of the weekly referenced data - # else-where. + # elsewhere. del s2 From 3c5ac1a2da219a04d2fbc465480e5909a18efb15 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 13 Feb 2023 10:58:41 -0800 Subject: [PATCH 27/31] use contextmanager --- python/cudf/cudf/tests/test_copying.py | 62 ++++++++++---------------- 1 file changed, 23 insertions(+), 39 deletions(-) diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py index 864badabdc2..d485912f7b7 100644 --- a/python/cudf/cudf/tests/test_copying.py +++ b/python/cudf/cudf/tests/test_copying.py @@ -1,5 +1,7 @@ # Copyright (c) 2020-2023, NVIDIA CORPORATION. +import contextlib + import cupy as cp import numpy as np import pandas as pd @@ -10,6 +12,17 @@ from cudf.testing._utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq +# TODO: Make use of set_option context manager +# once https://github.com/rapidsai/cudf/issues/12736 +# is resolved. +@contextlib.contextmanager +def with_copy_on_write(on): + original_cow_setting = cudf.get_option("copy_on_write") + cudf.set_option("copy_on_write", on) + yield + cudf.set_option("copy_on_write", original_cow_setting) + + @pytest.mark.parametrize("dtype", NUMERIC_TYPES + OTHER_TYPES) def test_repeat(dtype): arr = np.random.rand(10) * 10 @@ -55,14 +68,8 @@ def test_null_copy(): assert len(col) == 2049 -# TODO: Make use of set_option context manager -# once https://github.com/rapidsai/cudf/issues/12736 -# is resolved. - - +@with_copy_on_write(on=True) def test_series_setitem_cow_on(): - original_cow_setting = cudf.get_option("copy_on_write") - cudf.set_option("copy_on_write", True) actual = cudf.Series([1, 2, 3, 4, 5]) new_copy = actual.copy(deep=False) @@ -70,25 +77,19 @@ def test_series_setitem_cow_on(): assert_eq(actual, cudf.Series([1, 100, 3, 4, 5])) assert_eq(new_copy, cudf.Series([1, 2, 3, 4, 5])) - cudf.set_option("copy_on_write", original_cow_setting) - +@with_copy_on_write(on=False) def test_series_setitem_cow_off(): - original_cow_setting = cudf.get_option("copy_on_write") - cudf.set_option("copy_on_write", False) actual = cudf.Series([1, 2, 3, 4, 5]) new_copy = actual.copy(deep=False) actual[1] = 100 assert_eq(actual, cudf.Series([1, 100, 3, 4, 5])) assert_eq(new_copy, cudf.Series([1, 100, 3, 4, 5])) - cudf.set_option("copy_on_write", original_cow_setting) +@with_copy_on_write(on=True) def test_series_setitem_both_slice_cow_on(): - original_cow_setting = cudf.get_option("copy_on_write") - cudf.set_option("copy_on_write", True) - actual = cudf.Series([1, 2, 3, 4, 5]) new_copy = actual.copy(deep=False) @@ -99,13 +100,10 @@ def test_series_setitem_both_slice_cow_on(): new_copy[slice(2, 4, 1)] = 300 assert_eq(actual, cudf.Series([100, 100, 3, 4, 5])) assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5])) - cudf.set_option("copy_on_write", original_cow_setting) +@with_copy_on_write(on=False) def test_series_setitem_both_slice_cow_off(): - original_cow_setting = cudf.get_option("copy_on_write") - cudf.set_option("copy_on_write", False) - actual = cudf.Series([1, 2, 3, 4, 5]) new_copy = actual.copy(deep=False) @@ -116,12 +114,10 @@ def test_series_setitem_both_slice_cow_off(): new_copy[slice(2, 4, 1)] = 300 assert_eq(actual, cudf.Series([100, 100, 300, 300, 5])) assert_eq(new_copy, cudf.Series([100, 100, 300, 300, 5])) - cudf.set_option("copy_on_write", original_cow_setting) +@with_copy_on_write(on=True) def test_series_setitem_partial_slice_cow_on(): - original_cow_setting = cudf.get_option("copy_on_write") - cudf.set_option("copy_on_write", True) actual = cudf.Series([1, 2, 3, 4, 5]) new_copy = actual.copy(deep=False) @@ -134,12 +130,10 @@ def test_series_setitem_partial_slice_cow_on(): new_slice[0:2] = 10 assert_eq(new_slice, cudf.Series([10, 10, 5], index=[2, 3, 4])) assert_eq(actual, cudf.Series([1, 2, 3, 4, 5])) - cudf.set_option("copy_on_write", original_cow_setting) +@with_copy_on_write(on=False) def test_series_setitem_partial_slice_cow_off(): - original_cow_setting = cudf.get_option("copy_on_write") - cudf.set_option("copy_on_write", False) actual = cudf.Series([1, 2, 3, 4, 5]) new_copy = actual.copy(deep=False) @@ -152,16 +146,13 @@ def test_series_setitem_partial_slice_cow_off(): new_slice[0:2] = 10 assert_eq(new_slice, cudf.Series([10, 10, 5], index=[2, 3, 4])) assert_eq(actual, cudf.Series([1, 2, 10, 10, 5])) - cudf.set_option("copy_on_write", original_cow_setting) +@with_copy_on_write(on=True) def test_multiple_series_cow(): # Verify constructing, modifying, deleting # multiple copies of a series preserves # the data appropriately when COW is enabled. - - original_cow_setting = cudf.get_option("copy_on_write") - cudf.set_option("copy_on_write", True) s = cudf.Series([10, 20, 30, 40, 50]) s1 = s.copy(deep=False) s2 = s.copy(deep=False) @@ -269,12 +260,10 @@ def test_multiple_series_cow(): del s3 assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) - cudf.set_option("copy_on_write", original_cow_setting) +@with_copy_on_write(on=True) def test_series_zero_copy_cow_on(): - original_cow_setting = cudf.get_option("copy_on_write") - cudf.set_option("copy_on_write", True) s = cudf.Series([1, 2, 3, 4, 5]) s1 = s.copy(deep=False) cp_array = cp.asarray(s) @@ -317,12 +306,10 @@ def test_series_zero_copy_cow_on(): # because they are zero-copied. assert_eq(s5, cudf.Series([1, 1, 30, 40, 50])) assert_eq(s4, cudf.Series([1, 1, 30, 40, 50])) - cudf.set_option("copy_on_write", original_cow_setting) +@with_copy_on_write(on=False) def test_series_zero_copy_cow_off(): - original_cow_setting = cudf.get_option("copy_on_write") - cudf.set_option("copy_on_write", False) s = cudf.Series([1, 2, 3, 4, 5]) s1 = s.copy(deep=False) cp_array = cp.asarray(s) @@ -366,7 +353,6 @@ def test_series_zero_copy_cow_off(): # because they are zero-copied. assert_eq(s5, cudf.Series([1, 1, 30, 40, 50])) assert_eq(s4, cudf.Series([1, 1, 30, 40, 50])) - cudf.set_option("copy_on_write", original_cow_setting) @pytest.mark.parametrize("copy_on_write", [True, False]) @@ -419,9 +405,8 @@ def test_series_cat_copy(copy_on_write): cudf.set_option("copy_on_write", original_cow_setting) +@with_copy_on_write(on=True) def test_dataframe_cow_slice_setitem(): - original_cow_setting = cudf.get_option("copy_on_write") - cudf.set_option("copy_on_write", True) df = cudf.DataFrame({"a": [10, 11, 12, 13, 14], "b": [20, 30, 40, 50, 60]}) slice_df = df[1:4] @@ -444,4 +429,3 @@ def test_dataframe_cow_slice_setitem(): df, cudf.DataFrame({"a": [10, 11, 12, 13, 14], "b": [20, 30, 40, 50, 60]}), ) - cudf.set_option("copy_on_write", original_cow_setting) From 4ba71d3c3dfd3d11e7f3002dc0757ef1c20ab7a0 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 13 Feb 2023 11:02:50 -0800 Subject: [PATCH 28/31] add comment --- python/cudf/cudf/core/column/column.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index f06b86f4da9..b803efc2068 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2603,6 +2603,11 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: def _proxy_cai_obj(cai, owner): + """ + Returns a proxy CAI SimpleNameSpace wrapped + with the provided `cai` as `__cuda_array_interface__` + and owner as `owner` to keep the object alive. + """ return cuda_array_interface_wrapper( ptr=cai["data"][0], size=cai["shape"][0], From 6edd00310fc9c7dcd142adec6aca5e5bafce0746 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 13 Feb 2023 11:04:10 -0800 Subject: [PATCH 29/31] add comment --- python/cudf/cudf/core/column/column.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index b803efc2068..965b413e84f 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2607,6 +2607,9 @@ def _proxy_cai_obj(cai, owner): Returns a proxy CAI SimpleNameSpace wrapped with the provided `cai` as `__cuda_array_interface__` and owner as `owner` to keep the object alive. + This is an internal utility for `data_array_view` + and `mask_array_view` where an object with + read-only CAI is required. """ return cuda_array_interface_wrapper( ptr=cai["data"][0], From 518ea66cef71d9d1f00255b30ead316a81a34538 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 16 Feb 2023 08:03:07 -0800 Subject: [PATCH 30/31] update docs --- docs/cudf/source/user_guide/copy-on-write.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/cudf/source/user_guide/copy-on-write.md b/docs/cudf/source/user_guide/copy-on-write.md index 1cf518270d6..a4d9b549b79 100644 --- a/docs/cudf/source/user_guide/copy-on-write.md +++ b/docs/cudf/source/user_guide/copy-on-write.md @@ -88,7 +88,11 @@ dtype: int64 ## Notes -When copy-on-write is enabled, there is no concept of views. Modifying any view created inside cuDF will always trigger a copy and will not modify the original object. +When copy-on-write is enabled, there is no longer a concept of views when +slicing or indexing. In this sense indexing behaves as one would expect for +builtin Python containers like `lists`, rather than indexing `numpy arrays`. +Modifying a "view" created by cuDF will always trigger a copy and will not +modify the original object. Copy-on-write produces much more consistent copy semantics. Since every object is a copy of the original, users no longer have to think about when modifications may unexpectedly happen in place. This will bring consistency across operations and bring cudf and pandas behavior into alignment when copy-on-write is enabled for both. Here is one example where pandas and cudf are currently inconsistent without copy-on-write enabled: From 5ed99ad3707a5024c5cbdd6b183efce6a7cfe17f Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 16 Feb 2023 08:04:07 -0800 Subject: [PATCH 31/31] typo --- docs/cudf/source/user_guide/copy-on-write.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/cudf/source/user_guide/copy-on-write.md b/docs/cudf/source/user_guide/copy-on-write.md index a4d9b549b79..fff8cafcd95 100644 --- a/docs/cudf/source/user_guide/copy-on-write.md +++ b/docs/cudf/source/user_guide/copy-on-write.md @@ -89,8 +89,8 @@ dtype: int64 ## Notes When copy-on-write is enabled, there is no longer a concept of views when -slicing or indexing. In this sense indexing behaves as one would expect for -builtin Python containers like `lists`, rather than indexing `numpy arrays`. +slicing or indexing. In this sense, indexing behaves as one would expect for +built-in Python containers like `lists`, rather than indexing `numpy arrays`. Modifying a "view" created by cuDF will always trigger a copy and will not modify the original object.