diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ffd01a4b..dba5199c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ - PR #261 Add `__bytes__` method to `DeviceBuffer` - PR #262 Moved device memory resource files to `mr/device` directory - PR #266 Drop `rmm.auto_device` +- PR #268 Add Cython/Python `copy_to_host` and `to_device` ## Improvements diff --git a/CMakeLists.txt b/CMakeLists.txt index 6a3ee22d0..223f9906d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -109,7 +109,6 @@ link_directories("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" # CMAKE_CUDA_IMPLICIT add_library(rmm SHARED src/rmm.cpp - src/device_buffer.cpp src/memory_manager.cpp thirdparty/cnmem/src/cnmem.cpp src/mr/device/default_memory_resource.cpp) diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp index 37a338a86..cbe1abb77 100644 --- a/include/rmm/device_buffer.hpp +++ b/include/rmm/device_buffer.hpp @@ -392,17 +392,4 @@ class device_buffer { mr::get_default_resource()}; ///< The memory resource used to ///< allocate/deallocate device memory }; - -/**--------------------------------------------------------------------------* - * @brief Copies rmm::device_buffer to a preallocated host buffer. - * - * Copies device memory asynchronously on the specified stream - * - * @throws std::runtime_error if `hb` is `nullptr` or copy fails - * - * @param db `rmm::device_buffer` to copy to host - * @param hb host allocated buffer to copy data to - * @param stream CUDA stream on which the device to host copy will be performed - *-------------------------------------------------------------------------**/ -void copy_to_host(const device_buffer& db, void* hb, cudaStream_t stream = 0); } // namespace rmm diff --git a/python/rmm/_lib/device_buffer.pxd b/python/rmm/_lib/device_buffer.pxd index 96745e811..70f21852d 100644 --- a/python/rmm/_lib/device_buffer.pxd +++ b/python/rmm/_lib/device_buffer.pxd @@ -20,7 +20,7 @@ from libcpp.memory cimport unique_ptr from libc.stdint cimport uintptr_t -from rmm._lib.lib cimport cudaStream_t +from rmm._lib.lib cimport cudaStream_t, cudaMemcpyAsync, cudaMemcpyDeviceToHost cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil: cdef cppclass device_buffer: @@ -37,10 +37,6 @@ cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil: size_t size() size_t capacity() - void copy_to_host(const device_buffer& db, void* hb) except + - void copy_to_host(const device_buffer& db, - void* hb, - cudaStream_t stream) except + cdef class DeviceBuffer: cdef unique_ptr[device_buffer] c_obj @@ -49,8 +45,9 @@ cdef class DeviceBuffer: cdef DeviceBuffer c_from_unique_ptr(unique_ptr[device_buffer] ptr) @staticmethod - cdef DeviceBuffer c_frombytes(const unsigned char[::1] b, + cdef DeviceBuffer c_to_device(const unsigned char[::1] b, uintptr_t stream=*) + cpdef copy_to_host(self, ary=*, uintptr_t stream=*) cpdef bytes tobytes(self, uintptr_t stream=*) cdef size_t c_size(self) @@ -59,5 +56,11 @@ cdef class DeviceBuffer: cdef void* c_data(self) +cpdef DeviceBuffer to_device(const unsigned char[::1] b, uintptr_t stream=*) +cpdef void copy_ptr_to_host(uintptr_t db, + unsigned char[::1] hb, + uintptr_t stream=*) nogil except * + + cdef extern from "" namespace "std" nogil: cdef unique_ptr[device_buffer] move(unique_ptr[device_buffer]) diff --git a/python/rmm/_lib/device_buffer.pyx b/python/rmm/_lib/device_buffer.pyx index 28b817d8b..855d50f97 100644 --- a/python/rmm/_lib/device_buffer.pyx +++ b/python/rmm/_lib/device_buffer.pyx @@ -17,6 +17,9 @@ # cython: embedsignature = True # cython: language_level = 3 + +import numpy as np + from libcpp.memory cimport unique_ptr from libc.stdint cimport uintptr_t @@ -71,7 +74,7 @@ cdef class DeviceBuffer: return self.tobytes() def __setstate__(self, state): - cdef DeviceBuffer other = DeviceBuffer.c_frombytes(state) + cdef DeviceBuffer other = DeviceBuffer.c_to_device(state) self.c_obj = move(other.c_obj) @property @@ -92,39 +95,62 @@ cdef class DeviceBuffer: return buf @staticmethod - @cython.boundscheck(False) - cdef DeviceBuffer c_frombytes(const unsigned char[::1] b, + cdef DeviceBuffer c_to_device(const unsigned char[::1] b, uintptr_t stream=0): - if b is None: - raise TypeError( - "Argument 'b' has incorrect type" - " (expected bytes, got NoneType)" + """Calls ``to_device`` function on arguments provided""" + return to_device(b, stream) + + @staticmethod + def to_device(const unsigned char[::1] b, uintptr_t stream=0): + """Calls ``to_device`` function on arguments provided""" + return to_device(b, stream) + + cpdef copy_to_host(self, ary=None, uintptr_t stream=0): + """Copy from a ``DeviceBuffer`` to a buffer on host + + Parameters + ---------- + hb : ``bytes``-like buffer to write into + stream : CUDA stream to use for copying, default 0 + + Examples + -------- + >>> import rmm + >>> db = rmm.DeviceBuffer.to_device(b"abc") + >>> hb = bytearray(db.nbytes) + >>> db.copy_to_host(hb) + >>> print(hb) + bytearray(b'abc') + >>> hb = db.copy_to_host() + >>> print(hb) + bytearray(b'abc') + """ + cdef const device_buffer* dbp = self.c_obj.get() + cdef size_t s = dbp.size() + + cdef unsigned char[::1] hb = ary + if hb is None: + # NumPy leverages huge pages under-the-hood, + # which speeds up the copy from device to host. + hb = ary = np.empty((s,), dtype="u1") + elif len(hb) < s: + raise ValueError( + "Argument `hb` is too small. Need space for %i bytes." % s ) - cdef uintptr_t p = &b[0] - cdef size_t s = len(b) - return DeviceBuffer(ptr=p, size=s, stream=stream) + with nogil: + copy_ptr_to_host(dbp.data(), hb[:s], stream) - @staticmethod - def frombytes(const unsigned char[::1] b, uintptr_t stream=0): - return DeviceBuffer.c_frombytes(b, stream) + return ary cpdef bytes tobytes(self, uintptr_t stream=0): cdef const device_buffer* dbp = self.c_obj.get() cdef size_t s = dbp.size() - if s == 0: - return b"" cdef bytes b = PyBytes_FromStringAndSize(NULL, s) - cdef void* p = PyBytes_AS_STRING(b) - cdef cudaStream_t c_stream - cdef cudaError_t err - with nogil: - c_stream = stream - copy_to_host(dbp[0], p, c_stream) - err = cudaStreamSynchronize(c_stream) - if err != cudaSuccess: - raise RuntimeError(f"Stream sync failed with error: {err}") + cdef unsigned char* p = PyBytes_AS_STRING(b) + cdef unsigned char[::1] mv = (p)[:s] + self.copy_to_host(mv, stream) return b @@ -139,3 +165,79 @@ cdef class DeviceBuffer: cdef void* c_data(self): return self.c_obj.get()[0].data() + + +@cython.boundscheck(False) +cpdef DeviceBuffer to_device(const unsigned char[::1] b, uintptr_t stream=0): + """Return a new ``DeviceBuffer`` with a copy of the data + + Parameters + ---------- + b : ``bytes``-like data on host to copy to device + stream : CUDA stream to use for copying, default 0 + + Returns + ------- + ``DeviceBuffer`` with copy of data from host + + Examples + -------- + >>> import rmm + >>> db = rmm._lib.device_buffer.to_device(b"abc") + >>> print(bytes(db)) + b'abc' + """ + + if b is None: + raise TypeError( + "Argument 'b' has incorrect type" + " (expected bytes-like, got NoneType)" + ) + + cdef uintptr_t p = &b[0] + cdef size_t s = len(b) + return DeviceBuffer(ptr=p, size=s, stream=stream) + + +@cython.boundscheck(False) +cpdef void copy_ptr_to_host(uintptr_t db, + unsigned char[::1] hb, + uintptr_t stream=0) nogil except *: + """Copy from a device pointer to a buffer on host + + Parameters + ---------- + db : pointer to data on device to copy + hb : ``bytes``-like buffer to write into + stream : CUDA stream to use for copying, default 0 + + Examples + -------- + >>> import rmm + >>> db = rmm.DeviceBuffer.to_device(b"abc") + >>> hb = bytearray(db.nbytes) + >>> rmm._lib.device_buffer.copy_ptr_to_host(db.ptr, hb) + >>> print(hb) + bytearray(b'abc') + """ + + if hb is None: + with gil: + raise TypeError( + "Argument `hb` has incorrect type" + " (expected bytes-like, got NoneType)" + ) + + cdef cudaError_t err + + err = cudaMemcpyAsync(&hb[0], db, len(hb), + cudaMemcpyDeviceToHost, stream) + if err != cudaSuccess: + with gil: + raise RuntimeError(f"Memcpy failed with error: {err}") + + if stream == 0: + err = cudaStreamSynchronize(stream) + if err != cudaSuccess: + with gil: + raise RuntimeError(f"Stream sync failed with error: {err}") diff --git a/python/rmm/_lib/lib.pxd b/python/rmm/_lib/lib.pxd index 5805bf44b..9f26326c0 100644 --- a/python/rmm/_lib/lib.pxd +++ b/python/rmm/_lib/lib.pxd @@ -33,6 +33,17 @@ cdef extern from * nogil: ctypedef void* cudaStream_t "cudaStream_t" + ctypedef enum cudaMemcpyKind "cudaMemcpyKind": + cudaMemcpyHostToHost = 0 + cudaMemcpyHostToDevice = 1 + cudaMemcpyDeviceToHost = 2 + cudaMemcpyDeviceToDevice = 3 + cudaMemcpyDefault = 4 + + cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count, + cudaMemcpyKind kind) + cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count, + cudaMemcpyKind kind, cudaStream_t stream) cudaError_t cudaStreamSynchronize(cudaStream_t stream) diff --git a/python/rmm/tests/test_rmm.py b/python/rmm/tests/test_rmm.py index 2c64aee94..ba79049ee 100644 --- a/python/rmm/tests/test_rmm.py +++ b/python/rmm/tests/test_rmm.py @@ -114,7 +114,7 @@ def test_rmm_device_buffer(size): assert len(s) == len(b) # Test conversion from bytes - b2 = rmm.DeviceBuffer.frombytes(s) + b2 = rmm.DeviceBuffer.to_device(s) assert isinstance(b2, rmm.DeviceBuffer) assert len(b2) == len(s) @@ -124,6 +124,35 @@ def test_rmm_device_buffer(size): assert b.capacity() >= b.size +@pytest.mark.parametrize( + "hb", + [ + b"abc", + bytearray(b"abc"), + memoryview(b"abc"), + np.asarray(memoryview(b"abc")), + np.arange(3, dtype="u1"), + ], +) +def test_rmm_device_buffer_memoryview_roundtrip(hb): + mv = memoryview(hb) + db = rmm.DeviceBuffer.to_device(hb) + hb2 = db.copy_to_host() + assert isinstance(hb2, np.ndarray) + mv2 = memoryview(hb2) + assert mv == mv2 + hb3a = bytearray(mv.nbytes) + hb3b = db.copy_to_host(hb3a) + assert hb3a is hb3b + mv3 = memoryview(hb3b) + assert mv == mv3 + hb4a = np.empty_like(mv) + hb4b = db.copy_to_host(hb4a) + assert hb4a is hb4b + mv4 = memoryview(hb4b) + assert mv == mv4 + + @pytest.mark.parametrize( "hb", [ @@ -146,19 +175,19 @@ def test_rmm_device_buffer_bytes_roundtrip(hb): mv = memoryview(hb) except TypeError: with pytest.raises(TypeError): - rmm.DeviceBuffer.frombytes(hb) + rmm.DeviceBuffer.to_device(hb) else: if mv.format != "B": with pytest.raises(ValueError): - rmm.DeviceBuffer.frombytes(hb) + rmm.DeviceBuffer.to_device(hb) elif len(mv.strides) != 1: with pytest.raises(ValueError): - rmm.DeviceBuffer.frombytes(hb) + rmm.DeviceBuffer.to_device(hb) elif mv.strides[0] != 1: with pytest.raises(ValueError): - rmm.DeviceBuffer.frombytes(hb) + rmm.DeviceBuffer.to_device(hb) else: - db = rmm.DeviceBuffer.frombytes(hb) + db = rmm.DeviceBuffer.to_device(hb) hb2 = db.tobytes() mv2 = memoryview(hb2) assert mv == mv2 @@ -169,7 +198,7 @@ def test_rmm_device_buffer_bytes_roundtrip(hb): @pytest.mark.parametrize("hb", [b"", b"123", b"abc"]) def test_rmm_device_buffer_pickle_roundtrip(hb): - db = rmm.DeviceBuffer.frombytes(hb) + db = rmm.DeviceBuffer.to_device(hb) pb = pickle.dumps(db) del db db2 = pickle.loads(pb) diff --git a/src/device_buffer.cpp b/src/device_buffer.cpp deleted file mode 100644 index b28cee70e..000000000 --- a/src/device_buffer.cpp +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2018, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -namespace rmm { -void copy_to_host(const device_buffer& db, void* hb, cudaStream_t stream) { - if (hb == nullptr) { - throw std::runtime_error{"Cannot copy to `nullptr`."}; - } - cudaError_t err = cudaMemcpyAsync(hb, - db.data(), - db.size(), - cudaMemcpyDeviceToHost, - stream); - if (err != cudaSuccess) { - throw std::runtime_error{"Failed to copy to host."}; - } -} -} // namespace rmm diff --git a/tests/device_buffer_tests.cpp b/tests/device_buffer_tests.cpp index 1fe5e159c..54af41c30 100644 --- a/tests/device_buffer_tests.cpp +++ b/tests/device_buffer_tests.cpp @@ -111,15 +111,6 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawDevicePointer) { EXPECT_EQ(cudaSuccess, cudaFree(device_memory)); } -TYPED_TEST(DeviceBufferTest, CopyToRawHostPointer) { - rmm::device_buffer buff(this->size); - std::vector host_data(this->size); - uint8_t* host_data_ptr = host_data.data(); - rmm::copy_to_host(buff, host_data_ptr); - EXPECT_EQ(0, buff.stream()); - // TODO check for equality between the contents of the two allocations -} - TYPED_TEST(DeviceBufferTest, CopyFromRawHostPointer) { std::vector host_data(this->size); rmm::device_buffer buff(static_cast(host_data.data()), this->size);