Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Cython/Python copy_to_host and to_device #268

Merged
merged 32 commits into from
Feb 3, 2020
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
de7c128
Adjust `copy_to_host` to take pointer directly
jakirkham Jan 30, 2020
93742ea
Rename `copy_to_host` in Cython
jakirkham Jan 30, 2020
7d7b987
Add Cython wrapper around `copy_to_host`
jakirkham Jan 30, 2020
cd2f0dc
Call Cython `copy_to_host` wrapper in `tobytes`
jakirkham Jan 30, 2020
bfb828a
Workaround assignment issue with empty strings
jakirkham Jan 30, 2020
71f9e3a
Skip copy `rmm::device_buffer` when it is trivial
jakirkham Jan 30, 2020
f51bd64
Add cudaMemcpyKind and cudaMemcpyAsync in Cython
jakirkham Jan 30, 2020
24faec8
Call cudaMemcpyAsync in `copy_to_host`
jakirkham Jan 30, 2020
05dd6bf
Drop unused import
jakirkham Jan 30, 2020
76b6b03
Drop `copy_to_host` from C++
jakirkham Jan 30, 2020
d21f7aa
Note Cython/Python `copy_to_host` function
jakirkham Jan 30, 2020
7e1a15f
Add `copy_to_host` method to `DeviceBuffer`
jakirkham Jan 30, 2020
d5c775f
Call `copy_to_host` method in `tobytes`
jakirkham Jan 30, 2020
80bd86c
Test round-trip `DeviceBuffer` with output buffers
jakirkham Jan 30, 2020
6142676
Make `copy_to_host`'s `stream` argument optional
jakirkham Jan 30, 2020
70bea9f
Make `black` happy
jakirkham Jan 30, 2020
7f996a7
Update python/rmm/_lib/device_buffer.pyx
jakirkham Jan 30, 2020
38c04b1
Rename `frombytes` to `to_device`
jakirkham Jan 30, 2020
2c581f0
Drop explicit synchronization when copying
jakirkham Jan 30, 2020
50d48e5
Refactor out `to_device` function
jakirkham Jan 30, 2020
0888170
Update changelog entry
jakirkham Jan 31, 2020
f50da49
Comment on NumPy's use of huge pages
jakirkham Jan 31, 2020
a0eee55
Synchronize on the default stream
jakirkham Jan 31, 2020
3f9de93
Merge rapidsai/branch-0.13 into jakirkham/add_cy_copy_to_host
jakirkham Jan 31, 2020
71fc46e
Fix error message to say "bytes-like"
jakirkham Jan 31, 2020
480a3fa
Drop `RuntimeWarning` for larger buffers
jakirkham Jan 31, 2020
db4be74
Rename `copy_to_host` func to `copy_ptr_to_host`
jakirkham Jan 31, 2020
f75ebaf
Just pass the subselection to `copy_ptr_to_host`
jakirkham Jan 31, 2020
17c7bb9
Merge rapidsai/branch-0.13 into jakirkham/add_cy_copy_to_host
jakirkham Feb 3, 2020
c9c4cfe
Add docstrings to new functions/methods
jakirkham Feb 3, 2020
02c45da
Match Numba signature with `ary`
jakirkham Feb 3, 2020
3f6d997
Verify `copy_to_host` returns the same object
jakirkham Feb 3, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
- PR #258 Define pickling behavior for `DeviceBuffer`
- PR #261 Add `__bytes__` method to `DeviceBuffer`
- PR #266 Drop `rmm.auto_device`
- PR #268 Add Cython/Python `copy_to_host` and `to_device`

## Improvements

Expand Down
1 change: 0 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,6 @@ link_directories("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" # CMAKE_CUDA_IMPLICIT

add_library(rmm SHARED
src/rmm.cpp
src/device_buffer.cpp
src/memory_manager.cpp
thirdparty/cnmem/src/cnmem.cpp
src/mr/default_memory_resource.cpp)
Expand Down
13 changes: 0 additions & 13 deletions include/rmm/device_buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -392,17 +392,4 @@ class device_buffer {
mr::get_default_resource()}; ///< The memory resource used to
///< allocate/deallocate device memory
};

/**--------------------------------------------------------------------------*
* @brief Copies rmm::device_buffer to a preallocated host buffer.
*
* Copies device memory asynchronously on the specified stream
*
* @throws std::runtime_error if `hb` is `nullptr` or copy fails
*
* @param db `rmm::device_buffer` to copy to host
* @param hb host allocated buffer to copy data to
* @param stream CUDA stream on which the device to host copy will be performed
*-------------------------------------------------------------------------**/
void copy_to_host(const device_buffer& db, void* hb, cudaStream_t stream = 0);
} // namespace rmm
15 changes: 9 additions & 6 deletions python/rmm/_lib/device_buffer.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from libcpp.memory cimport unique_ptr
from libc.stdint cimport uintptr_t

from rmm._lib.lib cimport cudaStream_t
from rmm._lib.lib cimport cudaStream_t, cudaMemcpyAsync, cudaMemcpyDeviceToHost

cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil:
cdef cppclass device_buffer:
Expand All @@ -37,10 +37,6 @@ cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil:
size_t size()
size_t capacity()

void copy_to_host(const device_buffer& db, void* hb) except +
void copy_to_host(const device_buffer& db,
void* hb,
cudaStream_t stream) except +

cdef class DeviceBuffer:
cdef unique_ptr[device_buffer] c_obj
Expand All @@ -49,8 +45,9 @@ cdef class DeviceBuffer:
cdef DeviceBuffer c_from_unique_ptr(unique_ptr[device_buffer] ptr)

@staticmethod
cdef DeviceBuffer c_frombytes(const unsigned char[::1] b,
cdef DeviceBuffer c_to_device(const unsigned char[::1] b,
uintptr_t stream=*)
cpdef copy_to_host(self, unsigned char[::1] hb=*, uintptr_t stream=*)
cpdef bytes tobytes(self, uintptr_t stream=*)

cdef size_t c_size(self)
Expand All @@ -59,5 +56,11 @@ cdef class DeviceBuffer:
cdef void* c_data(self)


cpdef DeviceBuffer to_device(const unsigned char[::1] b, uintptr_t stream=*)
cpdef void copy_ptr_to_host(uintptr_t db,
unsigned char[::1] hb,
uintptr_t stream=*) nogil except *


cdef extern from "<utility>" namespace "std" nogil:
cdef unique_ptr[device_buffer] move(unique_ptr[device_buffer])
93 changes: 69 additions & 24 deletions python/rmm/_lib/device_buffer.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
# cython: embedsignature = True
# cython: language_level = 3


import numpy as np

from libcpp.memory cimport unique_ptr
from libc.stdint cimport uintptr_t

Expand Down Expand Up @@ -71,7 +74,7 @@ cdef class DeviceBuffer:
return self.tobytes()

def __setstate__(self, state):
cdef DeviceBuffer other = DeviceBuffer.c_frombytes(state)
cdef DeviceBuffer other = DeviceBuffer.c_to_device(state)
self.c_obj = move(other.c_obj)

@property
Expand All @@ -92,39 +95,42 @@ cdef class DeviceBuffer:
return buf

@staticmethod
@cython.boundscheck(False)
cdef DeviceBuffer c_frombytes(const unsigned char[::1] b,
cdef DeviceBuffer c_to_device(const unsigned char[::1] b,
uintptr_t stream=0):
if b is None:
raise TypeError(
"Argument 'b' has incorrect type"
" (expected bytes, got NoneType)"
return to_device(b, stream)

@staticmethod
def to_device(const unsigned char[::1] b, uintptr_t stream=0):
return to_device(b, stream)

cpdef copy_to_host(self, unsigned char[::1] hb=None, uintptr_t stream=0):
cdef const device_buffer* dbp = self.c_obj.get()
cdef size_t s = dbp.size()

if hb is None:
# NumPy leverages huge pages under-the-hood,
# which speeds up the copy from device to host.
hb = np.empty((s,), dtype="u1")
elif len(hb) < s:
raise ValueError(
"Argument `hb` is too small. Need space for %i bytes." % s
)
elif len(hb) > s:
hb = hb[:s]

cdef uintptr_t p = <uintptr_t>&b[0]
cdef size_t s = len(b)
return DeviceBuffer(ptr=p, size=s, stream=stream)
with nogil:
copy_ptr_to_host(<uintptr_t>dbp.data(), hb, stream)

@staticmethod
def frombytes(const unsigned char[::1] b, uintptr_t stream=0):
return DeviceBuffer.c_frombytes(b, stream)
return hb

cpdef bytes tobytes(self, uintptr_t stream=0):
cdef const device_buffer* dbp = self.c_obj.get()
cdef size_t s = dbp.size()
if s == 0:
return b""

cdef bytes b = PyBytes_FromStringAndSize(NULL, s)
cdef void* p = <void*>PyBytes_AS_STRING(b)
cdef cudaStream_t c_stream
cdef cudaError_t err
with nogil:
c_stream = <cudaStream_t>stream
copy_to_host(dbp[0], p, c_stream)
err = cudaStreamSynchronize(c_stream)
if err != cudaSuccess:
raise RuntimeError(f"Stream sync failed with error: {err}")
cdef unsigned char* p = <unsigned char*>PyBytes_AS_STRING(b)
cdef unsigned char[::1] mv = (<unsigned char[:(s + 1):1]>p)[:s]
self.copy_to_host(mv, stream)

return b

Expand All @@ -139,3 +145,42 @@ cdef class DeviceBuffer:

cdef void* c_data(self):
return self.c_obj.get()[0].data()


@cython.boundscheck(False)
cpdef DeviceBuffer to_device(const unsigned char[::1] b, uintptr_t stream=0):
if b is None:
raise TypeError(
"Argument 'b' has incorrect type"
" (expected bytes-like, got NoneType)"
)

cdef uintptr_t p = <uintptr_t>&b[0]
cdef size_t s = len(b)
return DeviceBuffer(ptr=p, size=s, stream=stream)


@cython.boundscheck(False)
cpdef void copy_ptr_to_host(uintptr_t db,
unsigned char[::1] hb,
uintptr_t stream=0) nogil except *:
if hb is None:
with gil:
raise TypeError(
"Argument `hb` has incorrect type"
" (expected bytes-like, got NoneType)"
)

cdef cudaError_t err

err = cudaMemcpyAsync(<void*>&hb[0], <const void*>db, len(hb),
cudaMemcpyDeviceToHost, <cudaStream_t>stream)
if err != cudaSuccess:
with gil:
raise RuntimeError(f"Memcpy failed with error: {err}")

if stream == 0:
err = cudaStreamSynchronize(<cudaStream_t>stream)
if err != cudaSuccess:
with gil:
raise RuntimeError(f"Stream sync failed with error: {err}")
11 changes: 11 additions & 0 deletions python/rmm/_lib/lib.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,17 @@ cdef extern from * nogil:

ctypedef void* cudaStream_t "cudaStream_t"

ctypedef enum cudaMemcpyKind "cudaMemcpyKind":
cudaMemcpyHostToHost = 0
cudaMemcpyHostToDevice = 1
cudaMemcpyDeviceToHost = 2
cudaMemcpyDeviceToDevice = 3
cudaMemcpyDefault = 4

cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count,
cudaMemcpyKind kind)
cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count,
cudaMemcpyKind kind, cudaStream_t stream)
cudaError_t cudaStreamSynchronize(cudaStream_t stream)


Expand Down
40 changes: 33 additions & 7 deletions python/rmm/tests/test_rmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def test_rmm_device_buffer(size):
assert len(s) == len(b)

# Test conversion from bytes
b2 = rmm.DeviceBuffer.frombytes(s)
b2 = rmm.DeviceBuffer.to_device(s)
assert isinstance(b2, rmm.DeviceBuffer)
assert len(b2) == len(s)

Expand All @@ -124,6 +124,32 @@ def test_rmm_device_buffer(size):
assert b.capacity() >= b.size


@pytest.mark.parametrize(
"hb",
[
b"abc",
bytearray(b"abc"),
memoryview(b"abc"),
np.asarray(memoryview(b"abc")),
np.arange(3, dtype="u1"),
],
)
def test_rmm_device_buffer_memoryview_roundtrip(hb):
mv = memoryview(hb)
db = rmm.DeviceBuffer.to_device(hb)
hb2 = db.copy_to_host()
mv2 = memoryview(hb2)
assert mv == mv2
hb3 = bytearray(mv.nbytes)
hb3 = db.copy_to_host(hb3)
mv3 = memoryview(hb3)
assert mv == mv3
hb4 = np.empty_like(mv)
hb4 = db.copy_to_host(hb4)
mv4 = memoryview(hb4)
assert mv == mv4


@pytest.mark.parametrize(
"hb",
[
Expand All @@ -146,19 +172,19 @@ def test_rmm_device_buffer_bytes_roundtrip(hb):
mv = memoryview(hb)
except TypeError:
with pytest.raises(TypeError):
rmm.DeviceBuffer.frombytes(hb)
rmm.DeviceBuffer.to_device(hb)
else:
if mv.format != "B":
with pytest.raises(ValueError):
rmm.DeviceBuffer.frombytes(hb)
rmm.DeviceBuffer.to_device(hb)
elif len(mv.strides) != 1:
with pytest.raises(ValueError):
rmm.DeviceBuffer.frombytes(hb)
rmm.DeviceBuffer.to_device(hb)
elif mv.strides[0] != 1:
with pytest.raises(ValueError):
rmm.DeviceBuffer.frombytes(hb)
rmm.DeviceBuffer.to_device(hb)
else:
db = rmm.DeviceBuffer.frombytes(hb)
db = rmm.DeviceBuffer.to_device(hb)
hb2 = db.tobytes()
mv2 = memoryview(hb2)
assert mv == mv2
Expand All @@ -169,7 +195,7 @@ def test_rmm_device_buffer_bytes_roundtrip(hb):

@pytest.mark.parametrize("hb", [b"", b"123", b"abc"])
def test_rmm_device_buffer_pickle_roundtrip(hb):
db = rmm.DeviceBuffer.frombytes(hb)
db = rmm.DeviceBuffer.to_device(hb)
pb = pickle.dumps(db)
del db
db2 = pickle.loads(pb)
Expand Down
33 changes: 0 additions & 33 deletions src/device_buffer.cpp

This file was deleted.

9 changes: 0 additions & 9 deletions tests/device_buffer_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,15 +111,6 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawDevicePointer) {
EXPECT_EQ(cudaSuccess, cudaFree(device_memory));
}

TYPED_TEST(DeviceBufferTest, CopyToRawHostPointer) {
rmm::device_buffer buff(this->size);
std::vector<uint8_t> host_data(this->size);
uint8_t* host_data_ptr = host_data.data();
rmm::copy_to_host(buff, host_data_ptr);
EXPECT_EQ(0, buff.stream());
// TODO check for equality between the contents of the two allocations
}

TYPED_TEST(DeviceBufferTest, CopyFromRawHostPointer) {
std::vector<uint8_t> host_data(this->size);
rmm::device_buffer buff(static_cast<void*>(host_data.data()), this->size);
Expand Down