Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add RMM PyTorch allocator #1168

Merged
merged 18 commits into from
Jan 5, 2023
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -732,3 +732,47 @@ This can be done in two ways:
**Note:** This only configures Numba to use the current RMM resource for allocations.
It does not initialize nor change the current resource, e.g., enabling a memory pool.
See [here](#memoryresource-objects) for more information on changing the current memory resource.

### Using RMM with PyTorch

[PyTorch](https://pytorch.org/docs/stable/notes/cuda.html) can use RMM
for memory allocation. For example, to configure PyTorch to use an
RMM-managed pool:

```python
import rmm
import torch

rmm.reinitialize(pool_allocator=True)
torch.cuda.memory.change_current_allocator(rmm.rmm_torch_allocator)
```

PyTorch and RMM will now share the same memory pool.
shwina marked this conversation as resolved.
Show resolved Hide resolved

You can, of course, use a custom memory resource with PyTorch as well:

```python
import rmm
import torch

# configure RMM to use a managed memory resource, wrapped with a
# statistics resource adaptor that can report information about the
# amount of memory allocated:
mr = rmm.mr.StatisticsResourceAdaptor(rmm.mr.ManagedMemoryResource())
rmm.mr.set_current_device_resource(mr)

# configure PyTorch to use RMM for allocations:
torch.cuda.change_current_allocator(rmm.rmm_torch_allocator)
vyasr marked this conversation as resolved.
Show resolved Hide resolved

x = torch.tensor([1, 2]).cuda()

# the memory resource reports information about PyTorch allocations:
mr.allocation_counts
Out[6]:
{'current_bytes': 16,
'current_count': 1,
'peak_bytes': 16,
'peak_count': 1,
'total_bytes': 16,
'total_count': 1}
```
1 change: 1 addition & 0 deletions python/rmm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
register_reinitialize_hook,
reinitialize,
rmm_cupy_allocator,
rmm_torch_allocator,
unregister_reinitialize_hook,
)

Expand Down
3 changes: 2 additions & 1 deletion python/rmm/_lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
# the License.
# =============================================================================

set(cython_sources device_buffer.pyx lib.pyx memory_resource.pyx cuda_stream.pyx)
set(cython_sources device_buffer.pyx lib.pyx memory_resource.pyx cuda_stream.pyx
torch_allocator.pyx)
set(linked_libraries rmm::rmm)

# Build all of the Cython targets
Expand Down
3 changes: 3 additions & 0 deletions python/rmm/_lib/memory_resource.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from cuda.ccudart cimport cudaStream_t
from libc.stdint cimport int8_t
from libcpp.memory cimport shared_ptr
from libcpp.string cimport string
Expand All @@ -22,7 +23,9 @@ cdef extern from "rmm/mr/device/device_memory_resource.hpp" \
namespace "rmm::mr" nogil:
cdef cppclass device_memory_resource:
void* allocate(size_t bytes) except +
void* allocate(size_t bytes, cudaStream_t stream) except +
shwina marked this conversation as resolved.
Show resolved Hide resolved
void deallocate(void* ptr, size_t bytes) except +
void deallocate(void* ptr, size_t bytes, cudaStream_t stream) except +

cdef class DeviceMemoryResource:
cdef shared_ptr[device_memory_resource] c_obj
Expand Down
29 changes: 5 additions & 24 deletions python/rmm/_lib/memory_resource.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ from rmm._cuda.gpu import (
)

from rmm._lib.cuda_stream_view cimport cuda_stream_view
from rmm._lib.per_device_resource cimport (
cuda_device_id,
set_per_device_resource as cpp_set_per_device_resource,
)

# Transparent handle of a C++ exception
ctypedef pair[int, string] CppExcept
Expand Down Expand Up @@ -212,29 +216,6 @@ cdef extern from "rmm/mr/device/failure_callback_resource_adaptor.hpp" \
) except +


cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil:

cdef cppclass cuda_device_id:
ctypedef int value_type

cuda_device_id(value_type id)

value_type value()

cdef device_memory_resource* _set_current_device_resource \
"rmm::mr::set_current_device_resource" (device_memory_resource* new_mr)
cdef device_memory_resource* _get_current_device_resource \
"rmm::mr::get_current_device_resource" ()

cdef device_memory_resource* _set_per_device_resource \
"rmm::mr::set_per_device_resource" (
cuda_device_id id,
device_memory_resource* new_mr
)
cdef device_memory_resource* _get_per_device_resource \
"rmm::mr::get_per_device_resource"(cuda_device_id id)


cdef class DeviceMemoryResource:

cdef device_memory_resource* get_mr(self):
Expand Down Expand Up @@ -973,7 +954,7 @@ cpdef set_per_device_resource(int device, DeviceMemoryResource mr):
cdef unique_ptr[cuda_device_id] device_id = \
make_unique[cuda_device_id](device)

_set_per_device_resource(deref(device_id), mr.get_mr())
cpp_set_per_device_resource(deref(device_id), mr.get_mr())


cpdef set_current_device_resource(DeviceMemoryResource mr):
Expand Down
23 changes: 23 additions & 0 deletions python/rmm/_lib/per_device_resource.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from rmm._lib.memory_resource cimport device_memory_resource


cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil:
cdef cppclass cuda_device_id:
ctypedef int value_type

cuda_device_id(value_type id)

value_type value()

cdef extern from "rmm/mr/device/per_device_resource.hpp" \
namespace "rmm::mr" nogil:
cdef device_memory_resource* set_current_device_resource(
device_memory_resource* new_mr
)
cdef device_memory_resource* get_current_device_resource()
cdef device_memory_resource* set_per_device_resource(
cuda_device_id id, device_memory_resource* new_mr
)
cdef device_memory_resource* get_per_device_resource (
cuda_device_id id
)
19 changes: 19 additions & 0 deletions python/rmm/_lib/torch_allocator.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from cuda.ccudart cimport cudaStream_t
from libc.stdint cimport uintptr_t
from libc.stdio cimport printf

from rmm._lib.memory_resource cimport device_memory_resource
from rmm._lib.per_device_resource cimport get_current_device_resource


cdef public void* allocate(
ssize_t size, int device, void* stream
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q: device is ignored by design? (I was reviewing cupy/cupy#7210 and noticed this.)

Copy link
Contributor Author

@shwina shwina Dec 21, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, great catch! This brings out a subtle problem:

In RMM, each device has its own memory resource. Thus, to do the allocation on a specified device with RMM, I would write the torch allocate function like this:

cdef public void* allocate(ssize_t size, int device, void* stream) except * with gil:                                                                                                                                                                                       
    cdef device_memory_resource* mr = get_per_device_resource(device)                                                                                                                            
    return mr[0].allocate(size, <cudaStream_t> stream)

Unforunately, the deallocation function does not accept a device argument, so we cannot retrieve the memory resource that was used for allocation:

void deallocate(void* ptr, ssize_t size, void* stream)

I don't really see a way around this other than for the deallocate signature to include the device argument. cc: @emcastillo

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would love to know too. TBH I am puzzled by PyTorch's (long-time) behavior of asking for device. It should just honor the current device...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 to that

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll submit a follow-up PR adding support for device, once pytorch/pytorch#91398 is merged.

) except * with gil:
cdef device_memory_resource* mr = get_current_device_resource()
return mr[0].allocate(size, <cudaStream_t> stream)

cdef public void deallocate(
void* ptr, ssize_t size, void* stream
) except * with gil:
cdef device_memory_resource* mr = get_current_device_resource()
mr[0].deallocate(ptr, size, <cudaStream_t> stream)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are these gil requiring functions? It seems like it's all pure C code here, no Python objects etc.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be clear, if they're getting called from GIL-requiring code in PyTorch, so be it. I just don't see a reason that these functions need to explicitly acquire the GIL. If PyT can call these in a nogil context, is there a reason for us not to allow that?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because the allocate() and deallocate() methods can involve Python operations on Python objects, e.g., in CallbackMemoryResource or FailureCallbackResourceAdaptor.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking that would be automatically be handled when those callbacks are invoked since those callbacks are stored as Python objects. Those are stored as Python objects in the class, so any interaction with them should reacquire the GIL already, right? I guess the potential issue is that we cast these to void * pointers before passing them to the C++ classes, so at the point of the call we've lost Cython's safety net. Is that right? If so, we should consider (out of scope for this PR of course) inserting the necessary Python C API calls into the relevant rmm C++ classes i.e. in failure_callback_resource_adaptor::do_allocate.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That was my expectation as well. If the callback touches Python objects, shouldn't it be the responsibility of the callback to acquire/release the GIL?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My proposal above was wrong, there's no reason to embed this information in librmm where the callbacks could be anything (not necessarily Python objects). However, it should be the responsibility of the callbacks in rmm's Cython code to acquire the GIL as needed, and we do appear to do this correctly already. The _oom_callback_function used by the FailureCallbackResourceAdaptor acquires the GIL before calling the user-provided callback, as do both the allocate and deallocate callbacks used by the CallbackMemoryResource.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup -- the GIL should neither be released in C++, nor can it be released in Python. The Cython "wrapper" functions are what need to take on the responsibility of handling the GIL.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Vyas and I did a bit more exploration of exactly why we need a with gil here and ended up quite deep in the CPython and Cython internals (still without a clear answer though).

The symptom though is clear. If you take the example I have in the PR description:

import rmm                                                                                                                                                                                                 
import torch

torch.cuda.memory.change_current_allocator(rmm.rmm_torch_allocator)
                                                                                                                                                                                                           
base_mr = rmm.mr.CudaMemoryResource()                                                                                                                                                                      
                                                                                                                                                                                                           
def allocate_func(size):                                                                                                                                                                                   
    print(f"Allocating {size} bytes")                                                                                                                                                                      
    return base_mr.allocate(size)                                                                                                                                                                          
                                                                                                                                                                                                           
def deallocate_func(ptr, size):                                                                                                                                                                            
    print(f"Deallocating {size} bytes")                                                                                                                                                                    
    return base_mr.deallocate(ptr, size)                                                                                                                                                                   
                                                                                                                                                                                                           
rmm.mr.set_current_device_resource(                                                                                                                                                                        
    rmm.mr.CallbackMemoryResource(allocate_func, deallocate_func)                                                                                                                                          
)                                                                                                                                                                                                          
                                                                                                                                                                                                           
x = torch.tensor([1, 2]).cuda()                                                                                                                                                                            
del x                                                                                                                                                                                                      
y = torch.tensor([1, 2, 3]).cuda()                                                                                                                                                                         
del y                     

And raise an error in allocate_func, while removing the with gil, you'll see that the error is uncaught and eventually this segfaults.

15 changes: 15 additions & 0 deletions python/rmm/rmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,21 @@ def rmm_cupy_allocator(nbytes):
return ptr


try:
from torch.cuda.memory import CUDAPluggableAllocator
except ImportError:
rmm_torch_allocator = None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this bite a user that accesses this attribute and passes it around thinking that it's fine when it's really just None? It might be safer to override __getattr__ for the module and have it raise an error to prevent the user from accessing this attribute when CUDAPluggableAllocator failed to import.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we alternatively pass on ImportError to achieve the same effect as defining that module __getattr__?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you'd get close to the same effect, just a slightly less user-friendly version. With a __getattr__ override you could provide a more friendly error message indicating that this happened because the torch allocator failed to import, whereas if you just avoid defining it the user will see an AttributeError without any additional diagnostics and may think it's a bug in rmm.

It's a very minor point though, I'm fine leaving this as is for now and only revisiting in the future if we get a lot of user questions about why the allocator is None.

else:
import rmm._lib.torch_allocator

_alloc_free_lib_path = rmm._lib.torch_allocator.__file__
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is neat!

rmm_torch_allocator = CUDAPluggableAllocator(
_alloc_free_lib_path,
alloc_fn_name="allocate",
free_fn_name="deallocate",
)
Comment on lines +248 to +252
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q: Would this honor rmm.reinitialize() if a user changes the MR?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rmm.reinitialize() resets the default memory resource used by RMM. Each call to allocate() and deallocate() queries the default memory resource via a call to get_current_device_resource(), so -- yes.



def register_reinitialize_hook(func, *args, **kwargs):
"""
Add a function to the list of functions ("hooks") that will be
Expand Down
53 changes: 43 additions & 10 deletions python/rmm/tests/test_rmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -604,20 +604,21 @@ def test_cuda_async_memory_resource_threshold(nelem, alloc):
array_tester("u1", 2 * nelem, alloc) # should trigger release


def test_statistics_resource_adaptor():

cuda_mr = rmm.mr.CudaMemoryResource()
@pytest.fixture
def stats_mr():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be set up as a yield fixture and reset the current device resource afterward?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We already have an autouse function scoped fixture that does that: https://github.com/rapidsai/rmm/blob/branch-23.02/python/rmm/tests/test_rmm.py#L46. I'm guessing that should just work as expected?

mr = rmm.mr.StatisticsResourceAdaptor(rmm.mr.CudaMemoryResource())
rmm.mr.set_current_device_resource(mr)
return mr

mr = rmm.mr.StatisticsResourceAdaptor(cuda_mr)

rmm.mr.set_current_device_resource(mr)
def test_statistics_resource_adaptor(stats_mr):

buffers = [rmm.DeviceBuffer(size=1000) for _ in range(10)]

for i in range(9, 0, -2):
del buffers[i]

assert mr.allocation_counts == {
assert stats_mr.allocation_counts == {
"current_bytes": 5000,
"current_count": 5,
"peak_bytes": 10000,
Expand All @@ -627,7 +628,7 @@ def test_statistics_resource_adaptor():
}

# Push a new Tracking adaptor
mr2 = rmm.mr.StatisticsResourceAdaptor(mr)
mr2 = rmm.mr.StatisticsResourceAdaptor(stats_mr)
rmm.mr.set_current_device_resource(mr2)

for _ in range(2):
Expand All @@ -641,7 +642,7 @@ def test_statistics_resource_adaptor():
"total_bytes": 2000,
"total_count": 2,
}
assert mr.allocation_counts == {
assert stats_mr.allocation_counts == {
"current_bytes": 7000,
"current_count": 7,
"peak_bytes": 10000,
Expand All @@ -661,18 +662,18 @@ def test_statistics_resource_adaptor():
"total_bytes": 2000,
"total_count": 2,
}
assert mr.allocation_counts == {
assert stats_mr.allocation_counts == {
"current_bytes": 0,
"current_count": 0,
"peak_bytes": 10000,
"peak_count": 10,
"total_bytes": 12000,
"total_count": 12,
}
gc.collect()


def test_tracking_resource_adaptor():

cuda_mr = rmm.mr.CudaMemoryResource()

mr = rmm.mr.TrackingResourceAdaptor(cuda_mr, capture_stacks=True)
Expand Down Expand Up @@ -914,3 +915,35 @@ def test_rmm_device_buffer_copy(cuda_ary, make_copy):
result = db_copy.copy_to_host()

np.testing.assert_equal(expected, result)


@pytest.fixture(scope="session")
def torch_allocator():
try:
from torch.cuda.memory import change_current_allocator
except ImportError:
pytest.skip("pytorch pluggable allocator not available")
change_current_allocator(rmm.rmm_torch_allocator)


def test_rmm_torch_allocator(torch_allocator, stats_mr):
import torch
jakirkham marked this conversation as resolved.
Show resolved Hide resolved

assert stats_mr.allocation_counts["current_bytes"] == 0
x = torch.tensor([1, 2]).cuda()
assert stats_mr.allocation_counts["current_bytes"] > 0
del x
assert stats_mr.allocation_counts["current_bytes"] == 0


def test_rmm_torch_allocator_using_stream(torch_allocator, stats_mr):
import torch

assert stats_mr.allocation_counts["current_bytes"] == 0
s = torch.cuda.Stream()
with torch.cuda.stream(s):
x = torch.tensor([1, 2]).cuda()
torch.cuda.current_stream().wait_stream(s)
assert stats_mr.allocation_counts["current_bytes"] > 0
del x
assert stats_mr.allocation_counts["current_bytes"] == 0