From f1e9413957d495dee2b4c3cff8cecd9529d207ce Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 29 Nov 2022 07:35:37 -0500 Subject: [PATCH 01/15] Add RMM PyTorch allocator --- python/rmm/_lib/CMakeLists.txt | 3 ++- python/rmm/_lib/memory_resource.pxd | 3 +++ python/rmm/_lib/torch_allocator.pyx | 18 ++++++++++++++++ python/rmm/rmm.py | 33 +++++++++++++++++++++++++++++ 4 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 python/rmm/_lib/torch_allocator.pyx diff --git a/python/rmm/_lib/CMakeLists.txt b/python/rmm/_lib/CMakeLists.txt index 44f4513b2..9e90d7e99 100644 --- a/python/rmm/_lib/CMakeLists.txt +++ b/python/rmm/_lib/CMakeLists.txt @@ -12,7 +12,8 @@ # the License. # ============================================================================= -set(cython_sources device_buffer.pyx lib.pyx memory_resource.pyx cuda_stream.pyx) +set(cython_sources device_buffer.pyx lib.pyx memory_resource.pyx cuda_stream.pyx + torch_allocator.pyx) set(linked_libraries rmm::rmm) # Build all of the Cython targets diff --git a/python/rmm/_lib/memory_resource.pxd b/python/rmm/_lib/memory_resource.pxd index 387d39866..6f98ed644 100644 --- a/python/rmm/_lib/memory_resource.pxd +++ b/python/rmm/_lib/memory_resource.pxd @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from cuda.ccudart cimport cudaStream_t from libc.stdint cimport int8_t from libcpp.memory cimport shared_ptr from libcpp.string cimport string @@ -22,7 +23,9 @@ cdef extern from "rmm/mr/device/device_memory_resource.hpp" \ namespace "rmm::mr" nogil: cdef cppclass device_memory_resource: void* allocate(size_t bytes) except + + void* allocate(size_t bytes, cudaStream_t stream) except + void deallocate(void* ptr, size_t bytes) except + + void deallocate(void* ptr, size_t bytes, cudaStream_t stream) except + cdef class DeviceMemoryResource: cdef shared_ptr[device_memory_resource] c_obj diff --git a/python/rmm/_lib/torch_allocator.pyx b/python/rmm/_lib/torch_allocator.pyx new file mode 100644 index 000000000..548bb7b19 --- /dev/null +++ b/python/rmm/_lib/torch_allocator.pyx @@ -0,0 +1,18 @@ +from cuda.ccudart cimport cudaStream_t +from libc.stdint cimport uintptr_t +from libc.stdio cimport printf + +from rmm._lib.memory_resource cimport device_memory_resource + + +cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil: + cdef device_memory_resource* get_current_device_resource \ + "rmm::mr::get_current_device_resource" () + +cdef public void* allocate(ssize_t size, int device, void* stream) except *: + cdef device_memory_resource* mr = get_current_device_resource() + return mr[0].allocate(size, stream) + +cdef public void deallocate(void* ptr, ssize_t size, void* stream) except *: + cdef device_memory_resource* mr = get_current_device_resource() + mr[0].deallocate(ptr, size, stream) diff --git a/python/rmm/rmm.py b/python/rmm/rmm.py index 398d83de3..9aeaf5181 100644 --- a/python/rmm/rmm.py +++ b/python/rmm/rmm.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import ctypes +import warnings from cuda.cuda import CUdeviceptr, cuIpcGetMemHandle from numba import config, cuda @@ -237,6 +238,38 @@ def rmm_cupy_allocator(nbytes): return ptr +def _set_pytorch_allocator(): + try: + from torch.cuda.memory import ( + CUDAPluggableAllocator, + change_current_allocator, + ) + except ImportError: + return + else: + import rmm._lib.torch_allocator + + alloc_free_lib_path = rmm._lib.torch_allocator.__file__ + + rmm_torch_allocator = CUDAPluggableAllocator( + alloc_free_lib_path, + alloc_fn_name="allocate", + free_fn_name="deallocate", + ) + + try: + change_current_allocator(rmm_torch_allocator) + except RuntimeError as e: + warnings.warn( + "RMM could not change the PyTorch CUDA allocator " + "because another allocator is already in use.", + RuntimeWarning, + ) + + +_set_pytorch_allocator() + + def register_reinitialize_hook(func, *args, **kwargs): """ Add a function to the list of functions ("hooks") that will be From 53a18ab7fa654e001df924c10ae3175ef557af08 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 29 Nov 2022 07:46:39 -0500 Subject: [PATCH 02/15] Add `with gil` to enable calling Python code in allocate/deallocate --- python/rmm/_lib/torch_allocator.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/rmm/_lib/torch_allocator.pyx b/python/rmm/_lib/torch_allocator.pyx index 548bb7b19..743ae6cdd 100644 --- a/python/rmm/_lib/torch_allocator.pyx +++ b/python/rmm/_lib/torch_allocator.pyx @@ -9,10 +9,10 @@ cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil: cdef device_memory_resource* get_current_device_resource \ "rmm::mr::get_current_device_resource" () -cdef public void* allocate(ssize_t size, int device, void* stream) except *: +cdef public void* allocate(ssize_t size, int device, void* stream) except * with gil: cdef device_memory_resource* mr = get_current_device_resource() return mr[0].allocate(size, stream) -cdef public void deallocate(void* ptr, ssize_t size, void* stream) except *: +cdef public void deallocate(void* ptr, ssize_t size, void* stream) except * with gil: cdef device_memory_resource* mr = get_current_device_resource() mr[0].deallocate(ptr, size, stream) From aab7e0733453df56e6d8cbd67136984e383e8ebe Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 29 Nov 2022 13:10:06 -0500 Subject: [PATCH 03/15] Don't set pytorch allocator by default --- python/rmm/__init__.py | 1 + python/rmm/rmm.py | 43 +++++++++++++++--------------------------- 2 files changed, 16 insertions(+), 28 deletions(-) diff --git a/python/rmm/__init__.py b/python/rmm/__init__.py index acdeb93a8..9fb13fe73 100644 --- a/python/rmm/__init__.py +++ b/python/rmm/__init__.py @@ -25,6 +25,7 @@ register_reinitialize_hook, reinitialize, rmm_cupy_allocator, + rmm_torch_allocator, unregister_reinitialize_hook, ) diff --git a/python/rmm/rmm.py b/python/rmm/rmm.py index 9aeaf5181..b4dd20b0b 100644 --- a/python/rmm/rmm.py +++ b/python/rmm/rmm.py @@ -238,36 +238,23 @@ def rmm_cupy_allocator(nbytes): return ptr -def _set_pytorch_allocator(): - try: - from torch.cuda.memory import ( - CUDAPluggableAllocator, - change_current_allocator, - ) - except ImportError: - return - else: - import rmm._lib.torch_allocator - - alloc_free_lib_path = rmm._lib.torch_allocator.__file__ - - rmm_torch_allocator = CUDAPluggableAllocator( - alloc_free_lib_path, - alloc_fn_name="allocate", - free_fn_name="deallocate", - ) - - try: - change_current_allocator(rmm_torch_allocator) - except RuntimeError as e: - warnings.warn( - "RMM could not change the PyTorch CUDA allocator " - "because another allocator is already in use.", - RuntimeWarning, - ) +try: + from torch.cuda.memory import ( + CUDAPluggableAllocator, + change_current_allocator, + ) +except ImportError: + rmm_torch_allocator = None +else: + import rmm._lib.torch_allocator + _alloc_free_lib_path = rmm._lib.torch_allocator.__file__ -_set_pytorch_allocator() + rmm_torch_allocator = CUDAPluggableAllocator( + _alloc_free_lib_path, + alloc_fn_name="allocate", + free_fn_name="deallocate", + ) def register_reinitialize_hook(func, *args, **kwargs): From 63f2692d3ff410687ea8d67ade4af6c3835fb5e9 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 30 Nov 2022 15:08:13 -0500 Subject: [PATCH 04/15] Add tests --- python/rmm/tests/test_rmm.py | 57 +++++++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 10 deletions(-) diff --git a/python/rmm/tests/test_rmm.py b/python/rmm/tests/test_rmm.py index 931ff5336..2eca417c7 100644 --- a/python/rmm/tests/test_rmm.py +++ b/python/rmm/tests/test_rmm.py @@ -604,20 +604,21 @@ def test_cuda_async_memory_resource_threshold(nelem, alloc): array_tester("u1", 2 * nelem, alloc) # should trigger release -def test_statistics_resource_adaptor(): - - cuda_mr = rmm.mr.CudaMemoryResource() +@pytest.fixture +def stats_mr(): + mr = rmm.mr.StatisticsResourceAdaptor(rmm.mr.CudaMemoryResource()) + rmm.mr.set_current_device_resource(mr) + return mr - mr = rmm.mr.StatisticsResourceAdaptor(cuda_mr) - rmm.mr.set_current_device_resource(mr) +def test_statistics_resource_adaptor(stats_mr): buffers = [rmm.DeviceBuffer(size=1000) for _ in range(10)] for i in range(9, 0, -2): del buffers[i] - assert mr.allocation_counts == { + assert stats_mr.allocation_counts == { "current_bytes": 5000, "current_count": 5, "peak_bytes": 10000, @@ -627,7 +628,7 @@ def test_statistics_resource_adaptor(): } # Push a new Tracking adaptor - mr2 = rmm.mr.StatisticsResourceAdaptor(mr) + mr2 = rmm.mr.StatisticsResourceAdaptor(stats_mr) rmm.mr.set_current_device_resource(mr2) for _ in range(2): @@ -641,7 +642,7 @@ def test_statistics_resource_adaptor(): "total_bytes": 2000, "total_count": 2, } - assert mr.allocation_counts == { + assert stats_mr.allocation_counts == { "current_bytes": 7000, "current_count": 7, "peak_bytes": 10000, @@ -661,7 +662,7 @@ def test_statistics_resource_adaptor(): "total_bytes": 2000, "total_count": 2, } - assert mr.allocation_counts == { + assert stats_mr.allocation_counts == { "current_bytes": 0, "current_count": 0, "peak_bytes": 10000, @@ -669,10 +670,10 @@ def test_statistics_resource_adaptor(): "total_bytes": 12000, "total_count": 12, } + gc.collect() def test_tracking_resource_adaptor(): - cuda_mr = rmm.mr.CudaMemoryResource() mr = rmm.mr.TrackingResourceAdaptor(cuda_mr, capture_stacks=True) @@ -914,3 +915,39 @@ def test_rmm_device_buffer_copy(cuda_ary, make_copy): result = db_copy.copy_to_host() np.testing.assert_equal(expected, result) + + +@pytest.fixture +def torch_allocator(): + try: + from torch.cuda.memory import change_current_allocator + except ImportError: + pytest.skip("pytorch pluggable allocator not available") + + try: + change_current_allocator(rmm.rmm_torch_allocator) + except RuntimeError: + pass + + +def test_rmm_torch_allocator(torch_allocator, stats_mr): + import torch + + assert stats_mr.allocation_counts["current_bytes"] == 0 + x = torch.tensor([1, 2]).cuda() + assert stats_mr.allocation_counts["current_bytes"] > 0 + del x + assert stats_mr.allocation_counts["current_bytes"] == 0 + + +def test_rmm_torch_allocator_using_stream(torch_allocator, stats_mr): + import torch + + assert stats_mr.allocation_counts["current_bytes"] == 0 + s = torch.cuda.Stream() + with torch.cuda.stream(s): + x = torch.tensor([1, 2]).cuda() + torch.cuda.current_stream().wait_stream(s) + assert stats_mr.allocation_counts["current_bytes"] > 0 + del x + assert stats_mr.allocation_counts["current_bytes"] == 0 From 6404dac684dce9d8182b538680a385f004759694 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 30 Nov 2022 16:03:47 -0500 Subject: [PATCH 05/15] Styles --- python/rmm/_lib/torch_allocator.pyx | 8 ++++++-- python/rmm/rmm.py | 9 +-------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/python/rmm/_lib/torch_allocator.pyx b/python/rmm/_lib/torch_allocator.pyx index 743ae6cdd..da4ac7170 100644 --- a/python/rmm/_lib/torch_allocator.pyx +++ b/python/rmm/_lib/torch_allocator.pyx @@ -9,10 +9,14 @@ cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil: cdef device_memory_resource* get_current_device_resource \ "rmm::mr::get_current_device_resource" () -cdef public void* allocate(ssize_t size, int device, void* stream) except * with gil: +cdef public void* allocate( + ssize_t size, int device, void* stream +) except * with gil: cdef device_memory_resource* mr = get_current_device_resource() return mr[0].allocate(size, stream) -cdef public void deallocate(void* ptr, ssize_t size, void* stream) except * with gil: +cdef public void deallocate( + void* ptr, ssize_t size, void* stream +) except * with gil: cdef device_memory_resource* mr = get_current_device_resource() mr[0].deallocate(ptr, size, stream) diff --git a/python/rmm/rmm.py b/python/rmm/rmm.py index b4dd20b0b..313b029bc 100644 --- a/python/rmm/rmm.py +++ b/python/rmm/rmm.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import ctypes -import warnings from cuda.cuda import CUdeviceptr, cuIpcGetMemHandle from numba import config, cuda @@ -239,17 +238,11 @@ def rmm_cupy_allocator(nbytes): try: - from torch.cuda.memory import ( - CUDAPluggableAllocator, - change_current_allocator, - ) + from torch.cuda.memory import CUDAPluggableAllocator except ImportError: rmm_torch_allocator = None else: - import rmm._lib.torch_allocator - _alloc_free_lib_path = rmm._lib.torch_allocator.__file__ - rmm_torch_allocator = CUDAPluggableAllocator( _alloc_free_lib_path, alloc_fn_name="allocate", From 73e08461e7ea1755020ef371e0c8d276eac32289 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 13 Dec 2022 15:17:47 -0500 Subject: [PATCH 06/15] Don't redeclare get_current_device_resource --- python/rmm/_lib/memory_resource.pyx | 29 +++++-------------------- python/rmm/_lib/per_device_resource.pxd | 23 ++++++++++++++++++++ python/rmm/_lib/torch_allocator.pyx | 5 +---- 3 files changed, 29 insertions(+), 28 deletions(-) create mode 100644 python/rmm/_lib/per_device_resource.pxd diff --git a/python/rmm/_lib/memory_resource.pyx b/python/rmm/_lib/memory_resource.pyx index a20d481e0..b2477d554 100644 --- a/python/rmm/_lib/memory_resource.pyx +++ b/python/rmm/_lib/memory_resource.pyx @@ -35,6 +35,10 @@ from rmm._cuda.gpu import ( ) from rmm._lib.cuda_stream_view cimport cuda_stream_view +from rmm._lib.per_device_resource cimport ( + cuda_device_id, + set_per_device_resource as cpp_set_per_device_resource, +) # Transparent handle of a C++ exception ctypedef pair[int, string] CppExcept @@ -212,29 +216,6 @@ cdef extern from "rmm/mr/device/failure_callback_resource_adaptor.hpp" \ ) except + -cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil: - - cdef cppclass cuda_device_id: - ctypedef int value_type - - cuda_device_id(value_type id) - - value_type value() - - cdef device_memory_resource* _set_current_device_resource \ - "rmm::mr::set_current_device_resource" (device_memory_resource* new_mr) - cdef device_memory_resource* _get_current_device_resource \ - "rmm::mr::get_current_device_resource" () - - cdef device_memory_resource* _set_per_device_resource \ - "rmm::mr::set_per_device_resource" ( - cuda_device_id id, - device_memory_resource* new_mr - ) - cdef device_memory_resource* _get_per_device_resource \ - "rmm::mr::get_per_device_resource"(cuda_device_id id) - - cdef class DeviceMemoryResource: cdef device_memory_resource* get_mr(self): @@ -973,7 +954,7 @@ cpdef set_per_device_resource(int device, DeviceMemoryResource mr): cdef unique_ptr[cuda_device_id] device_id = \ make_unique[cuda_device_id](device) - _set_per_device_resource(deref(device_id), mr.get_mr()) + cpp_set_per_device_resource(deref(device_id), mr.get_mr()) cpdef set_current_device_resource(DeviceMemoryResource mr): diff --git a/python/rmm/_lib/per_device_resource.pxd b/python/rmm/_lib/per_device_resource.pxd new file mode 100644 index 000000000..c33217622 --- /dev/null +++ b/python/rmm/_lib/per_device_resource.pxd @@ -0,0 +1,23 @@ +from rmm._lib.memory_resource cimport device_memory_resource + + +cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil: + cdef cppclass cuda_device_id: + ctypedef int value_type + + cuda_device_id(value_type id) + + value_type value() + +cdef extern from "rmm/mr/device/per_device_resource.hpp" \ + namespace "rmm::mr" nogil: + cdef device_memory_resource* set_current_device_resource( + device_memory_resource* new_mr + ) + cdef device_memory_resource* get_current_device_resource() + cdef device_memory_resource* set_per_device_resource( + cuda_device_id id, device_memory_resource* new_mr + ) + cdef device_memory_resource* get_per_device_resource ( + cuda_device_id id + ) diff --git a/python/rmm/_lib/torch_allocator.pyx b/python/rmm/_lib/torch_allocator.pyx index da4ac7170..f34466b70 100644 --- a/python/rmm/_lib/torch_allocator.pyx +++ b/python/rmm/_lib/torch_allocator.pyx @@ -3,12 +3,9 @@ from libc.stdint cimport uintptr_t from libc.stdio cimport printf from rmm._lib.memory_resource cimport device_memory_resource +from rmm._lib.per_device_resource cimport get_current_device_resource -cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil: - cdef device_memory_resource* get_current_device_resource \ - "rmm::mr::get_current_device_resource" () - cdef public void* allocate( ssize_t size, int device, void* stream ) except * with gil: From 9c27e851724c643604fa6837b2f7de9002b3642e Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 13 Dec 2022 16:28:27 -0500 Subject: [PATCH 07/15] use a session-scoped fixture instead --- python/rmm/rmm.py | 2 ++ python/rmm/tests/test_rmm.py | 8 ++------ 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/python/rmm/rmm.py b/python/rmm/rmm.py index 313b029bc..cae9971dc 100644 --- a/python/rmm/rmm.py +++ b/python/rmm/rmm.py @@ -242,6 +242,8 @@ def rmm_cupy_allocator(nbytes): except ImportError: rmm_torch_allocator = None else: + import rmm._lib.torch_allocator + _alloc_free_lib_path = rmm._lib.torch_allocator.__file__ rmm_torch_allocator = CUDAPluggableAllocator( _alloc_free_lib_path, diff --git a/python/rmm/tests/test_rmm.py b/python/rmm/tests/test_rmm.py index 2eca417c7..d6cae9a27 100644 --- a/python/rmm/tests/test_rmm.py +++ b/python/rmm/tests/test_rmm.py @@ -917,17 +917,13 @@ def test_rmm_device_buffer_copy(cuda_ary, make_copy): np.testing.assert_equal(expected, result) -@pytest.fixture +@pytest.fixture(scope="session") def torch_allocator(): try: from torch.cuda.memory import change_current_allocator except ImportError: pytest.skip("pytorch pluggable allocator not available") - - try: - change_current_allocator(rmm.rmm_torch_allocator) - except RuntimeError: - pass + change_current_allocator(rmm.rmm_torch_allocator) def test_rmm_torch_allocator(torch_allocator, stats_mr): From 39251dbb06b9ed01eb6e2fd7b89177b51a960096 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 13 Dec 2022 16:52:45 -0500 Subject: [PATCH 08/15] Add a note on how to use RMM + PyTorch --- README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/README.md b/README.md index 91c5c577f..197e9a5cd 100644 --- a/README.md +++ b/README.md @@ -732,3 +732,18 @@ This can be done in two ways: **Note:** This only configures Numba to use the current RMM resource for allocations. It does not initialize nor change the current resource, e.g., enabling a memory pool. See [here](#memoryresource-objects) for more information on changing the current memory resource. + +### Using RMM with PyTorch + +[PyTorch](https://pytorch.org/docs/stable/notes/cuda.html) can use RMM for memory allocation. +For example, to configure PyTorch to use an RMM-managed pool, you can do the following: + +```python +import rmm +import torch + +rmm.reinitialize(pool_allocator=True) +torch.cuda.memory.change_current_allocator(rmm.rmm_torch_allocator) +``` + +PyTorch and RMM will now share the same memory pool. From 67133619d63fbc23b18152147ba3a9d3da600bbc Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 14 Dec 2022 10:07:10 -0500 Subject: [PATCH 09/15] More doc --- README.md | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 197e9a5cd..32ae806c5 100644 --- a/README.md +++ b/README.md @@ -735,8 +735,9 @@ See [here](#memoryresource-objects) for more information on changing the current ### Using RMM with PyTorch -[PyTorch](https://pytorch.org/docs/stable/notes/cuda.html) can use RMM for memory allocation. -For example, to configure PyTorch to use an RMM-managed pool, you can do the following: +[PyTorch](https://pytorch.org/docs/stable/notes/cuda.html) can use RMM +for memory allocation. For example, to configure PyTorch to use an +RMM-managed pool: ```python import rmm @@ -747,3 +748,31 @@ torch.cuda.memory.change_current_allocator(rmm.rmm_torch_allocator) ``` PyTorch and RMM will now share the same memory pool. + +You can, of course, use a custom memory resource with PyTorch as well: + +```python +import rmm +import torch + +# configure RMM to use a managed memory resource, wrapped with a +# statistics resource adaptor that can report information about the +# amount of memory allocated: +mr = rmm.mr.StatisticsResourceAdaptor(rmm.mr.ManagedMemoryResource()) +rmm.mr.set_current_device_resource(mr) + +# configure PyTorch to use RMM for allocations: +torch.cuda.change_current_allocator(rmm.rmm_torch_allocator) + +x = torch.tensor([1, 2]).cuda() + +# the memory resource reports information about PyTorch allocations: +mr.allocation_counts +Out[6]: +{'current_bytes': 16, + 'current_count': 1, + 'peak_bytes': 16, + 'peak_count': 1, + 'total_bytes': 16, + 'total_count': 1} +``` From a53ce953d80610ef0cb78f6a8f3ba4462f6f75f0 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 15 Dec 2022 11:48:43 -0500 Subject: [PATCH 10/15] Add new test_rmm_pytorch.py. Move fixtures to conftest.py. --- python/rmm/tests/__init__.py | 0 python/rmm/tests/conftest.py | 21 ++++++++++++ python/rmm/tests/test_rmm.py | 50 ---------------------------- python/rmm/tests/test_rmm_pytorch.py | 37 ++++++++++++++++++++ 4 files changed, 58 insertions(+), 50 deletions(-) create mode 100644 python/rmm/tests/__init__.py create mode 100644 python/rmm/tests/conftest.py create mode 100644 python/rmm/tests/test_rmm_pytorch.py diff --git a/python/rmm/tests/__init__.py b/python/rmm/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/python/rmm/tests/conftest.py b/python/rmm/tests/conftest.py new file mode 100644 index 000000000..5fad81c79 --- /dev/null +++ b/python/rmm/tests/conftest.py @@ -0,0 +1,21 @@ +import pytest + +import rmm + + +@pytest.fixture(scope="function", autouse=True) +def rmm_auto_reinitialize(): + # Run the test + yield + + # Automatically reinitialize the current memory resource after running each + # test + + rmm.reinitialize() + + +@pytest.fixture +def stats_mr(): + mr = rmm.mr.StatisticsResourceAdaptor(rmm.mr.CudaMemoryResource()) + rmm.mr.set_current_device_resource(mr) + return mr diff --git a/python/rmm/tests/test_rmm.py b/python/rmm/tests/test_rmm.py index d6cae9a27..93ef89fb6 100644 --- a/python/rmm/tests/test_rmm.py +++ b/python/rmm/tests/test_rmm.py @@ -42,17 +42,6 @@ ) -@pytest.fixture(scope="function", autouse=True) -def rmm_auto_reinitialize(): - - # Run the test - yield - - # Automatically reinitialize the current memory resource after running each - # test - rmm.reinitialize() - - def array_tester(dtype, nelem, alloc): # data h_in = np.full(nelem, 3.2, dtype) @@ -604,13 +593,6 @@ def test_cuda_async_memory_resource_threshold(nelem, alloc): array_tester("u1", 2 * nelem, alloc) # should trigger release -@pytest.fixture -def stats_mr(): - mr = rmm.mr.StatisticsResourceAdaptor(rmm.mr.CudaMemoryResource()) - rmm.mr.set_current_device_resource(mr) - return mr - - def test_statistics_resource_adaptor(stats_mr): buffers = [rmm.DeviceBuffer(size=1000) for _ in range(10)] @@ -915,35 +897,3 @@ def test_rmm_device_buffer_copy(cuda_ary, make_copy): result = db_copy.copy_to_host() np.testing.assert_equal(expected, result) - - -@pytest.fixture(scope="session") -def torch_allocator(): - try: - from torch.cuda.memory import change_current_allocator - except ImportError: - pytest.skip("pytorch pluggable allocator not available") - change_current_allocator(rmm.rmm_torch_allocator) - - -def test_rmm_torch_allocator(torch_allocator, stats_mr): - import torch - - assert stats_mr.allocation_counts["current_bytes"] == 0 - x = torch.tensor([1, 2]).cuda() - assert stats_mr.allocation_counts["current_bytes"] > 0 - del x - assert stats_mr.allocation_counts["current_bytes"] == 0 - - -def test_rmm_torch_allocator_using_stream(torch_allocator, stats_mr): - import torch - - assert stats_mr.allocation_counts["current_bytes"] == 0 - s = torch.cuda.Stream() - with torch.cuda.stream(s): - x = torch.tensor([1, 2]).cuda() - torch.cuda.current_stream().wait_stream(s) - assert stats_mr.allocation_counts["current_bytes"] > 0 - del x - assert stats_mr.allocation_counts["current_bytes"] == 0 diff --git a/python/rmm/tests/test_rmm_pytorch.py b/python/rmm/tests/test_rmm_pytorch.py new file mode 100644 index 000000000..9471af346 --- /dev/null +++ b/python/rmm/tests/test_rmm_pytorch.py @@ -0,0 +1,37 @@ +import pytest + +import rmm + +torch = pytest.importorskip("torch") + + +@pytest.fixture(scope="session") +def torch_allocator(): + try: + from torch.cuda.memory import change_current_allocator + except ImportError: + pytest.skip("pytorch pluggable allocator not available") + change_current_allocator(rmm.rmm_torch_allocator) + + +def test_rmm_torch_allocator(torch_allocator, stats_mr): + import torch + + assert stats_mr.allocation_counts["current_bytes"] == 0 + x = torch.tensor([1, 2]).cuda() + assert stats_mr.allocation_counts["current_bytes"] > 0 + del x + assert stats_mr.allocation_counts["current_bytes"] == 0 + + +def test_rmm_torch_allocator_using_stream(torch_allocator, stats_mr): + import torch + + assert stats_mr.allocation_counts["current_bytes"] == 0 + s = torch.cuda.Stream() + with torch.cuda.stream(s): + x = torch.tensor([1, 2]).cuda() + torch.cuda.current_stream().wait_stream(s) + assert stats_mr.allocation_counts["current_bytes"] > 0 + del x + assert stats_mr.allocation_counts["current_bytes"] == 0 From 741a1df0f8a6a942c6ad8852e27b1ee33cb7c591 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Tue, 13 Dec 2022 16:57:13 -0500 Subject: [PATCH 11/15] Ensure `UpstreamResourceAdaptor` is not cleared by the Python GC (#1170) Closes #1169. Essentially, we are running into the situation described in https://cython.readthedocs.io/en/latest/src/userguide/extension_types.html#disabling-cycle-breaking-tp-clear with `UpstreamResourceAdaptor`. The solution is to prevent clearing of `UpstreamResourceAdaptor` objects by decorating them with `no_gc_clear`. Cython calls out the following: > If you use no_gc_clear, it is important that any given reference cycle contains at least one object without no_gc_clear. Otherwise, the cycle cannot be broken, which is a memory leak. The other object in RMM that we mark `@no_gc_clear` is `DeviceBuffer`, and a `DeviceBuffer` can keep a reference to an `UpstreamResourceAdaptor`. But, an `UpstreamResourceAdaptor` cannot keep a reference to a `DeviceBuffer`, so instances of the two cannot form a reference cycle AFAICT. Authors: - Ashwin Srinath (https://github.com/shwina) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/rmm/pull/1170 --- python/rmm/_lib/memory_resource.pyx | 3 +++ python/rmm/tests/test_rmm.py | 29 +++++++++++++++++++++++------ 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/python/rmm/_lib/memory_resource.pyx b/python/rmm/_lib/memory_resource.pyx index b2477d554..501cf51cf 100644 --- a/python/rmm/_lib/memory_resource.pyx +++ b/python/rmm/_lib/memory_resource.pyx @@ -16,6 +16,7 @@ import os import warnings from collections import defaultdict +cimport cython from cython.operator cimport dereference as deref from libc.stdint cimport int8_t, int64_t, uintptr_t from libcpp cimport bool @@ -228,6 +229,8 @@ cdef class DeviceMemoryResource: self.c_obj.get().deallocate((ptr), nbytes) +# See the note about `no_gc_clear` in `device_buffer.pyx`. +@cython.no_gc_clear cdef class UpstreamResourceAdaptor(DeviceMemoryResource): def __cinit__(self, DeviceMemoryResource upstream_mr, *args, **kwargs): diff --git a/python/rmm/tests/test_rmm.py b/python/rmm/tests/test_rmm.py index 93ef89fb6..f79c60b43 100644 --- a/python/rmm/tests/test_rmm.py +++ b/python/rmm/tests/test_rmm.py @@ -725,6 +725,13 @@ def callback(nbytes: int) -> bool: def test_dev_buf_circle_ref_dealloc(): + # This test creates a reference cycle containing a `DeviceBuffer` + # and ensures that the garbage collector does not clear it, i.e., + # that the GC does not remove all references to other Python + # objects from it. The `DeviceBuffer` needs to keep its reference + # to the `DeviceMemoryResource` that was used to create it in + # order to be cleaned up properly. See GH #931. + rmm.mr.set_current_device_resource(rmm.mr.CudaMemoryResource()) dbuf1 = rmm.DeviceBuffer(size=1_000_000) @@ -734,17 +741,27 @@ def test_dev_buf_circle_ref_dealloc(): l1.append(l1) # due to the reference cycle, the device buffer doesn't actually get - # cleaned up until later, when we invoke `gc.collect()`: + # cleaned up until after `gc.collect()` is called. del dbuf1, l1 rmm.mr.set_current_device_resource(rmm.mr.CudaMemoryResource()) - # by now, the only remaining reference to the *original* memory - # resource should be in `dbuf1`. However, the cyclic garbage collector - # will eliminate that reference when it clears the object via its - # `tp_clear` method. Later, when `tp_dealloc` attemps to actually - # deallocate `dbuf1` (which needs the MR alive), a segfault occurs. + # test that after the call to `gc.collect()`, the `DeviceBuffer` + # is deallocated successfully (i.e., without a segfault). + gc.collect() + + +def test_upstream_mr_circle_ref_dealloc(): + # This test is just like the one above, except it tests that + # instances of `UpstreamResourceAdaptor` (such as + # `PoolMemoryResource`) are not cleared by the GC. + rmm.mr.set_current_device_resource(rmm.mr.CudaMemoryResource()) + mr = rmm.mr.PoolMemoryResource(rmm.mr.get_current_device_resource()) + l1 = [mr] + l1.append(l1) + del mr, l1 + rmm.mr.set_current_device_resource(rmm.mr.CudaMemoryResource()) gc.collect() From 4534bcaa21796244227ae3868f643f548d612687 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 19 Dec 2022 19:23:25 -0500 Subject: [PATCH 12/15] Delete __init__.py in tests/ --- python/rmm/tests/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 python/rmm/tests/__init__.py diff --git a/python/rmm/tests/__init__.py b/python/rmm/tests/__init__.py deleted file mode 100644 index e69de29bb..000000000 From db5fc5265ce853c195cea7f1451eb9261bb516dd Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 21 Dec 2022 12:55:30 -0500 Subject: [PATCH 13/15] styles --- python/rmm/_lib/torch_allocator.pyx | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/rmm/_lib/torch_allocator.pyx b/python/rmm/_lib/torch_allocator.pyx index f34466b70..3c177760f 100644 --- a/python/rmm/_lib/torch_allocator.pyx +++ b/python/rmm/_lib/torch_allocator.pyx @@ -1,6 +1,4 @@ from cuda.ccudart cimport cudaStream_t -from libc.stdint cimport uintptr_t -from libc.stdio cimport printf from rmm._lib.memory_resource cimport device_memory_resource from rmm._lib.per_device_resource cimport get_current_device_resource From 2b343c2dbea27170d0dd565acc5e5a610c83117b Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 5 Jan 2023 09:17:46 -0500 Subject: [PATCH 14/15] Correctly type stream parameter for allocate/deallocate --- python/rmm/_lib/memory_resource.pxd | 11 ++++++++--- python/rmm/_lib/torch_allocator.pyx | 11 +++++++++-- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/python/rmm/_lib/memory_resource.pxd b/python/rmm/_lib/memory_resource.pxd index 6f98ed644..5bb3746bc 100644 --- a/python/rmm/_lib/memory_resource.pxd +++ b/python/rmm/_lib/memory_resource.pxd @@ -12,20 +12,25 @@ # See the License for the specific language governing permissions and # limitations under the License. -from cuda.ccudart cimport cudaStream_t from libc.stdint cimport int8_t from libcpp.memory cimport shared_ptr from libcpp.string cimport string from libcpp.vector cimport vector +from rmm._lib.cuda_stream_view cimport cuda_stream_view + cdef extern from "rmm/mr/device/device_memory_resource.hpp" \ namespace "rmm::mr" nogil: cdef cppclass device_memory_resource: void* allocate(size_t bytes) except + - void* allocate(size_t bytes, cudaStream_t stream) except + + void* allocate(size_t bytes, cuda_stream_view stream) except + void deallocate(void* ptr, size_t bytes) except + - void deallocate(void* ptr, size_t bytes, cudaStream_t stream) except + + void deallocate( + void* ptr, + size_t bytes, + cuda_stream_view stream + ) except + cdef class DeviceMemoryResource: cdef shared_ptr[device_memory_resource] c_obj diff --git a/python/rmm/_lib/torch_allocator.pyx b/python/rmm/_lib/torch_allocator.pyx index 3c177760f..12dc9fe11 100644 --- a/python/rmm/_lib/torch_allocator.pyx +++ b/python/rmm/_lib/torch_allocator.pyx @@ -1,5 +1,6 @@ from cuda.ccudart cimport cudaStream_t +from rmm._lib.cuda_stream_view cimport cuda_stream_view from rmm._lib.memory_resource cimport device_memory_resource from rmm._lib.per_device_resource cimport get_current_device_resource @@ -8,10 +9,16 @@ cdef public void* allocate( ssize_t size, int device, void* stream ) except * with gil: cdef device_memory_resource* mr = get_current_device_resource() - return mr[0].allocate(size, stream) + cdef cuda_stream_view stream_view = cuda_stream_view( + (stream) + ) + return mr[0].allocate(size, stream_view) cdef public void deallocate( void* ptr, ssize_t size, void* stream ) except * with gil: cdef device_memory_resource* mr = get_current_device_resource() - mr[0].deallocate(ptr, size, stream) + cdef cuda_stream_view stream_view = cuda_stream_view( + (stream) + ) + mr[0].deallocate(ptr, size, stream_view) From 232fbd089519563bbf151074f4de32d64a129bd6 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 5 Jan 2023 10:06:57 -0500 Subject: [PATCH 15/15] Address reviews --- README.md | 9 ++++++--- python/rmm/tests/test_rmm_pytorch.py | 8 ++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 32ae806c5..a49430150 100644 --- a/README.md +++ b/README.md @@ -755,15 +755,18 @@ You can, of course, use a custom memory resource with PyTorch as well: import rmm import torch +# note that you can configure PyTorch to use RMM either before or +# after changing RMM's memory resource. PyTorch will use whatever +# memory resource is configured to be the "current" memory resource at +# the time of allocation. +torch.cuda.change_current_allocator(rmm.rmm_torch_allocator) + # configure RMM to use a managed memory resource, wrapped with a # statistics resource adaptor that can report information about the # amount of memory allocated: mr = rmm.mr.StatisticsResourceAdaptor(rmm.mr.ManagedMemoryResource()) rmm.mr.set_current_device_resource(mr) -# configure PyTorch to use RMM for allocations: -torch.cuda.change_current_allocator(rmm.rmm_torch_allocator) - x = torch.tensor([1, 2]).cuda() # the memory resource reports information about PyTorch allocations: diff --git a/python/rmm/tests/test_rmm_pytorch.py b/python/rmm/tests/test_rmm_pytorch.py index 9471af346..eaa40c0ed 100644 --- a/python/rmm/tests/test_rmm_pytorch.py +++ b/python/rmm/tests/test_rmm_pytorch.py @@ -1,3 +1,5 @@ +import gc + import pytest import rmm @@ -15,18 +17,15 @@ def torch_allocator(): def test_rmm_torch_allocator(torch_allocator, stats_mr): - import torch - assert stats_mr.allocation_counts["current_bytes"] == 0 x = torch.tensor([1, 2]).cuda() assert stats_mr.allocation_counts["current_bytes"] > 0 del x + gc.collect() assert stats_mr.allocation_counts["current_bytes"] == 0 def test_rmm_torch_allocator_using_stream(torch_allocator, stats_mr): - import torch - assert stats_mr.allocation_counts["current_bytes"] == 0 s = torch.cuda.Stream() with torch.cuda.stream(s): @@ -34,4 +33,5 @@ def test_rmm_torch_allocator_using_stream(torch_allocator, stats_mr): torch.cuda.current_stream().wait_stream(s) assert stats_mr.allocation_counts["current_bytes"] > 0 del x + gc.collect() assert stats_mr.allocation_counts["current_bytes"] == 0