rapidsai · rapids-bot · Jan 5, 2023 · Nov 29, 2022 · Nov 29, 2022 · Nov 29, 2022
@@ -25,6 +25,7 @@
     register_reinitialize_hook,
     reinitialize,
     rmm_cupy_allocator,
+    rmm_torch_allocator,
     unregister_reinitialize_hook,
 )
 

@@ -12,7 +12,8 @@
 # the License.
 # =============================================================================
 
-set(cython_sources device_buffer.pyx lib.pyx memory_resource.pyx cuda_stream.pyx)
+set(cython_sources device_buffer.pyx lib.pyx memory_resource.pyx cuda_stream.pyx
+                   torch_allocator.pyx)
 set(linked_libraries rmm::rmm)
 
 # Build all of the Cython targets

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from cuda.ccudart cimport cudaStream_t
 from libc.stdint cimport int8_t
 from libcpp.memory cimport shared_ptr
 from libcpp.string cimport string
@@ -22,7 +23,9 @@ cdef extern from "rmm/mr/device/device_memory_resource.hpp" \
         namespace "rmm::mr" nogil:
     cdef cppclass device_memory_resource:
         void* allocate(size_t bytes) except +
+        void* allocate(size_t bytes, cudaStream_t stream) except +
         void deallocate(void* ptr, size_t bytes) except +
+        void deallocate(void* ptr, size_t bytes, cudaStream_t stream) except +
 
 cdef class DeviceMemoryResource:
     cdef shared_ptr[device_memory_resource] c_obj

@@ -0,0 +1,22 @@
+from cuda.ccudart cimport cudaStream_t
+from libc.stdint cimport uintptr_t
+from libc.stdio cimport printf
+
+from rmm._lib.memory_resource cimport device_memory_resource
+
+
+cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil:
+    cdef device_memory_resource* get_current_device_resource \
+        "rmm::mr::get_current_device_resource" ()
+
+cdef public void* allocate(
+    ssize_t size, int device, void* stream
+) except * with gil:
+    cdef device_memory_resource* mr = get_current_device_resource()
+    return mr[0].allocate(size, <cudaStream_t> stream)
+
+cdef public void deallocate(
+    void* ptr, ssize_t size, void* stream
+) except * with gil:
+    cdef device_memory_resource* mr = get_current_device_resource()
+    mr[0].deallocate(ptr, size, <cudaStream_t> stream)
@@ -237,6 +237,19 @@ def rmm_cupy_allocator(nbytes):
     return ptr
 
 
+try:
+    from torch.cuda.memory import CUDAPluggableAllocator
+except ImportError:
+    rmm_torch_allocator = None
+else:
+    _alloc_free_lib_path = rmm._lib.torch_allocator.__file__
+    rmm_torch_allocator = CUDAPluggableAllocator(
+        _alloc_free_lib_path,
+        alloc_fn_name="allocate",
+        free_fn_name="deallocate",
+    )
+
+
 def register_reinitialize_hook(func, *args, **kwargs):
     """
     Add a function to the list of functions ("hooks") that will be

@@ -604,20 +604,21 @@ def test_cuda_async_memory_resource_threshold(nelem, alloc):
     array_tester("u1", 2 * nelem, alloc)  # should trigger release
 
 
-def test_statistics_resource_adaptor():
-
-    cuda_mr = rmm.mr.CudaMemoryResource()
+@pytest.fixture
+def stats_mr():
+    mr = rmm.mr.StatisticsResourceAdaptor(rmm.mr.CudaMemoryResource())
+    rmm.mr.set_current_device_resource(mr)
+    return mr
 
-    mr = rmm.mr.StatisticsResourceAdaptor(cuda_mr)
 
-    rmm.mr.set_current_device_resource(mr)
+def test_statistics_resource_adaptor(stats_mr):
 
     buffers = [rmm.DeviceBuffer(size=1000) for _ in range(10)]
 
     for i in range(9, 0, -2):
         del buffers[i]
 
-    assert mr.allocation_counts == {
+    assert stats_mr.allocation_counts == {
         "current_bytes": 5000,
         "current_count": 5,
         "peak_bytes": 10000,
@@ -627,7 +628,7 @@ def test_statistics_resource_adaptor():
     }
 
     # Push a new Tracking adaptor
-    mr2 = rmm.mr.StatisticsResourceAdaptor(mr)
+    mr2 = rmm.mr.StatisticsResourceAdaptor(stats_mr)
     rmm.mr.set_current_device_resource(mr2)
 
     for _ in range(2):
@@ -641,7 +642,7 @@ def test_statistics_resource_adaptor():
         "total_bytes": 2000,
         "total_count": 2,
     }
-    assert mr.allocation_counts == {
+    assert stats_mr.allocation_counts == {
         "current_bytes": 7000,
         "current_count": 7,
         "peak_bytes": 10000,
@@ -661,18 +662,18 @@ def test_statistics_resource_adaptor():
         "total_bytes": 2000,
         "total_count": 2,
     }
-    assert mr.allocation_counts == {
+    assert stats_mr.allocation_counts == {
         "current_bytes": 0,
         "current_count": 0,
         "peak_bytes": 10000,
         "peak_count": 10,
         "total_bytes": 12000,
         "total_count": 12,
     }
+    gc.collect()
 
 
 def test_tracking_resource_adaptor():
-
     cuda_mr = rmm.mr.CudaMemoryResource()
 
     mr = rmm.mr.TrackingResourceAdaptor(cuda_mr, capture_stacks=True)
@@ -914,3 +915,39 @@ def test_rmm_device_buffer_copy(cuda_ary, make_copy):
     result = db_copy.copy_to_host()
 
     np.testing.assert_equal(expected, result)
+
+
+@pytest.fixture
+def torch_allocator():
+    try:
+        from torch.cuda.memory import change_current_allocator
+    except ImportError:
+        pytest.skip("pytorch pluggable allocator not available")
+
+    try:
+        change_current_allocator(rmm.rmm_torch_allocator)
+    except RuntimeError:
+        pass
+
+
+def test_rmm_torch_allocator(torch_allocator, stats_mr):
+    import torch
+
+    assert stats_mr.allocation_counts["current_bytes"] == 0
+    x = torch.tensor([1, 2]).cuda()
+    assert stats_mr.allocation_counts["current_bytes"] > 0
+    del x
+    assert stats_mr.allocation_counts["current_bytes"] == 0
+
+
+def test_rmm_torch_allocator_using_stream(torch_allocator, stats_mr):
+    import torch
+
+    assert stats_mr.allocation_counts["current_bytes"] == 0
+    s = torch.cuda.Stream()
+    with torch.cuda.stream(s):
+        x = torch.tensor([1, 2]).cuda()
+    torch.cuda.current_stream().wait_stream(s)
+    assert stats_mr.allocation_counts["current_bytes"] > 0
+    del x
+    assert stats_mr.allocation_counts["current_bytes"] == 0