From f1e9413957d495dee2b4c3cff8cecd9529d207ce Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Tue, 29 Nov 2022 07:35:37 -0500
Subject: [PATCH 01/15] Add RMM PyTorch allocator

---
 python/rmm/_lib/CMakeLists.txt      |  3 ++-
 python/rmm/_lib/memory_resource.pxd |  3 +++
 python/rmm/_lib/torch_allocator.pyx | 18 ++++++++++++++++
 python/rmm/rmm.py                   | 33 +++++++++++++++++++++++++++++
 4 files changed, 56 insertions(+), 1 deletion(-)
 create mode 100644 python/rmm/_lib/torch_allocator.pyx

diff --git a/python/rmm/_lib/CMakeLists.txt b/python/rmm/_lib/CMakeLists.txt
index 44f4513b2..9e90d7e99 100644
--- a/python/rmm/_lib/CMakeLists.txt
+++ b/python/rmm/_lib/CMakeLists.txt
@@ -12,7 +12,8 @@
 # the License.
 # =============================================================================
 
-set(cython_sources device_buffer.pyx lib.pyx memory_resource.pyx cuda_stream.pyx)
+set(cython_sources device_buffer.pyx lib.pyx memory_resource.pyx cuda_stream.pyx
+                   torch_allocator.pyx)
 set(linked_libraries rmm::rmm)
 
 # Build all of the Cython targets
diff --git a/python/rmm/_lib/memory_resource.pxd b/python/rmm/_lib/memory_resource.pxd
index 387d39866..6f98ed644 100644
--- a/python/rmm/_lib/memory_resource.pxd
+++ b/python/rmm/_lib/memory_resource.pxd
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from cuda.ccudart cimport cudaStream_t
 from libc.stdint cimport int8_t
 from libcpp.memory cimport shared_ptr
 from libcpp.string cimport string
@@ -22,7 +23,9 @@ cdef extern from "rmm/mr/device/device_memory_resource.hpp" \
         namespace "rmm::mr" nogil:
     cdef cppclass device_memory_resource:
         void* allocate(size_t bytes) except +
+        void* allocate(size_t bytes, cudaStream_t stream) except +
         void deallocate(void* ptr, size_t bytes) except +
+        void deallocate(void* ptr, size_t bytes, cudaStream_t stream) except +
 
 cdef class DeviceMemoryResource:
     cdef shared_ptr[device_memory_resource] c_obj
diff --git a/python/rmm/_lib/torch_allocator.pyx b/python/rmm/_lib/torch_allocator.pyx
new file mode 100644
index 000000000..548bb7b19
--- /dev/null
+++ b/python/rmm/_lib/torch_allocator.pyx
@@ -0,0 +1,18 @@
+from cuda.ccudart cimport cudaStream_t
+from libc.stdint cimport uintptr_t
+from libc.stdio cimport printf
+
+from rmm._lib.memory_resource cimport device_memory_resource
+
+
+cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil:
+    cdef device_memory_resource* get_current_device_resource \
+        "rmm::mr::get_current_device_resource" ()
+
+cdef public void* allocate(ssize_t size, int device, void* stream) except *:
+    cdef device_memory_resource* mr = get_current_device_resource()
+    return mr[0].allocate(size, <cudaStream_t> stream)
+
+cdef public void deallocate(void* ptr, ssize_t size, void* stream) except *:
+    cdef device_memory_resource* mr = get_current_device_resource()
+    mr[0].deallocate(ptr, size, <cudaStream_t> stream)
diff --git a/python/rmm/rmm.py b/python/rmm/rmm.py
index 398d83de3..9aeaf5181 100644
--- a/python/rmm/rmm.py
+++ b/python/rmm/rmm.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import ctypes
+import warnings
 
 from cuda.cuda import CUdeviceptr, cuIpcGetMemHandle
 from numba import config, cuda
@@ -237,6 +238,38 @@ def rmm_cupy_allocator(nbytes):
     return ptr
 
 
+def _set_pytorch_allocator():
+    try:
+        from torch.cuda.memory import (
+            CUDAPluggableAllocator,
+            change_current_allocator,
+        )
+    except ImportError:
+        return
+    else:
+        import rmm._lib.torch_allocator
+
+        alloc_free_lib_path = rmm._lib.torch_allocator.__file__
+
+        rmm_torch_allocator = CUDAPluggableAllocator(
+            alloc_free_lib_path,
+            alloc_fn_name="allocate",
+            free_fn_name="deallocate",
+        )
+
+        try:
+            change_current_allocator(rmm_torch_allocator)
+        except RuntimeError as e:
+            warnings.warn(
+                "RMM could not change the PyTorch CUDA allocator "
+                "because another allocator is already in use.",
+                RuntimeWarning,
+            )
+
+
+_set_pytorch_allocator()
+
+
 def register_reinitialize_hook(func, *args, **kwargs):
     """
     Add a function to the list of functions ("hooks") that will be

From 53a18ab7fa654e001df924c10ae3175ef557af08 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Tue, 29 Nov 2022 07:46:39 -0500
Subject: [PATCH 02/15] Add `with gil` to enable calling Python code in
 allocate/deallocate

---
 python/rmm/_lib/torch_allocator.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/rmm/_lib/torch_allocator.pyx b/python/rmm/_lib/torch_allocator.pyx
index 548bb7b19..743ae6cdd 100644
--- a/python/rmm/_lib/torch_allocator.pyx
+++ b/python/rmm/_lib/torch_allocator.pyx
@@ -9,10 +9,10 @@ cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil:
     cdef device_memory_resource* get_current_device_resource \
         "rmm::mr::get_current_device_resource" ()
 
-cdef public void* allocate(ssize_t size, int device, void* stream) except *:
+cdef public void* allocate(ssize_t size, int device, void* stream) except * with gil:
     cdef device_memory_resource* mr = get_current_device_resource()
     return mr[0].allocate(size, <cudaStream_t> stream)
 
-cdef public void deallocate(void* ptr, ssize_t size, void* stream) except *:
+cdef public void deallocate(void* ptr, ssize_t size, void* stream) except * with gil:
     cdef device_memory_resource* mr = get_current_device_resource()
     mr[0].deallocate(ptr, size, <cudaStream_t> stream)

From aab7e0733453df56e6d8cbd67136984e383e8ebe Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Tue, 29 Nov 2022 13:10:06 -0500
Subject: [PATCH 03/15] Don't set pytorch allocator by default

---
 python/rmm/__init__.py |  1 +
 python/rmm/rmm.py      | 43 +++++++++++++++---------------------------
 2 files changed, 16 insertions(+), 28 deletions(-)

diff --git a/python/rmm/__init__.py b/python/rmm/__init__.py
index acdeb93a8..9fb13fe73 100644
--- a/python/rmm/__init__.py
+++ b/python/rmm/__init__.py
@@ -25,6 +25,7 @@
     register_reinitialize_hook,
     reinitialize,
     rmm_cupy_allocator,
+    rmm_torch_allocator,
     unregister_reinitialize_hook,
 )
 
diff --git a/python/rmm/rmm.py b/python/rmm/rmm.py
index 9aeaf5181..b4dd20b0b 100644
--- a/python/rmm/rmm.py
+++ b/python/rmm/rmm.py
@@ -238,36 +238,23 @@ def rmm_cupy_allocator(nbytes):
     return ptr
 
 
-def _set_pytorch_allocator():
-    try:
-        from torch.cuda.memory import (
-            CUDAPluggableAllocator,
-            change_current_allocator,
-        )
-    except ImportError:
-        return
-    else:
-        import rmm._lib.torch_allocator
-
-        alloc_free_lib_path = rmm._lib.torch_allocator.__file__
-
-        rmm_torch_allocator = CUDAPluggableAllocator(
-            alloc_free_lib_path,
-            alloc_fn_name="allocate",
-            free_fn_name="deallocate",
-        )
-
-        try:
-            change_current_allocator(rmm_torch_allocator)
-        except RuntimeError as e:
-            warnings.warn(
-                "RMM could not change the PyTorch CUDA allocator "
-                "because another allocator is already in use.",
-                RuntimeWarning,
-            )
+try:
+    from torch.cuda.memory import (
+        CUDAPluggableAllocator,
+        change_current_allocator,
+    )
+except ImportError:
+    rmm_torch_allocator = None
+else:
+    import rmm._lib.torch_allocator
 
+    _alloc_free_lib_path = rmm._lib.torch_allocator.__file__
 
-_set_pytorch_allocator()
+    rmm_torch_allocator = CUDAPluggableAllocator(
+        _alloc_free_lib_path,
+        alloc_fn_name="allocate",
+        free_fn_name="deallocate",
+    )
 
 
 def register_reinitialize_hook(func, *args, **kwargs):

From 63f2692d3ff410687ea8d67ade4af6c3835fb5e9 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Wed, 30 Nov 2022 15:08:13 -0500
Subject: [PATCH 04/15] Add tests

---
 python/rmm/tests/test_rmm.py | 57 +++++++++++++++++++++++++++++-------
 1 file changed, 47 insertions(+), 10 deletions(-)

diff --git a/python/rmm/tests/test_rmm.py b/python/rmm/tests/test_rmm.py
index 931ff5336..2eca417c7 100644
--- a/python/rmm/tests/test_rmm.py
+++ b/python/rmm/tests/test_rmm.py
@@ -604,20 +604,21 @@ def test_cuda_async_memory_resource_threshold(nelem, alloc):
     array_tester("u1", 2 * nelem, alloc)  # should trigger release
 
 
-def test_statistics_resource_adaptor():
-
-    cuda_mr = rmm.mr.CudaMemoryResource()
+@pytest.fixture
+def stats_mr():
+    mr = rmm.mr.StatisticsResourceAdaptor(rmm.mr.CudaMemoryResource())
+    rmm.mr.set_current_device_resource(mr)
+    return mr
 
-    mr = rmm.mr.StatisticsResourceAdaptor(cuda_mr)
 
-    rmm.mr.set_current_device_resource(mr)
+def test_statistics_resource_adaptor(stats_mr):
 
     buffers = [rmm.DeviceBuffer(size=1000) for _ in range(10)]
 
     for i in range(9, 0, -2):
         del buffers[i]
 
-    assert mr.allocation_counts == {
+    assert stats_mr.allocation_counts == {
         "current_bytes": 5000,
         "current_count": 5,
         "peak_bytes": 10000,
@@ -627,7 +628,7 @@ def test_statistics_resource_adaptor():
     }
 
     # Push a new Tracking adaptor
-    mr2 = rmm.mr.StatisticsResourceAdaptor(mr)
+    mr2 = rmm.mr.StatisticsResourceAdaptor(stats_mr)
     rmm.mr.set_current_device_resource(mr2)
 
     for _ in range(2):
@@ -641,7 +642,7 @@ def test_statistics_resource_adaptor():
         "total_bytes": 2000,
         "total_count": 2,
     }
-    assert mr.allocation_counts == {
+    assert stats_mr.allocation_counts == {
         "current_bytes": 7000,
         "current_count": 7,
         "peak_bytes": 10000,
@@ -661,7 +662,7 @@ def test_statistics_resource_adaptor():
         "total_bytes": 2000,
         "total_count": 2,
     }
-    assert mr.allocation_counts == {
+    assert stats_mr.allocation_counts == {
         "current_bytes": 0,
         "current_count": 0,
         "peak_bytes": 10000,
@@ -669,10 +670,10 @@ def test_statistics_resource_adaptor():
         "total_bytes": 12000,
         "total_count": 12,
     }
+    gc.collect()
 
 
 def test_tracking_resource_adaptor():
-
     cuda_mr = rmm.mr.CudaMemoryResource()
 
     mr = rmm.mr.TrackingResourceAdaptor(cuda_mr, capture_stacks=True)
@@ -914,3 +915,39 @@ def test_rmm_device_buffer_copy(cuda_ary, make_copy):
     result = db_copy.copy_to_host()
 
     np.testing.assert_equal(expected, result)
+
+
+@pytest.fixture
+def torch_allocator():
+    try:
+        from torch.cuda.memory import change_current_allocator
+    except ImportError:
+        pytest.skip("pytorch pluggable allocator not available")
+
+    try:
+        change_current_allocator(rmm.rmm_torch_allocator)
+    except RuntimeError:
+        pass
+
+
+def test_rmm_torch_allocator(torch_allocator, stats_mr):
+    import torch
+
+    assert stats_mr.allocation_counts["current_bytes"] == 0
+    x = torch.tensor([1, 2]).cuda()
+    assert stats_mr.allocation_counts["current_bytes"] > 0
+    del x
+    assert stats_mr.allocation_counts["current_bytes"] == 0
+
+
+def test_rmm_torch_allocator_using_stream(torch_allocator, stats_mr):
+    import torch
+
+    assert stats_mr.allocation_counts["current_bytes"] == 0
+    s = torch.cuda.Stream()
+    with torch.cuda.stream(s):
+        x = torch.tensor([1, 2]).cuda()
+    torch.cuda.current_stream().wait_stream(s)
+    assert stats_mr.allocation_counts["current_bytes"] > 0
+    del x
+    assert stats_mr.allocation_counts["current_bytes"] == 0

From 6404dac684dce9d8182b538680a385f004759694 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Wed, 30 Nov 2022 16:03:47 -0500
Subject: [PATCH 05/15] Styles

---
 python/rmm/_lib/torch_allocator.pyx | 8 ++++++--
 python/rmm/rmm.py                   | 9 +--------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/python/rmm/_lib/torch_allocator.pyx b/python/rmm/_lib/torch_allocator.pyx
index 743ae6cdd..da4ac7170 100644
--- a/python/rmm/_lib/torch_allocator.pyx
+++ b/python/rmm/_lib/torch_allocator.pyx
@@ -9,10 +9,14 @@ cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil:
     cdef device_memory_resource* get_current_device_resource \
         "rmm::mr::get_current_device_resource" ()
 
-cdef public void* allocate(ssize_t size, int device, void* stream) except * with gil:
+cdef public void* allocate(
+    ssize_t size, int device, void* stream
+) except * with gil:
     cdef device_memory_resource* mr = get_current_device_resource()
     return mr[0].allocate(size, <cudaStream_t> stream)
 
-cdef public void deallocate(void* ptr, ssize_t size, void* stream) except * with gil:
+cdef public void deallocate(
+    void* ptr, ssize_t size, void* stream
+) except * with gil:
     cdef device_memory_resource* mr = get_current_device_resource()
     mr[0].deallocate(ptr, size, <cudaStream_t> stream)
diff --git a/python/rmm/rmm.py b/python/rmm/rmm.py
index b4dd20b0b..313b029bc 100644
--- a/python/rmm/rmm.py
+++ b/python/rmm/rmm.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import ctypes
-import warnings
 
 from cuda.cuda import CUdeviceptr, cuIpcGetMemHandle
 from numba import config, cuda
@@ -239,17 +238,11 @@ def rmm_cupy_allocator(nbytes):
 
 
 try:
-    from torch.cuda.memory import (
-        CUDAPluggableAllocator,
-        change_current_allocator,
-    )
+    from torch.cuda.memory import CUDAPluggableAllocator
 except ImportError:
     rmm_torch_allocator = None
 else:
-    import rmm._lib.torch_allocator
-
     _alloc_free_lib_path = rmm._lib.torch_allocator.__file__
-
     rmm_torch_allocator = CUDAPluggableAllocator(
         _alloc_free_lib_path,
         alloc_fn_name="allocate",

From 73e08461e7ea1755020ef371e0c8d276eac32289 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Tue, 13 Dec 2022 15:17:47 -0500
Subject: [PATCH 06/15] Don't redeclare get_current_device_resource

---
 python/rmm/_lib/memory_resource.pyx     | 29 +++++--------------------
 python/rmm/_lib/per_device_resource.pxd | 23 ++++++++++++++++++++
 python/rmm/_lib/torch_allocator.pyx     |  5 +----
 3 files changed, 29 insertions(+), 28 deletions(-)
 create mode 100644 python/rmm/_lib/per_device_resource.pxd

diff --git a/python/rmm/_lib/memory_resource.pyx b/python/rmm/_lib/memory_resource.pyx
index a20d481e0..b2477d554 100644
--- a/python/rmm/_lib/memory_resource.pyx
+++ b/python/rmm/_lib/memory_resource.pyx
@@ -35,6 +35,10 @@ from rmm._cuda.gpu import (
 )
 
 from rmm._lib.cuda_stream_view cimport cuda_stream_view
+from rmm._lib.per_device_resource cimport (
+    cuda_device_id,
+    set_per_device_resource as cpp_set_per_device_resource,
+)
 
 # Transparent handle of a C++ exception
 ctypedef pair[int, string] CppExcept
@@ -212,29 +216,6 @@ cdef extern from "rmm/mr/device/failure_callback_resource_adaptor.hpp" \
         ) except +
 
 
-cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil:
-
-    cdef cppclass cuda_device_id:
-        ctypedef int value_type
-
-        cuda_device_id(value_type id)
-
-        value_type value()
-
-    cdef device_memory_resource* _set_current_device_resource \
-        "rmm::mr::set_current_device_resource" (device_memory_resource* new_mr)
-    cdef device_memory_resource* _get_current_device_resource \
-        "rmm::mr::get_current_device_resource" ()
-
-    cdef device_memory_resource* _set_per_device_resource \
-        "rmm::mr::set_per_device_resource" (
-            cuda_device_id id,
-            device_memory_resource* new_mr
-        )
-    cdef device_memory_resource* _get_per_device_resource \
-        "rmm::mr::get_per_device_resource"(cuda_device_id id)
-
-
 cdef class DeviceMemoryResource:
 
     cdef device_memory_resource* get_mr(self):
@@ -973,7 +954,7 @@ cpdef set_per_device_resource(int device, DeviceMemoryResource mr):
     cdef unique_ptr[cuda_device_id] device_id = \
         make_unique[cuda_device_id](device)
 
-    _set_per_device_resource(deref(device_id), mr.get_mr())
+    cpp_set_per_device_resource(deref(device_id), mr.get_mr())
 
 
 cpdef set_current_device_resource(DeviceMemoryResource mr):
diff --git a/python/rmm/_lib/per_device_resource.pxd b/python/rmm/_lib/per_device_resource.pxd
new file mode 100644
index 000000000..c33217622
--- /dev/null
+++ b/python/rmm/_lib/per_device_resource.pxd
@@ -0,0 +1,23 @@
+from rmm._lib.memory_resource cimport device_memory_resource
+
+
+cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil:
+    cdef cppclass cuda_device_id:
+        ctypedef int value_type
+
+        cuda_device_id(value_type id)
+
+        value_type value()
+
+cdef extern from "rmm/mr/device/per_device_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef device_memory_resource* set_current_device_resource(
+        device_memory_resource* new_mr
+    )
+    cdef device_memory_resource* get_current_device_resource()
+    cdef device_memory_resource* set_per_device_resource(
+        cuda_device_id id, device_memory_resource* new_mr
+    )
+    cdef device_memory_resource* get_per_device_resource (
+        cuda_device_id id
+    )
diff --git a/python/rmm/_lib/torch_allocator.pyx b/python/rmm/_lib/torch_allocator.pyx
index da4ac7170..f34466b70 100644
--- a/python/rmm/_lib/torch_allocator.pyx
+++ b/python/rmm/_lib/torch_allocator.pyx
@@ -3,12 +3,9 @@ from libc.stdint cimport uintptr_t
 from libc.stdio cimport printf
 
 from rmm._lib.memory_resource cimport device_memory_resource
+from rmm._lib.per_device_resource cimport get_current_device_resource
 
 
-cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil:
-    cdef device_memory_resource* get_current_device_resource \
-        "rmm::mr::get_current_device_resource" ()
-
 cdef public void* allocate(
     ssize_t size, int device, void* stream
 ) except * with gil:

From 9c27e851724c643604fa6837b2f7de9002b3642e Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Tue, 13 Dec 2022 16:28:27 -0500
Subject: [PATCH 07/15] use a session-scoped fixture instead

---
 python/rmm/rmm.py            | 2 ++
 python/rmm/tests/test_rmm.py | 8 ++------
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/python/rmm/rmm.py b/python/rmm/rmm.py
index 313b029bc..cae9971dc 100644
--- a/python/rmm/rmm.py
+++ b/python/rmm/rmm.py
@@ -242,6 +242,8 @@ def rmm_cupy_allocator(nbytes):
 except ImportError:
     rmm_torch_allocator = None
 else:
+    import rmm._lib.torch_allocator
+
     _alloc_free_lib_path = rmm._lib.torch_allocator.__file__
     rmm_torch_allocator = CUDAPluggableAllocator(
         _alloc_free_lib_path,
diff --git a/python/rmm/tests/test_rmm.py b/python/rmm/tests/test_rmm.py
index 2eca417c7..d6cae9a27 100644
--- a/python/rmm/tests/test_rmm.py
+++ b/python/rmm/tests/test_rmm.py
@@ -917,17 +917,13 @@ def test_rmm_device_buffer_copy(cuda_ary, make_copy):
     np.testing.assert_equal(expected, result)
 
 
-@pytest.fixture
+@pytest.fixture(scope="session")
 def torch_allocator():
     try:
         from torch.cuda.memory import change_current_allocator
     except ImportError:
         pytest.skip("pytorch pluggable allocator not available")
-
-    try:
-        change_current_allocator(rmm.rmm_torch_allocator)
-    except RuntimeError:
-        pass
+    change_current_allocator(rmm.rmm_torch_allocator)
 
 
 def test_rmm_torch_allocator(torch_allocator, stats_mr):

From 39251dbb06b9ed01eb6e2fd7b89177b51a960096 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Tue, 13 Dec 2022 16:52:45 -0500
Subject: [PATCH 08/15] Add a note on how to use RMM + PyTorch

---
 README.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/README.md b/README.md
index 91c5c577f..197e9a5cd 100644
--- a/README.md
+++ b/README.md
@@ -732,3 +732,18 @@ This can be done in two ways:
 **Note:** This only configures Numba to use the current RMM resource for allocations.
 It does not initialize nor change the current resource, e.g., enabling a memory pool.
 See [here](#memoryresource-objects) for more information on changing the current memory resource.
+
+### Using RMM with PyTorch
+
+[PyTorch](https://pytorch.org/docs/stable/notes/cuda.html) can use RMM for memory allocation.
+For example, to configure PyTorch to use an RMM-managed pool, you can do the following:
+
+```python
+import rmm
+import torch
+
+rmm.reinitialize(pool_allocator=True)
+torch.cuda.memory.change_current_allocator(rmm.rmm_torch_allocator)
+```
+
+PyTorch and RMM will now share the same memory pool.

From 67133619d63fbc23b18152147ba3a9d3da600bbc Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Wed, 14 Dec 2022 10:07:10 -0500
Subject: [PATCH 09/15] More doc

---
 README.md | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 197e9a5cd..32ae806c5 100644
--- a/README.md
+++ b/README.md
@@ -735,8 +735,9 @@ See [here](#memoryresource-objects) for more information on changing the current
 
 ### Using RMM with PyTorch
 
-[PyTorch](https://pytorch.org/docs/stable/notes/cuda.html) can use RMM for memory allocation.
-For example, to configure PyTorch to use an RMM-managed pool, you can do the following:
+[PyTorch](https://pytorch.org/docs/stable/notes/cuda.html) can use RMM
+for memory allocation.  For example, to configure PyTorch to use an
+RMM-managed pool:
 
 ```python
 import rmm
@@ -747,3 +748,31 @@ torch.cuda.memory.change_current_allocator(rmm.rmm_torch_allocator)
 ```
 
 PyTorch and RMM will now share the same memory pool.
+
+You can, of course, use a custom memory resource with PyTorch as well:
+
+```python
+import rmm
+import torch
+
+# configure RMM to use a managed memory resource, wrapped with a
+# statistics resource adaptor that can report information about the
+# amount of memory allocated:
+mr = rmm.mr.StatisticsResourceAdaptor(rmm.mr.ManagedMemoryResource())
+rmm.mr.set_current_device_resource(mr)
+
+# configure PyTorch to use RMM for allocations:
+torch.cuda.change_current_allocator(rmm.rmm_torch_allocator)
+
+x = torch.tensor([1, 2]).cuda()
+
+# the memory resource reports information about PyTorch allocations:
+mr.allocation_counts
+Out[6]:
+{'current_bytes': 16,
+ 'current_count': 1,
+ 'peak_bytes': 16,
+ 'peak_count': 1,
+ 'total_bytes': 16,
+ 'total_count': 1}
+```

From a53ce953d80610ef0cb78f6a8f3ba4462f6f75f0 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 15 Dec 2022 11:48:43 -0500
Subject: [PATCH 10/15] Add new test_rmm_pytorch.py. Move fixtures to
 conftest.py.

---
 python/rmm/tests/__init__.py         |  0
 python/rmm/tests/conftest.py         | 21 ++++++++++++
 python/rmm/tests/test_rmm.py         | 50 ----------------------------
 python/rmm/tests/test_rmm_pytorch.py | 37 ++++++++++++++++++++
 4 files changed, 58 insertions(+), 50 deletions(-)
 create mode 100644 python/rmm/tests/__init__.py
 create mode 100644 python/rmm/tests/conftest.py
 create mode 100644 python/rmm/tests/test_rmm_pytorch.py

diff --git a/python/rmm/tests/__init__.py b/python/rmm/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/rmm/tests/conftest.py b/python/rmm/tests/conftest.py
new file mode 100644
index 000000000..5fad81c79
--- /dev/null
+++ b/python/rmm/tests/conftest.py
@@ -0,0 +1,21 @@
+import pytest
+
+import rmm
+
+
+@pytest.fixture(scope="function", autouse=True)
+def rmm_auto_reinitialize():
+    # Run the test
+    yield
+
+    # Automatically reinitialize the current memory resource after running each
+    # test
+
+    rmm.reinitialize()
+
+
+@pytest.fixture
+def stats_mr():
+    mr = rmm.mr.StatisticsResourceAdaptor(rmm.mr.CudaMemoryResource())
+    rmm.mr.set_current_device_resource(mr)
+    return mr
diff --git a/python/rmm/tests/test_rmm.py b/python/rmm/tests/test_rmm.py
index d6cae9a27..93ef89fb6 100644
--- a/python/rmm/tests/test_rmm.py
+++ b/python/rmm/tests/test_rmm.py
@@ -42,17 +42,6 @@
 )
 
 
-@pytest.fixture(scope="function", autouse=True)
-def rmm_auto_reinitialize():
-
-    # Run the test
-    yield
-
-    # Automatically reinitialize the current memory resource after running each
-    # test
-    rmm.reinitialize()
-
-
 def array_tester(dtype, nelem, alloc):
     # data
     h_in = np.full(nelem, 3.2, dtype)
@@ -604,13 +593,6 @@ def test_cuda_async_memory_resource_threshold(nelem, alloc):
     array_tester("u1", 2 * nelem, alloc)  # should trigger release
 
 
-@pytest.fixture
-def stats_mr():
-    mr = rmm.mr.StatisticsResourceAdaptor(rmm.mr.CudaMemoryResource())
-    rmm.mr.set_current_device_resource(mr)
-    return mr
-
-
 def test_statistics_resource_adaptor(stats_mr):
 
     buffers = [rmm.DeviceBuffer(size=1000) for _ in range(10)]
@@ -915,35 +897,3 @@ def test_rmm_device_buffer_copy(cuda_ary, make_copy):
     result = db_copy.copy_to_host()
 
     np.testing.assert_equal(expected, result)
-
-
-@pytest.fixture(scope="session")
-def torch_allocator():
-    try:
-        from torch.cuda.memory import change_current_allocator
-    except ImportError:
-        pytest.skip("pytorch pluggable allocator not available")
-    change_current_allocator(rmm.rmm_torch_allocator)
-
-
-def test_rmm_torch_allocator(torch_allocator, stats_mr):
-    import torch
-
-    assert stats_mr.allocation_counts["current_bytes"] == 0
-    x = torch.tensor([1, 2]).cuda()
-    assert stats_mr.allocation_counts["current_bytes"] > 0
-    del x
-    assert stats_mr.allocation_counts["current_bytes"] == 0
-
-
-def test_rmm_torch_allocator_using_stream(torch_allocator, stats_mr):
-    import torch
-
-    assert stats_mr.allocation_counts["current_bytes"] == 0
-    s = torch.cuda.Stream()
-    with torch.cuda.stream(s):
-        x = torch.tensor([1, 2]).cuda()
-    torch.cuda.current_stream().wait_stream(s)
-    assert stats_mr.allocation_counts["current_bytes"] > 0
-    del x
-    assert stats_mr.allocation_counts["current_bytes"] == 0
diff --git a/python/rmm/tests/test_rmm_pytorch.py b/python/rmm/tests/test_rmm_pytorch.py
new file mode 100644
index 000000000..9471af346
--- /dev/null
+++ b/python/rmm/tests/test_rmm_pytorch.py
@@ -0,0 +1,37 @@
+import pytest
+
+import rmm
+
+torch = pytest.importorskip("torch")
+
+
+@pytest.fixture(scope="session")
+def torch_allocator():
+    try:
+        from torch.cuda.memory import change_current_allocator
+    except ImportError:
+        pytest.skip("pytorch pluggable allocator not available")
+    change_current_allocator(rmm.rmm_torch_allocator)
+
+
+def test_rmm_torch_allocator(torch_allocator, stats_mr):
+    import torch
+
+    assert stats_mr.allocation_counts["current_bytes"] == 0
+    x = torch.tensor([1, 2]).cuda()
+    assert stats_mr.allocation_counts["current_bytes"] > 0
+    del x
+    assert stats_mr.allocation_counts["current_bytes"] == 0
+
+
+def test_rmm_torch_allocator_using_stream(torch_allocator, stats_mr):
+    import torch
+
+    assert stats_mr.allocation_counts["current_bytes"] == 0
+    s = torch.cuda.Stream()
+    with torch.cuda.stream(s):
+        x = torch.tensor([1, 2]).cuda()
+    torch.cuda.current_stream().wait_stream(s)
+    assert stats_mr.allocation_counts["current_bytes"] > 0
+    del x
+    assert stats_mr.allocation_counts["current_bytes"] == 0

From 741a1df0f8a6a942c6ad8852e27b1ee33cb7c591 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Tue, 13 Dec 2022 16:57:13 -0500
Subject: [PATCH 11/15] Ensure `UpstreamResourceAdaptor` is not cleared by the
 Python GC (#1170)

Closes #1169.

Essentially, we are running into the situation described in https://cython.readthedocs.io/en/latest/src/userguide/extension_types.html#disabling-cycle-breaking-tp-clear with `UpstreamResourceAdaptor`.

The solution is to prevent clearing of `UpstreamResourceAdaptor` objects by decorating them with `no_gc_clear`.

Cython calls out the following:

> If you use no_gc_clear, it is important that any given reference cycle contains at least one object without no_gc_clear. Otherwise, the cycle cannot be broken, which is a memory leak.

The other object in RMM that we mark `@no_gc_clear` is `DeviceBuffer`, and a `DeviceBuffer` can keep a reference to an `UpstreamResourceAdaptor`. But, an `UpstreamResourceAdaptor` cannot keep a reference to a `DeviceBuffer`, so instances of the two cannot form a reference cycle AFAICT.

Authors:
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/rmm/pull/1170
---
 python/rmm/_lib/memory_resource.pyx |  3 +++
 python/rmm/tests/test_rmm.py        | 29 +++++++++++++++++++++++------
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/python/rmm/_lib/memory_resource.pyx b/python/rmm/_lib/memory_resource.pyx
index b2477d554..501cf51cf 100644
--- a/python/rmm/_lib/memory_resource.pyx
+++ b/python/rmm/_lib/memory_resource.pyx
@@ -16,6 +16,7 @@ import os
 import warnings
 from collections import defaultdict
 
+cimport cython
 from cython.operator cimport dereference as deref
 from libc.stdint cimport int8_t, int64_t, uintptr_t
 from libcpp cimport bool
@@ -228,6 +229,8 @@ cdef class DeviceMemoryResource:
         self.c_obj.get().deallocate(<void*>(ptr), nbytes)
 
 
+# See the note about `no_gc_clear` in `device_buffer.pyx`.
+@cython.no_gc_clear
 cdef class UpstreamResourceAdaptor(DeviceMemoryResource):
 
     def __cinit__(self, DeviceMemoryResource upstream_mr, *args, **kwargs):
diff --git a/python/rmm/tests/test_rmm.py b/python/rmm/tests/test_rmm.py
index 93ef89fb6..f79c60b43 100644
--- a/python/rmm/tests/test_rmm.py
+++ b/python/rmm/tests/test_rmm.py
@@ -725,6 +725,13 @@ def callback(nbytes: int) -> bool:
 
 
 def test_dev_buf_circle_ref_dealloc():
+    # This test creates a reference cycle containing a `DeviceBuffer`
+    # and ensures that the garbage collector does not clear it, i.e.,
+    # that the GC does not remove all references to other Python
+    # objects from it. The `DeviceBuffer` needs to keep its reference
+    # to the `DeviceMemoryResource` that was used to create it in
+    # order to be cleaned up properly. See GH #931.
+
     rmm.mr.set_current_device_resource(rmm.mr.CudaMemoryResource())
 
     dbuf1 = rmm.DeviceBuffer(size=1_000_000)
@@ -734,17 +741,27 @@ def test_dev_buf_circle_ref_dealloc():
     l1.append(l1)
 
     # due to the reference cycle, the device buffer doesn't actually get
-    # cleaned up until later, when we invoke `gc.collect()`:
+    # cleaned up until after `gc.collect()` is called.
     del dbuf1, l1
 
     rmm.mr.set_current_device_resource(rmm.mr.CudaMemoryResource())
 
-    # by now, the only remaining reference to the *original* memory
-    # resource should be in `dbuf1`. However, the cyclic garbage collector
-    # will eliminate that reference when it clears the object via its
-    # `tp_clear` method.  Later, when `tp_dealloc` attemps to actually
-    # deallocate `dbuf1` (which needs the MR alive), a segfault occurs.
+    # test that after the call to `gc.collect()`, the `DeviceBuffer`
+    # is deallocated successfully (i.e., without a segfault).
+    gc.collect()
+
+
+def test_upstream_mr_circle_ref_dealloc():
+    # This test is just like the one above, except it tests that
+    # instances of `UpstreamResourceAdaptor` (such as
+    # `PoolMemoryResource`) are not cleared by the GC.
 
+    rmm.mr.set_current_device_resource(rmm.mr.CudaMemoryResource())
+    mr = rmm.mr.PoolMemoryResource(rmm.mr.get_current_device_resource())
+    l1 = [mr]
+    l1.append(l1)
+    del mr, l1
+    rmm.mr.set_current_device_resource(rmm.mr.CudaMemoryResource())
     gc.collect()
 
 

From 4534bcaa21796244227ae3868f643f548d612687 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 19 Dec 2022 19:23:25 -0500
Subject: [PATCH 12/15] Delete __init__.py in tests/

---
 python/rmm/tests/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 python/rmm/tests/__init__.py

diff --git a/python/rmm/tests/__init__.py b/python/rmm/tests/__init__.py
deleted file mode 100644
index e69de29bb..000000000

From db5fc5265ce853c195cea7f1451eb9261bb516dd Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Wed, 21 Dec 2022 12:55:30 -0500
Subject: [PATCH 13/15] styles

---
 python/rmm/_lib/torch_allocator.pyx | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/rmm/_lib/torch_allocator.pyx b/python/rmm/_lib/torch_allocator.pyx
index f34466b70..3c177760f 100644
--- a/python/rmm/_lib/torch_allocator.pyx
+++ b/python/rmm/_lib/torch_allocator.pyx
@@ -1,6 +1,4 @@
 from cuda.ccudart cimport cudaStream_t
-from libc.stdint cimport uintptr_t
-from libc.stdio cimport printf
 
 from rmm._lib.memory_resource cimport device_memory_resource
 from rmm._lib.per_device_resource cimport get_current_device_resource

From 2b343c2dbea27170d0dd565acc5e5a610c83117b Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 5 Jan 2023 09:17:46 -0500
Subject: [PATCH 14/15] Correctly type stream parameter for allocate/deallocate

---
 python/rmm/_lib/memory_resource.pxd | 11 ++++++++---
 python/rmm/_lib/torch_allocator.pyx | 11 +++++++++--
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/python/rmm/_lib/memory_resource.pxd b/python/rmm/_lib/memory_resource.pxd
index 6f98ed644..5bb3746bc 100644
--- a/python/rmm/_lib/memory_resource.pxd
+++ b/python/rmm/_lib/memory_resource.pxd
@@ -12,20 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cuda.ccudart cimport cudaStream_t
 from libc.stdint cimport int8_t
 from libcpp.memory cimport shared_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
+from rmm._lib.cuda_stream_view cimport cuda_stream_view
+
 
 cdef extern from "rmm/mr/device/device_memory_resource.hpp" \
         namespace "rmm::mr" nogil:
     cdef cppclass device_memory_resource:
         void* allocate(size_t bytes) except +
-        void* allocate(size_t bytes, cudaStream_t stream) except +
+        void* allocate(size_t bytes, cuda_stream_view stream) except +
         void deallocate(void* ptr, size_t bytes) except +
-        void deallocate(void* ptr, size_t bytes, cudaStream_t stream) except +
+        void deallocate(
+            void* ptr,
+            size_t bytes,
+            cuda_stream_view stream
+        ) except +
 
 cdef class DeviceMemoryResource:
     cdef shared_ptr[device_memory_resource] c_obj
diff --git a/python/rmm/_lib/torch_allocator.pyx b/python/rmm/_lib/torch_allocator.pyx
index 3c177760f..12dc9fe11 100644
--- a/python/rmm/_lib/torch_allocator.pyx
+++ b/python/rmm/_lib/torch_allocator.pyx
@@ -1,5 +1,6 @@
 from cuda.ccudart cimport cudaStream_t
 
+from rmm._lib.cuda_stream_view cimport cuda_stream_view
 from rmm._lib.memory_resource cimport device_memory_resource
 from rmm._lib.per_device_resource cimport get_current_device_resource
 
@@ -8,10 +9,16 @@ cdef public void* allocate(
     ssize_t size, int device, void* stream
 ) except * with gil:
     cdef device_memory_resource* mr = get_current_device_resource()
-    return mr[0].allocate(size, <cudaStream_t> stream)
+    cdef cuda_stream_view stream_view = cuda_stream_view(
+        <cudaStream_t>(stream)
+    )
+    return mr[0].allocate(size, stream_view)
 
 cdef public void deallocate(
     void* ptr, ssize_t size, void* stream
 ) except * with gil:
     cdef device_memory_resource* mr = get_current_device_resource()
-    mr[0].deallocate(ptr, size, <cudaStream_t> stream)
+    cdef cuda_stream_view stream_view = cuda_stream_view(
+        <cudaStream_t>(stream)
+    )
+    mr[0].deallocate(ptr, size, stream_view)

From 232fbd089519563bbf151074f4de32d64a129bd6 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 5 Jan 2023 10:06:57 -0500
Subject: [PATCH 15/15] Address reviews

---
 README.md                            | 9 ++++++---
 python/rmm/tests/test_rmm_pytorch.py | 8 ++++----
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 32ae806c5..a49430150 100644
--- a/README.md
+++ b/README.md
@@ -755,15 +755,18 @@ You can, of course, use a custom memory resource with PyTorch as well:
 import rmm
 import torch
 
+# note that you can configure PyTorch to use RMM either before or
+# after changing RMM's memory resource.  PyTorch will use whatever
+# memory resource is configured to be the "current" memory resource at
+# the time of allocation.
+torch.cuda.change_current_allocator(rmm.rmm_torch_allocator)
+
 # configure RMM to use a managed memory resource, wrapped with a
 # statistics resource adaptor that can report information about the
 # amount of memory allocated:
 mr = rmm.mr.StatisticsResourceAdaptor(rmm.mr.ManagedMemoryResource())
 rmm.mr.set_current_device_resource(mr)
 
-# configure PyTorch to use RMM for allocations:
-torch.cuda.change_current_allocator(rmm.rmm_torch_allocator)
-
 x = torch.tensor([1, 2]).cuda()
 
 # the memory resource reports information about PyTorch allocations:
diff --git a/python/rmm/tests/test_rmm_pytorch.py b/python/rmm/tests/test_rmm_pytorch.py
index 9471af346..eaa40c0ed 100644
--- a/python/rmm/tests/test_rmm_pytorch.py
+++ b/python/rmm/tests/test_rmm_pytorch.py
@@ -1,3 +1,5 @@
+import gc
+
 import pytest
 
 import rmm
@@ -15,18 +17,15 @@ def torch_allocator():
 
 
 def test_rmm_torch_allocator(torch_allocator, stats_mr):
-    import torch
-
     assert stats_mr.allocation_counts["current_bytes"] == 0
     x = torch.tensor([1, 2]).cuda()
     assert stats_mr.allocation_counts["current_bytes"] > 0
     del x
+    gc.collect()
     assert stats_mr.allocation_counts["current_bytes"] == 0
 
 
 def test_rmm_torch_allocator_using_stream(torch_allocator, stats_mr):
-    import torch
-
     assert stats_mr.allocation_counts["current_bytes"] == 0
     s = torch.cuda.Stream()
     with torch.cuda.stream(s):
@@ -34,4 +33,5 @@ def test_rmm_torch_allocator_using_stream(torch_allocator, stats_mr):
     torch.cuda.current_stream().wait_stream(s)
     assert stats_mr.allocation_counts["current_bytes"] > 0
     del x
+    gc.collect()
     assert stats_mr.allocation_counts["current_bytes"] == 0