From 319a372b75b0530e7f4600bbdc34197db2bf420c Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Fri, 29 Nov 2024 11:17:00 -0800
Subject: [PATCH 1/9] merge with main for ruff

---
 cuda_core/cuda/core/experimental/_device.py | 12 +++++--
 cuda_core/cuda/core/experimental/_memory.py | 37 ++++++++++++++++++++-
 2 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index 0c03c789..a5cd4bc7 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -7,7 +7,7 @@
 
 from cuda import cuda, cudart
 from cuda.core.experimental._context import Context, ContextOptions
-from cuda.core.experimental._memory import Buffer, MemoryResource, _DefaultAsyncMempool
+from cuda.core.experimental._memory import Buffer, MemoryResource, _AsyncMemoryResource, _DefaultAsyncMempool
 from cuda.core.experimental._stream import Stream, StreamOptions, default_stream
 from cuda.core.experimental._utils import ComputeCapability, CUDAError, handle_return, precondition
 
@@ -62,7 +62,13 @@ def __new__(cls, device_id=None):
                 for dev_id in range(total):
                     dev = super().__new__(cls)
                     dev._id = dev_id
-                    dev._mr = _DefaultAsyncMempool(dev_id)
+                    # If the device is in TCC mode, or does not support memory pools for some other reason,
+                    # use the AsyncMemoryResource which does not use memory pools.
+                    if (handle_return(cudart.cudaGetDeviceProperties(dev_id))).memoryPoolsSupported == 0:
+                        dev._mr = _AsyncMemoryResource(dev_id)
+                    else:
+                        dev._mr = _DefaultAsyncMempool(dev_id)
+
                     dev._has_inited = False
                     _tls.devices.append(dev)
 
@@ -70,7 +76,7 @@ def __new__(cls, device_id=None):
 
     def _check_context_initialized(self, *args, **kwargs):
         if not self._has_inited:
-            raise CUDAError("the device is not yet initialized, perhaps you forgot to call .set_current() first?")
+            raise CUDAError("the device is not yet initialized, " "perhaps you forgot to call .set_current() first?")
 
     @property
     def device_id(self) -> int:
diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py
index 415b5151..50f8a260 100644
--- a/cuda_core/cuda/core/experimental/_memory.py
+++ b/cuda_core/cuda/core/experimental/_memory.py
@@ -42,7 +42,11 @@ class Buffer:
     """
 
     # TODO: handle ownership? (_mr could be None)
-    __slots__ = ("_ptr", "_size", "_mr")
+    __slots__ = (
+        "_ptr",
+        "_size",
+        "_mr",
+    )
 
     def __init__(self, ptr, size, mr: MemoryResource = None):
         self._ptr = ptr
@@ -286,3 +290,34 @@ def is_host_accessible(self) -> bool:
     @property
     def device_id(self) -> int:
         raise RuntimeError("the pinned memory resource is not bound to any GPU")
+
+
+class _AsyncMemoryResource(MemoryResource):
+    __slots__ = ("_dev_id",)
+
+    def __init__(self, dev_id):
+        self._handle = None
+        self._dev_id = dev_id
+
+    def allocate(self, size, stream=None) -> Buffer:
+        if stream is None:
+            stream = default_stream()
+        ptr = handle_return(cuda.cuMemAllocAsync(size, stream._handle))
+        return Buffer(ptr, size, self)
+
+    def deallocate(self, ptr, size, stream=None):
+        if stream is None:
+            stream = default_stream()
+        handle_return(cuda.cuMemFreeAsync(ptr, stream._handle))
+
+    @property
+    def is_device_accessible(self) -> bool:
+        return True
+
+    @property
+    def is_host_accessible(self) -> bool:
+        return False
+
+    @property
+    def device_id(self) -> int:
+        return self._dev_id

From 19e3a4f4b54a4b9562742d8575ad1f8ca7e6e0a7 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Fri, 29 Nov 2024 11:18:04 -0800
Subject: [PATCH 2/9] fix tuple reformat

---
 cuda_core/cuda/core/experimental/_memory.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py
index 50f8a260..26a9dd82 100644
--- a/cuda_core/cuda/core/experimental/_memory.py
+++ b/cuda_core/cuda/core/experimental/_memory.py
@@ -42,11 +42,7 @@ class Buffer:
     """
 
     # TODO: handle ownership? (_mr could be None)
-    __slots__ = (
-        "_ptr",
-        "_size",
-        "_mr",
-    )
+    __slots__ = ("_ptr", "_size", "_mr")
 
     def __init__(self, ptr, size, mr: MemoryResource = None):
         self._ptr = ptr

From 5e84da7cf888214ba940176a28089467f2afb055 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Fri, 29 Nov 2024 11:18:45 -0800
Subject: [PATCH 3/9] fix tuple reformat

---
 cuda_core/cuda/core/experimental/_device.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index a5cd4bc7..a15eef36 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -76,7 +76,7 @@ def __new__(cls, device_id=None):
 
     def _check_context_initialized(self, *args, **kwargs):
         if not self._has_inited:
-            raise CUDAError("the device is not yet initialized, " "perhaps you forgot to call .set_current() first?")
+            raise CUDAError("the device is not yet initialized, perhaps you forgot to call .set_current() first?")
 
     @property
     def device_id(self) -> int:

From 122d25c01f4b8bbc02239bb1c2e58005c4bdb506 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Mon, 2 Dec 2024 09:25:39 -0800
Subject: [PATCH 4/9] switch to sync alloc and free

---
 cuda_core/cuda/core/experimental/_device.py | 6 +++---
 cuda_core/cuda/core/experimental/_memory.py | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index a15eef36..889c20a0 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -7,7 +7,7 @@
 
 from cuda import cuda, cudart
 from cuda.core.experimental._context import Context, ContextOptions
-from cuda.core.experimental._memory import Buffer, MemoryResource, _AsyncMemoryResource, _DefaultAsyncMempool
+from cuda.core.experimental._memory import Buffer, MemoryResource, _DefaultAsyncMempool, _SynchronousMemoryResource
 from cuda.core.experimental._stream import Stream, StreamOptions, default_stream
 from cuda.core.experimental._utils import ComputeCapability, CUDAError, handle_return, precondition
 
@@ -63,9 +63,9 @@ def __new__(cls, device_id=None):
                     dev = super().__new__(cls)
                     dev._id = dev_id
                     # If the device is in TCC mode, or does not support memory pools for some other reason,
-                    # use the AsyncMemoryResource which does not use memory pools.
+                    # use the SynchronousMemoryResource which does not use memory pools.
                     if (handle_return(cudart.cudaGetDeviceProperties(dev_id))).memoryPoolsSupported == 0:
-                        dev._mr = _AsyncMemoryResource(dev_id)
+                        dev._mr = _SynchronousMemoryResource(dev_id)
                     else:
                         dev._mr = _DefaultAsyncMempool(dev_id)
 
diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py
index 26a9dd82..16dd97d7 100644
--- a/cuda_core/cuda/core/experimental/_memory.py
+++ b/cuda_core/cuda/core/experimental/_memory.py
@@ -288,7 +288,7 @@ def device_id(self) -> int:
         raise RuntimeError("the pinned memory resource is not bound to any GPU")
 
 
-class _AsyncMemoryResource(MemoryResource):
+class _SynchronousMemoryResource(MemoryResource):
     __slots__ = ("_dev_id",)
 
     def __init__(self, dev_id):
@@ -298,13 +298,13 @@ def __init__(self, dev_id):
     def allocate(self, size, stream=None) -> Buffer:
         if stream is None:
             stream = default_stream()
-        ptr = handle_return(cuda.cuMemAllocAsync(size, stream._handle))
+        ptr = handle_return(cuda.cuMemAlloc(size, stream._handle))
         return Buffer(ptr, size, self)
 
     def deallocate(self, ptr, size, stream=None):
         if stream is None:
             stream = default_stream()
-        handle_return(cuda.cuMemFreeAsync(ptr, stream._handle))
+        handle_return(cuda.cuMemFree(ptr, stream._handle))
 
     @property
     def is_device_accessible(self) -> bool:

From 27ec6d3dabd76238da0974c945c65b7c81ae7c22 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Tue, 3 Dec 2024 13:26:01 -0800
Subject: [PATCH 5/9] add release notes

---
 cuda_core/docs/source/release.md             | 1 +
 cuda_core/docs/source/release/0.1.1-notes.md | 0
 2 files changed, 1 insertion(+)
 create mode 100644 cuda_core/docs/source/release/0.1.1-notes.md

diff --git a/cuda_core/docs/source/release.md b/cuda_core/docs/source/release.md
index 48e24786..55090b0b 100644
--- a/cuda_core/docs/source/release.md
+++ b/cuda_core/docs/source/release.md
@@ -5,5 +5,6 @@
 maxdepth: 3
 ---
 
+    0.1.1 <release/0.1.1-notes>
     0.1.0 <release/0.1.0-notes>
 ```
diff --git a/cuda_core/docs/source/release/0.1.1-notes.md b/cuda_core/docs/source/release/0.1.1-notes.md
new file mode 100644
index 00000000..e69de29b

From 42c4b45241f2c5f08ca96c7560788fec769ef1c0 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Tue, 3 Dec 2024 13:28:58 -0800
Subject: [PATCH 6/9] make true the default path

---
 cuda_core/cuda/core/experimental/_device.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index 889c20a0..88676cf6 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -64,10 +64,10 @@ def __new__(cls, device_id=None):
                     dev._id = dev_id
                     # If the device is in TCC mode, or does not support memory pools for some other reason,
                     # use the SynchronousMemoryResource which does not use memory pools.
-                    if (handle_return(cudart.cudaGetDeviceProperties(dev_id))).memoryPoolsSupported == 0:
-                        dev._mr = _SynchronousMemoryResource(dev_id)
-                    else:
+                    if (handle_return(cudart.cudaGetDeviceProperties(dev_id))).memoryPoolsSupported == 1:
                         dev._mr = _DefaultAsyncMempool(dev_id)
+                    else:
+                        dev._mr = _SynchronousMemoryResource(dev_id)
 
                     dev._has_inited = False
                     _tls.devices.append(dev)

From 64b1f22e9fae282739c6cf9aaf4005a0f289914b Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Tue, 3 Dec 2024 15:38:41 -0800
Subject: [PATCH 7/9] minor rewording

---
 cuda_core/docs/source/release/0.1.1-notes.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/cuda_core/docs/source/release/0.1.1-notes.md b/cuda_core/docs/source/release/0.1.1-notes.md
index e69de29b..d80e6ef4 100644
--- a/cuda_core/docs/source/release/0.1.1-notes.md
+++ b/cuda_core/docs/source/release/0.1.1-notes.md
@@ -0,0 +1,7 @@
+# `cuda.core` Release notes
+
+Released on Dec X, 2024
+
+## Hightlights
+- Support TCC devices with a default synchronous memory resource to avoid the use of memory pools
+

From 1d80ca70f1ef4d0435d4aa49ee97d9ec8b254588 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Wed, 4 Dec 2024 08:48:38 -0800
Subject: [PATCH 8/9] fix some known issues before colossus test

---
 cuda_core/cuda/core/experimental/_device.py | 6 +++++-
 cuda_core/cuda/core/experimental/_memory.py | 8 ++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index 88676cf6..db5f57cf 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -64,7 +64,11 @@ def __new__(cls, device_id=None):
                     dev._id = dev_id
                     # If the device is in TCC mode, or does not support memory pools for some other reason,
                     # use the SynchronousMemoryResource which does not use memory pools.
-                    if (handle_return(cudart.cudaGetDeviceProperties(dev_id))).memoryPoolsSupported == 1:
+                    if (
+                        handle_return(
+                            cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0)
+                        )
+                    ) == 1:
                         dev._mr = _DefaultAsyncMempool(dev_id)
                     else:
                         dev._mr = _SynchronousMemoryResource(dev_id)
diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py
index ac6a78fe..5ff00ba2 100644
--- a/cuda_core/cuda/core/experimental/_memory.py
+++ b/cuda_core/cuda/core/experimental/_memory.py
@@ -303,15 +303,11 @@ def __init__(self, dev_id):
         self._dev_id = dev_id
 
     def allocate(self, size, stream=None) -> Buffer:
-        if stream is None:
-            stream = default_stream()
-        ptr = handle_return(cuda.cuMemAlloc(size, stream._handle))
+        ptr = handle_return(cuda.cuMemAlloc(size))
         return Buffer(ptr, size, self)
 
     def deallocate(self, ptr, size, stream=None):
-        if stream is None:
-            stream = default_stream()
-        handle_return(cuda.cuMemFree(ptr, stream._handle))
+        handle_return(cuda.cuMemFree(ptr))
 
     @property
     def is_device_accessible(self) -> bool:

From b6d73c8ef4efb9ceb97c52f73cc4fb0a60d910c6 Mon Sep 17 00:00:00 2001
From: Keenan Simpson <ksimpson@nvidia.com>
Date: Fri, 6 Dec 2024 11:13:14 -0800
Subject: [PATCH 9/9] Update cuda_core/cuda/core/experimental/_memory.py

Co-authored-by: Leo Fang <leof@nvidia.com>
---
 cuda_core/cuda/core/experimental/_memory.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py
index 5ff00ba2..12fafb39 100644
--- a/cuda_core/cuda/core/experimental/_memory.py
+++ b/cuda_core/cuda/core/experimental/_memory.py
@@ -307,6 +307,9 @@ def allocate(self, size, stream=None) -> Buffer:
         return Buffer(ptr, size, self)
 
     def deallocate(self, ptr, size, stream=None):
+        if stream is None:
+            stream = default_stream()
+        stream.sync()
         handle_return(cuda.cuMemFree(ptr))
 
     @property