Skip to content

Commit

Permalink
Merge pull request #257 from NVIDIA/ksimpson/tcc_memory_resource
Browse files Browse the repository at this point in the history
Add fallback memory resource for TCC devices
  • Loading branch information
leofang authored Dec 6, 2024
2 parents 1c86afa + 57f7003 commit c6a1a94
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 3 deletions.
14 changes: 12 additions & 2 deletions cuda_core/cuda/core/experimental/_device.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from cuda import cuda, cudart
from cuda.core.experimental._context import Context, ContextOptions
from cuda.core.experimental._memory import Buffer, MemoryResource, _DefaultAsyncMempool
from cuda.core.experimental._memory import Buffer, MemoryResource, _DefaultAsyncMempool, _SynchronousMemoryResource
from cuda.core.experimental._stream import Stream, StreamOptions, default_stream
from cuda.core.experimental._utils import ComputeCapability, CUDAError, handle_return, precondition

Expand Down Expand Up @@ -62,7 +62,17 @@ def __new__(cls, device_id=None):
for dev_id in range(total):
dev = super().__new__(cls)
dev._id = dev_id
dev._mr = _DefaultAsyncMempool(dev_id)
# If the device is in TCC mode, or does not support memory pools for some other reason,
# use the SynchronousMemoryResource which does not use memory pools.
if (
handle_return(
cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0)
)
) == 1:
dev._mr = _DefaultAsyncMempool(dev_id)
else:
dev._mr = _SynchronousMemoryResource(dev_id)

dev._has_inited = False
_tls.devices.append(dev)

Expand Down
30 changes: 30 additions & 0 deletions cuda_core/cuda/core/experimental/_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,3 +293,33 @@ def is_host_accessible(self) -> bool:
@property
def device_id(self) -> int:
raise RuntimeError("the pinned memory resource is not bound to any GPU")


class _SynchronousMemoryResource(MemoryResource):
__slots__ = ("_dev_id",)

def __init__(self, dev_id):
self._handle = None
self._dev_id = dev_id

def allocate(self, size, stream=None) -> Buffer:
ptr = handle_return(cuda.cuMemAlloc(size))
return Buffer(ptr, size, self)

def deallocate(self, ptr, size, stream=None):
if stream is None:
stream = default_stream()
stream.sync()
handle_return(cuda.cuMemFree(ptr))

@property
def is_device_accessible(self) -> bool:
return True

@property
def is_host_accessible(self) -> bool:
return False

@property
def device_id(self) -> int:
return self._dev_id
2 changes: 1 addition & 1 deletion cuda_core/docs/source/release/0.1.1-notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Released on Dec XX, 2024
## Hightlights
- Add `StridedMemoryView` and `@args_viewable_as_strided_memory` that provide a concrete
implementation of DLPack & CUDA Array Interface supports.

- Support TCC devices with a default synchronous memory resource to avoid the use of memory pools

## Limitations

Expand Down

0 comments on commit c6a1a94

Please sign in to comment.