From c09e947abbc0b92c479dfe4f1b8cdce80efdde9e Mon Sep 17 00:00:00 2001
From: Akshay Venkatesh <akvenkatesh@nvidia.com>
Date: Thu, 3 Oct 2024 21:02:58 +0000
Subject: [PATCH] opal/cuda: Handle stream-ordered allocations and assign
 primary device context

Signed-off-by: Akshay Venkatesh <akvenkatesh@nvidia.com>
(cherry picked from commit 5328616e8ece2d3c3de201df1a9dd1ebe9f1299b)
---
 opal/mca/accelerator/cuda/accelerator_cuda.c | 115 +++++++++++++++++--
 1 file changed, 107 insertions(+), 8 deletions(-)

diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c
index 1c7d76d1bbf..89056e09299 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda.c
+++ b/opal/mca/accelerator/cuda/accelerator_cuda.c
@@ -159,12 +159,96 @@ static int accelerator_cuda_get_device_id(CUcontext mem_ctx) {
     return dev_id;
 }
 
+static int accelerator_cuda_check_mpool(CUdeviceptr dbuf, CUmemorytype *mem_type,
+                                        int *dev_id)
+{
+#if OPAL_CUDA_VMM_SUPPORT
+    static int device_count = -1;
+    static int mpool_supported = -1;
+    CUresult result;
+    CUmemoryPool mpool;
+    CUmemAccess_flags flags;
+    CUmemLocation location;
+
+    if (mpool_supported <= 0) {
+        if (mpool_supported == -1) {
+            if (device_count == -1) {
+                result = cuDeviceGetCount(&device_count);
+                if (result != CUDA_SUCCESS || (0 == device_count)) {
+                    mpool_supported = 0;  /* never check again */
+                    device_count = 0;
+                    return 0;
+                }
+            }
+
+            /* assume uniformity of devices */
+            result = cuDeviceGetAttribute(&mpool_supported,
+                    CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, 0);
+            if (result != CUDA_SUCCESS) {
+                mpool_supported = 0;
+            }
+        }
+        if (0 == mpool_supported) {
+            return 0;
+        }
+    }
+
+    result = cuPointerGetAttribute(&mpool, CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE,
+                                   dbuf);
+    if (CUDA_SUCCESS != result) {
+        return 0;
+    }
+
+    /* check if device has access */
+    for (int i = 0; i < device_count; i++) {
+        location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+        location.id   = i;
+        result = cuMemPoolGetAccess(&flags, mpool, &location);
+        if ((CUDA_SUCCESS == result) &&
+            (CU_MEM_ACCESS_FLAGS_PROT_READWRITE == flags)) {
+            *mem_type = CU_MEMORYTYPE_DEVICE;
+            *dev_id  = i;
+            return 1;
+        }
+    }
+
+    /* host must have access as device access possibility is exhausted */
+    *mem_type = CU_MEMORYTYPE_HOST;
+    *dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+    return 0;
+#endif
+
+    return 0;
+}
+
+static int accelerator_cuda_get_primary_context(CUdevice dev_id, CUcontext *pctx)
+{
+    CUresult result;
+    unsigned int flags;
+    int active;
+
+    result =  cuDevicePrimaryCtxGetState(dev_id, &flags, &active);
+    if (CUDA_SUCCESS != result) {
+        return OPAL_ERROR;
+    }
+
+    if (active) {
+        result = cuDevicePrimaryCtxRetain(pctx, dev_id);
+        return OPAL_SUCCESS;
+    }
+
+    return OPAL_ERROR;
+}
+
 static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *flags)
 {
     CUresult result;
     int is_vmm = 0;
+    int is_mpool_ptr = 0;
     int vmm_dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+    int mpool_dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
     CUmemorytype vmm_mem_type = 0;
+    CUmemorytype mpool_mem_type = 0;
     CUmemorytype mem_type = 0;
     CUdeviceptr dbuf = (CUdeviceptr) addr;
     CUcontext ctx = NULL, mem_ctx = NULL;
@@ -177,6 +261,7 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
     *flags = 0;
 
     is_vmm = accelerator_cuda_check_vmm(dbuf, &vmm_mem_type, &vmm_dev_id);
+    is_mpool_ptr = accelerator_cuda_check_mpool(dbuf, &mpool_mem_type, &mpool_dev_id);
 
 #if OPAL_CUDA_GET_ATTRIBUTES
     uint32_t is_managed = 0;
@@ -210,6 +295,9 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
         if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE)) {
             mem_type = CU_MEMORYTYPE_DEVICE;
             *dev_id = vmm_dev_id;
+        } else if (is_mpool_ptr && (mpool_mem_type == CU_MEMORYTYPE_DEVICE)) {
+            mem_type = CU_MEMORYTYPE_DEVICE;
+            *dev_id = mpool_dev_id;
         } else {
             /* Host memory, nothing to do here */
             return 0;
@@ -220,6 +308,8 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
     } else {
         if (is_vmm) {
             *dev_id = vmm_dev_id;
+        } else if (is_mpool_ptr) {
+            *dev_id = mpool_dev_id;
         } else {
             /* query the device from the context */
             *dev_id = accelerator_cuda_get_device_id(mem_ctx);
@@ -238,6 +328,9 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
         if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE)) {
             mem_type = CU_MEMORYTYPE_DEVICE;
             *dev_id = vmm_dev_id;
+        } else if (is_mpool_ptr && (mpool_mem_type == CU_MEMORYTYPE_DEVICE)) {
+            mem_type = CU_MEMORYTYPE_DEVICE;
+            *dev_id = mpool_dev_id;
         } else {
             /* Host memory, nothing to do here */
             return 0;
@@ -245,6 +338,8 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
     } else {
         if (is_vmm) {
             *dev_id = vmm_dev_id;
+        } else if (is_mpool_ptr) {
+            *dev_id = mpool_dev_id;
         } else {
             result = cuPointerGetAttribute(&mem_ctx,
                                            CU_POINTER_ATTRIBUTE_CONTEXT, dbuf);
@@ -278,14 +373,18 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
                 return OPAL_ERROR;
             }
 #endif /* OPAL_CUDA_GET_ATTRIBUTES */
-            if (is_vmm) {
-                /* This function is expected to set context if pointer is device
-                 * accessible but VMM allocations have NULL context associated
-                 * which cannot be set against the calling thread */
-                opal_output(0,
-                        "CUDA: unable to set context with the given pointer"
-                        "ptr=%p aborting...", addr);
-                return OPAL_ERROR;
+            if (is_vmm || is_mpool_ptr) {
+                if (OPAL_SUCCESS ==
+                        accelerator_cuda_get_primary_context(
+                            is_vmm ? vmm_dev_id : mpool_dev_id, &mem_ctx)) {
+                    /* As VMM/mempool allocations have no context associated
+                     * with them, check if device primary context can be set */
+                } else {
+                    opal_output(0,
+                                "CUDA: unable to set ctx with the given pointer"
+                                "ptr=%p aborting...", addr);
+                    return OPAL_ERROR;
+                }
             }
 
             result = cuCtxSetCurrent(mem_ctx);