From 83a6b6501f6d65f5242cddd02711ef01ed4738e4 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 29 Mar 2023 03:31:48 -0700
Subject: [PATCH 01/15] Implement get_context_from_pointer

---
 cpp/include/kvikio/file_handle.hpp |  4 +-
 cpp/include/kvikio/shim/cuda.hpp   |  6 +++
 cpp/include/kvikio/utils.hpp       | 79 ++++++++++++++++++++----------
 3 files changed, 60 insertions(+), 29 deletions(-)
diff --git a/cpp/include/kvikio/file_handle.hpp b/cpp/include/kvikio/file_handle.hpp
index c71d52e7e8..716dcfc66c 100644
--- a/cpp/include/kvikio/file_handle.hpp
+++ b/cpp/include/kvikio/file_handle.hpp
@@ -391,7 +391,7 @@ class FileHandle {
       return parallel_io(op, buf, size, file_offset, task_size, 0);
     }
 
-    CUcontext ctx = get_current_context(buf);
+    CUcontext ctx = get_context_from_pointer(buf);
     auto task     = [this, ctx](void* devPtr_base,
                             std::size_t size,
                             std::size_t file_offset,
@@ -437,7 +437,7 @@ class FileHandle {
       return parallel_io(op, buf, size, file_offset, task_size, 0);
     }
 
-    CUcontext ctx = get_current_context(buf);
+    CUcontext ctx = get_context_from_pointer(buf);
     auto op       = [this, ctx](const void* devPtr_base,
                           std::size_t size,
                           std::size_t file_offset,
diff --git a/cpp/include/kvikio/shim/cuda.hpp b/cpp/include/kvikio/shim/cuda.hpp
index 3ca0d5f3e9..7ea444524d 100644
--- a/cpp/include/kvikio/shim/cuda.hpp
+++ b/cpp/include/kvikio/shim/cuda.hpp
@@ -43,6 +43,9 @@ class cudaAPI {
   decltype(cuMemGetAddressRange)* MemGetAddressRange{nullptr};
   decltype(cuGetErrorName)* GetErrorName{nullptr};
   decltype(cuGetErrorString)* GetErrorString{nullptr};
+  decltype(cuDeviceGet)* DeviceGet{nullptr};
+  decltype(cuDevicePrimaryCtxRetain)* DevicePrimaryCtxRetain{nullptr};
+  decltype(cuDevicePrimaryCtxRelease)* DevicePrimaryCtxRelease{nullptr};
 
  private:
   cudaAPI()
@@ -64,6 +67,9 @@ class cudaAPI {
     get_symbol(MemGetAddressRange, lib, KVIKIO_STRINGIFY(cuMemGetAddressRange));
     get_symbol(GetErrorName, lib, KVIKIO_STRINGIFY(cuGetErrorName));
     get_symbol(GetErrorString, lib, KVIKIO_STRINGIFY(cuGetErrorString));
+    get_symbol(DeviceGet, lib, KVIKIO_STRINGIFY(cuDeviceGet));
+    get_symbol(DevicePrimaryCtxRetain, lib, KVIKIO_STRINGIFY(cuDevicePrimaryCtxRetain));
+    get_symbol(DevicePrimaryCtxRelease, lib, KVIKIO_STRINGIFY(cuDevicePrimaryCtxRelease));
   }
 
  public:
diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp
index 1b9c81f190..f4eed2f50c 100644
--- a/cpp/include/kvikio/utils.hpp
+++ b/cpp/include/kvikio/utils.hpp
@@ -51,37 +51,62 @@ inline constexpr std::size_t page_size = 4096;
   return reinterpret_cast<CUdeviceptr>(devPtr);
 }
 
+[[nodiscard]] inline int get_device_ordinal_from_pointer(const void* devPtr)
+{
+  CUdeviceptr dev_ptr = convert_void2deviceptr(devPtr);
+  int ret;
+  CUDA_DRIVER_TRY(
+    cudaAPI::instance().PointerGetAttribute(&ret, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, dev_ptr));
+  return ret;
+}
+
 /**
- * @brief Get the current cuda context
- *
- * Previously, we got the cuda context from the provided device pointer by calling
- * `cuPointerGetAttribute(..., CU_POINTER_ATTRIBUTE_CONTEXT)`. However, this doesn't
- * work for stream ordered device memory allocations[1] so we now get the current
- * cuda context instead.
- * [1] <https://docs.nvidia.com/cuda/cuda-c-programming-guide/#pointer-attributes>
- *
- * @param check_owning_devPtr If not NULL, a device memory pointer that must have
- * been allocated by, mapped by, or registered with the current context. If this
- * isn't the case, a CUfileException is thrown.
- *
- * @return The current cuda context
+ * @brief RAII wrapper for a CUDA primary context
  */
-[[nodiscard]] inline CUcontext get_current_context(const void* check_owning_devPtr = nullptr)
-{
-  if (check_owning_devPtr != nullptr) {
-    CUdeviceptr current_ctx_devPtr{};
-    CUdeviceptr dev_ptr = convert_void2deviceptr(check_owning_devPtr);
-
-    CUresult const err = cudaAPI::instance().PointerGetAttribute(
-      &current_ctx_devPtr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr);
-    if (err != CUDA_SUCCESS || current_ctx_devPtr != dev_ptr) {
-      throw CUfileException("The current CUDA context must own the given device memory");
+class CudaPrimaryContext {
+ public:
+  CUdevice dev;
+  CUcontext ctx;
+
+  CudaPrimaryContext(int device_ordinal)
+  {
+    CUDA_DRIVER_TRY(cudaAPI::instance().DeviceGet(&dev, device_ordinal));
+    CUDA_DRIVER_TRY(cudaAPI::instance().DevicePrimaryCtxRetain(&ctx, dev));
+  }
+  CudaPrimaryContext(const CudaPrimaryContext&) = delete;
+  CudaPrimaryContext& operator=(CudaPrimaryContext const&) = delete;
+  CudaPrimaryContext(CudaPrimaryContext&&)                 = delete;
+  CudaPrimaryContext&& operator=(CudaPrimaryContext&&) = delete;
+  ~CudaPrimaryContext()
+  {
+    try {
+      CUDA_DRIVER_TRY(cudaAPI::instance().DevicePrimaryCtxRelease(dev), CUfileException);
+    } catch (const CUfileException& e) {
+      std::cerr << e.what() << std::endl;
     }
   }
+};
 
-  CUcontext ret{};
-  CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ret));
-  return ret;
+[[nodiscard]] inline CUcontext get_context_from_pointer(const void* devPtr)
+{
+  static std::map<int, CudaPrimaryContext> _primary_contexts;
+  CUdeviceptr dev_ptr = convert_void2deviceptr(devPtr);
+
+  // First we try to get the current context and making sure it can access the device pointer.
+  {
+    CUcontext ctx;
+    CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx));
+    if (ctx != nullptr) {
+      CUdeviceptr current_ctx_dev_ptr{};
+      CUDA_DRIVER_TRY(cudaAPI::instance().PointerGetAttribute(
+        &current_ctx_dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr));
+      if (current_ctx_dev_ptr != dev_ptr) { return ctx; }
+    }
+  }
+  // If the current context isn't available, we return the primary context.
+  int ordinal = get_device_ordinal_from_pointer(devPtr);
+  _primary_contexts.try_emplace(ordinal, ordinal);
+  return _primary_contexts.at(ordinal).ctx;
 }
 
 /**
@@ -149,7 +174,7 @@ inline std::tuple<void*, std::size_t, std::size_t> get_alloc_info(const void* de
   if (ctx != nullptr) {
     _ctx = *ctx;
   } else {
-    _ctx = get_current_context(devPtr);
+    _ctx = get_context_from_pointer(devPtr);
   }
   PushAndPopContext context(_ctx);
   CUDA_DRIVER_TRY(cudaAPI::instance().MemGetAddressRange(&base_ptr, &base_size, dev));

From d92ecc85be1110a3cf23cc6719c6729c017fcf91 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 29 Mar 2023 04:15:39 -0700
Subject: [PATCH 02/15] doc

---
 cpp/include/kvikio/utils.hpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp
index f4eed2f50c..7ccee1825d 100644
--- a/cpp/include/kvikio/utils.hpp
+++ b/cpp/include/kvikio/utils.hpp
@@ -19,6 +19,7 @@
 #include <cstring>
 #include <future>
 #include <iostream>
+#include <map>
 #include <tuple>
 
 #include <kvikio/error.hpp>
@@ -51,6 +52,12 @@ inline constexpr std::size_t page_size = 4096;
   return reinterpret_cast<CUdeviceptr>(devPtr);
 }
 
+/**
+ * @brief Return the device owning the pointer
+ *
+ * @param ptr Device pointer to query
+ * @return The device ordinal
+ */
 [[nodiscard]] inline int get_device_ordinal_from_pointer(const void* devPtr)
 {
   CUdeviceptr dev_ptr = convert_void2deviceptr(devPtr);
@@ -87,6 +94,12 @@ class CudaPrimaryContext {
   }
 };
 
+/**
+ * @brief Return a CUDA that can be used with the given device pointer
+ *
+ * @param devPtr Device pointer to query
+ * @return Usable CUDA context
+ */
 [[nodiscard]] inline CUcontext get_context_from_pointer(const void* devPtr)
 {
   static std::map<int, CudaPrimaryContext> _primary_contexts;

From 5e122bd21f159ced633883a06d8b28cab50ef6b5 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 29 Mar 2023 04:39:34 -0700
Subject: [PATCH 03/15] get_context_from_pointer(): also checks
 CU_POINTER_ATTRIBUTE_CONTEXT

---
 cpp/include/kvikio/utils.hpp | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp
index 7ccee1825d..29b8c734de 100644
--- a/cpp/include/kvikio/utils.hpp
+++ b/cpp/include/kvikio/utils.hpp
@@ -105,18 +105,27 @@ class CudaPrimaryContext {
   static std::map<int, CudaPrimaryContext> _primary_contexts;
   CUdeviceptr dev_ptr = convert_void2deviceptr(devPtr);
 
-  // First we try to get the current context and making sure it can access the device pointer.
-  {
-    CUcontext ctx;
+  // First we check if a context has been associated with `devPtr`
+  CUcontext ctx;
+  const CUresult err =
+    cudaAPI::instance().PointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr);
+  if (err != CUDA_ERROR_INVALID_VALUE) {
+    // If this isn't the case, we grab the current context
     CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx));
-    if (ctx != nullptr) {
-      CUdeviceptr current_ctx_dev_ptr{};
-      CUDA_DRIVER_TRY(cudaAPI::instance().PointerGetAttribute(
-        &current_ctx_dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr));
-      if (current_ctx_dev_ptr != dev_ptr) { return ctx; }
-    }
+  } else {
+    CUDA_DRIVER_TRY(err);  // Check for errors from previous asynchronous launches
   }
-  // If the current context isn't available, we return the primary context.
+
+  // If we found a context, we return it if it can access the `devPtr`.
+  if (ctx != nullptr) {
+    CUdeviceptr current_ctx_dev_ptr{};
+    CUDA_DRIVER_TRY(cudaAPI::instance().PointerGetAttribute(
+      &current_ctx_dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr));
+    if (current_ctx_dev_ptr != dev_ptr) { return ctx; }
+  }
+
+  // If we didn't find any usable context, we return the primary context of the device owning
+  // `devPtr`.
   int ordinal = get_device_ordinal_from_pointer(devPtr);
   _primary_contexts.try_emplace(ordinal, ordinal);
   return _primary_contexts.at(ordinal).ctx;

From 09b67d0dd9e6816b8e09bec0e55aca1e13311656 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 29 Mar 2023 04:41:19 -0700
Subject: [PATCH 04/15] doc

---
 cpp/include/kvikio/utils.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp
index 29b8c734de..91ea2e1f68 100644
--- a/cpp/include/kvikio/utils.hpp
+++ b/cpp/include/kvikio/utils.hpp
@@ -116,7 +116,7 @@ class CudaPrimaryContext {
     CUDA_DRIVER_TRY(err);  // Check for errors from previous asynchronous launches
   }
 
-  // If we found a context, we return it if it can access the `devPtr`.
+  // If we found a context and it can access `devPtr`, we return it.
   if (ctx != nullptr) {
     CUdeviceptr current_ctx_dev_ptr{};
     CUDA_DRIVER_TRY(cudaAPI::instance().PointerGetAttribute(

From 073360f61bb74b47c353984a6f3cc09602defe01 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 29 Mar 2023 04:43:41 -0700
Subject: [PATCH 05/15] doc

---
 cpp/include/kvikio/utils.hpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp
index 91ea2e1f68..da9a03f6b4 100644
--- a/cpp/include/kvikio/utils.hpp
+++ b/cpp/include/kvikio/utils.hpp
@@ -105,7 +105,9 @@ class CudaPrimaryContext {
   static std::map<int, CudaPrimaryContext> _primary_contexts;
   CUdeviceptr dev_ptr = convert_void2deviceptr(devPtr);
 
-  // First we check if a context has been associated with `devPtr`
+  // First we check if a context has been associated with `devPtr`.
+  // Notice, this is not the case for stream ordered device memory allocations.
+  // See <https://docs.nvidia.com/cuda/cuda-c-programming-guide/#pointer-attributes>
   CUcontext ctx;
   const CUresult err =
     cudaAPI::instance().PointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr);

From ed409a24e689a4741482b80260daf81ecf29f6fe Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 29 Mar 2023 16:37:10 +0200
Subject: [PATCH 06/15] doc

Co-authored-by: Lawrence Mitchell <wence@gmx.li>
---
 cpp/include/kvikio/utils.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp
index da9a03f6b4..4b6a14abc0 100644
--- a/cpp/include/kvikio/utils.hpp
+++ b/cpp/include/kvikio/utils.hpp
@@ -95,7 +95,7 @@ class CudaPrimaryContext {
 };
 
 /**
- * @brief Return a CUDA that can be used with the given device pointer
+ * @brief Return a CUDA context that can be used with the given device pointer
  *
  * @param devPtr Device pointer to query
  * @return Usable CUDA context

From b21371f695713c4e9a9cc8033ff9506f71f7ce7c Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 29 Mar 2023 07:38:44 -0700
Subject: [PATCH 07/15] fixing get_context_from_pointer checks

---
 cpp/include/kvikio/utils.hpp | 40 ++++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp
index da9a03f6b4..663790529f 100644
--- a/cpp/include/kvikio/utils.hpp
+++ b/cpp/include/kvikio/utils.hpp
@@ -108,26 +108,30 @@ class CudaPrimaryContext {
   // First we check if a context has been associated with `devPtr`.
   // Notice, this is not the case for stream ordered device memory allocations.
   // See <https://docs.nvidia.com/cuda/cuda-c-programming-guide/#pointer-attributes>
-  CUcontext ctx;
-  const CUresult err =
-    cudaAPI::instance().PointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr);
-  if (err != CUDA_ERROR_INVALID_VALUE) {
-    // If this isn't the case, we grab the current context
-    CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx));
-  } else {
-    CUDA_DRIVER_TRY(err);  // Check for errors from previous asynchronous launches
+  {
+    CUcontext ctx;
+    const CUresult err =
+      cudaAPI::instance().PointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr);
+    if (err != CUDA_ERROR_INVALID_VALUE) {
+      CUDA_DRIVER_TRY(err);  // Check for other errors
+      return ctx;
+    }
   }
-
-  // If we found a context and it can access `devPtr`, we return it.
-  if (ctx != nullptr) {
-    CUdeviceptr current_ctx_dev_ptr{};
-    CUDA_DRIVER_TRY(cudaAPI::instance().PointerGetAttribute(
-      &current_ctx_dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr));
-    if (current_ctx_dev_ptr != dev_ptr) { return ctx; }
+  // If this isn't the case, we check the current context. If it exist and can access `devPtr`,
+  // we return it.
+  {
+    CUcontext ctx;
+    CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx));
+    if (ctx != nullptr) {
+      CUdeviceptr current_ctx_dev_ptr{};
+      CUDA_DRIVER_TRY(cudaAPI::instance().PointerGetAttribute(
+        &current_ctx_dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr));
+      if (current_ctx_dev_ptr != dev_ptr) { return ctx; }
+    }
   }
-
-  // If we didn't find any usable context, we return the primary context of the device owning
-  // `devPtr`.
+  // Finally, if we didn't find any usable context, we return the primary context of the
+  // device that owns `devPtr`. Notice, we use `_primary_contexts` to cache the primary
+  // context of each device.
   int ordinal = get_device_ordinal_from_pointer(devPtr);
   _primary_contexts.try_emplace(ordinal, ordinal);
   return _primary_contexts.at(ordinal).ctx;

From 2e4b8ad2c8219bdec61d3f18322354d446a1aa70 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 29 Mar 2023 07:42:07 -0700
Subject: [PATCH 08/15] clean up

---
 cpp/include/kvikio/utils.hpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp
index a8edf2ae8c..1516c3d4d8 100644
--- a/cpp/include/kvikio/utils.hpp
+++ b/cpp/include/kvikio/utils.hpp
@@ -58,9 +58,8 @@ inline constexpr std::size_t page_size = 4096;
  * @param ptr Device pointer to query
  * @return The device ordinal
  */
-[[nodiscard]] inline int get_device_ordinal_from_pointer(const void* devPtr)
+[[nodiscard]] inline int get_device_ordinal_from_pointer(CUdeviceptr dev_ptr)
 {
-  CUdeviceptr dev_ptr = convert_void2deviceptr(devPtr);
   int ret;
   CUDA_DRIVER_TRY(
     cudaAPI::instance().PointerGetAttribute(&ret, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, dev_ptr));
@@ -132,7 +131,7 @@ class CudaPrimaryContext {
   // Finally, if we didn't find any usable context, we return the primary context of the
   // device that owns `devPtr`. Notice, we use `_primary_contexts` to cache the primary
   // context of each device.
-  int ordinal = get_device_ordinal_from_pointer(devPtr);
+  int ordinal = get_device_ordinal_from_pointer(dev_ptr);
   _primary_contexts.try_emplace(ordinal, ordinal);
   return _primary_contexts.at(ordinal).ctx;
 }

From 5a585833e6077d6f62daf052d6ac83ec18be5054 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 29 Mar 2023 18:07:12 +0200
Subject: [PATCH 09/15] typo

Co-authored-by: Lawrence Mitchell <wence@gmx.li>
---
 cpp/include/kvikio/utils.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp
index 1516c3d4d8..7747722ecd 100644
--- a/cpp/include/kvikio/utils.hpp
+++ b/cpp/include/kvikio/utils.hpp
@@ -116,7 +116,7 @@ class CudaPrimaryContext {
       return ctx;
     }
   }
-  // If this isn't the case, we check the current context. If it exist and can access `devPtr`,
+  // If this isn't the case, we check the current context. If it exists and can access `devPtr`,
   // we return it.
   {
     CUcontext ctx;

From 1720ffbc6fb2d66207dee9861b2b5b1080778132 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 29 Mar 2023 09:23:47 -0700
Subject: [PATCH 10/15] CU_POINTER_ATTRIBUTE_CONTEXT returns ctx == null

---
 cpp/include/kvikio/utils.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp
index 1516c3d4d8..127e897ab9 100644
--- a/cpp/include/kvikio/utils.hpp
+++ b/cpp/include/kvikio/utils.hpp
@@ -111,7 +111,7 @@ class CudaPrimaryContext {
     CUcontext ctx;
     const CUresult err =
       cudaAPI::instance().PointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr);
-    if (err != CUDA_ERROR_INVALID_VALUE) {
+    if (err != CUDA_ERROR_INVALID_VALUE && ctx != nullptr) {
       CUDA_DRIVER_TRY(err);  // Check for other errors
       return ctx;
     }

From 07e608547e5bf1adf284231a07d7046768c91410 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 29 Mar 2023 18:32:09 +0200
Subject: [PATCH 11/15] Update cpp/include/kvikio/utils.hpp

Co-authored-by: Lawrence Mitchell <wence@gmx.li>
---
 cpp/include/kvikio/utils.hpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp
index e446995014..1660733462 100644
--- a/cpp/include/kvikio/utils.hpp
+++ b/cpp/include/kvikio/utils.hpp
@@ -111,9 +111,12 @@ class CudaPrimaryContext {
     CUcontext ctx;
     const CUresult err =
       cudaAPI::instance().PointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr);
-    if (err != CUDA_ERROR_INVALID_VALUE && ctx != nullptr) {
-      CUDA_DRIVER_TRY(err);  // Check for other errors
-      return ctx;
+    if (err == CUDA_SUCCESS && ctx != nullptr) {
+        return ctx;
+    } else if (err != CUDA_ERROR_INVALID) {
+        CUDA_DRIVER_TRY(err);
+    }
+    // either CUDA_ERROR_INVALID, or SUCCESS, but stream-ordered allocation
     }
   }
   // If this isn't the case, we check the current context. If it exists and can access `devPtr`,

From 1600827b669ae9371c115608555bcaf65c9e19ca Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Thu, 30 Mar 2023 12:29:15 +0200
Subject: [PATCH 12/15] rework

---
 cpp/CMakeLists.txt            |   1 +
 cpp/include/kvikio/error.hpp  |   7 +-
 cpp/include/kvikio/utils.hpp  | 154 ++++++++++++++++++++++------------
 python/tests/test_basic_io.py |  89 ++++++++++++++------
 4 files changed, 168 insertions(+), 83 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index cd583c8323..0141806347 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -26,6 +26,7 @@ project(
   LANGUAGES CXX
 )
 
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/")
 
 # Write the version header
diff --git a/cpp/include/kvikio/error.hpp b/cpp/include/kvikio/error.hpp
index 07c09a73ab..e8e0e00641 100644
--- a/cpp/include/kvikio/error.hpp
+++ b/cpp/include/kvikio/error.hpp
@@ -40,9 +40,10 @@ struct CUfileException : public std::runtime_error {
                              KVIKIO_STRINGIFY(__LINE__) +                                      \
                              ": CUDA_ERROR_STUB_LIBRARY("                                      \
                              "The CUDA driver loaded is a stub library)"};                     \
-    } else if (error != CUDA_SUCCESS) {                                                        \
-      const char* err_name;                                                                    \
-      const char* err_str;                                                                     \
+    }                                                                                          \
+    if (error != CUDA_SUCCESS) {                                                               \
+      const char* err_name     = nullptr;                                                      \
+      const char* err_str      = nullptr;                                                      \
       CUresult err_name_status = cudaAPI::instance().GetErrorName(error, &err_name);           \
       CUresult err_str_status  = cudaAPI::instance().GetErrorString(error, &err_str);          \
       if (err_name_status == CUDA_ERROR_INVALID_VALUE) { err_name = "unknown"; }               \
diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp
index 1660733462..a271e3555a 100644
--- a/cpp/include/kvikio/utils.hpp
+++ b/cpp/include/kvikio/utils.hpp
@@ -20,6 +20,7 @@
 #include <future>
 #include <iostream>
 #include <map>
+#include <optional>
 #include <tuple>
 
 #include <kvikio/error.hpp>
@@ -52,6 +53,34 @@ inline constexpr std::size_t page_size = 4096;
   return reinterpret_cast<CUdeviceptr>(devPtr);
 }
 
+/**
+ * @brief Check if `ptr` points to host memory (as opposed to device memory)
+ *
+ * In this context, managed memory counts as device memory
+ *
+ * @param ptr Memory pointer to query
+ * @return The boolean answer
+ */
+inline bool is_host_memory(const void* ptr)
+{
+  CUpointer_attribute attrs[1] = {
+    CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
+  };
+  CUmemorytype memtype{};
+  void* data[1] = {&memtype};
+  CUresult result =
+    cudaAPI::instance().PointerGetAttributes(1, attrs, data, convert_void2deviceptr(ptr));
+
+  // We assume that `ptr` is host memory when CUDA_ERROR_NOT_INITIALIZED
+  if (result == CUDA_ERROR_NOT_INITIALIZED) { return true; }
+  CUDA_DRIVER_TRY(result);
+
+  // Notice, queying `CU_POINTER_ATTRIBUTE_MEMORY_TYPE` returns zero when the memory
+  // is unregistered host memory. This is undocumented but how the Runtime CUDA API
+  // does it to support `cudaMemoryTypeUnregistered`.
+  return memtype == 0 || memtype == CU_MEMORYTYPE_HOST;
+}
+
 /**
  * @brief Return the device owning the pointer
  *
@@ -60,7 +89,7 @@ inline constexpr std::size_t page_size = 4096;
  */
 [[nodiscard]] inline int get_device_ordinal_from_pointer(CUdeviceptr dev_ptr)
 {
-  int ret;
+  int ret = 0;
   CUDA_DRIVER_TRY(
     cudaAPI::instance().PointerGetAttribute(&ret, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, dev_ptr));
   return ret;
@@ -71,8 +100,8 @@ inline constexpr std::size_t page_size = 4096;
  */
 class CudaPrimaryContext {
  public:
-  CUdevice dev;
-  CUcontext ctx;
+  CUdevice dev{};
+  CUcontext ctx{};
 
   CudaPrimaryContext(int device_ordinal)
   {
@@ -93,78 +122,93 @@ class CudaPrimaryContext {
   }
 };
 
+/**
+ * @brief Given a device ordinal, return the primary context of the device.
+ *
+ * This function cache the primary contexts retrieved until program exit
+ *
+ * @param ordinal Device ordinal - an integer between 0 and the number of CUDA devices
+ * @return Primary CUDA context
+ */
+[[nodiscard]] inline CUcontext get_primary_cuda_context(int ordinal)
+{
+  static std::map<int, CudaPrimaryContext> _primary_contexts;
+  _primary_contexts.try_emplace(ordinal, ordinal);
+  return _primary_contexts.at(ordinal).ctx;
+}
+
+/**
+ * @brief Return the CUDA context associated the given device pointer, if any.
+ *
+ * @param dev_ptr Device pointer to query
+ * @return Usable CUDA context, if one were found.
+ */
+[[nodiscard]] inline std::optional<CUcontext> get_context_associated_pointer(CUdeviceptr dev_ptr)
+{
+  CUcontext ctx = nullptr;
+  const CUresult err =
+    cudaAPI::instance().PointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr);
+  if (err == CUDA_SUCCESS && ctx != nullptr) { return ctx; }
+  if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); }
+  return {};
+}
+
+/**
+ * @brief Check if the current CUDA context can access the given device pointer
+ *
+ * @param dev_ptr Device pointer to query
+ * @return The boolean answer
+ */
+[[nodiscard]] inline bool can_current_context_access_pointer(CUdeviceptr dev_ptr)
+{
+  CUdeviceptr current_ctx_dev_ptr{};
+  const CUresult err = cudaAPI::instance().PointerGetAttribute(
+    &current_ctx_dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr);
+  if (err == CUDA_SUCCESS && current_ctx_dev_ptr == dev_ptr) { return true; }
+  if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); }
+  return false;
+}
+
 /**
  * @brief Return a CUDA context that can be used with the given device pointer
  *
+ * For robustness, we look for an usabale context in the following order:
+ *   1) If a context has been associated with `devPtr`, it is returned
+ *   2) If the current context exist and can access `devPtr`, it is returned.
+ *   3) Return the primary context of the device that owns `devPtr`. We assume the
+ *      primary context can access `devPtr`.
  * @param devPtr Device pointer to query
  * @return Usable CUDA context
  */
 [[nodiscard]] inline CUcontext get_context_from_pointer(const void* devPtr)
 {
-  static std::map<int, CudaPrimaryContext> _primary_contexts;
   CUdeviceptr dev_ptr = convert_void2deviceptr(devPtr);
 
   // First we check if a context has been associated with `devPtr`.
-  // Notice, this is not the case for stream ordered device memory allocations.
-  // See <https://docs.nvidia.com/cuda/cuda-c-programming-guide/#pointer-attributes>
   {
-    CUcontext ctx;
-    const CUresult err =
-      cudaAPI::instance().PointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr);
-    if (err == CUDA_SUCCESS && ctx != nullptr) {
-        return ctx;
-    } else if (err != CUDA_ERROR_INVALID) {
-        CUDA_DRIVER_TRY(err);
-    }
-    // either CUDA_ERROR_INVALID, or SUCCESS, but stream-ordered allocation
+    auto ctx = get_context_associated_pointer(dev_ptr);
+    if (ctx.has_value()) {
+      std::cout << "get_context_from_pointer() - context_associated" << std::endl;
+      return ctx.value();
     }
   }
-  // If this isn't the case, we check the current context. If it exists and can access `devPtr`,
-  // we return it.
+
+  // If this isn't the case, we check the current context. If it exist and can access `devPtr`, we
+  // return the current context.
   {
-    CUcontext ctx;
+    CUcontext ctx = nullptr;
     CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx));
-    if (ctx != nullptr) {
-      CUdeviceptr current_ctx_dev_ptr{};
-      CUDA_DRIVER_TRY(cudaAPI::instance().PointerGetAttribute(
-        &current_ctx_dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr));
-      if (current_ctx_dev_ptr != dev_ptr) { return ctx; }
+    if (ctx != nullptr && can_current_context_access_pointer(dev_ptr)) {
+      std::cout << "get_context_from_pointer() - can_current_context_access_pointer" << std::endl;
+      return ctx;
     }
   }
-  // Finally, if we didn't find any usable context, we return the primary context of the
-  // device that owns `devPtr`. Notice, we use `_primary_contexts` to cache the primary
-  // context of each device.
-  int ordinal = get_device_ordinal_from_pointer(dev_ptr);
-  _primary_contexts.try_emplace(ordinal, ordinal);
-  return _primary_contexts.at(ordinal).ctx;
-}
-
-/**
- * @brief Check if `ptr` points to host memory (as opposed to device memory)
- *
- * In this context, managed memory counts as device memory
- *
- * @param ptr Memory pointer to query
- * @return The boolean answer
- */
-inline bool is_host_memory(const void* ptr)
-{
-  CUpointer_attribute attrs[1] = {
-    CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
-  };
-  CUmemorytype memtype{};
-  void* data[1] = {&memtype};
-  CUresult result =
-    cudaAPI::instance().PointerGetAttributes(1, attrs, data, convert_void2deviceptr(ptr));
 
-  // We assume that `ptr` is host memory when CUDA_ERROR_NOT_INITIALIZED
-  if (result == CUDA_ERROR_NOT_INITIALIZED) { return true; }
-  CUDA_DRIVER_TRY(result);
+  std::cout << "get_context_from_pointer() - get_primary_cuda_context" << std::endl;
 
-  // Notice, queying `CU_POINTER_ATTRIBUTE_MEMORY_TYPE` returns zero when the memory
-  // is unregistered host memory. This is undocumented but how the Runtime CUDA API
-  // does it to support `cudaMemoryTypeUnregistered`.
-  return memtype == 0 || memtype == CU_MEMORYTYPE_HOST;
+  // Finally, if we didn't find any usable context, we return the primary context of the
+  // device that owns `devPtr`. If the primary context cannot access `devPtr`, we accept failure.
+  return get_primary_cuda_context(get_device_ordinal_from_pointer(dev_ptr));
 }
 
 /**
diff --git a/python/tests/test_basic_io.py b/python/tests/test_basic_io.py
index dd5b98fa0f..208226d992 100644
--- a/python/tests/test_basic_io.py
+++ b/python/tests/test_basic_io.py
@@ -3,6 +3,7 @@
 
 import os
 import random
+from contextlib import contextmanager
 
 import pytest
 
@@ -133,6 +134,69 @@ def test_read_write_slices(tmp_path, xp, nthreads, tasksize, start, end):
             assert all(a == b)
 
 
+@pytest.mark.parametrize("size", [1, 10, 100, 1000, 1024, 4096, 4096 * 10])
+def test_raw_read_write(tmp_path, size):
+    """Test raw read/write"""
+    filename = tmp_path / "test-file"
+
+    a = cupy.arange(size)
+    with kvikio.CuFile(filename, "w") as f:
+        assert f.raw_write(a) == a.nbytes
+    with kvikio.CuFile(filename, "r") as f:
+        assert f.raw_read(a) == a.nbytes
+
+
+def test_raw_read_write_of_host_memory(tmp_path):
+    """Test raw read/write of host memory, which isn't supported"""
+    filename = tmp_path / "test-file"
+
+    a = numpy.arange(1024)
+    with kvikio.CuFile(filename, "w") as f:
+        with pytest.raises(ValueError, match="Non-CUDA buffers not supported"):
+            f.raw_write(a)
+    with kvikio.CuFile(filename, "r") as f:
+        with pytest.raises(ValueError, match="Non-CUDA buffers not supported"):
+            assert f.raw_read(a) == a.nbytes
+
+
+@contextmanager
+def with_no_cuda_context():
+    """Context that pop all CUDA contexts before the test and push them back on after"""
+    cuda = pytest.importorskip("cuda.cuda")
+    assert cuda.cuInit(0)[0] == cuda.CUresult.CUDA_SUCCESS
+
+    ctx_stack = []
+    while True:
+        err, ctx = cuda.cuCtxPopCurrent()
+        if err == cuda.CUresult.CUDA_ERROR_INVALID_CONTEXT:
+            break
+        assert err == cuda.CUresult.CUDA_SUCCESS
+        ctx_stack.append(ctx)
+    yield
+    for ctx in reversed(ctx_stack):
+        (err,) = cuda.cuCtxPushCurrent(ctx)
+        assert err == cuda.CUresult.CUDA_SUCCESS
+
+
+def test_no_current_cuda_context(tmp_path):
+    filename = tmp_path / "test-file"
+    ary = cupy.arange(100)
+
+    with with_no_cuda_context():
+        with kvikio.CuFile(filename, "w") as f:
+            f.write(ary)
+
+    with cupy.cuda.using_allocator(cupy.cuda.malloc_async):
+        ary = cupy.arange(100)
+
+    with with_no_cuda_context():
+        with kvikio.CuFile(filename, "w") as f:
+            f.write(ary)
+
+    with kvikio.CuFile(filename, "w") as f:
+        f.write(ary)
+
+
 @pytest.mark.skipif(
     cupy.cuda.runtime.getDeviceCount() < 2, reason="requires multiple GPUs"
 )
@@ -159,28 +223,3 @@ def test_multiple_gpus(tmp_path):
                 with cupy.cuda.Device(1):
                     assert f.read(a1) == a1.nbytes
             assert all(cupy.asnumpy(a0) == cupy.asnumpy(a1))
-
-
-@pytest.mark.parametrize("size", [1, 10, 100, 1000, 1024, 4096, 4096 * 10])
-def test_raw_read_write(tmp_path, size):
-    """Test raw read/write"""
-    filename = tmp_path / "test-file"
-
-    a = cupy.arange(size)
-    with kvikio.CuFile(filename, "w") as f:
-        assert f.raw_write(a) == a.nbytes
-    with kvikio.CuFile(filename, "r") as f:
-        assert f.raw_read(a) == a.nbytes
-
-
-def test_raw_read_write_of_host_memory(tmp_path):
-    """Test raw read/write of host memory, which isn't supported"""
-    filename = tmp_path / "test-file"
-
-    a = numpy.arange(1024)
-    with kvikio.CuFile(filename, "w") as f:
-        with pytest.raises(ValueError, match="Non-CUDA buffers not supported"):
-            f.raw_write(a)
-    with kvikio.CuFile(filename, "r") as f:
-        with pytest.raises(ValueError, match="Non-CUDA buffers not supported"):
-            assert f.raw_read(a) == a.nbytes

From fd4847a8f66cb05100b342991025fb02e7dec809 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Thu, 30 Mar 2023 13:43:23 +0200
Subject: [PATCH 13/15] Apply suggestions from code review

Co-authored-by: Lawrence Mitchell <wence@gmx.li>
---
 cpp/include/kvikio/utils.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp
index a271e3555a..77461a888c 100644
--- a/cpp/include/kvikio/utils.hpp
+++ b/cpp/include/kvikio/utils.hpp
@@ -125,7 +125,7 @@ class CudaPrimaryContext {
 /**
  * @brief Given a device ordinal, return the primary context of the device.
  *
- * This function cache the primary contexts retrieved until program exit
+ * This function caches the primary contexts retrieved until program exit
  *
  * @param ordinal Device ordinal - an integer between 0 and the number of CUDA devices
  * @return Primary CUDA context
@@ -159,7 +159,7 @@ class CudaPrimaryContext {
  * @param dev_ptr Device pointer to query
  * @return The boolean answer
  */
-[[nodiscard]] inline bool can_current_context_access_pointer(CUdeviceptr dev_ptr)
+[[nodiscard]] inline bool current_context_can_access_pointer(CUdeviceptr dev_ptr)
 {
   CUdeviceptr current_ctx_dev_ptr{};
   const CUresult err = cudaAPI::instance().PointerGetAttribute(
@@ -174,7 +174,7 @@ class CudaPrimaryContext {
  *
  * For robustness, we look for an usabale context in the following order:
  *   1) If a context has been associated with `devPtr`, it is returned
- *   2) If the current context exist and can access `devPtr`, it is returned.
+ *   2) If the current context exists and can access `devPtr`, it is returned.
  *   3) Return the primary context of the device that owns `devPtr`. We assume the
  *      primary context can access `devPtr`.
  * @param devPtr Device pointer to query

From 39d50f2386fb1709231c2b4435ff9f97e0249487 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Thu, 30 Mar 2023 14:01:44 +0200
Subject: [PATCH 14/15] clean up

---
 cpp/include/kvikio/utils.hpp  | 11 ++++++--
 python/tests/test_basic_io.py | 53 ++++++++++++++++++-----------------
 2 files changed, 35 insertions(+), 29 deletions(-)

diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp
index 77461a888c..8e3a8e5e7b 100644
--- a/cpp/include/kvikio/utils.hpp
+++ b/cpp/include/kvikio/utils.hpp
@@ -173,10 +173,15 @@ class CudaPrimaryContext {
  * @brief Return a CUDA context that can be used with the given device pointer
  *
  * For robustness, we look for an usabale context in the following order:
- *   1) If a context has been associated with `devPtr`, it is returned
+ *   1) If a context has been associated with `devPtr`, it is returned.
  *   2) If the current context exists and can access `devPtr`, it is returned.
  *   3) Return the primary context of the device that owns `devPtr`. We assume the
- *      primary context can access `devPtr`.
+ *      primary context can access `devPtr`, which might not be true in the exceptional
+ *      disjoint addressing cases mention in the CUDA docs[1]. In these cases, the user
+ *      has to set an usable current context before reading/writing using KvikIO.
+ *
+ * [1] <https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__UNIFIED.html>
+ *
  * @param devPtr Device pointer to query
  * @return Usable CUDA context
  */
@@ -198,7 +203,7 @@ class CudaPrimaryContext {
   {
     CUcontext ctx = nullptr;
     CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx));
-    if (ctx != nullptr && can_current_context_access_pointer(dev_ptr)) {
+    if (ctx != nullptr && current_context_can_access_pointer(dev_ptr)) {
       std::cout << "get_context_from_pointer() - can_current_context_access_pointer" << std::endl;
       return ctx;
     }
diff --git a/python/tests/test_basic_io.py b/python/tests/test_basic_io.py
index 208226d992..c12bf395e4 100644
--- a/python/tests/test_basic_io.py
+++ b/python/tests/test_basic_io.py
@@ -178,48 +178,49 @@ def with_no_cuda_context():
         assert err == cuda.CUresult.CUDA_SUCCESS
 
 
-def test_no_current_cuda_context(tmp_path):
+def test_no_current_cuda_context(tmp_path, xp):
+    """Test IO when CUDA context is current"""
     filename = tmp_path / "test-file"
-    ary = cupy.arange(100)
-
-    with with_no_cuda_context():
-        with kvikio.CuFile(filename, "w") as f:
-            f.write(ary)
-
-    with cupy.cuda.using_allocator(cupy.cuda.malloc_async):
-        ary = cupy.arange(100)
-
-    with with_no_cuda_context():
-        with kvikio.CuFile(filename, "w") as f:
-            f.write(ary)
+    a = xp.arange(100)
+    b = xp.empty_like(a)
 
-    with kvikio.CuFile(filename, "w") as f:
-        f.write(ary)
+    with kvikio.CuFile(filename, "w+") as f:
+        with with_no_cuda_context():
+            f.write(a)
+        f.read(b)
+    assert all(a == b)
 
 
 @pytest.mark.skipif(
     cupy.cuda.runtime.getDeviceCount() < 2, reason="requires multiple GPUs"
 )
-def test_multiple_gpus(tmp_path):
+def test_multiple_gpus(tmp_path, xp):
     """Test IO from two different GPUs"""
+    filename = tmp_path / "test-file"
+
     with kvikio.defaults.set_num_threads(10):
         with kvikio.defaults.set_task_size(10):
+
+            # Allocate an array on each device
             with cupy.cuda.Device(0):
-                a0 = cupy.arange(200)
+                a0 = xp.arange(200)
             with cupy.cuda.Device(1):
-                a1 = cupy.zeros(200, dtype=a0.dtype)
+                a1 = xp.zeros(200, dtype=a0.dtype)
 
-            filename = tmp_path / "test-file"
+            # Test when the device match the allocation
             with kvikio.CuFile(filename, "w") as f:
                 with cupy.cuda.Device(0):
                     assert f.write(a0) == a0.nbytes
-
             with kvikio.CuFile(filename, "r") as f:
-                with pytest.raises(
-                    RuntimeError,
-                    match="The current CUDA context must own the given device memory",
-                ):
-                    f.read(a1)
                 with cupy.cuda.Device(1):
                     assert f.read(a1) == a1.nbytes
-            assert all(cupy.asnumpy(a0) == cupy.asnumpy(a1))
+            assert bytes(a0) == bytes(a1)
+
+            # Test when the device doesn't match the allocation
+            with kvikio.CuFile(filename, "w") as f:
+                with cupy.cuda.Device(1):
+                    assert f.write(a0) == a0.nbytes
+            with kvikio.CuFile(filename, "r") as f:
+                with cupy.cuda.Device(0):
+                    assert f.read(a1) == a1.nbytes
+            assert bytes(a0) == bytes(a1)

From 0aff7f5d632ba46d3096e2d69d650aacfb54a007 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Thu, 30 Mar 2023 14:20:51 +0200
Subject: [PATCH 15/15] removed debug info

---
 cpp/include/kvikio/utils.hpp | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp
index 8e3a8e5e7b..9362678820 100644
--- a/cpp/include/kvikio/utils.hpp
+++ b/cpp/include/kvikio/utils.hpp
@@ -192,10 +192,7 @@ class CudaPrimaryContext {
   // First we check if a context has been associated with `devPtr`.
   {
     auto ctx = get_context_associated_pointer(dev_ptr);
-    if (ctx.has_value()) {
-      std::cout << "get_context_from_pointer() - context_associated" << std::endl;
-      return ctx.value();
-    }
+    if (ctx.has_value()) { return ctx.value(); }
   }
 
   // If this isn't the case, we check the current context. If it exist and can access `devPtr`, we
@@ -203,14 +200,9 @@ class CudaPrimaryContext {
   {
     CUcontext ctx = nullptr;
     CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx));
-    if (ctx != nullptr && current_context_can_access_pointer(dev_ptr)) {
-      std::cout << "get_context_from_pointer() - can_current_context_access_pointer" << std::endl;
-      return ctx;
-    }
+    if (ctx != nullptr && current_context_can_access_pointer(dev_ptr)) { return ctx; }
   }
 
-  std::cout << "get_context_from_pointer() - get_primary_cuda_context" << std::endl;
-
   // Finally, if we didn't find any usable context, we return the primary context of the
   // device that owns `devPtr`. If the primary context cannot access `devPtr`, we accept failure.
   return get_primary_cuda_context(get_device_ordinal_from_pointer(dev_ptr));