Fallback to use the CUDA primary context (#189)

Fixes rapidsai/cudf#13019 Authors: - Mads R. B. Kristensen (https://github.com/madsbk) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: #189
rapidsai · Apr 3, 2023 · 5978055 · 5978055
1 parent 7320b52
commit 5978055
Show file tree

Hide file tree

Showing 6 changed files with 211 additions and 67 deletions.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -26,6 +26,7 @@ project(
   LANGUAGES CXX
 )
 
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/")
 
 # Write the version header

diff --git a/cpp/include/kvikio/error.hpp b/cpp/include/kvikio/error.hpp
@@ -40,9 +40,10 @@ struct CUfileException : public std::runtime_error {
                              KVIKIO_STRINGIFY(__LINE__) +                                      \
                              ": CUDA_ERROR_STUB_LIBRARY("                                      \
                              "The CUDA driver loaded is a stub library)"};                     \
-    } else if (error != CUDA_SUCCESS) {                                                        \
-      const char* err_name;                                                                    \
-      const char* err_str;                                                                     \
+    }                                                                                          \
+    if (error != CUDA_SUCCESS) {                                                               \
+      const char* err_name     = nullptr;                                                      \
+      const char* err_str      = nullptr;                                                      \
       CUresult err_name_status = cudaAPI::instance().GetErrorName(error, &err_name);           \
       CUresult err_str_status  = cudaAPI::instance().GetErrorString(error, &err_str);          \
       if (err_name_status == CUDA_ERROR_INVALID_VALUE) { err_name = "unknown"; }               \

diff --git a/cpp/include/kvikio/file_handle.hpp b/cpp/include/kvikio/file_handle.hpp
@@ -391,7 +391,7 @@ class FileHandle {
       return parallel_io(op, buf, size, file_offset, task_size, 0);
     }
 
-    CUcontext ctx = get_current_context(buf);
+    CUcontext ctx = get_context_from_pointer(buf);
     auto task     = [this, ctx](void* devPtr_base,
                             std::size_t size,
                             std::size_t file_offset,
@@ -437,7 +437,7 @@ class FileHandle {
       return parallel_io(op, buf, size, file_offset, task_size, 0);
     }
 
-    CUcontext ctx = get_current_context(buf);
+    CUcontext ctx = get_context_from_pointer(buf);
     auto op       = [this, ctx](const void* devPtr_base,
                           std::size_t size,
                           std::size_t file_offset,

diff --git a/cpp/include/kvikio/shim/cuda.hpp b/cpp/include/kvikio/shim/cuda.hpp
@@ -43,6 +43,9 @@ class cudaAPI {
   decltype(cuMemGetAddressRange)* MemGetAddressRange{nullptr};
   decltype(cuGetErrorName)* GetErrorName{nullptr};
   decltype(cuGetErrorString)* GetErrorString{nullptr};
+  decltype(cuDeviceGet)* DeviceGet{nullptr};
+  decltype(cuDevicePrimaryCtxRetain)* DevicePrimaryCtxRetain{nullptr};
+  decltype(cuDevicePrimaryCtxRelease)* DevicePrimaryCtxRelease{nullptr};
 
  private:
   cudaAPI()
@@ -64,6 +67,9 @@ class cudaAPI {
     get_symbol(MemGetAddressRange, lib, KVIKIO_STRINGIFY(cuMemGetAddressRange));
     get_symbol(GetErrorName, lib, KVIKIO_STRINGIFY(cuGetErrorName));
     get_symbol(GetErrorString, lib, KVIKIO_STRINGIFY(cuGetErrorString));
+    get_symbol(DeviceGet, lib, KVIKIO_STRINGIFY(cuDeviceGet));
+    get_symbol(DevicePrimaryCtxRetain, lib, KVIKIO_STRINGIFY(cuDevicePrimaryCtxRetain));
+    get_symbol(DevicePrimaryCtxRelease, lib, KVIKIO_STRINGIFY(cuDevicePrimaryCtxRelease));
   }
 
  public:

diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp
@@ -19,6 +19,8 @@
 #include <cstring>
 #include <future>
 #include <iostream>
+#include <map>
+#include <optional>
 #include <tuple>
 
 #include <kvikio/error.hpp>
@@ -51,39 +53,6 @@ inline constexpr std::size_t page_size = 4096;
   return reinterpret_cast<CUdeviceptr>(devPtr);
 }
 
-/**
- * @brief Get the current cuda context
- *
- * Previously, we got the cuda context from the provided device pointer by calling
- * `cuPointerGetAttribute(..., CU_POINTER_ATTRIBUTE_CONTEXT)`. However, this doesn't
- * work for stream ordered device memory allocations[1] so we now get the current
- * cuda context instead.
- * [1] <https://docs.nvidia.com/cuda/cuda-c-programming-guide/#pointer-attributes>
- *
- * @param check_owning_devPtr If not NULL, a device memory pointer that must have
- * been allocated by, mapped by, or registered with the current context. If this
- * isn't the case, a CUfileException is thrown.
- *
- * @return The current cuda context
- */
-[[nodiscard]] inline CUcontext get_current_context(const void* check_owning_devPtr = nullptr)
-{
-  if (check_owning_devPtr != nullptr) {
-    CUdeviceptr current_ctx_devPtr{};
-    CUdeviceptr dev_ptr = convert_void2deviceptr(check_owning_devPtr);
-
-    CUresult const err = cudaAPI::instance().PointerGetAttribute(
-      &current_ctx_devPtr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr);
-    if (err != CUDA_SUCCESS || current_ctx_devPtr != dev_ptr) {
-      throw CUfileException("The current CUDA context must own the given device memory");
-    }
-  }
-
-  CUcontext ret{};
-  CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ret));
-  return ret;
-}
-
 /**
  * @brief Check if `ptr` points to host memory (as opposed to device memory)
  *
@@ -112,6 +81,133 @@ inline bool is_host_memory(const void* ptr)
   return memtype == 0 || memtype == CU_MEMORYTYPE_HOST;
 }
 
+/**
+ * @brief Return the device owning the pointer
+ *
+ * @param ptr Device pointer to query
+ * @return The device ordinal
+ */
+[[nodiscard]] inline int get_device_ordinal_from_pointer(CUdeviceptr dev_ptr)
+{
+  int ret = 0;
+  CUDA_DRIVER_TRY(
+    cudaAPI::instance().PointerGetAttribute(&ret, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, dev_ptr));
+  return ret;
+}
+
+/**
+ * @brief RAII wrapper for a CUDA primary context
+ */
+class CudaPrimaryContext {
+ public:
+  CUdevice dev{};
+  CUcontext ctx{};
+
+  CudaPrimaryContext(int device_ordinal)
+  {
+    CUDA_DRIVER_TRY(cudaAPI::instance().DeviceGet(&dev, device_ordinal));
+    CUDA_DRIVER_TRY(cudaAPI::instance().DevicePrimaryCtxRetain(&ctx, dev));
+  }
+  CudaPrimaryContext(const CudaPrimaryContext&) = delete;
+  CudaPrimaryContext& operator=(CudaPrimaryContext const&) = delete;
+  CudaPrimaryContext(CudaPrimaryContext&&)                 = delete;
+  CudaPrimaryContext&& operator=(CudaPrimaryContext&&) = delete;
+  ~CudaPrimaryContext()
+  {
+    try {
+      CUDA_DRIVER_TRY(cudaAPI::instance().DevicePrimaryCtxRelease(dev), CUfileException);
+    } catch (const CUfileException& e) {
+      std::cerr << e.what() << std::endl;
+    }
+  }
+};
+
+/**
+ * @brief Given a device ordinal, return the primary context of the device.
+ *
+ * This function caches the primary contexts retrieved until program exit
+ *
+ * @param ordinal Device ordinal - an integer between 0 and the number of CUDA devices
+ * @return Primary CUDA context
+ */
+[[nodiscard]] inline CUcontext get_primary_cuda_context(int ordinal)
+{
+  static std::map<int, CudaPrimaryContext> _primary_contexts;
+  _primary_contexts.try_emplace(ordinal, ordinal);
+  return _primary_contexts.at(ordinal).ctx;
+}
+
+/**
+ * @brief Return the CUDA context associated the given device pointer, if any.
+ *
+ * @param dev_ptr Device pointer to query
+ * @return Usable CUDA context, if one were found.
+ */
+[[nodiscard]] inline std::optional<CUcontext> get_context_associated_pointer(CUdeviceptr dev_ptr)
+{
+  CUcontext ctx = nullptr;
+  const CUresult err =
+    cudaAPI::instance().PointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr);
+  if (err == CUDA_SUCCESS && ctx != nullptr) { return ctx; }
+  if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); }
+  return {};
+}
+
+/**
+ * @brief Check if the current CUDA context can access the given device pointer
+ *
+ * @param dev_ptr Device pointer to query
+ * @return The boolean answer
+ */
+[[nodiscard]] inline bool current_context_can_access_pointer(CUdeviceptr dev_ptr)
+{
+  CUdeviceptr current_ctx_dev_ptr{};
+  const CUresult err = cudaAPI::instance().PointerGetAttribute(
+    &current_ctx_dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr);
+  if (err == CUDA_SUCCESS && current_ctx_dev_ptr == dev_ptr) { return true; }
+  if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); }
+  return false;
+}
+
+/**
+ * @brief Return a CUDA context that can be used with the given device pointer
+ *
+ * For robustness, we look for an usabale context in the following order:
+ *   1) If a context has been associated with `devPtr`, it is returned.
+ *   2) If the current context exists and can access `devPtr`, it is returned.
+ *   3) Return the primary context of the device that owns `devPtr`. We assume the
+ *      primary context can access `devPtr`, which might not be true in the exceptional
+ *      disjoint addressing cases mention in the CUDA docs[1]. In these cases, the user
+ *      has to set an usable current context before reading/writing using KvikIO.
+ *
+ * [1] <https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__UNIFIED.html>
+ *
+ * @param devPtr Device pointer to query
+ * @return Usable CUDA context
+ */
+[[nodiscard]] inline CUcontext get_context_from_pointer(const void* devPtr)
+{
+  CUdeviceptr dev_ptr = convert_void2deviceptr(devPtr);
+
+  // First we check if a context has been associated with `devPtr`.
+  {
+    auto ctx = get_context_associated_pointer(dev_ptr);
+    if (ctx.has_value()) { return ctx.value(); }
+  }
+
+  // If this isn't the case, we check the current context. If it exist and can access `devPtr`, we
+  // return the current context.
+  {
+    CUcontext ctx = nullptr;
+    CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx));
+    if (ctx != nullptr && current_context_can_access_pointer(dev_ptr)) { return ctx; }
+  }
+
+  // Finally, if we didn't find any usable context, we return the primary context of the
+  // device that owns `devPtr`. If the primary context cannot access `devPtr`, we accept failure.
+  return get_primary_cuda_context(get_device_ordinal_from_pointer(dev_ptr));
+}
+
 /**
  * @brief Push CUDA context on creation and pop it on destruction
  */
@@ -149,7 +245,7 @@ inline std::tuple<void*, std::size_t, std::size_t> get_alloc_info(const void* de
   if (ctx != nullptr) {
     _ctx = *ctx;
   } else {
-    _ctx = get_current_context(devPtr);
+    _ctx = get_context_from_pointer(devPtr);
   }
   PushAndPopContext context(_ctx);
   CUDA_DRIVER_TRY(cudaAPI::instance().MemGetAddressRange(&base_ptr, &base_size, dev));

diff --git a/python/tests/test_basic_io.py b/python/tests/test_basic_io.py
@@ -3,6 +3,7 @@
 
 import os
 import random
+from contextlib import contextmanager
 
 import pytest
 
@@ -133,34 +134,6 @@ def test_read_write_slices(tmp_path, xp, nthreads, tasksize, start, end):
             assert all(a == b)
 
 
-@pytest.mark.skipif(
-    cupy.cuda.runtime.getDeviceCount() < 2, reason="requires multiple GPUs"
-)
-def test_multiple_gpus(tmp_path):
-    """Test IO from two different GPUs"""
-    with kvikio.defaults.set_num_threads(10):
-        with kvikio.defaults.set_task_size(10):
-            with cupy.cuda.Device(0):
-                a0 = cupy.arange(200)
-            with cupy.cuda.Device(1):
-                a1 = cupy.zeros(200, dtype=a0.dtype)
-
-            filename = tmp_path / "test-file"
-            with kvikio.CuFile(filename, "w") as f:
-                with cupy.cuda.Device(0):
-                    assert f.write(a0) == a0.nbytes
-
-            with kvikio.CuFile(filename, "r") as f:
-                with pytest.raises(
-                    RuntimeError,
-                    match="The current CUDA context must own the given device memory",
-                ):
-                    f.read(a1)
-                with cupy.cuda.Device(1):
-                    assert f.read(a1) == a1.nbytes
-            assert all(cupy.asnumpy(a0) == cupy.asnumpy(a1))
-
-
 @pytest.mark.parametrize("size", [1, 10, 100, 1000, 1024, 4096, 4096 * 10])
 def test_raw_read_write(tmp_path, size):
     """Test raw read/write"""
@@ -184,3 +157,70 @@ def test_raw_read_write_of_host_memory(tmp_path):
     with kvikio.CuFile(filename, "r") as f:
         with pytest.raises(ValueError, match="Non-CUDA buffers not supported"):
             assert f.raw_read(a) == a.nbytes
+
+
+@contextmanager
+def with_no_cuda_context():
+    """Context that pop all CUDA contexts before the test and push them back on after"""
+    cuda = pytest.importorskip("cuda.cuda")
+    assert cuda.cuInit(0)[0] == cuda.CUresult.CUDA_SUCCESS
+
+    ctx_stack = []
+    while True:
+        err, ctx = cuda.cuCtxPopCurrent()
+        if err == cuda.CUresult.CUDA_ERROR_INVALID_CONTEXT:
+            break
+        assert err == cuda.CUresult.CUDA_SUCCESS
+        ctx_stack.append(ctx)
+    yield
+    for ctx in reversed(ctx_stack):
+        (err,) = cuda.cuCtxPushCurrent(ctx)
+        assert err == cuda.CUresult.CUDA_SUCCESS
+
+
+def test_no_current_cuda_context(tmp_path, xp):
+    """Test IO when CUDA context is current"""
+    filename = tmp_path / "test-file"
+    a = xp.arange(100)
+    b = xp.empty_like(a)
+
+    with kvikio.CuFile(filename, "w+") as f:
+        with with_no_cuda_context():
+            f.write(a)
+        f.read(b)
+    assert all(a == b)
+
+
+@pytest.mark.skipif(
+    cupy.cuda.runtime.getDeviceCount() < 2, reason="requires multiple GPUs"
+)
+def test_multiple_gpus(tmp_path, xp):
+    """Test IO from two different GPUs"""
+    filename = tmp_path / "test-file"
+
+    with kvikio.defaults.set_num_threads(10):
+        with kvikio.defaults.set_task_size(10):
+
+            # Allocate an array on each device
+            with cupy.cuda.Device(0):
+                a0 = xp.arange(200)
+            with cupy.cuda.Device(1):
+                a1 = xp.zeros(200, dtype=a0.dtype)
+
+            # Test when the device match the allocation
+            with kvikio.CuFile(filename, "w") as f:
+                with cupy.cuda.Device(0):
+                    assert f.write(a0) == a0.nbytes
+            with kvikio.CuFile(filename, "r") as f:
+                with cupy.cuda.Device(1):
+                    assert f.read(a1) == a1.nbytes
+            assert bytes(a0) == bytes(a1)
+
+            # Test when the device doesn't match the allocation
+            with kvikio.CuFile(filename, "w") as f:
+                with cupy.cuda.Device(1):
+                    assert f.write(a0) == a0.nbytes
+            with kvikio.CuFile(filename, "r") as f:
+                with cupy.cuda.Device(0):
+                    assert f.read(a1) == a1.nbytes
+            assert bytes(a0) == bytes(a1)