Skip to content

Commit

Permalink
Fallback to use the CUDA primary context (#189)
Browse files Browse the repository at this point in the history
Fixes rapidsai/cudf#13019

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: #189
  • Loading branch information
madsbk authored Apr 3, 2023
1 parent 7320b52 commit 5978055
Show file tree
Hide file tree
Showing 6 changed files with 211 additions and 67 deletions.
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ project(
LANGUAGES CXX
)

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/")

# Write the version header
Expand Down
7 changes: 4 additions & 3 deletions cpp/include/kvikio/error.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,10 @@ struct CUfileException : public std::runtime_error {
KVIKIO_STRINGIFY(__LINE__) + \
": CUDA_ERROR_STUB_LIBRARY(" \
"The CUDA driver loaded is a stub library)"}; \
} else if (error != CUDA_SUCCESS) { \
const char* err_name; \
const char* err_str; \
} \
if (error != CUDA_SUCCESS) { \
const char* err_name = nullptr; \
const char* err_str = nullptr; \
CUresult err_name_status = cudaAPI::instance().GetErrorName(error, &err_name); \
CUresult err_str_status = cudaAPI::instance().GetErrorString(error, &err_str); \
if (err_name_status == CUDA_ERROR_INVALID_VALUE) { err_name = "unknown"; } \
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/kvikio/file_handle.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,7 @@ class FileHandle {
return parallel_io(op, buf, size, file_offset, task_size, 0);
}

CUcontext ctx = get_current_context(buf);
CUcontext ctx = get_context_from_pointer(buf);
auto task = [this, ctx](void* devPtr_base,
std::size_t size,
std::size_t file_offset,
Expand Down Expand Up @@ -437,7 +437,7 @@ class FileHandle {
return parallel_io(op, buf, size, file_offset, task_size, 0);
}

CUcontext ctx = get_current_context(buf);
CUcontext ctx = get_context_from_pointer(buf);
auto op = [this, ctx](const void* devPtr_base,
std::size_t size,
std::size_t file_offset,
Expand Down
6 changes: 6 additions & 0 deletions cpp/include/kvikio/shim/cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ class cudaAPI {
decltype(cuMemGetAddressRange)* MemGetAddressRange{nullptr};
decltype(cuGetErrorName)* GetErrorName{nullptr};
decltype(cuGetErrorString)* GetErrorString{nullptr};
decltype(cuDeviceGet)* DeviceGet{nullptr};
decltype(cuDevicePrimaryCtxRetain)* DevicePrimaryCtxRetain{nullptr};
decltype(cuDevicePrimaryCtxRelease)* DevicePrimaryCtxRelease{nullptr};

private:
cudaAPI()
Expand All @@ -64,6 +67,9 @@ class cudaAPI {
get_symbol(MemGetAddressRange, lib, KVIKIO_STRINGIFY(cuMemGetAddressRange));
get_symbol(GetErrorName, lib, KVIKIO_STRINGIFY(cuGetErrorName));
get_symbol(GetErrorString, lib, KVIKIO_STRINGIFY(cuGetErrorString));
get_symbol(DeviceGet, lib, KVIKIO_STRINGIFY(cuDeviceGet));
get_symbol(DevicePrimaryCtxRetain, lib, KVIKIO_STRINGIFY(cuDevicePrimaryCtxRetain));
get_symbol(DevicePrimaryCtxRelease, lib, KVIKIO_STRINGIFY(cuDevicePrimaryCtxRelease));
}

public:
Expand Down
164 changes: 130 additions & 34 deletions cpp/include/kvikio/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
#include <cstring>
#include <future>
#include <iostream>
#include <map>
#include <optional>
#include <tuple>

#include <kvikio/error.hpp>
Expand Down Expand Up @@ -51,39 +53,6 @@ inline constexpr std::size_t page_size = 4096;
return reinterpret_cast<CUdeviceptr>(devPtr);
}

/**
* @brief Get the current cuda context
*
* Previously, we got the cuda context from the provided device pointer by calling
* `cuPointerGetAttribute(..., CU_POINTER_ATTRIBUTE_CONTEXT)`. However, this doesn't
* work for stream ordered device memory allocations[1] so we now get the current
* cuda context instead.
* [1] <https://docs.nvidia.com/cuda/cuda-c-programming-guide/#pointer-attributes>
*
* @param check_owning_devPtr If not NULL, a device memory pointer that must have
* been allocated by, mapped by, or registered with the current context. If this
* isn't the case, a CUfileException is thrown.
*
* @return The current cuda context
*/
[[nodiscard]] inline CUcontext get_current_context(const void* check_owning_devPtr = nullptr)
{
if (check_owning_devPtr != nullptr) {
CUdeviceptr current_ctx_devPtr{};
CUdeviceptr dev_ptr = convert_void2deviceptr(check_owning_devPtr);

CUresult const err = cudaAPI::instance().PointerGetAttribute(
&current_ctx_devPtr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr);
if (err != CUDA_SUCCESS || current_ctx_devPtr != dev_ptr) {
throw CUfileException("The current CUDA context must own the given device memory");
}
}

CUcontext ret{};
CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ret));
return ret;
}

/**
* @brief Check if `ptr` points to host memory (as opposed to device memory)
*
Expand Down Expand Up @@ -112,6 +81,133 @@ inline bool is_host_memory(const void* ptr)
return memtype == 0 || memtype == CU_MEMORYTYPE_HOST;
}

/**
* @brief Return the device owning the pointer
*
* @param ptr Device pointer to query
* @return The device ordinal
*/
[[nodiscard]] inline int get_device_ordinal_from_pointer(CUdeviceptr dev_ptr)
{
int ret = 0;
CUDA_DRIVER_TRY(
cudaAPI::instance().PointerGetAttribute(&ret, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, dev_ptr));
return ret;
}

/**
* @brief RAII wrapper for a CUDA primary context
*/
class CudaPrimaryContext {
public:
CUdevice dev{};
CUcontext ctx{};

CudaPrimaryContext(int device_ordinal)
{
CUDA_DRIVER_TRY(cudaAPI::instance().DeviceGet(&dev, device_ordinal));
CUDA_DRIVER_TRY(cudaAPI::instance().DevicePrimaryCtxRetain(&ctx, dev));
}
CudaPrimaryContext(const CudaPrimaryContext&) = delete;
CudaPrimaryContext& operator=(CudaPrimaryContext const&) = delete;
CudaPrimaryContext(CudaPrimaryContext&&) = delete;
CudaPrimaryContext&& operator=(CudaPrimaryContext&&) = delete;
~CudaPrimaryContext()
{
try {
CUDA_DRIVER_TRY(cudaAPI::instance().DevicePrimaryCtxRelease(dev), CUfileException);
} catch (const CUfileException& e) {
std::cerr << e.what() << std::endl;
}
}
};

/**
* @brief Given a device ordinal, return the primary context of the device.
*
* This function caches the primary contexts retrieved until program exit
*
* @param ordinal Device ordinal - an integer between 0 and the number of CUDA devices
* @return Primary CUDA context
*/
[[nodiscard]] inline CUcontext get_primary_cuda_context(int ordinal)
{
static std::map<int, CudaPrimaryContext> _primary_contexts;
_primary_contexts.try_emplace(ordinal, ordinal);
return _primary_contexts.at(ordinal).ctx;
}

/**
* @brief Return the CUDA context associated the given device pointer, if any.
*
* @param dev_ptr Device pointer to query
* @return Usable CUDA context, if one were found.
*/
[[nodiscard]] inline std::optional<CUcontext> get_context_associated_pointer(CUdeviceptr dev_ptr)
{
CUcontext ctx = nullptr;
const CUresult err =
cudaAPI::instance().PointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr);
if (err == CUDA_SUCCESS && ctx != nullptr) { return ctx; }
if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); }
return {};
}

/**
* @brief Check if the current CUDA context can access the given device pointer
*
* @param dev_ptr Device pointer to query
* @return The boolean answer
*/
[[nodiscard]] inline bool current_context_can_access_pointer(CUdeviceptr dev_ptr)
{
CUdeviceptr current_ctx_dev_ptr{};
const CUresult err = cudaAPI::instance().PointerGetAttribute(
&current_ctx_dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr);
if (err == CUDA_SUCCESS && current_ctx_dev_ptr == dev_ptr) { return true; }
if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); }
return false;
}

/**
* @brief Return a CUDA context that can be used with the given device pointer
*
* For robustness, we look for an usabale context in the following order:
* 1) If a context has been associated with `devPtr`, it is returned.
* 2) If the current context exists and can access `devPtr`, it is returned.
* 3) Return the primary context of the device that owns `devPtr`. We assume the
* primary context can access `devPtr`, which might not be true in the exceptional
* disjoint addressing cases mention in the CUDA docs[1]. In these cases, the user
* has to set an usable current context before reading/writing using KvikIO.
*
* [1] <https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__UNIFIED.html>
*
* @param devPtr Device pointer to query
* @return Usable CUDA context
*/
[[nodiscard]] inline CUcontext get_context_from_pointer(const void* devPtr)
{
CUdeviceptr dev_ptr = convert_void2deviceptr(devPtr);

// First we check if a context has been associated with `devPtr`.
{
auto ctx = get_context_associated_pointer(dev_ptr);
if (ctx.has_value()) { return ctx.value(); }
}

// If this isn't the case, we check the current context. If it exist and can access `devPtr`, we
// return the current context.
{
CUcontext ctx = nullptr;
CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx));
if (ctx != nullptr && current_context_can_access_pointer(dev_ptr)) { return ctx; }
}

// Finally, if we didn't find any usable context, we return the primary context of the
// device that owns `devPtr`. If the primary context cannot access `devPtr`, we accept failure.
return get_primary_cuda_context(get_device_ordinal_from_pointer(dev_ptr));
}

/**
* @brief Push CUDA context on creation and pop it on destruction
*/
Expand Down Expand Up @@ -149,7 +245,7 @@ inline std::tuple<void*, std::size_t, std::size_t> get_alloc_info(const void* de
if (ctx != nullptr) {
_ctx = *ctx;
} else {
_ctx = get_current_context(devPtr);
_ctx = get_context_from_pointer(devPtr);
}
PushAndPopContext context(_ctx);
CUDA_DRIVER_TRY(cudaAPI::instance().MemGetAddressRange(&base_ptr, &base_size, dev));
Expand Down
96 changes: 68 additions & 28 deletions python/tests/test_basic_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import os
import random
from contextlib import contextmanager

import pytest

Expand Down Expand Up @@ -133,34 +134,6 @@ def test_read_write_slices(tmp_path, xp, nthreads, tasksize, start, end):
assert all(a == b)


@pytest.mark.skipif(
cupy.cuda.runtime.getDeviceCount() < 2, reason="requires multiple GPUs"
)
def test_multiple_gpus(tmp_path):
"""Test IO from two different GPUs"""
with kvikio.defaults.set_num_threads(10):
with kvikio.defaults.set_task_size(10):
with cupy.cuda.Device(0):
a0 = cupy.arange(200)
with cupy.cuda.Device(1):
a1 = cupy.zeros(200, dtype=a0.dtype)

filename = tmp_path / "test-file"
with kvikio.CuFile(filename, "w") as f:
with cupy.cuda.Device(0):
assert f.write(a0) == a0.nbytes

with kvikio.CuFile(filename, "r") as f:
with pytest.raises(
RuntimeError,
match="The current CUDA context must own the given device memory",
):
f.read(a1)
with cupy.cuda.Device(1):
assert f.read(a1) == a1.nbytes
assert all(cupy.asnumpy(a0) == cupy.asnumpy(a1))


@pytest.mark.parametrize("size", [1, 10, 100, 1000, 1024, 4096, 4096 * 10])
def test_raw_read_write(tmp_path, size):
"""Test raw read/write"""
Expand All @@ -184,3 +157,70 @@ def test_raw_read_write_of_host_memory(tmp_path):
with kvikio.CuFile(filename, "r") as f:
with pytest.raises(ValueError, match="Non-CUDA buffers not supported"):
assert f.raw_read(a) == a.nbytes


@contextmanager
def with_no_cuda_context():
"""Context that pop all CUDA contexts before the test and push them back on after"""
cuda = pytest.importorskip("cuda.cuda")
assert cuda.cuInit(0)[0] == cuda.CUresult.CUDA_SUCCESS

ctx_stack = []
while True:
err, ctx = cuda.cuCtxPopCurrent()
if err == cuda.CUresult.CUDA_ERROR_INVALID_CONTEXT:
break
assert err == cuda.CUresult.CUDA_SUCCESS
ctx_stack.append(ctx)
yield
for ctx in reversed(ctx_stack):
(err,) = cuda.cuCtxPushCurrent(ctx)
assert err == cuda.CUresult.CUDA_SUCCESS


def test_no_current_cuda_context(tmp_path, xp):
"""Test IO when CUDA context is current"""
filename = tmp_path / "test-file"
a = xp.arange(100)
b = xp.empty_like(a)

with kvikio.CuFile(filename, "w+") as f:
with with_no_cuda_context():
f.write(a)
f.read(b)
assert all(a == b)


@pytest.mark.skipif(
cupy.cuda.runtime.getDeviceCount() < 2, reason="requires multiple GPUs"
)
def test_multiple_gpus(tmp_path, xp):
"""Test IO from two different GPUs"""
filename = tmp_path / "test-file"

with kvikio.defaults.set_num_threads(10):
with kvikio.defaults.set_task_size(10):

# Allocate an array on each device
with cupy.cuda.Device(0):
a0 = xp.arange(200)
with cupy.cuda.Device(1):
a1 = xp.zeros(200, dtype=a0.dtype)

# Test when the device match the allocation
with kvikio.CuFile(filename, "w") as f:
with cupy.cuda.Device(0):
assert f.write(a0) == a0.nbytes
with kvikio.CuFile(filename, "r") as f:
with cupy.cuda.Device(1):
assert f.read(a1) == a1.nbytes
assert bytes(a0) == bytes(a1)

# Test when the device doesn't match the allocation
with kvikio.CuFile(filename, "w") as f:
with cupy.cuda.Device(1):
assert f.write(a0) == a0.nbytes
with kvikio.CuFile(filename, "r") as f:
with cupy.cuda.Device(0):
assert f.read(a1) == a1.nbytes
assert bytes(a0) == bytes(a1)

0 comments on commit 5978055

Please sign in to comment.