Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fallback to use the CUDA primary context #189

Merged
merged 18 commits into from
Apr 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ project(
LANGUAGES CXX
)

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/")

# Write the version header
Expand Down
7 changes: 4 additions & 3 deletions cpp/include/kvikio/error.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,10 @@ struct CUfileException : public std::runtime_error {
KVIKIO_STRINGIFY(__LINE__) + \
": CUDA_ERROR_STUB_LIBRARY(" \
"The CUDA driver loaded is a stub library)"}; \
} else if (error != CUDA_SUCCESS) { \
const char* err_name; \
const char* err_str; \
} \
if (error != CUDA_SUCCESS) { \
const char* err_name = nullptr; \
const char* err_str = nullptr; \
CUresult err_name_status = cudaAPI::instance().GetErrorName(error, &err_name); \
CUresult err_str_status = cudaAPI::instance().GetErrorString(error, &err_str); \
if (err_name_status == CUDA_ERROR_INVALID_VALUE) { err_name = "unknown"; } \
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/kvikio/file_handle.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,7 @@ class FileHandle {
return parallel_io(op, buf, size, file_offset, task_size, 0);
}

CUcontext ctx = get_current_context(buf);
CUcontext ctx = get_context_from_pointer(buf);
auto task = [this, ctx](void* devPtr_base,
std::size_t size,
std::size_t file_offset,
Expand Down Expand Up @@ -437,7 +437,7 @@ class FileHandle {
return parallel_io(op, buf, size, file_offset, task_size, 0);
}

CUcontext ctx = get_current_context(buf);
CUcontext ctx = get_context_from_pointer(buf);
auto op = [this, ctx](const void* devPtr_base,
std::size_t size,
std::size_t file_offset,
Expand Down
6 changes: 6 additions & 0 deletions cpp/include/kvikio/shim/cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ class cudaAPI {
decltype(cuMemGetAddressRange)* MemGetAddressRange{nullptr};
decltype(cuGetErrorName)* GetErrorName{nullptr};
decltype(cuGetErrorString)* GetErrorString{nullptr};
decltype(cuDeviceGet)* DeviceGet{nullptr};
decltype(cuDevicePrimaryCtxRetain)* DevicePrimaryCtxRetain{nullptr};
decltype(cuDevicePrimaryCtxRelease)* DevicePrimaryCtxRelease{nullptr};

private:
cudaAPI()
Expand All @@ -64,6 +67,9 @@ class cudaAPI {
get_symbol(MemGetAddressRange, lib, KVIKIO_STRINGIFY(cuMemGetAddressRange));
get_symbol(GetErrorName, lib, KVIKIO_STRINGIFY(cuGetErrorName));
get_symbol(GetErrorString, lib, KVIKIO_STRINGIFY(cuGetErrorString));
get_symbol(DeviceGet, lib, KVIKIO_STRINGIFY(cuDeviceGet));
get_symbol(DevicePrimaryCtxRetain, lib, KVIKIO_STRINGIFY(cuDevicePrimaryCtxRetain));
get_symbol(DevicePrimaryCtxRelease, lib, KVIKIO_STRINGIFY(cuDevicePrimaryCtxRelease));
}

public:
Expand Down
164 changes: 130 additions & 34 deletions cpp/include/kvikio/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
#include <cstring>
#include <future>
#include <iostream>
#include <map>
#include <optional>
#include <tuple>

#include <kvikio/error.hpp>
Expand Down Expand Up @@ -51,39 +53,6 @@ inline constexpr std::size_t page_size = 4096;
return reinterpret_cast<CUdeviceptr>(devPtr);
}

/**
* @brief Get the current cuda context
*
* Previously, we got the cuda context from the provided device pointer by calling
* `cuPointerGetAttribute(..., CU_POINTER_ATTRIBUTE_CONTEXT)`. However, this doesn't
* work for stream ordered device memory allocations[1] so we now get the current
* cuda context instead.
* [1] <https://docs.nvidia.com/cuda/cuda-c-programming-guide/#pointer-attributes>
*
* @param check_owning_devPtr If not NULL, a device memory pointer that must have
* been allocated by, mapped by, or registered with the current context. If this
* isn't the case, a CUfileException is thrown.
*
* @return The current cuda context
*/
[[nodiscard]] inline CUcontext get_current_context(const void* check_owning_devPtr = nullptr)
{
if (check_owning_devPtr != nullptr) {
CUdeviceptr current_ctx_devPtr{};
CUdeviceptr dev_ptr = convert_void2deviceptr(check_owning_devPtr);

CUresult const err = cudaAPI::instance().PointerGetAttribute(
&current_ctx_devPtr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr);
if (err != CUDA_SUCCESS || current_ctx_devPtr != dev_ptr) {
throw CUfileException("The current CUDA context must own the given device memory");
}
}

CUcontext ret{};
CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ret));
return ret;
}

/**
* @brief Check if `ptr` points to host memory (as opposed to device memory)
*
Expand Down Expand Up @@ -112,6 +81,133 @@ inline bool is_host_memory(const void* ptr)
return memtype == 0 || memtype == CU_MEMORYTYPE_HOST;
}

/**
* @brief Return the device owning the pointer
*
* @param ptr Device pointer to query
* @return The device ordinal
*/
[[nodiscard]] inline int get_device_ordinal_from_pointer(CUdeviceptr dev_ptr)
{
int ret = 0;
CUDA_DRIVER_TRY(
cudaAPI::instance().PointerGetAttribute(&ret, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, dev_ptr));
return ret;
}

/**
* @brief RAII wrapper for a CUDA primary context
*/
class CudaPrimaryContext {
public:
CUdevice dev{};
CUcontext ctx{};

CudaPrimaryContext(int device_ordinal)
{
CUDA_DRIVER_TRY(cudaAPI::instance().DeviceGet(&dev, device_ordinal));
CUDA_DRIVER_TRY(cudaAPI::instance().DevicePrimaryCtxRetain(&ctx, dev));
}
CudaPrimaryContext(const CudaPrimaryContext&) = delete;
CudaPrimaryContext& operator=(CudaPrimaryContext const&) = delete;
CudaPrimaryContext(CudaPrimaryContext&&) = delete;
CudaPrimaryContext&& operator=(CudaPrimaryContext&&) = delete;
~CudaPrimaryContext()
{
try {
CUDA_DRIVER_TRY(cudaAPI::instance().DevicePrimaryCtxRelease(dev), CUfileException);
} catch (const CUfileException& e) {
std::cerr << e.what() << std::endl;
}
}
};

/**
* @brief Given a device ordinal, return the primary context of the device.
*
* This function caches the primary contexts retrieved until program exit
*
* @param ordinal Device ordinal - an integer between 0 and the number of CUDA devices
* @return Primary CUDA context
*/
[[nodiscard]] inline CUcontext get_primary_cuda_context(int ordinal)
{
static std::map<int, CudaPrimaryContext> _primary_contexts;
madsbk marked this conversation as resolved.
Show resolved Hide resolved
_primary_contexts.try_emplace(ordinal, ordinal);
return _primary_contexts.at(ordinal).ctx;
}

/**
* @brief Return the CUDA context associated the given device pointer, if any.
*
* @param dev_ptr Device pointer to query
* @return Usable CUDA context, if one were found.
*/
[[nodiscard]] inline std::optional<CUcontext> get_context_associated_pointer(CUdeviceptr dev_ptr)
{
CUcontext ctx = nullptr;
const CUresult err =
cudaAPI::instance().PointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr);
if (err == CUDA_SUCCESS && ctx != nullptr) { return ctx; }
if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); }
return {};
}

/**
* @brief Check if the current CUDA context can access the given device pointer
*
* @param dev_ptr Device pointer to query
* @return The boolean answer
*/
[[nodiscard]] inline bool current_context_can_access_pointer(CUdeviceptr dev_ptr)
{
CUdeviceptr current_ctx_dev_ptr{};
const CUresult err = cudaAPI::instance().PointerGetAttribute(
&current_ctx_dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr);
if (err == CUDA_SUCCESS && current_ctx_dev_ptr == dev_ptr) { return true; }
if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); }
return false;
}

/**
* @brief Return a CUDA context that can be used with the given device pointer
*
* For robustness, we look for an usabale context in the following order:
* 1) If a context has been associated with `devPtr`, it is returned.
* 2) If the current context exists and can access `devPtr`, it is returned.
* 3) Return the primary context of the device that owns `devPtr`. We assume the
* primary context can access `devPtr`, which might not be true in the exceptional
* disjoint addressing cases mention in the CUDA docs[1]. In these cases, the user
* has to set an usable current context before reading/writing using KvikIO.
*
* [1] <https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__UNIFIED.html>
*
* @param devPtr Device pointer to query
* @return Usable CUDA context
*/
[[nodiscard]] inline CUcontext get_context_from_pointer(const void* devPtr)
{
CUdeviceptr dev_ptr = convert_void2deviceptr(devPtr);

// First we check if a context has been associated with `devPtr`.
{
auto ctx = get_context_associated_pointer(dev_ptr);
if (ctx.has_value()) { return ctx.value(); }
}

// If this isn't the case, we check the current context. If it exist and can access `devPtr`, we
// return the current context.
madsbk marked this conversation as resolved.
Show resolved Hide resolved
{
CUcontext ctx = nullptr;
CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx));
if (ctx != nullptr && current_context_can_access_pointer(dev_ptr)) { return ctx; }
}

// Finally, if we didn't find any usable context, we return the primary context of the
// device that owns `devPtr`. If the primary context cannot access `devPtr`, we accept failure.
return get_primary_cuda_context(get_device_ordinal_from_pointer(dev_ptr));
}

/**
* @brief Push CUDA context on creation and pop it on destruction
*/
Expand Down Expand Up @@ -149,7 +245,7 @@ inline std::tuple<void*, std::size_t, std::size_t> get_alloc_info(const void* de
if (ctx != nullptr) {
_ctx = *ctx;
} else {
_ctx = get_current_context(devPtr);
_ctx = get_context_from_pointer(devPtr);
}
PushAndPopContext context(_ctx);
CUDA_DRIVER_TRY(cudaAPI::instance().MemGetAddressRange(&base_ptr, &base_size, dev));
Expand Down
96 changes: 68 additions & 28 deletions python/tests/test_basic_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import os
import random
from contextlib import contextmanager

import pytest

Expand Down Expand Up @@ -133,34 +134,6 @@ def test_read_write_slices(tmp_path, xp, nthreads, tasksize, start, end):
assert all(a == b)


@pytest.mark.skipif(
cupy.cuda.runtime.getDeviceCount() < 2, reason="requires multiple GPUs"
)
def test_multiple_gpus(tmp_path):
"""Test IO from two different GPUs"""
with kvikio.defaults.set_num_threads(10):
with kvikio.defaults.set_task_size(10):
with cupy.cuda.Device(0):
a0 = cupy.arange(200)
with cupy.cuda.Device(1):
a1 = cupy.zeros(200, dtype=a0.dtype)

filename = tmp_path / "test-file"
with kvikio.CuFile(filename, "w") as f:
with cupy.cuda.Device(0):
assert f.write(a0) == a0.nbytes

with kvikio.CuFile(filename, "r") as f:
with pytest.raises(
RuntimeError,
match="The current CUDA context must own the given device memory",
):
f.read(a1)
with cupy.cuda.Device(1):
assert f.read(a1) == a1.nbytes
assert all(cupy.asnumpy(a0) == cupy.asnumpy(a1))


@pytest.mark.parametrize("size", [1, 10, 100, 1000, 1024, 4096, 4096 * 10])
def test_raw_read_write(tmp_path, size):
"""Test raw read/write"""
Expand All @@ -184,3 +157,70 @@ def test_raw_read_write_of_host_memory(tmp_path):
with kvikio.CuFile(filename, "r") as f:
with pytest.raises(ValueError, match="Non-CUDA buffers not supported"):
assert f.raw_read(a) == a.nbytes


@contextmanager
def with_no_cuda_context():
"""Context that pop all CUDA contexts before the test and push them back on after"""
cuda = pytest.importorskip("cuda.cuda")
assert cuda.cuInit(0)[0] == cuda.CUresult.CUDA_SUCCESS

ctx_stack = []
while True:
err, ctx = cuda.cuCtxPopCurrent()
if err == cuda.CUresult.CUDA_ERROR_INVALID_CONTEXT:
break
assert err == cuda.CUresult.CUDA_SUCCESS
ctx_stack.append(ctx)
yield
for ctx in reversed(ctx_stack):
(err,) = cuda.cuCtxPushCurrent(ctx)
assert err == cuda.CUresult.CUDA_SUCCESS


def test_no_current_cuda_context(tmp_path, xp):
"""Test IO when CUDA context is current"""
filename = tmp_path / "test-file"
a = xp.arange(100)
b = xp.empty_like(a)

with kvikio.CuFile(filename, "w+") as f:
with with_no_cuda_context():
f.write(a)
f.read(b)
assert all(a == b)


@pytest.mark.skipif(
cupy.cuda.runtime.getDeviceCount() < 2, reason="requires multiple GPUs"
)
def test_multiple_gpus(tmp_path, xp):
"""Test IO from two different GPUs"""
filename = tmp_path / "test-file"

with kvikio.defaults.set_num_threads(10):
with kvikio.defaults.set_task_size(10):

# Allocate an array on each device
with cupy.cuda.Device(0):
a0 = xp.arange(200)
with cupy.cuda.Device(1):
a1 = xp.zeros(200, dtype=a0.dtype)

# Test when the device match the allocation
with kvikio.CuFile(filename, "w") as f:
with cupy.cuda.Device(0):
assert f.write(a0) == a0.nbytes
with kvikio.CuFile(filename, "r") as f:
with cupy.cuda.Device(1):
assert f.read(a1) == a1.nbytes
assert bytes(a0) == bytes(a1)

# Test when the device doesn't match the allocation
with kvikio.CuFile(filename, "w") as f:
with cupy.cuda.Device(1):
assert f.write(a0) == a0.nbytes
with kvikio.CuFile(filename, "r") as f:
with cupy.cuda.Device(0):
assert f.read(a1) == a1.nbytes
assert bytes(a0) == bytes(a1)