diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index cd583c8323..0141806347 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -26,6 +26,7 @@ project( LANGUAGES CXX ) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/") # Write the version header diff --git a/cpp/include/kvikio/error.hpp b/cpp/include/kvikio/error.hpp index 07c09a73ab..e8e0e00641 100644 --- a/cpp/include/kvikio/error.hpp +++ b/cpp/include/kvikio/error.hpp @@ -40,9 +40,10 @@ struct CUfileException : public std::runtime_error { KVIKIO_STRINGIFY(__LINE__) + \ ": CUDA_ERROR_STUB_LIBRARY(" \ "The CUDA driver loaded is a stub library)"}; \ - } else if (error != CUDA_SUCCESS) { \ - const char* err_name; \ - const char* err_str; \ + } \ + if (error != CUDA_SUCCESS) { \ + const char* err_name = nullptr; \ + const char* err_str = nullptr; \ CUresult err_name_status = cudaAPI::instance().GetErrorName(error, &err_name); \ CUresult err_str_status = cudaAPI::instance().GetErrorString(error, &err_str); \ if (err_name_status == CUDA_ERROR_INVALID_VALUE) { err_name = "unknown"; } \ diff --git a/cpp/include/kvikio/file_handle.hpp b/cpp/include/kvikio/file_handle.hpp index c71d52e7e8..716dcfc66c 100644 --- a/cpp/include/kvikio/file_handle.hpp +++ b/cpp/include/kvikio/file_handle.hpp @@ -391,7 +391,7 @@ class FileHandle { return parallel_io(op, buf, size, file_offset, task_size, 0); } - CUcontext ctx = get_current_context(buf); + CUcontext ctx = get_context_from_pointer(buf); auto task = [this, ctx](void* devPtr_base, std::size_t size, std::size_t file_offset, @@ -437,7 +437,7 @@ class FileHandle { return parallel_io(op, buf, size, file_offset, task_size, 0); } - CUcontext ctx = get_current_context(buf); + CUcontext ctx = get_context_from_pointer(buf); auto op = [this, ctx](const void* devPtr_base, std::size_t size, std::size_t file_offset, diff --git a/cpp/include/kvikio/shim/cuda.hpp b/cpp/include/kvikio/shim/cuda.hpp index 3ca0d5f3e9..7ea444524d 100644 --- a/cpp/include/kvikio/shim/cuda.hpp +++ b/cpp/include/kvikio/shim/cuda.hpp @@ -43,6 +43,9 @@ class cudaAPI { decltype(cuMemGetAddressRange)* MemGetAddressRange{nullptr}; decltype(cuGetErrorName)* GetErrorName{nullptr}; decltype(cuGetErrorString)* GetErrorString{nullptr}; + decltype(cuDeviceGet)* DeviceGet{nullptr}; + decltype(cuDevicePrimaryCtxRetain)* DevicePrimaryCtxRetain{nullptr}; + decltype(cuDevicePrimaryCtxRelease)* DevicePrimaryCtxRelease{nullptr}; private: cudaAPI() @@ -64,6 +67,9 @@ class cudaAPI { get_symbol(MemGetAddressRange, lib, KVIKIO_STRINGIFY(cuMemGetAddressRange)); get_symbol(GetErrorName, lib, KVIKIO_STRINGIFY(cuGetErrorName)); get_symbol(GetErrorString, lib, KVIKIO_STRINGIFY(cuGetErrorString)); + get_symbol(DeviceGet, lib, KVIKIO_STRINGIFY(cuDeviceGet)); + get_symbol(DevicePrimaryCtxRetain, lib, KVIKIO_STRINGIFY(cuDevicePrimaryCtxRetain)); + get_symbol(DevicePrimaryCtxRelease, lib, KVIKIO_STRINGIFY(cuDevicePrimaryCtxRelease)); } public: diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp index 1b9c81f190..9362678820 100644 --- a/cpp/include/kvikio/utils.hpp +++ b/cpp/include/kvikio/utils.hpp @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include #include @@ -51,39 +53,6 @@ inline constexpr std::size_t page_size = 4096; return reinterpret_cast(devPtr); } -/** - * @brief Get the current cuda context - * - * Previously, we got the cuda context from the provided device pointer by calling - * `cuPointerGetAttribute(..., CU_POINTER_ATTRIBUTE_CONTEXT)`. However, this doesn't - * work for stream ordered device memory allocations[1] so we now get the current - * cuda context instead. - * [1] - * - * @param check_owning_devPtr If not NULL, a device memory pointer that must have - * been allocated by, mapped by, or registered with the current context. If this - * isn't the case, a CUfileException is thrown. - * - * @return The current cuda context - */ -[[nodiscard]] inline CUcontext get_current_context(const void* check_owning_devPtr = nullptr) -{ - if (check_owning_devPtr != nullptr) { - CUdeviceptr current_ctx_devPtr{}; - CUdeviceptr dev_ptr = convert_void2deviceptr(check_owning_devPtr); - - CUresult const err = cudaAPI::instance().PointerGetAttribute( - ¤t_ctx_devPtr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr); - if (err != CUDA_SUCCESS || current_ctx_devPtr != dev_ptr) { - throw CUfileException("The current CUDA context must own the given device memory"); - } - } - - CUcontext ret{}; - CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ret)); - return ret; -} - /** * @brief Check if `ptr` points to host memory (as opposed to device memory) * @@ -112,6 +81,133 @@ inline bool is_host_memory(const void* ptr) return memtype == 0 || memtype == CU_MEMORYTYPE_HOST; } +/** + * @brief Return the device owning the pointer + * + * @param ptr Device pointer to query + * @return The device ordinal + */ +[[nodiscard]] inline int get_device_ordinal_from_pointer(CUdeviceptr dev_ptr) +{ + int ret = 0; + CUDA_DRIVER_TRY( + cudaAPI::instance().PointerGetAttribute(&ret, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, dev_ptr)); + return ret; +} + +/** + * @brief RAII wrapper for a CUDA primary context + */ +class CudaPrimaryContext { + public: + CUdevice dev{}; + CUcontext ctx{}; + + CudaPrimaryContext(int device_ordinal) + { + CUDA_DRIVER_TRY(cudaAPI::instance().DeviceGet(&dev, device_ordinal)); + CUDA_DRIVER_TRY(cudaAPI::instance().DevicePrimaryCtxRetain(&ctx, dev)); + } + CudaPrimaryContext(const CudaPrimaryContext&) = delete; + CudaPrimaryContext& operator=(CudaPrimaryContext const&) = delete; + CudaPrimaryContext(CudaPrimaryContext&&) = delete; + CudaPrimaryContext&& operator=(CudaPrimaryContext&&) = delete; + ~CudaPrimaryContext() + { + try { + CUDA_DRIVER_TRY(cudaAPI::instance().DevicePrimaryCtxRelease(dev), CUfileException); + } catch (const CUfileException& e) { + std::cerr << e.what() << std::endl; + } + } +}; + +/** + * @brief Given a device ordinal, return the primary context of the device. + * + * This function caches the primary contexts retrieved until program exit + * + * @param ordinal Device ordinal - an integer between 0 and the number of CUDA devices + * @return Primary CUDA context + */ +[[nodiscard]] inline CUcontext get_primary_cuda_context(int ordinal) +{ + static std::map _primary_contexts; + _primary_contexts.try_emplace(ordinal, ordinal); + return _primary_contexts.at(ordinal).ctx; +} + +/** + * @brief Return the CUDA context associated the given device pointer, if any. + * + * @param dev_ptr Device pointer to query + * @return Usable CUDA context, if one were found. + */ +[[nodiscard]] inline std::optional get_context_associated_pointer(CUdeviceptr dev_ptr) +{ + CUcontext ctx = nullptr; + const CUresult err = + cudaAPI::instance().PointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr); + if (err == CUDA_SUCCESS && ctx != nullptr) { return ctx; } + if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); } + return {}; +} + +/** + * @brief Check if the current CUDA context can access the given device pointer + * + * @param dev_ptr Device pointer to query + * @return The boolean answer + */ +[[nodiscard]] inline bool current_context_can_access_pointer(CUdeviceptr dev_ptr) +{ + CUdeviceptr current_ctx_dev_ptr{}; + const CUresult err = cudaAPI::instance().PointerGetAttribute( + ¤t_ctx_dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr); + if (err == CUDA_SUCCESS && current_ctx_dev_ptr == dev_ptr) { return true; } + if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); } + return false; +} + +/** + * @brief Return a CUDA context that can be used with the given device pointer + * + * For robustness, we look for an usabale context in the following order: + * 1) If a context has been associated with `devPtr`, it is returned. + * 2) If the current context exists and can access `devPtr`, it is returned. + * 3) Return the primary context of the device that owns `devPtr`. We assume the + * primary context can access `devPtr`, which might not be true in the exceptional + * disjoint addressing cases mention in the CUDA docs[1]. In these cases, the user + * has to set an usable current context before reading/writing using KvikIO. + * + * [1] + * + * @param devPtr Device pointer to query + * @return Usable CUDA context + */ +[[nodiscard]] inline CUcontext get_context_from_pointer(const void* devPtr) +{ + CUdeviceptr dev_ptr = convert_void2deviceptr(devPtr); + + // First we check if a context has been associated with `devPtr`. + { + auto ctx = get_context_associated_pointer(dev_ptr); + if (ctx.has_value()) { return ctx.value(); } + } + + // If this isn't the case, we check the current context. If it exist and can access `devPtr`, we + // return the current context. + { + CUcontext ctx = nullptr; + CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx)); + if (ctx != nullptr && current_context_can_access_pointer(dev_ptr)) { return ctx; } + } + + // Finally, if we didn't find any usable context, we return the primary context of the + // device that owns `devPtr`. If the primary context cannot access `devPtr`, we accept failure. + return get_primary_cuda_context(get_device_ordinal_from_pointer(dev_ptr)); +} + /** * @brief Push CUDA context on creation and pop it on destruction */ @@ -149,7 +245,7 @@ inline std::tuple get_alloc_info(const void* de if (ctx != nullptr) { _ctx = *ctx; } else { - _ctx = get_current_context(devPtr); + _ctx = get_context_from_pointer(devPtr); } PushAndPopContext context(_ctx); CUDA_DRIVER_TRY(cudaAPI::instance().MemGetAddressRange(&base_ptr, &base_size, dev)); diff --git a/python/tests/test_basic_io.py b/python/tests/test_basic_io.py index dd5b98fa0f..c12bf395e4 100644 --- a/python/tests/test_basic_io.py +++ b/python/tests/test_basic_io.py @@ -3,6 +3,7 @@ import os import random +from contextlib import contextmanager import pytest @@ -133,34 +134,6 @@ def test_read_write_slices(tmp_path, xp, nthreads, tasksize, start, end): assert all(a == b) -@pytest.mark.skipif( - cupy.cuda.runtime.getDeviceCount() < 2, reason="requires multiple GPUs" -) -def test_multiple_gpus(tmp_path): - """Test IO from two different GPUs""" - with kvikio.defaults.set_num_threads(10): - with kvikio.defaults.set_task_size(10): - with cupy.cuda.Device(0): - a0 = cupy.arange(200) - with cupy.cuda.Device(1): - a1 = cupy.zeros(200, dtype=a0.dtype) - - filename = tmp_path / "test-file" - with kvikio.CuFile(filename, "w") as f: - with cupy.cuda.Device(0): - assert f.write(a0) == a0.nbytes - - with kvikio.CuFile(filename, "r") as f: - with pytest.raises( - RuntimeError, - match="The current CUDA context must own the given device memory", - ): - f.read(a1) - with cupy.cuda.Device(1): - assert f.read(a1) == a1.nbytes - assert all(cupy.asnumpy(a0) == cupy.asnumpy(a1)) - - @pytest.mark.parametrize("size", [1, 10, 100, 1000, 1024, 4096, 4096 * 10]) def test_raw_read_write(tmp_path, size): """Test raw read/write""" @@ -184,3 +157,70 @@ def test_raw_read_write_of_host_memory(tmp_path): with kvikio.CuFile(filename, "r") as f: with pytest.raises(ValueError, match="Non-CUDA buffers not supported"): assert f.raw_read(a) == a.nbytes + + +@contextmanager +def with_no_cuda_context(): + """Context that pop all CUDA contexts before the test and push them back on after""" + cuda = pytest.importorskip("cuda.cuda") + assert cuda.cuInit(0)[0] == cuda.CUresult.CUDA_SUCCESS + + ctx_stack = [] + while True: + err, ctx = cuda.cuCtxPopCurrent() + if err == cuda.CUresult.CUDA_ERROR_INVALID_CONTEXT: + break + assert err == cuda.CUresult.CUDA_SUCCESS + ctx_stack.append(ctx) + yield + for ctx in reversed(ctx_stack): + (err,) = cuda.cuCtxPushCurrent(ctx) + assert err == cuda.CUresult.CUDA_SUCCESS + + +def test_no_current_cuda_context(tmp_path, xp): + """Test IO when CUDA context is current""" + filename = tmp_path / "test-file" + a = xp.arange(100) + b = xp.empty_like(a) + + with kvikio.CuFile(filename, "w+") as f: + with with_no_cuda_context(): + f.write(a) + f.read(b) + assert all(a == b) + + +@pytest.mark.skipif( + cupy.cuda.runtime.getDeviceCount() < 2, reason="requires multiple GPUs" +) +def test_multiple_gpus(tmp_path, xp): + """Test IO from two different GPUs""" + filename = tmp_path / "test-file" + + with kvikio.defaults.set_num_threads(10): + with kvikio.defaults.set_task_size(10): + + # Allocate an array on each device + with cupy.cuda.Device(0): + a0 = xp.arange(200) + with cupy.cuda.Device(1): + a1 = xp.zeros(200, dtype=a0.dtype) + + # Test when the device match the allocation + with kvikio.CuFile(filename, "w") as f: + with cupy.cuda.Device(0): + assert f.write(a0) == a0.nbytes + with kvikio.CuFile(filename, "r") as f: + with cupy.cuda.Device(1): + assert f.read(a1) == a1.nbytes + assert bytes(a0) == bytes(a1) + + # Test when the device doesn't match the allocation + with kvikio.CuFile(filename, "w") as f: + with cupy.cuda.Device(1): + assert f.write(a0) == a0.nbytes + with kvikio.CuFile(filename, "r") as f: + with cupy.cuda.Device(0): + assert f.read(a1) == a1.nbytes + assert bytes(a0) == bytes(a1)