From 83a6b6501f6d65f5242cddd02711ef01ed4738e4 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 29 Mar 2023 03:31:48 -0700 Subject: [PATCH 01/15] Implement get_context_from_pointer --- cpp/include/kvikio/file_handle.hpp | 4 +- cpp/include/kvikio/shim/cuda.hpp | 6 +++ cpp/include/kvikio/utils.hpp | 79 ++++++++++++++++++++---------- 3 files changed, 60 insertions(+), 29 deletions(-) diff --git a/cpp/include/kvikio/file_handle.hpp b/cpp/include/kvikio/file_handle.hpp index c71d52e7e8..716dcfc66c 100644 --- a/cpp/include/kvikio/file_handle.hpp +++ b/cpp/include/kvikio/file_handle.hpp @@ -391,7 +391,7 @@ class FileHandle { return parallel_io(op, buf, size, file_offset, task_size, 0); } - CUcontext ctx = get_current_context(buf); + CUcontext ctx = get_context_from_pointer(buf); auto task = [this, ctx](void* devPtr_base, std::size_t size, std::size_t file_offset, @@ -437,7 +437,7 @@ class FileHandle { return parallel_io(op, buf, size, file_offset, task_size, 0); } - CUcontext ctx = get_current_context(buf); + CUcontext ctx = get_context_from_pointer(buf); auto op = [this, ctx](const void* devPtr_base, std::size_t size, std::size_t file_offset, diff --git a/cpp/include/kvikio/shim/cuda.hpp b/cpp/include/kvikio/shim/cuda.hpp index 3ca0d5f3e9..7ea444524d 100644 --- a/cpp/include/kvikio/shim/cuda.hpp +++ b/cpp/include/kvikio/shim/cuda.hpp @@ -43,6 +43,9 @@ class cudaAPI { decltype(cuMemGetAddressRange)* MemGetAddressRange{nullptr}; decltype(cuGetErrorName)* GetErrorName{nullptr}; decltype(cuGetErrorString)* GetErrorString{nullptr}; + decltype(cuDeviceGet)* DeviceGet{nullptr}; + decltype(cuDevicePrimaryCtxRetain)* DevicePrimaryCtxRetain{nullptr}; + decltype(cuDevicePrimaryCtxRelease)* DevicePrimaryCtxRelease{nullptr}; private: cudaAPI() @@ -64,6 +67,9 @@ class cudaAPI { get_symbol(MemGetAddressRange, lib, KVIKIO_STRINGIFY(cuMemGetAddressRange)); get_symbol(GetErrorName, lib, KVIKIO_STRINGIFY(cuGetErrorName)); get_symbol(GetErrorString, lib, KVIKIO_STRINGIFY(cuGetErrorString)); + get_symbol(DeviceGet, lib, KVIKIO_STRINGIFY(cuDeviceGet)); + get_symbol(DevicePrimaryCtxRetain, lib, KVIKIO_STRINGIFY(cuDevicePrimaryCtxRetain)); + get_symbol(DevicePrimaryCtxRelease, lib, KVIKIO_STRINGIFY(cuDevicePrimaryCtxRelease)); } public: diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp index 1b9c81f190..f4eed2f50c 100644 --- a/cpp/include/kvikio/utils.hpp +++ b/cpp/include/kvikio/utils.hpp @@ -51,37 +51,62 @@ inline constexpr std::size_t page_size = 4096; return reinterpret_cast(devPtr); } +[[nodiscard]] inline int get_device_ordinal_from_pointer(const void* devPtr) +{ + CUdeviceptr dev_ptr = convert_void2deviceptr(devPtr); + int ret; + CUDA_DRIVER_TRY( + cudaAPI::instance().PointerGetAttribute(&ret, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, dev_ptr)); + return ret; +} + /** - * @brief Get the current cuda context - * - * Previously, we got the cuda context from the provided device pointer by calling - * `cuPointerGetAttribute(..., CU_POINTER_ATTRIBUTE_CONTEXT)`. However, this doesn't - * work for stream ordered device memory allocations[1] so we now get the current - * cuda context instead. - * [1] - * - * @param check_owning_devPtr If not NULL, a device memory pointer that must have - * been allocated by, mapped by, or registered with the current context. If this - * isn't the case, a CUfileException is thrown. - * - * @return The current cuda context + * @brief RAII wrapper for a CUDA primary context */ -[[nodiscard]] inline CUcontext get_current_context(const void* check_owning_devPtr = nullptr) -{ - if (check_owning_devPtr != nullptr) { - CUdeviceptr current_ctx_devPtr{}; - CUdeviceptr dev_ptr = convert_void2deviceptr(check_owning_devPtr); - - CUresult const err = cudaAPI::instance().PointerGetAttribute( - ¤t_ctx_devPtr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr); - if (err != CUDA_SUCCESS || current_ctx_devPtr != dev_ptr) { - throw CUfileException("The current CUDA context must own the given device memory"); +class CudaPrimaryContext { + public: + CUdevice dev; + CUcontext ctx; + + CudaPrimaryContext(int device_ordinal) + { + CUDA_DRIVER_TRY(cudaAPI::instance().DeviceGet(&dev, device_ordinal)); + CUDA_DRIVER_TRY(cudaAPI::instance().DevicePrimaryCtxRetain(&ctx, dev)); + } + CudaPrimaryContext(const CudaPrimaryContext&) = delete; + CudaPrimaryContext& operator=(CudaPrimaryContext const&) = delete; + CudaPrimaryContext(CudaPrimaryContext&&) = delete; + CudaPrimaryContext&& operator=(CudaPrimaryContext&&) = delete; + ~CudaPrimaryContext() + { + try { + CUDA_DRIVER_TRY(cudaAPI::instance().DevicePrimaryCtxRelease(dev), CUfileException); + } catch (const CUfileException& e) { + std::cerr << e.what() << std::endl; } } +}; - CUcontext ret{}; - CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ret)); - return ret; +[[nodiscard]] inline CUcontext get_context_from_pointer(const void* devPtr) +{ + static std::map _primary_contexts; + CUdeviceptr dev_ptr = convert_void2deviceptr(devPtr); + + // First we try to get the current context and making sure it can access the device pointer. + { + CUcontext ctx; + CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx)); + if (ctx != nullptr) { + CUdeviceptr current_ctx_dev_ptr{}; + CUDA_DRIVER_TRY(cudaAPI::instance().PointerGetAttribute( + ¤t_ctx_dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr)); + if (current_ctx_dev_ptr != dev_ptr) { return ctx; } + } + } + // If the current context isn't available, we return the primary context. + int ordinal = get_device_ordinal_from_pointer(devPtr); + _primary_contexts.try_emplace(ordinal, ordinal); + return _primary_contexts.at(ordinal).ctx; } /** @@ -149,7 +174,7 @@ inline std::tuple get_alloc_info(const void* de if (ctx != nullptr) { _ctx = *ctx; } else { - _ctx = get_current_context(devPtr); + _ctx = get_context_from_pointer(devPtr); } PushAndPopContext context(_ctx); CUDA_DRIVER_TRY(cudaAPI::instance().MemGetAddressRange(&base_ptr, &base_size, dev)); From d92ecc85be1110a3cf23cc6719c6729c017fcf91 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 29 Mar 2023 04:15:39 -0700 Subject: [PATCH 02/15] doc --- cpp/include/kvikio/utils.hpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp index f4eed2f50c..7ccee1825d 100644 --- a/cpp/include/kvikio/utils.hpp +++ b/cpp/include/kvikio/utils.hpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -51,6 +52,12 @@ inline constexpr std::size_t page_size = 4096; return reinterpret_cast(devPtr); } +/** + * @brief Return the device owning the pointer + * + * @param ptr Device pointer to query + * @return The device ordinal + */ [[nodiscard]] inline int get_device_ordinal_from_pointer(const void* devPtr) { CUdeviceptr dev_ptr = convert_void2deviceptr(devPtr); @@ -87,6 +94,12 @@ class CudaPrimaryContext { } }; +/** + * @brief Return a CUDA that can be used with the given device pointer + * + * @param devPtr Device pointer to query + * @return Usable CUDA context + */ [[nodiscard]] inline CUcontext get_context_from_pointer(const void* devPtr) { static std::map _primary_contexts; From 5e122bd21f159ced633883a06d8b28cab50ef6b5 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 29 Mar 2023 04:39:34 -0700 Subject: [PATCH 03/15] get_context_from_pointer(): also checks CU_POINTER_ATTRIBUTE_CONTEXT --- cpp/include/kvikio/utils.hpp | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp index 7ccee1825d..29b8c734de 100644 --- a/cpp/include/kvikio/utils.hpp +++ b/cpp/include/kvikio/utils.hpp @@ -105,18 +105,27 @@ class CudaPrimaryContext { static std::map _primary_contexts; CUdeviceptr dev_ptr = convert_void2deviceptr(devPtr); - // First we try to get the current context and making sure it can access the device pointer. - { - CUcontext ctx; + // First we check if a context has been associated with `devPtr` + CUcontext ctx; + const CUresult err = + cudaAPI::instance().PointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr); + if (err != CUDA_ERROR_INVALID_VALUE) { + // If this isn't the case, we grab the current context CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx)); - if (ctx != nullptr) { - CUdeviceptr current_ctx_dev_ptr{}; - CUDA_DRIVER_TRY(cudaAPI::instance().PointerGetAttribute( - ¤t_ctx_dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr)); - if (current_ctx_dev_ptr != dev_ptr) { return ctx; } - } + } else { + CUDA_DRIVER_TRY(err); // Check for errors from previous asynchronous launches } - // If the current context isn't available, we return the primary context. + + // If we found a context, we return it if it can access the `devPtr`. + if (ctx != nullptr) { + CUdeviceptr current_ctx_dev_ptr{}; + CUDA_DRIVER_TRY(cudaAPI::instance().PointerGetAttribute( + ¤t_ctx_dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr)); + if (current_ctx_dev_ptr != dev_ptr) { return ctx; } + } + + // If we didn't find any usable context, we return the primary context of the device owning + // `devPtr`. int ordinal = get_device_ordinal_from_pointer(devPtr); _primary_contexts.try_emplace(ordinal, ordinal); return _primary_contexts.at(ordinal).ctx; From 09b67d0dd9e6816b8e09bec0e55aca1e13311656 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 29 Mar 2023 04:41:19 -0700 Subject: [PATCH 04/15] doc --- cpp/include/kvikio/utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp index 29b8c734de..91ea2e1f68 100644 --- a/cpp/include/kvikio/utils.hpp +++ b/cpp/include/kvikio/utils.hpp @@ -116,7 +116,7 @@ class CudaPrimaryContext { CUDA_DRIVER_TRY(err); // Check for errors from previous asynchronous launches } - // If we found a context, we return it if it can access the `devPtr`. + // If we found a context and it can access `devPtr`, we return it. if (ctx != nullptr) { CUdeviceptr current_ctx_dev_ptr{}; CUDA_DRIVER_TRY(cudaAPI::instance().PointerGetAttribute( From 073360f61bb74b47c353984a6f3cc09602defe01 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 29 Mar 2023 04:43:41 -0700 Subject: [PATCH 05/15] doc --- cpp/include/kvikio/utils.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp index 91ea2e1f68..da9a03f6b4 100644 --- a/cpp/include/kvikio/utils.hpp +++ b/cpp/include/kvikio/utils.hpp @@ -105,7 +105,9 @@ class CudaPrimaryContext { static std::map _primary_contexts; CUdeviceptr dev_ptr = convert_void2deviceptr(devPtr); - // First we check if a context has been associated with `devPtr` + // First we check if a context has been associated with `devPtr`. + // Notice, this is not the case for stream ordered device memory allocations. + // See CUcontext ctx; const CUresult err = cudaAPI::instance().PointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr); From ed409a24e689a4741482b80260daf81ecf29f6fe Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 29 Mar 2023 16:37:10 +0200 Subject: [PATCH 06/15] doc Co-authored-by: Lawrence Mitchell --- cpp/include/kvikio/utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp index da9a03f6b4..4b6a14abc0 100644 --- a/cpp/include/kvikio/utils.hpp +++ b/cpp/include/kvikio/utils.hpp @@ -95,7 +95,7 @@ class CudaPrimaryContext { }; /** - * @brief Return a CUDA that can be used with the given device pointer + * @brief Return a CUDA context that can be used with the given device pointer * * @param devPtr Device pointer to query * @return Usable CUDA context From b21371f695713c4e9a9cc8033ff9506f71f7ce7c Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 29 Mar 2023 07:38:44 -0700 Subject: [PATCH 07/15] fixing get_context_from_pointer checks --- cpp/include/kvikio/utils.hpp | 40 ++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp index da9a03f6b4..663790529f 100644 --- a/cpp/include/kvikio/utils.hpp +++ b/cpp/include/kvikio/utils.hpp @@ -108,26 +108,30 @@ class CudaPrimaryContext { // First we check if a context has been associated with `devPtr`. // Notice, this is not the case for stream ordered device memory allocations. // See - CUcontext ctx; - const CUresult err = - cudaAPI::instance().PointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr); - if (err != CUDA_ERROR_INVALID_VALUE) { - // If this isn't the case, we grab the current context - CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx)); - } else { - CUDA_DRIVER_TRY(err); // Check for errors from previous asynchronous launches + { + CUcontext ctx; + const CUresult err = + cudaAPI::instance().PointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr); + if (err != CUDA_ERROR_INVALID_VALUE) { + CUDA_DRIVER_TRY(err); // Check for other errors + return ctx; + } } - - // If we found a context and it can access `devPtr`, we return it. - if (ctx != nullptr) { - CUdeviceptr current_ctx_dev_ptr{}; - CUDA_DRIVER_TRY(cudaAPI::instance().PointerGetAttribute( - ¤t_ctx_dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr)); - if (current_ctx_dev_ptr != dev_ptr) { return ctx; } + // If this isn't the case, we check the current context. If it exist and can access `devPtr`, + // we return it. + { + CUcontext ctx; + CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx)); + if (ctx != nullptr) { + CUdeviceptr current_ctx_dev_ptr{}; + CUDA_DRIVER_TRY(cudaAPI::instance().PointerGetAttribute( + ¤t_ctx_dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr)); + if (current_ctx_dev_ptr != dev_ptr) { return ctx; } + } } - - // If we didn't find any usable context, we return the primary context of the device owning - // `devPtr`. + // Finally, if we didn't find any usable context, we return the primary context of the + // device that owns `devPtr`. Notice, we use `_primary_contexts` to cache the primary + // context of each device. int ordinal = get_device_ordinal_from_pointer(devPtr); _primary_contexts.try_emplace(ordinal, ordinal); return _primary_contexts.at(ordinal).ctx; From 2e4b8ad2c8219bdec61d3f18322354d446a1aa70 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 29 Mar 2023 07:42:07 -0700 Subject: [PATCH 08/15] clean up --- cpp/include/kvikio/utils.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp index a8edf2ae8c..1516c3d4d8 100644 --- a/cpp/include/kvikio/utils.hpp +++ b/cpp/include/kvikio/utils.hpp @@ -58,9 +58,8 @@ inline constexpr std::size_t page_size = 4096; * @param ptr Device pointer to query * @return The device ordinal */ -[[nodiscard]] inline int get_device_ordinal_from_pointer(const void* devPtr) +[[nodiscard]] inline int get_device_ordinal_from_pointer(CUdeviceptr dev_ptr) { - CUdeviceptr dev_ptr = convert_void2deviceptr(devPtr); int ret; CUDA_DRIVER_TRY( cudaAPI::instance().PointerGetAttribute(&ret, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, dev_ptr)); @@ -132,7 +131,7 @@ class CudaPrimaryContext { // Finally, if we didn't find any usable context, we return the primary context of the // device that owns `devPtr`. Notice, we use `_primary_contexts` to cache the primary // context of each device. - int ordinal = get_device_ordinal_from_pointer(devPtr); + int ordinal = get_device_ordinal_from_pointer(dev_ptr); _primary_contexts.try_emplace(ordinal, ordinal); return _primary_contexts.at(ordinal).ctx; } From 5a585833e6077d6f62daf052d6ac83ec18be5054 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 29 Mar 2023 18:07:12 +0200 Subject: [PATCH 09/15] typo Co-authored-by: Lawrence Mitchell --- cpp/include/kvikio/utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp index 1516c3d4d8..7747722ecd 100644 --- a/cpp/include/kvikio/utils.hpp +++ b/cpp/include/kvikio/utils.hpp @@ -116,7 +116,7 @@ class CudaPrimaryContext { return ctx; } } - // If this isn't the case, we check the current context. If it exist and can access `devPtr`, + // If this isn't the case, we check the current context. If it exists and can access `devPtr`, // we return it. { CUcontext ctx; From 1720ffbc6fb2d66207dee9861b2b5b1080778132 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 29 Mar 2023 09:23:47 -0700 Subject: [PATCH 10/15] CU_POINTER_ATTRIBUTE_CONTEXT returns ctx == null --- cpp/include/kvikio/utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp index 1516c3d4d8..127e897ab9 100644 --- a/cpp/include/kvikio/utils.hpp +++ b/cpp/include/kvikio/utils.hpp @@ -111,7 +111,7 @@ class CudaPrimaryContext { CUcontext ctx; const CUresult err = cudaAPI::instance().PointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr); - if (err != CUDA_ERROR_INVALID_VALUE) { + if (err != CUDA_ERROR_INVALID_VALUE && ctx != nullptr) { CUDA_DRIVER_TRY(err); // Check for other errors return ctx; } From 07e608547e5bf1adf284231a07d7046768c91410 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 29 Mar 2023 18:32:09 +0200 Subject: [PATCH 11/15] Update cpp/include/kvikio/utils.hpp Co-authored-by: Lawrence Mitchell --- cpp/include/kvikio/utils.hpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp index e446995014..1660733462 100644 --- a/cpp/include/kvikio/utils.hpp +++ b/cpp/include/kvikio/utils.hpp @@ -111,9 +111,12 @@ class CudaPrimaryContext { CUcontext ctx; const CUresult err = cudaAPI::instance().PointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr); - if (err != CUDA_ERROR_INVALID_VALUE && ctx != nullptr) { - CUDA_DRIVER_TRY(err); // Check for other errors - return ctx; + if (err == CUDA_SUCCESS && ctx != nullptr) { + return ctx; + } else if (err != CUDA_ERROR_INVALID) { + CUDA_DRIVER_TRY(err); + } + // either CUDA_ERROR_INVALID, or SUCCESS, but stream-ordered allocation } } // If this isn't the case, we check the current context. If it exists and can access `devPtr`, From 1600827b669ae9371c115608555bcaf65c9e19ca Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 30 Mar 2023 12:29:15 +0200 Subject: [PATCH 12/15] rework --- cpp/CMakeLists.txt | 1 + cpp/include/kvikio/error.hpp | 7 +- cpp/include/kvikio/utils.hpp | 154 ++++++++++++++++++++++------------ python/tests/test_basic_io.py | 89 ++++++++++++++------ 4 files changed, 168 insertions(+), 83 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index cd583c8323..0141806347 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -26,6 +26,7 @@ project( LANGUAGES CXX ) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/") # Write the version header diff --git a/cpp/include/kvikio/error.hpp b/cpp/include/kvikio/error.hpp index 07c09a73ab..e8e0e00641 100644 --- a/cpp/include/kvikio/error.hpp +++ b/cpp/include/kvikio/error.hpp @@ -40,9 +40,10 @@ struct CUfileException : public std::runtime_error { KVIKIO_STRINGIFY(__LINE__) + \ ": CUDA_ERROR_STUB_LIBRARY(" \ "The CUDA driver loaded is a stub library)"}; \ - } else if (error != CUDA_SUCCESS) { \ - const char* err_name; \ - const char* err_str; \ + } \ + if (error != CUDA_SUCCESS) { \ + const char* err_name = nullptr; \ + const char* err_str = nullptr; \ CUresult err_name_status = cudaAPI::instance().GetErrorName(error, &err_name); \ CUresult err_str_status = cudaAPI::instance().GetErrorString(error, &err_str); \ if (err_name_status == CUDA_ERROR_INVALID_VALUE) { err_name = "unknown"; } \ diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp index 1660733462..a271e3555a 100644 --- a/cpp/include/kvikio/utils.hpp +++ b/cpp/include/kvikio/utils.hpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -52,6 +53,34 @@ inline constexpr std::size_t page_size = 4096; return reinterpret_cast(devPtr); } +/** + * @brief Check if `ptr` points to host memory (as opposed to device memory) + * + * In this context, managed memory counts as device memory + * + * @param ptr Memory pointer to query + * @return The boolean answer + */ +inline bool is_host_memory(const void* ptr) +{ + CUpointer_attribute attrs[1] = { + CU_POINTER_ATTRIBUTE_MEMORY_TYPE, + }; + CUmemorytype memtype{}; + void* data[1] = {&memtype}; + CUresult result = + cudaAPI::instance().PointerGetAttributes(1, attrs, data, convert_void2deviceptr(ptr)); + + // We assume that `ptr` is host memory when CUDA_ERROR_NOT_INITIALIZED + if (result == CUDA_ERROR_NOT_INITIALIZED) { return true; } + CUDA_DRIVER_TRY(result); + + // Notice, queying `CU_POINTER_ATTRIBUTE_MEMORY_TYPE` returns zero when the memory + // is unregistered host memory. This is undocumented but how the Runtime CUDA API + // does it to support `cudaMemoryTypeUnregistered`. + return memtype == 0 || memtype == CU_MEMORYTYPE_HOST; +} + /** * @brief Return the device owning the pointer * @@ -60,7 +89,7 @@ inline constexpr std::size_t page_size = 4096; */ [[nodiscard]] inline int get_device_ordinal_from_pointer(CUdeviceptr dev_ptr) { - int ret; + int ret = 0; CUDA_DRIVER_TRY( cudaAPI::instance().PointerGetAttribute(&ret, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, dev_ptr)); return ret; @@ -71,8 +100,8 @@ inline constexpr std::size_t page_size = 4096; */ class CudaPrimaryContext { public: - CUdevice dev; - CUcontext ctx; + CUdevice dev{}; + CUcontext ctx{}; CudaPrimaryContext(int device_ordinal) { @@ -93,78 +122,93 @@ class CudaPrimaryContext { } }; +/** + * @brief Given a device ordinal, return the primary context of the device. + * + * This function cache the primary contexts retrieved until program exit + * + * @param ordinal Device ordinal - an integer between 0 and the number of CUDA devices + * @return Primary CUDA context + */ +[[nodiscard]] inline CUcontext get_primary_cuda_context(int ordinal) +{ + static std::map _primary_contexts; + _primary_contexts.try_emplace(ordinal, ordinal); + return _primary_contexts.at(ordinal).ctx; +} + +/** + * @brief Return the CUDA context associated the given device pointer, if any. + * + * @param dev_ptr Device pointer to query + * @return Usable CUDA context, if one were found. + */ +[[nodiscard]] inline std::optional get_context_associated_pointer(CUdeviceptr dev_ptr) +{ + CUcontext ctx = nullptr; + const CUresult err = + cudaAPI::instance().PointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr); + if (err == CUDA_SUCCESS && ctx != nullptr) { return ctx; } + if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); } + return {}; +} + +/** + * @brief Check if the current CUDA context can access the given device pointer + * + * @param dev_ptr Device pointer to query + * @return The boolean answer + */ +[[nodiscard]] inline bool can_current_context_access_pointer(CUdeviceptr dev_ptr) +{ + CUdeviceptr current_ctx_dev_ptr{}; + const CUresult err = cudaAPI::instance().PointerGetAttribute( + ¤t_ctx_dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr); + if (err == CUDA_SUCCESS && current_ctx_dev_ptr == dev_ptr) { return true; } + if (err != CUDA_ERROR_INVALID_VALUE) { CUDA_DRIVER_TRY(err); } + return false; +} + /** * @brief Return a CUDA context that can be used with the given device pointer * + * For robustness, we look for an usabale context in the following order: + * 1) If a context has been associated with `devPtr`, it is returned + * 2) If the current context exist and can access `devPtr`, it is returned. + * 3) Return the primary context of the device that owns `devPtr`. We assume the + * primary context can access `devPtr`. * @param devPtr Device pointer to query * @return Usable CUDA context */ [[nodiscard]] inline CUcontext get_context_from_pointer(const void* devPtr) { - static std::map _primary_contexts; CUdeviceptr dev_ptr = convert_void2deviceptr(devPtr); // First we check if a context has been associated with `devPtr`. - // Notice, this is not the case for stream ordered device memory allocations. - // See { - CUcontext ctx; - const CUresult err = - cudaAPI::instance().PointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr); - if (err == CUDA_SUCCESS && ctx != nullptr) { - return ctx; - } else if (err != CUDA_ERROR_INVALID) { - CUDA_DRIVER_TRY(err); - } - // either CUDA_ERROR_INVALID, or SUCCESS, but stream-ordered allocation + auto ctx = get_context_associated_pointer(dev_ptr); + if (ctx.has_value()) { + std::cout << "get_context_from_pointer() - context_associated" << std::endl; + return ctx.value(); } } - // If this isn't the case, we check the current context. If it exists and can access `devPtr`, - // we return it. + + // If this isn't the case, we check the current context. If it exist and can access `devPtr`, we + // return the current context. { - CUcontext ctx; + CUcontext ctx = nullptr; CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx)); - if (ctx != nullptr) { - CUdeviceptr current_ctx_dev_ptr{}; - CUDA_DRIVER_TRY(cudaAPI::instance().PointerGetAttribute( - ¤t_ctx_dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, dev_ptr)); - if (current_ctx_dev_ptr != dev_ptr) { return ctx; } + if (ctx != nullptr && can_current_context_access_pointer(dev_ptr)) { + std::cout << "get_context_from_pointer() - can_current_context_access_pointer" << std::endl; + return ctx; } } - // Finally, if we didn't find any usable context, we return the primary context of the - // device that owns `devPtr`. Notice, we use `_primary_contexts` to cache the primary - // context of each device. - int ordinal = get_device_ordinal_from_pointer(dev_ptr); - _primary_contexts.try_emplace(ordinal, ordinal); - return _primary_contexts.at(ordinal).ctx; -} - -/** - * @brief Check if `ptr` points to host memory (as opposed to device memory) - * - * In this context, managed memory counts as device memory - * - * @param ptr Memory pointer to query - * @return The boolean answer - */ -inline bool is_host_memory(const void* ptr) -{ - CUpointer_attribute attrs[1] = { - CU_POINTER_ATTRIBUTE_MEMORY_TYPE, - }; - CUmemorytype memtype{}; - void* data[1] = {&memtype}; - CUresult result = - cudaAPI::instance().PointerGetAttributes(1, attrs, data, convert_void2deviceptr(ptr)); - // We assume that `ptr` is host memory when CUDA_ERROR_NOT_INITIALIZED - if (result == CUDA_ERROR_NOT_INITIALIZED) { return true; } - CUDA_DRIVER_TRY(result); + std::cout << "get_context_from_pointer() - get_primary_cuda_context" << std::endl; - // Notice, queying `CU_POINTER_ATTRIBUTE_MEMORY_TYPE` returns zero when the memory - // is unregistered host memory. This is undocumented but how the Runtime CUDA API - // does it to support `cudaMemoryTypeUnregistered`. - return memtype == 0 || memtype == CU_MEMORYTYPE_HOST; + // Finally, if we didn't find any usable context, we return the primary context of the + // device that owns `devPtr`. If the primary context cannot access `devPtr`, we accept failure. + return get_primary_cuda_context(get_device_ordinal_from_pointer(dev_ptr)); } /** diff --git a/python/tests/test_basic_io.py b/python/tests/test_basic_io.py index dd5b98fa0f..208226d992 100644 --- a/python/tests/test_basic_io.py +++ b/python/tests/test_basic_io.py @@ -3,6 +3,7 @@ import os import random +from contextlib import contextmanager import pytest @@ -133,6 +134,69 @@ def test_read_write_slices(tmp_path, xp, nthreads, tasksize, start, end): assert all(a == b) +@pytest.mark.parametrize("size", [1, 10, 100, 1000, 1024, 4096, 4096 * 10]) +def test_raw_read_write(tmp_path, size): + """Test raw read/write""" + filename = tmp_path / "test-file" + + a = cupy.arange(size) + with kvikio.CuFile(filename, "w") as f: + assert f.raw_write(a) == a.nbytes + with kvikio.CuFile(filename, "r") as f: + assert f.raw_read(a) == a.nbytes + + +def test_raw_read_write_of_host_memory(tmp_path): + """Test raw read/write of host memory, which isn't supported""" + filename = tmp_path / "test-file" + + a = numpy.arange(1024) + with kvikio.CuFile(filename, "w") as f: + with pytest.raises(ValueError, match="Non-CUDA buffers not supported"): + f.raw_write(a) + with kvikio.CuFile(filename, "r") as f: + with pytest.raises(ValueError, match="Non-CUDA buffers not supported"): + assert f.raw_read(a) == a.nbytes + + +@contextmanager +def with_no_cuda_context(): + """Context that pop all CUDA contexts before the test and push them back on after""" + cuda = pytest.importorskip("cuda.cuda") + assert cuda.cuInit(0)[0] == cuda.CUresult.CUDA_SUCCESS + + ctx_stack = [] + while True: + err, ctx = cuda.cuCtxPopCurrent() + if err == cuda.CUresult.CUDA_ERROR_INVALID_CONTEXT: + break + assert err == cuda.CUresult.CUDA_SUCCESS + ctx_stack.append(ctx) + yield + for ctx in reversed(ctx_stack): + (err,) = cuda.cuCtxPushCurrent(ctx) + assert err == cuda.CUresult.CUDA_SUCCESS + + +def test_no_current_cuda_context(tmp_path): + filename = tmp_path / "test-file" + ary = cupy.arange(100) + + with with_no_cuda_context(): + with kvikio.CuFile(filename, "w") as f: + f.write(ary) + + with cupy.cuda.using_allocator(cupy.cuda.malloc_async): + ary = cupy.arange(100) + + with with_no_cuda_context(): + with kvikio.CuFile(filename, "w") as f: + f.write(ary) + + with kvikio.CuFile(filename, "w") as f: + f.write(ary) + + @pytest.mark.skipif( cupy.cuda.runtime.getDeviceCount() < 2, reason="requires multiple GPUs" ) @@ -159,28 +223,3 @@ def test_multiple_gpus(tmp_path): with cupy.cuda.Device(1): assert f.read(a1) == a1.nbytes assert all(cupy.asnumpy(a0) == cupy.asnumpy(a1)) - - -@pytest.mark.parametrize("size", [1, 10, 100, 1000, 1024, 4096, 4096 * 10]) -def test_raw_read_write(tmp_path, size): - """Test raw read/write""" - filename = tmp_path / "test-file" - - a = cupy.arange(size) - with kvikio.CuFile(filename, "w") as f: - assert f.raw_write(a) == a.nbytes - with kvikio.CuFile(filename, "r") as f: - assert f.raw_read(a) == a.nbytes - - -def test_raw_read_write_of_host_memory(tmp_path): - """Test raw read/write of host memory, which isn't supported""" - filename = tmp_path / "test-file" - - a = numpy.arange(1024) - with kvikio.CuFile(filename, "w") as f: - with pytest.raises(ValueError, match="Non-CUDA buffers not supported"): - f.raw_write(a) - with kvikio.CuFile(filename, "r") as f: - with pytest.raises(ValueError, match="Non-CUDA buffers not supported"): - assert f.raw_read(a) == a.nbytes From fd4847a8f66cb05100b342991025fb02e7dec809 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 30 Mar 2023 13:43:23 +0200 Subject: [PATCH 13/15] Apply suggestions from code review Co-authored-by: Lawrence Mitchell --- cpp/include/kvikio/utils.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp index a271e3555a..77461a888c 100644 --- a/cpp/include/kvikio/utils.hpp +++ b/cpp/include/kvikio/utils.hpp @@ -125,7 +125,7 @@ class CudaPrimaryContext { /** * @brief Given a device ordinal, return the primary context of the device. * - * This function cache the primary contexts retrieved until program exit + * This function caches the primary contexts retrieved until program exit * * @param ordinal Device ordinal - an integer between 0 and the number of CUDA devices * @return Primary CUDA context @@ -159,7 +159,7 @@ class CudaPrimaryContext { * @param dev_ptr Device pointer to query * @return The boolean answer */ -[[nodiscard]] inline bool can_current_context_access_pointer(CUdeviceptr dev_ptr) +[[nodiscard]] inline bool current_context_can_access_pointer(CUdeviceptr dev_ptr) { CUdeviceptr current_ctx_dev_ptr{}; const CUresult err = cudaAPI::instance().PointerGetAttribute( @@ -174,7 +174,7 @@ class CudaPrimaryContext { * * For robustness, we look for an usabale context in the following order: * 1) If a context has been associated with `devPtr`, it is returned - * 2) If the current context exist and can access `devPtr`, it is returned. + * 2) If the current context exists and can access `devPtr`, it is returned. * 3) Return the primary context of the device that owns `devPtr`. We assume the * primary context can access `devPtr`. * @param devPtr Device pointer to query From 39d50f2386fb1709231c2b4435ff9f97e0249487 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 30 Mar 2023 14:01:44 +0200 Subject: [PATCH 14/15] clean up --- cpp/include/kvikio/utils.hpp | 11 ++++++-- python/tests/test_basic_io.py | 53 ++++++++++++++++++----------------- 2 files changed, 35 insertions(+), 29 deletions(-) diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp index 77461a888c..8e3a8e5e7b 100644 --- a/cpp/include/kvikio/utils.hpp +++ b/cpp/include/kvikio/utils.hpp @@ -173,10 +173,15 @@ class CudaPrimaryContext { * @brief Return a CUDA context that can be used with the given device pointer * * For robustness, we look for an usabale context in the following order: - * 1) If a context has been associated with `devPtr`, it is returned + * 1) If a context has been associated with `devPtr`, it is returned. * 2) If the current context exists and can access `devPtr`, it is returned. * 3) Return the primary context of the device that owns `devPtr`. We assume the - * primary context can access `devPtr`. + * primary context can access `devPtr`, which might not be true in the exceptional + * disjoint addressing cases mention in the CUDA docs[1]. In these cases, the user + * has to set an usable current context before reading/writing using KvikIO. + * + * [1] + * * @param devPtr Device pointer to query * @return Usable CUDA context */ @@ -198,7 +203,7 @@ class CudaPrimaryContext { { CUcontext ctx = nullptr; CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx)); - if (ctx != nullptr && can_current_context_access_pointer(dev_ptr)) { + if (ctx != nullptr && current_context_can_access_pointer(dev_ptr)) { std::cout << "get_context_from_pointer() - can_current_context_access_pointer" << std::endl; return ctx; } diff --git a/python/tests/test_basic_io.py b/python/tests/test_basic_io.py index 208226d992..c12bf395e4 100644 --- a/python/tests/test_basic_io.py +++ b/python/tests/test_basic_io.py @@ -178,48 +178,49 @@ def with_no_cuda_context(): assert err == cuda.CUresult.CUDA_SUCCESS -def test_no_current_cuda_context(tmp_path): +def test_no_current_cuda_context(tmp_path, xp): + """Test IO when CUDA context is current""" filename = tmp_path / "test-file" - ary = cupy.arange(100) - - with with_no_cuda_context(): - with kvikio.CuFile(filename, "w") as f: - f.write(ary) - - with cupy.cuda.using_allocator(cupy.cuda.malloc_async): - ary = cupy.arange(100) - - with with_no_cuda_context(): - with kvikio.CuFile(filename, "w") as f: - f.write(ary) + a = xp.arange(100) + b = xp.empty_like(a) - with kvikio.CuFile(filename, "w") as f: - f.write(ary) + with kvikio.CuFile(filename, "w+") as f: + with with_no_cuda_context(): + f.write(a) + f.read(b) + assert all(a == b) @pytest.mark.skipif( cupy.cuda.runtime.getDeviceCount() < 2, reason="requires multiple GPUs" ) -def test_multiple_gpus(tmp_path): +def test_multiple_gpus(tmp_path, xp): """Test IO from two different GPUs""" + filename = tmp_path / "test-file" + with kvikio.defaults.set_num_threads(10): with kvikio.defaults.set_task_size(10): + + # Allocate an array on each device with cupy.cuda.Device(0): - a0 = cupy.arange(200) + a0 = xp.arange(200) with cupy.cuda.Device(1): - a1 = cupy.zeros(200, dtype=a0.dtype) + a1 = xp.zeros(200, dtype=a0.dtype) - filename = tmp_path / "test-file" + # Test when the device match the allocation with kvikio.CuFile(filename, "w") as f: with cupy.cuda.Device(0): assert f.write(a0) == a0.nbytes - with kvikio.CuFile(filename, "r") as f: - with pytest.raises( - RuntimeError, - match="The current CUDA context must own the given device memory", - ): - f.read(a1) with cupy.cuda.Device(1): assert f.read(a1) == a1.nbytes - assert all(cupy.asnumpy(a0) == cupy.asnumpy(a1)) + assert bytes(a0) == bytes(a1) + + # Test when the device doesn't match the allocation + with kvikio.CuFile(filename, "w") as f: + with cupy.cuda.Device(1): + assert f.write(a0) == a0.nbytes + with kvikio.CuFile(filename, "r") as f: + with cupy.cuda.Device(0): + assert f.read(a1) == a1.nbytes + assert bytes(a0) == bytes(a1) From 0aff7f5d632ba46d3096e2d69d650aacfb54a007 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 30 Mar 2023 14:20:51 +0200 Subject: [PATCH 15/15] removed debug info --- cpp/include/kvikio/utils.hpp | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp index 8e3a8e5e7b..9362678820 100644 --- a/cpp/include/kvikio/utils.hpp +++ b/cpp/include/kvikio/utils.hpp @@ -192,10 +192,7 @@ class CudaPrimaryContext { // First we check if a context has been associated with `devPtr`. { auto ctx = get_context_associated_pointer(dev_ptr); - if (ctx.has_value()) { - std::cout << "get_context_from_pointer() - context_associated" << std::endl; - return ctx.value(); - } + if (ctx.has_value()) { return ctx.value(); } } // If this isn't the case, we check the current context. If it exist and can access `devPtr`, we @@ -203,14 +200,9 @@ class CudaPrimaryContext { { CUcontext ctx = nullptr; CUDA_DRIVER_TRY(cudaAPI::instance().CtxGetCurrent(&ctx)); - if (ctx != nullptr && current_context_can_access_pointer(dev_ptr)) { - std::cout << "get_context_from_pointer() - can_current_context_access_pointer" << std::endl; - return ctx; - } + if (ctx != nullptr && current_context_can_access_pointer(dev_ptr)) { return ctx; } } - std::cout << "get_context_from_pointer() - get_primary_cuda_context" << std::endl; - // Finally, if we didn't find any usable context, we return the primary context of the // device that owns `devPtr`. If the primary context cannot access `devPtr`, we accept failure. return get_primary_cuda_context(get_device_ordinal_from_pointer(dev_ptr));