From 04e9f9049ac18bcc0fd7aa9b152746ccafd037b1 Mon Sep 17 00:00:00 2001 From: Akshay Venkatesh Date: Wed, 17 Feb 2021 14:42:30 -0800 Subject: [PATCH] UCT/CUDA_IPC: fix peer-access-map init; log cuda_ipc open_memhandle failure --- src/uct/cuda/cuda_ipc/cuda_ipc_cache.c | 2 +- src/uct/cuda/cuda_ipc/cuda_ipc_md.c | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_cache.c b/src/uct/cuda/cuda_ipc/cuda_ipc_cache.c index ea267b56836..a2afd15eabb 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc_cache.c +++ b/src/uct/cuda/cuda_ipc/cuda_ipc_cache.c @@ -111,7 +111,7 @@ static ucs_status_t uct_cuda_ipc_open_memhandle(const uct_cuda_ipc_key_t *key, status = UCS_OK; } else { cuGetErrorString(cuerr, &cu_err_str); - ucs_error("cuIpcOpenMemHandle() failed: %s", cu_err_str); + ucs_debug("cuIpcOpenMemHandle() failed: %s", cu_err_str); status = (cuerr == CUDA_ERROR_ALREADY_MAPPED) ? UCS_ERR_ALREADY_EXISTS : UCS_ERR_INVALID_PARAM; } diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_md.c b/src/uct/cuda/cuda_ipc/cuda_ipc_md.c index c5663ca0397..a44799478b0 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc_md.c +++ b/src/uct/cuda/cuda_ipc/cuda_ipc_md.c @@ -78,6 +78,7 @@ ucs_status_t uct_cuda_ipc_get_unique_index_for_uuid(int* idx, int i; int num_devices; int original_capacity, new_capacity; + int original_count, new_count; for (i = 0; i < md->uuid_map_size; i++) { if (uct_cuda_ipc_uuid_equals(&rkey->uuid, &md->uuid_map[i])) { @@ -92,6 +93,8 @@ ucs_status_t uct_cuda_ipc_get_unique_index_for_uuid(int* idx, original_capacity = md->uuid_map_capacity; new_capacity = md->uuid_map_capacity ? (md->uuid_map_capacity * 2) : 16; + original_count = original_capacity * num_devices; + new_count = new_capacity * num_devices; md->uuid_map_capacity = new_capacity; md->uuid_map = ucs_realloc(md->uuid_map, new_capacity * sizeof(CUuuid), @@ -101,14 +104,14 @@ ucs_status_t uct_cuda_ipc_get_unique_index_for_uuid(int* idx, } md->peer_accessible_cache = ucs_realloc(md->peer_accessible_cache, - new_capacity * num_devices * + new_count * sizeof(ucs_ternary_auto_value_t), "uct_cuda_ipc_peer_accessible_cache"); if (md->peer_accessible_cache == NULL) { return UCS_ERR_NO_MEMORY; } - for (i = original_capacity; i < new_capacity; i++) { + for (i = original_count; i < new_count; i++) { md->peer_accessible_cache[i] = UCS_TRY; } }