Skip to content

Commit

Permalink
Merge pull request #9561 from roiedanino/fix/perftest-write-sn-stuck
Browse files Browse the repository at this point in the history
TOOLS/PERF/LIB: ep_flush after write_sn / read_sn when using memory types
  • Loading branch information
roiedanino authored Dec 31, 2023
2 parents 487cac0 + 5047658 commit 5c9e58c
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 2 deletions.
4 changes: 4 additions & 0 deletions src/tools/perf/lib/ucp_tests.cc
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,8 @@ class ucp_perf_test_runner {
(uint64_t)ptr, m_perf.ucp.self_recv_rkey,
&param);
request_wait(request, mem_type, "read_sn");
request = ucp_ep_flush_nbx(m_perf.ucp.self_ep, &param);
request_wait(request, mem_type, "flush read_sn");
return sn;
}
}
Expand All @@ -420,6 +422,8 @@ class ucp_perf_test_runner {
request = ucp_put_nbx(m_perf.ucp.self_ep, &sn, sizeof(sn),
(uint64_t)ptr, rkey, &param);
request_wait(request, mem_type, "write_sn");
request = ucp_ep_flush_nbx(m_perf.ucp.self_ep, &param);
request_wait(request, mem_type, "flush write_sn");
}
}

Expand Down
3 changes: 3 additions & 0 deletions src/uct/cuda/cuda_copy/cuda_copy_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,9 @@ uct_cuda_copy_post_cuda_async_copy(uct_ep_h tl_ep, void *dst, void *src,
ucs_queue_push(event_q, &cuda_event->queue);
cuda_event->comp = comp;

UCS_BITMAP_SET(iface->streams_to_sync,
uct_cuda_copy_flush_bitmap_idx(src_type, dst_type));

ucs_trace("cuda async issued: %p dst:%p[%s], src:%p[%s] len:%ld",
cuda_event, dst, ucs_memory_type_names[dst_type], src,
ucs_memory_type_names[src_type], length);
Expand Down
52 changes: 51 additions & 1 deletion src/uct/cuda/cuda_copy/cuda_copy_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -131,17 +131,51 @@ static ucs_status_t uct_cuda_copy_iface_query(uct_iface_h tl_iface,
return UCS_OK;
}

static ucs_status_t uct_cuda_copy_sync_streams(uct_cuda_copy_iface_t *iface)
{
CUstream stream;
uint32_t stream_index;
ucs_memory_type_t src_mem_type, dst_mem_type;
ucs_status_t status;

UCS_BITMAP_FOR_EACH_BIT(iface->streams_to_sync, stream_index) {
src_mem_type = stream_index / UCS_MEMORY_TYPE_LAST;
if ((src_mem_type >= UCS_MEMORY_TYPE_LAST)) {
break;
}

dst_mem_type = stream_index % UCS_MEMORY_TYPE_LAST;
stream = iface->queue_desc[src_mem_type][dst_mem_type].stream;
status = UCT_CUDADRV_FUNC_LOG_ERR(cuStreamSynchronize(stream));
if (status != UCS_OK) {
return status;
}

UCS_BITMAP_UNSET(iface->streams_to_sync,
uct_cuda_copy_flush_bitmap_idx(src_mem_type,
dst_mem_type));
}

return UCS_OK;
}

static ucs_status_t uct_cuda_copy_iface_flush(uct_iface_h tl_iface, unsigned flags,
uct_completion_t *comp)
{
uct_cuda_copy_iface_t *iface = ucs_derived_of(tl_iface, uct_cuda_copy_iface_t);
uct_cuda_copy_queue_desc_t *q_desc;
ucs_queue_iter_t iter;
ucs_status_t status;

if (comp != NULL) {
return UCS_ERR_UNSUPPORTED;
}

status = uct_cuda_copy_sync_streams(iface);
if (status != UCS_OK) {
return status;
}

ucs_queue_for_each_safe(q_desc, iter, &iface->active_queue, queue) {
if (!ucs_queue_is_empty(&q_desc->event_queue)) {
UCT_TL_IFACE_STAT_FLUSH_WAIT(ucs_derived_of(tl_iface,
Expand Down Expand Up @@ -269,14 +303,29 @@ static ucs_status_t uct_cuda_copy_iface_event_fd_arm(uct_iface_h tl_iface,
return UCS_OK;
}

static ucs_status_t
uct_cuda_copy_ep_flush(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp)
{
uct_cuda_copy_iface_t *iface = ucs_derived_of(tl_ep->iface,
uct_cuda_copy_iface_t);
ucs_status_t status;

status = uct_cuda_copy_sync_streams(iface);
if (status != UCS_OK) {
return status;
}

return uct_base_ep_flush(tl_ep, flags, comp);
}

static uct_iface_ops_t uct_cuda_copy_iface_ops = {
.ep_get_short = uct_cuda_copy_ep_get_short,
.ep_put_short = uct_cuda_copy_ep_put_short,
.ep_get_zcopy = uct_cuda_copy_ep_get_zcopy,
.ep_put_zcopy = uct_cuda_copy_ep_put_zcopy,
.ep_pending_add = ucs_empty_function_return_busy,
.ep_pending_purge = ucs_empty_function,
.ep_flush = uct_base_ep_flush,
.ep_flush = uct_cuda_copy_ep_flush,
.ep_fence = uct_base_ep_fence,
.ep_create = UCS_CLASS_NEW_FUNC_NAME(uct_cuda_copy_ep_t),
.ep_destroy = UCS_CLASS_DELETE_FUNC_NAME(uct_cuda_copy_ep_t),
Expand Down Expand Up @@ -429,6 +478,7 @@ static UCS_CLASS_INIT_FUNC(uct_cuda_copy_iface_t, uct_md_h md, uct_worker_h work
self->config.max_poll = config->max_poll;
self->config.max_cuda_events = config->max_cuda_events;
self->config.bandwidth = config->bandwidth;
UCS_BITMAP_CLEAR(&self->streams_to_sync);

ucs_mpool_params_reset(&mp_params);
mp_params.elem_size = sizeof(uct_cuda_copy_event_desc_t);
Expand Down
39 changes: 38 additions & 1 deletion src/uct/cuda/cuda_copy/cuda_copy_iface.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,39 @@
#ifndef UCT_CUDA_COPY_IFACE_H
#define UCT_CUDA_COPY_IFACE_H


#include <ucs/datastruct/bitmap.h>
#include <ucs/memory/memory_type.h>
#include <uct/base/uct_iface.h>
#include <uct/cuda/base/cuda_iface.h>
#include <ucs/memory/memory_type.h>

#include <pthread.h>


#define UCT_CUDA_MEMORY_TYPES_MAP 64

typedef uint64_t uct_cuda_copy_iface_addr_t;


/*
uct_cu_stream_bitmap_t will be treated as a 2D bitmap, in which
each bit represents a CUstream from the queue_desc attr:
row index is source mem_type and column index is the dest mem_type.
For example:
H - Host, C - Cuda, R - ROCm, I - Infiniband (RDMA)
H C R I
H 0 0 0 0
C 0 0 0 0
R 0 0 0 0
I 0 0 0 0
Bits will be set using:
UCS_BITMAP_SET(bitmap, uct_cuda_copy_flush_bitmap_idx(src_mem_type, dst_mem_type))
*/
typedef ucs_bitmap_t(UCT_CUDA_MEMORY_TYPES_MAP) uct_cu_stream_bitmap_t;

typedef struct uct_cuda_copy_queue_desc {
/* stream on which asynchronous memcpy operations are enqueued */
CUstream stream;
Expand Down Expand Up @@ -52,6 +76,10 @@ typedef struct uct_cuda_copy_iface {
void *event_arg;
uct_async_event_cb_t event_cb;
} async;

/* 2D bitmap representing which streams in queue_desc matrix
should sync during flush */
uct_cu_stream_bitmap_t streams_to_sync;
} uct_cuda_copy_iface_t;


Expand All @@ -68,4 +96,13 @@ typedef struct uct_cuda_copy_event_desc {
uct_completion_t *comp;
ucs_queue_elem_t queue;
} uct_cuda_copy_event_desc_t;


static UCS_F_ALWAYS_INLINE unsigned
uct_cuda_copy_flush_bitmap_idx(ucs_memory_type_t src_mem_type,
ucs_memory_type_t dst_mem_type)
{
return (src_mem_type * UCS_MEMORY_TYPE_LAST) + dst_mem_type;
}

#endif

0 comments on commit 5c9e58c

Please sign in to comment.