From b1c1d7999897f011e29a3bd521f5b1501b651653 Mon Sep 17 00:00:00 2001 From: Alex Margolin Date: Thu, 6 Jul 2017 17:54:31 +0300 Subject: [PATCH] UCP: Non-contiguous registration send/recv support for RC and DC (#5/6) --- src/ucp/api/ucp.h | 6 +- src/ucp/core/ucp_request.c | 8 + src/uct/ib/base/ib_md.c | 11 + src/uct/ib/base/ib_umr.c | 293 +++++++++++++++++++++++++-- src/uct/ib/base/ib_umr.h | 17 +- src/uct/ib/base/ib_verbs.h | 11 + src/uct/ib/dc/base/dc_iface.c | 7 +- src/uct/ib/dc/verbs/dc_verbs.c | 46 +++++ src/uct/ib/rc/base/rc_ep.h | 4 + src/uct/ib/rc/base/rc_iface.c | 18 +- src/uct/ib/rc/verbs/rc_verbs_ep.c | 44 ++++ src/uct/ib/rc/verbs/rc_verbs_iface.c | 1 + test/gtest/ucs/test_rcache.cc | 6 +- 13 files changed, 447 insertions(+), 25 deletions(-) diff --git a/src/ucp/api/ucp.h b/src/ucp/api/ucp.h index f278c9a5063..f6c63f54c6f 100644 --- a/src/ucp/api/ucp.h +++ b/src/ucp/api/ucp.h @@ -240,9 +240,9 @@ enum ucp_worker_attr_field { enum ucp_dt_type { UCP_DATATYPE_CONTIG = 0, /**< Contiguous datatype */ UCP_DATATYPE_IOV = 2, /**< Scatter-gather list with multiple pointers */ - UCP_DATATYPE_IOV_R = 3, /**< Same as IOV, but reusable */ - UCP_DATATYPE_STRIDE = 4, /**< Interleaving a pointers to strided data */ - UCP_DATATYPE_STRIDE_R = 5, /**< Strided datatype */ + //UCP_DATATYPE_IOV_R = 3, /**< Same as IOV, but reusable */ + //UCP_DATATYPE_STRIDE = 4, /**< Interleaving a pointers to strided data */ + //UCP_DATATYPE_STRIDE_R = 5, /**< Strided datatype */ UCP_DATATYPE_GENERIC = 7, /**< Generic datatype with user-defined pack/unpack routines */ UCP_DATATYPE_SHIFT = 3, /**< Number of bits defining diff --git a/src/ucp/core/ucp_request.c b/src/ucp/core/ucp_request.c index a70699f5072..e4c11c195c7 100644 --- a/src/ucp/core/ucp_request.c +++ b/src/ucp/core/ucp_request.c @@ -214,6 +214,14 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_request_memory_reg, } } state->dt.iov.memh = memh; + + /* If non-contiguous bind is not supported - use the existing mapping */ + uct_md_attr = &context->tl_mds[mdi].attr; + if (!(uct_md_attr->cap.flags & UCT_MD_FLAG_REG_NC)) { + break; + } + + status = ucp_dt_reusable_create(ep, buffer, length, datatype, state); break; default: diff --git a/src/uct/ib/base/ib_md.c b/src/uct/ib/base/ib_md.c index 8f13ab552c3..b4265618626 100644 --- a/src/uct/ib/base/ib_md.c +++ b/src/uct/ib/base/ib_md.c @@ -170,6 +170,14 @@ static ucs_status_t uct_ib_md_query(uct_md_h uct_md, uct_md_attr_t *md_attr) md_attr->cap.flags |= UCT_MD_FLAG_ALLOC; } + if (IBV_EXP_HAVE_UMR(&md->dev.dev_attr)) { + md_attr->cap.flags |= UCT_MD_FLAG_REG_NC; + } + + if (IBV_EXP_HAVE_UMR(&md->dev.dev_attr)) { + md_attr->cap.flags |= UCT_MD_FLAG_REG_NC; + } + md_attr->reg_cost = md->reg_cost; md_attr->local_cpus = md->dev.local_cpus; return UCS_OK; @@ -207,6 +215,9 @@ static ucs_status_t uct_ib_md_umr_qp_create(uct_ib_md_t *md) ucs_status_t status = UCS_ERR_IO_ERROR; ibdev = &md->dev; + if (!IBV_EXP_HAVE_UMR(&ibdev->dev_attr)) { + return UCS_ERR_UNSUPPORTED; + } /* TODO: fix port selection. It looks like active port should be used */ port_num = ibdev->first_port; diff --git a/src/uct/ib/base/ib_umr.c b/src/uct/ib/base/ib_umr.c index 34b0b0a5dd6..297d5fc5774 100644 --- a/src/uct/ib/base/ib_umr.c +++ b/src/uct/ib/base/ib_umr.c @@ -6,6 +6,16 @@ #include "ib_umr.h" +#define MAX_UMR_REPEAT_COUNT ((uint32_t)-1) +#define MAX_UMR_REPEAT_CYCLE ((uint32_t)-1) +#define MAX_UMR_REPEAT_STRIDE ((uint16_t)-1) +#define MAX_UMR_REPEAT_LENGTH ((uint16_t)-1) + +#define MAX_UMR_REPEAT_COUNT ((uint32_t)-1) +#define MAX_UMR_REPEAT_CYCLE ((uint32_t)-1) +#define MAX_UMR_REPEAT_STRIDE ((uint16_t)-1) +#define MAX_UMR_REPEAT_LENGTH ((uint16_t)-1) + #define UCT_IB_UMR_ACCESS_FLAGS (IBV_ACCESS_LOCAL_WRITE | \ IBV_ACCESS_REMOTE_WRITE | \ IBV_ACCESS_REMOTE_READ | \ @@ -16,6 +26,12 @@ static void uct_ib_umr_destroy(uct_ib_umr_t *umr) { switch (umr->wr.ext_op.umr.umr_type) { + case IBV_EXP_UMR_REPEAT: + ucs_free(umr->mem_strided); + ucs_free(umr->repeat_count); + ucs_free(umr->repeat_length); + ucs_free(umr->repeat_stride); + break; case IBV_EXP_UMR_MR_LIST: ucs_free(umr->mem_iov); break; @@ -34,7 +50,12 @@ ucs_status_t uct_ib_umr_init(uct_ib_md_t *md, unsigned klm_cnt, uct_ib_umr_t *um { struct ibv_exp_create_mr_in mrin = {0}; - umr->is_inline = 0; /* Temporary */ + if (!klm_cnt) { + klm_cnt = IBV_DEVICE_UMR_CAPS(&md->dev.dev_attr, max_send_wqe_inline_klms); + umr->is_inline = 1; + } else { + umr->is_inline = 0; + } /* Create memory key */ mrin.pd = md->pd; @@ -70,10 +91,13 @@ static inline ucs_status_t uct_ib_umr_fill_wr(uct_ib_md_t *md, const uct_iov_t *iov, size_t iovcnt, uct_ib_umr_t *umr) { + unsigned dim_idx; unsigned mem_idx; - const uct_iov_t *entry = iov; + unsigned ilv_idx; + const uct_iov_t *tmp, *entry = iov; unsigned entry_idx = 0; struct ibv_mr *ib_mr; + size_t cycle_length; if (!umr->is_inline) { return UCS_ERR_UNSUPPORTED; // TODO: support... @@ -86,35 +110,235 @@ ucs_status_t uct_ib_umr_fill_wr(uct_ib_md_t *md, const uct_iov_t *iov, umr->wr.ext_op.umr.modified_mr = umr->mr; umr->wr.ext_op.umr.base_addr = (uint64_t) entry->buffer; + if (entry->stride) { + umr->wr.ext_op.umr.umr_type = IBV_EXP_UMR_REPEAT; + umr->wr.ext_op.umr.mem_list.rb.mem_repeat_block_list = umr->mem_strided; + umr->wr.ext_op.umr.mem_list.rb.repeat_count = umr->repeat_count; + umr->wr.ext_op.umr.mem_list.rb.stride_dim = umr->stride_dim; + umr->repeat_count[0] = entry->count; + if (entry->count > MAX_UMR_REPEAT_COUNT) { + return UCS_ERR_UNSUPPORTED; + } - umr->wr.ext_op.umr.umr_type = IBV_EXP_UMR_MR_LIST; - umr->wr.ext_op.umr.mem_list.mem_reg_list = umr->mem_iov; - while (entry_idx < iovcnt) { - ib_mr = md->umr.get_mr(entry->memh); - if (ib_mr->pd != umr->mr->pd) { - return UCS_ERR_INVALID_PARAM; + mem_idx = 0; + cycle_length = 0; + while (entry_idx < iovcnt) { + if (umr->repeat_count[0] != entry->count) { + return UCS_ERR_UNSUPPORTED; + } + + ib_mr = md->umr.get_mr(entry->memh); + if (ib_mr->pd != umr->mr->pd) { + return UCS_ERR_INVALID_PARAM; + } + + for (tmp = entry, ilv_idx = entry->ilv_ratio; ilv_idx > 0; ilv_idx--) { + entry = tmp; /* repeat the same group of entries*/ + dim_idx = umr->stride_dim * mem_idx; + umr->mem_strided[mem_idx].base_addr = (uint64_t) entry->buffer; + umr->mem_strided[mem_idx].mr = ib_mr; + umr->mem_strided[mem_idx].stride = &umr->repeat_stride[dim_idx]; + umr->mem_strided[mem_idx].byte_count = &umr->repeat_length[dim_idx]; + + do { + if ((entry->length > MAX_UMR_REPEAT_STRIDE) || + (entry->stride > MAX_UMR_REPEAT_LENGTH)) { + return UCS_ERR_UNSUPPORTED; + } + + umr->repeat_length[dim_idx] = entry->length; + umr->repeat_stride[dim_idx] = entry->stride; + cycle_length += entry->length; + dim_idx++; + + entry = &iov[++entry_idx]; + } while (entry->buffer == NULL); + mem_idx++; + } + } + if (cycle_length > MAX_UMR_REPEAT_CYCLE) { + return UCS_ERR_UNSUPPORTED; } + } else { + umr->wr.ext_op.umr.umr_type = IBV_EXP_UMR_MR_LIST; + umr->wr.ext_op.umr.mem_list.mem_reg_list = umr->mem_iov; + while (entry_idx < iovcnt) { + ib_mr = md->umr.get_mr(entry->memh); + if (ib_mr->pd != umr->mr->pd) { + return UCS_ERR_INVALID_PARAM; + } - umr->mem_iov[entry_idx].base_addr = (uint64_t) entry->buffer; - umr->mem_iov[entry_idx].mr = ib_mr; - umr->mem_iov[entry_idx].length = entry->length; + umr->mem_iov[entry_idx].base_addr = (uint64_t) entry->buffer; + umr->mem_iov[entry_idx].mr = ib_mr; + umr->mem_iov[entry_idx].length = entry->length; - entry = &iov[++entry_idx]; + entry = &iov[++entry_idx]; + } + mem_idx = iovcnt; } - mem_idx = iovcnt; umr->wr.ext_op.umr.num_mrs = mem_idx; return UCS_OK; } +static inline +ucs_status_t uct_ib_md_calc_required_klms(const struct ibv_exp_device_attr *dev_attr, + const uct_iov_t *iov, size_t iovcnt, + unsigned *klms_needed, + unsigned *stride_dim, + unsigned *depth) +{ + unsigned iov_idx, klm_cnt = 0; + unsigned iov_depth, umr_depth = 0; + unsigned max_depth = IBV_DEVICE_UMR_CAPS(dev_attr, max_umr_recursion_depth); + unsigned max_dim = IBV_DEVICE_UMR_CAPS(dev_attr, max_umr_stride_dimension); + unsigned dim_cnt = (unsigned)-1, dim_check = (unsigned)-1; + + for (iov_idx = 0; iov_idx < iovcnt; iov_idx++) { + uct_mem_h iov_memh = iov[iov_idx].memh; + if (ucs_unlikely(!iov_memh)) { + return UCS_ERR_INVALID_PARAM; + } + + /* Check if recursion depth limit is exceeded */ + iov_depth = ((uct_ib_mem_t*)(iov_memh))->umr_depth; + if (iov_depth > umr_depth) { + if (iov_depth >= max_depth) { + return UCS_ERR_UNSUPPORTED; + } + umr_depth = iov_depth; + } + + /* Count stride dimension and KLMs required */ + if (iov[iov_idx].buffer == NULL) { + if (++dim_cnt >= max_dim) { + return UCS_ERR_UNSUPPORTED; + } + } else { + if (dim_cnt != dim_check) { + if (dim_check == (unsigned)-1) { + dim_check = dim_cnt; + } else { + return UCS_ERR_INVALID_PARAM; + } + } + if (iov[iov_idx].stride) { + dim_cnt = 1; + klm_cnt += iov[iov_idx].ilv_ratio; + } else { + dim_cnt = 0; + klm_cnt++; + } + } + } + + /* Check last iov array element */ + if (dim_cnt != dim_check) { + if (dim_check == (unsigned)-1) { + dim_check = dim_cnt; + } else { + return UCS_ERR_INVALID_PARAM; + } + } + + if (!max_dim && dim_check) { + return UCS_ERR_UNSUPPORTED; + } + + if (klm_cnt > IBV_DEVICE_UMR_CAPS(dev_attr, max_klm_list_size)) { + return UCS_ERR_UNSUPPORTED; + } + + *klms_needed = klm_cnt; + *stride_dim = dim_check; + *depth = umr_depth + 1; + return UCS_OK; +} + +static inline +ucs_status_t uct_ib_umr_alloc(uct_ib_md_t *md, unsigned klms, + unsigned stride_dim, uct_ib_umr_t **umr_p) +{ + uct_ib_umr_t *umr = ucs_mpool_get_inline(&md->umr.mp); + + if (klms > umr->klms) { + ucs_status_t status; + uct_ib_umr_finalize(umr); + status = uct_ib_umr_init(md, klms, umr); + if (status != UCS_OK) { + ucs_mpool_put_inline(umr); + return status; + } + } + + if (stride_dim) { + umr->stride_dim = stride_dim; + umr->mem_strided = ucs_malloc(klms * sizeof(*umr->mem_strided), "umr_repeat"); + if (umr->mem_strided == NULL) { + goto alloc_none; + } + + umr->repeat_count = ucs_malloc(stride_dim * sizeof(size_t), "umr_count"); + if (umr->repeat_count == NULL) { + goto alloc_strided; + } + + umr->repeat_length = ucs_malloc(stride_dim * klms * sizeof(size_t), "umr_length"); + if (umr->repeat_length == NULL) { + goto alloc_count; + } + + umr->repeat_stride = ucs_malloc(stride_dim * klms * sizeof(size_t), "umr_stride"); + if (umr->repeat_stride == NULL) { + goto alloc_length; + } + } else { + umr->mem_iov = ucs_malloc(klms * sizeof(struct ibv_exp_mem_region), "umr_iov"); + if (umr->mem_iov == NULL) { + goto alloc_none; + } + } + + umr->comp.count = 1; + *umr_p = umr; + return UCS_OK; + +alloc_length: + ucs_free(umr->repeat_length); +alloc_count: + ucs_free(umr->repeat_count); +alloc_strided: + ucs_free(umr->mem_strided); +alloc_none: + ucs_mpool_put_inline(umr); + return UCS_ERR_NO_MEMORY; +} + ucs_status_t uct_ib_umr_create(uct_ib_md_t *md, const uct_iov_t *iov, size_t iovcnt, uct_ep_t *tl_ep, ep_post_dereg_f dereg_f, uct_ib_umr_t **umr_p) { #if (HAVE_EXP_UMR && HAVE_EXP_UMR_NEW_API) ucs_status_t status; + unsigned klms_needed; + unsigned stride_dim; + unsigned umr_depth; + uct_ib_umr_t *umr; - uct_ib_umr_t *umr = ucs_mpool_get_inline(&md->umr.mp); + if (!IBV_EXP_HAVE_UMR(&md->dev.dev_attr)) { + return UCS_ERR_UNSUPPORTED; + } + + status = uct_ib_md_calc_required_klms(&md->dev.dev_attr, iov, iovcnt, + &klms_needed, &stride_dim, &umr_depth); + if (status != UCS_OK) { + return status; + } + + status = uct_ib_umr_alloc(md, klms_needed, stride_dim, &umr); + if (status != UCS_OK) { + return status; + } status = uct_ib_umr_fill_wr(md, iov, iovcnt, umr); if (status != UCS_OK) { @@ -122,6 +346,7 @@ ucs_status_t uct_ib_umr_create(uct_ib_md_t *md, const uct_iov_t *iov, return status; } + umr->depth = umr_depth; umr->dereg_f = dereg_f; umr->tl_ep = tl_ep; *umr_p = umr; @@ -214,6 +439,43 @@ ucs_status_t uct_ib_umr_reg_offset(uct_ib_md_t *md, struct ibv_mr *mr, #endif } +ucs_status_t uct_ib_umr_reg_nc(uct_md_h uct_md, const uct_iov_t *iov, + size_t iovcnt, uct_ep_h tl_ep, + ep_post_dereg_f dereg_f, uct_ib_mem_t *memh, + struct ibv_exp_send_wr **wr_p) +{ +#if (HAVE_EXP_UMR || HAVE_EXP_UMR_NEW_API) + uct_ib_umr_t *umr; + ucs_status_t status; + + uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t); + if (ucs_unlikely(md->umr.qp == NULL)) { + return UCS_ERR_UNSUPPORTED; + } + + UCS_STATS_UPDATE_COUNTER(md->stats, UCT_IB_MD_STAT_MEM_REG_NC, +1); + if (ucs_unlikely(memh->umr == NULL)) { + status = uct_ib_umr_create(md, iov, iovcnt, tl_ep, dereg_f, &umr); + if (status != UCS_OK) { + return status; + } + + memh->mr = umr->mr; + memh->umr = umr; + memh->lkey = umr->mr->lkey; + memh->flags = UCT_IB_MEM_FLAG_NC_MR; + memh->umr_depth = umr->depth; + *wr_p = &umr->wr; + return UCS_OK; + } + + *wr_p = &memh->umr->wr; + return uct_ib_umr_update_wr(iov, iovcnt, memh->umr); +#else + return UCS_ERR_UNSUPPORTED; +#endif +} + ucs_status_t uct_ib_umr_dereg_nc(uct_ib_umr_t *umr) { #if (HAVE_EXP_UMR || HAVE_EXP_UMR_NEW_API) @@ -232,3 +494,6 @@ ucs_status_t uct_ib_umr_dereg_nc(uct_ib_umr_t *umr) return UCS_ERR_UNSUPPORTED; #endif } + + + diff --git a/src/uct/ib/base/ib_umr.h b/src/uct/ib/base/ib_umr.h index 6e804b6021c..b4cfcede852 100644 --- a/src/uct/ib/base/ib_umr.h +++ b/src/uct/ib/base/ib_umr.h @@ -24,7 +24,17 @@ typedef struct uct_ib_umr { uct_completion_t comp; /* completion routine */ ep_post_dereg_f dereg_f; /* endpoint WR posting function pointer */ uct_ep_t *tl_ep; /* registering endpoint - for cleanup */ - struct ibv_exp_mem_region *mem_iov; + + union { + struct ibv_exp_mem_region *mem_iov; + struct { + struct ibv_exp_mem_repeat_block *mem_strided; //[UCT_IB_UMR_MAX_KLMS] + size_t *repeat_length; //[UCT_IB_UMR_MAX_KLMS][stride_dim] + size_t *repeat_stride; //[UCT_IB_UMR_MAX_KLMS][stride_dim] + size_t *repeat_count; //[stride_dim]; + unsigned stride_dim; + }; + }; } uct_ib_umr_t; ucs_status_t uct_ib_umr_init(uct_ib_md_t *md, unsigned klm_cnt, uct_ib_umr_t *umr); @@ -36,6 +46,11 @@ ucs_status_t uct_ib_umr_reg_offset(uct_ib_md_t *md, struct ibv_mr *mr, off_t offset, struct ibv_mr **offset_mr, uct_ib_umr_t **umr_p); +ucs_status_t uct_ib_umr_reg_nc(uct_md_h uct_md, const uct_iov_t *iov, + size_t iovcnt, uct_ep_h tl_ep, + ep_post_dereg_f dereg_f, uct_ib_mem_t *memh, + struct ibv_exp_send_wr **wr_p); + ucs_status_t uct_ib_umr_dereg_nc(uct_ib_umr_t *umr); #endif diff --git a/src/uct/ib/base/ib_verbs.h b/src/uct/ib/base/ib_verbs.h index 966a4b9548b..b4da824e581 100644 --- a/src/uct/ib/base/ib_verbs.h +++ b/src/uct/ib/base/ib_verbs.h @@ -193,6 +193,17 @@ static inline int ibv_exp_cq_ignore_overrun(struct ibv_cq *cq) # define IBV_DEVICE_TM_CAPS(_dev, _field) 0 #endif +/* + * Fast memory registration (UMR) support + */ +#if HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_UMR_CAPS +# define IBV_EXP_HAVE_UMR(_attr) ((_attr)->exp_device_cap_flags & IBV_EXP_DEVICE_UMR) +# define IBV_DEVICE_UMR_CAPS(_attr, _field) ((_attr)->umr_caps._field) +#else +# define IBV_EXP_HAVE_UMR(_attr) 0 +# define IBV_DEVICE_UMR_CAPS(_attr, _field) 0 +#endif + typedef uint8_t uct_ib_uint24_t[3]; diff --git a/src/uct/ib/dc/base/dc_iface.c b/src/uct/ib/dc/base/dc_iface.c index ec023cdcd65..bc96e508d80 100644 --- a/src/uct/ib/dc/base/dc_iface.c +++ b/src/uct/ib/dc/base/dc_iface.c @@ -59,9 +59,10 @@ static ucs_status_t uct_dc_iface_create_dct(uct_dc_iface_t *iface) init_attr.dc_key = UCT_IB_KEY; init_attr.port = iface->super.super.config.port_num; init_attr.mtu = iface->super.config.path_mtu; - init_attr.access_flags = IBV_EXP_ACCESS_REMOTE_WRITE | - IBV_EXP_ACCESS_REMOTE_READ | - IBV_EXP_ACCESS_REMOTE_ATOMIC; + init_attr.access_flags = IBV_EXP_ACCESS_REMOTE_WRITE | + IBV_EXP_ACCESS_REMOTE_READ | + IBV_EXP_ACCESS_REMOTE_ATOMIC | + IBV_EXP_ACCESS_MW_BIND; init_attr.min_rnr_timer = iface->super.config.min_rnr_timer; init_attr.hop_limit = 1; init_attr.inline_size = iface->super.config.rx_inline; diff --git a/src/uct/ib/dc/verbs/dc_verbs.c b/src/uct/ib/dc/verbs/dc_verbs.c index 30f79009aea..a11f11cd755 100644 --- a/src/uct/ib/dc/verbs/dc_verbs.c +++ b/src/uct/ib/dc/verbs/dc_verbs.c @@ -724,6 +724,51 @@ static void uct_dc_verbs_iface_progress(void *arg) } } +static void uct_dc_ep_dereg_nc(uct_ep_h tl_ep, struct ibv_exp_send_wr *wr, + uct_completion_t *comp) +{ + uct_dc_verbs_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_verbs_ep_t); + uct_dc_verbs_iface_t *iface = ucs_derived_of(tl_ep->iface, + uct_dc_verbs_iface_t); + + uct_dc_verbs_iface_post_send(iface, ep, wr, 0); + uct_dc_verbs_iface_add_send_comp(iface, ep, comp); +} + +static ucs_status_t uct_dc_ep_reg_nc(uct_ep_h tl_ep, const uct_iov_t *iov, + size_t iovcnt, uct_md_h *md_p, + uct_mem_h *memh_p, uct_completion_t *comp) +{ + uct_ib_mem_t *memh; + ucs_status_t status; + struct ibv_exp_send_wr *wr; + uct_dc_verbs_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_verbs_ep_t); + uct_dc_verbs_iface_t *iface = ucs_derived_of(tl_ep->iface, + uct_dc_verbs_iface_t); + + uct_md_h md = iface->super.super.super.super.md; + if (*memh_p == NULL) { + status = md->ops->mem_reg(md, NULL, 0, UCT_MD_MEM_FLAG_EMPTY, (void**)&memh); + if (ucs_unlikely(status != UCS_OK)) { + return status; + } + } + + status = uct_ib_umr_reg_nc(md, iov, iovcnt, tl_ep, + uct_dc_ep_dereg_nc, memh, &wr); + if (ucs_unlikely(status != UCS_OK)) { + return status; + } + + /* TODO: prevent DCI switch between UMR and its data send */ + uct_dc_verbs_iface_post_send(iface, ep, wr, wr->exp_send_flags); + uct_dc_verbs_iface_add_send_comp(iface, ep, comp); + + *md_p = md; + *memh_p = memh; + return UCS_INPROGRESS; +} + static void UCS_CLASS_DELETE_FUNC_NAME(uct_dc_verbs_iface_t)(uct_iface_t*); static uct_dc_iface_ops_t uct_dc_verbs_iface_ops = { @@ -746,6 +791,7 @@ static uct_dc_iface_ops_t uct_dc_verbs_iface_ops = { .ep_atomic_fadd32 = uct_dc_verbs_ep_atomic_fadd32, .ep_atomic_swap32 = uct_dc_verbs_ep_atomic_swap32, .ep_atomic_cswap32 = uct_dc_verbs_ep_atomic_cswap32, + .ep_mem_reg_nc = uct_dc_ep_reg_nc, .ep_pending_add = uct_dc_ep_pending_add, .ep_pending_purge = uct_dc_ep_pending_purge, .ep_flush = uct_dc_verbs_ep_flush, diff --git a/src/uct/ib/rc/base/rc_ep.h b/src/uct/ib/rc/base/rc_ep.h index aa5af862e6c..2f503900b83 100644 --- a/src/uct/ib/rc/base/rc_ep.h +++ b/src/uct/ib/rc/base/rc_ep.h @@ -215,6 +215,10 @@ ucs_status_t uct_rc_ep_pending_add(uct_ep_h tl_ep, uct_pending_req_t *n); void uct_rc_ep_pending_purge(uct_ep_h ep, uct_pending_purge_callback_t cb, void*arg); +ucs_status_t uct_rc_ep_reg_nc(uct_ep_h tl_ep, const uct_iov_t *iov, + size_t iovcnt, uct_md_h *md_p, + uct_mem_h *memh_p, uct_completion_t *comp); + ucs_arbiter_cb_result_t uct_rc_ep_process_pending(ucs_arbiter_t *arbiter, ucs_arbiter_elem_t *elem, void *arg); diff --git a/src/uct/ib/rc/base/rc_iface.c b/src/uct/ib/rc/base/rc_iface.c index c8adda704b9..df8180d248a 100644 --- a/src/uct/ib/rc/base/rc_iface.c +++ b/src/uct/ib/rc/base/rc_iface.c @@ -620,6 +620,21 @@ ucs_status_t uct_rc_iface_qp_create(uct_rc_iface_t *iface, int qp_type, qp_init_attr.max_inl_recv = iface->config.rx_inline; # endif +# if HAVE_EXP_UMR +# if (HAVE_IBV_EXP_QP_CREATE_UMR_CAPS || HAVE_EXP_UMR_NEW_API) + qp_init_attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_MAX_INL_KLMS; + qp_init_attr.max_inl_send_klms = dev->dev_attr.umr_caps.max_send_wqe_inline_klms; +# else + qp_init_attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_MAX_INL_KLMS; + qp_init_attr.max_inl_send_klms = dev->dev_attr.max_send_wqe_inline_klms; +# endif + +# if HAVE_IBV_EXP_QP_CREATE_UMR + qp_init_attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS; + qp_init_attr.exp_create_flags |= IBV_EXP_QP_CREATE_UMR; +# endif +# endif + qp = ibv_exp_create_qp(dev->ibv_context, &qp_init_attr); #else qp = ibv_create_qp(uct_ib_iface_md(&iface->super)->pd, &qp_init_attr); @@ -657,7 +672,8 @@ ucs_status_t uct_rc_iface_qp_init(uct_rc_iface_t *iface, struct ibv_qp *qp) qp_attr.qp_access_flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | - IBV_ACCESS_REMOTE_ATOMIC; + IBV_ACCESS_REMOTE_ATOMIC | + IBV_ACCESS_MW_BIND; ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | diff --git a/src/uct/ib/rc/verbs/rc_verbs_ep.c b/src/uct/ib/rc/verbs/rc_verbs_ep.c index 49956c137b8..94833145fb2 100644 --- a/src/uct/ib/rc/verbs/rc_verbs_ep.c +++ b/src/uct/ib/rc/verbs/rc_verbs_ep.c @@ -826,6 +826,50 @@ ucs_status_t uct_rc_verbs_ep_tag_rndv_request(uct_ep_h tl_ep, uct_tag_t tag, #endif /* IBV_EXP_HW_TM */ +static void uct_rc_ep_dereg_nc(uct_ep_h tl_ep, struct ibv_exp_send_wr *wr, + uct_completion_t *comp) +{ + uct_rc_verbs_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_verbs_ep_t); + uct_rc_verbs_iface_t *iface = ucs_derived_of(ep->super.super.super.iface, + uct_rc_verbs_iface_t); + + uct_rc_verbs_exp_post_send(ep, wr, wr->exp_send_flags); + uct_rc_txqp_add_send_comp(&iface->super, &ep->super.txqp, comp, ep->txcnt.pi); +} + +ucs_status_t uct_rc_ep_reg_nc(uct_ep_h tl_ep, const uct_iov_t *iov, + size_t iovcnt, uct_md_h *md_p, + uct_mem_h *memh_p, uct_completion_t *comp) +{ + uct_ib_mem_t *memh; + ucs_status_t status; + struct ibv_exp_send_wr *wr; + uct_rc_verbs_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_verbs_ep_t); + uct_rc_verbs_iface_t *iface = ucs_derived_of(ep->super.super.super.iface, + uct_rc_verbs_iface_t); + + uct_md_h md = iface->super.super.super.md; + if (*memh_p == NULL) { + status = md->ops->mem_reg(md, NULL, 0, UCT_MD_MEM_FLAG_EMPTY, (void**)&memh); + if (ucs_unlikely(status != UCS_OK)) { + return status; + } + } + + status = uct_ib_umr_reg_nc(md, iov, iovcnt, tl_ep, + uct_rc_ep_dereg_nc, memh, &wr); + if (ucs_unlikely(status != UCS_OK)) { + return status; + } + + uct_rc_verbs_exp_post_send(ep, wr, wr->exp_send_flags); + uct_rc_txqp_add_send_comp(&iface->super, &ep->super.txqp, comp, ep->txcnt.pi); + + *md_p = md; + *memh_p = memh; + return UCS_INPROGRESS; +} + UCS_CLASS_INIT_FUNC(uct_rc_verbs_ep_t, uct_iface_h tl_iface) { uct_rc_verbs_iface_t *iface = ucs_derived_of(tl_iface, uct_rc_verbs_iface_t); diff --git a/src/uct/ib/rc/verbs/rc_verbs_iface.c b/src/uct/ib/rc/verbs/rc_verbs_iface.c index fa5419b60c2..5d9e34efbaa 100644 --- a/src/uct/ib/rc/verbs/rc_verbs_iface.c +++ b/src/uct/ib/rc/verbs/rc_verbs_iface.c @@ -772,6 +772,7 @@ static uct_rc_iface_ops_t uct_rc_verbs_iface_ops = { .ep_destroy = UCS_CLASS_DELETE_FUNC_NAME(uct_rc_verbs_ep_t), .ep_get_address = uct_rc_verbs_ep_get_address, .ep_connect_to_ep = uct_rc_verbs_ep_connect_to_ep, + .ep_mem_reg_nc = uct_rc_ep_reg_nc, .iface_flush = uct_rc_iface_flush, .iface_fence = uct_base_iface_fence, .iface_progress_enable = ucs_empty_function, diff --git a/test/gtest/ucs/test_rcache.cc b/test/gtest/ucs/test_rcache.cc index 7fe8be0d73a..74ada72eed5 100644 --- a/test/gtest/ucs/test_rcache.cc +++ b/test/gtest/ucs/test_rcache.cc @@ -51,7 +51,7 @@ class test_rcache : public ucs::test { region *get(void *address, size_t length, int prot = PROT_READ|PROT_WRITE) { ucs_status_t status; ucs_rcache_region_t *r; - status = ucs_rcache_get(m_rcache, address, length, prot, 0, &r); + status = ucs_rcache_get(m_rcache, address, length, prot, NULL, &r); ASSERT_UCS_OK(status); EXPECT_TRUE(r != NULL); struct region *region = ucs_derived_of(r, struct region); @@ -123,7 +123,7 @@ class test_rcache : public ucs::test { private: static ucs_status_t mem_reg_cb(void *context, ucs_rcache_t *rcache, - unsigned flags, ucs_rcache_region_t *r) + void *arg, ucs_rcache_region_t *r) { return reinterpret_cast(context)->mem_reg( ucs_derived_of(r, struct region)); @@ -494,7 +494,7 @@ UCS_MT_TEST_F(test_rcache_no_register, register_failure, 10) { ucs_status_t status; ucs_rcache_region_t *r; - status = ucs_rcache_get(m_rcache, ptr, size, PROT_READ|PROT_WRITE, 0, &r); + status = ucs_rcache_get(m_rcache, ptr, size, PROT_READ|PROT_WRITE, NULL, &r); EXPECT_EQ(UCS_ERR_IO_ERROR, status); EXPECT_EQ(0u, m_reg_count);