From 10bf84ffa7709ed28ec2f4d52805034ea1b7e610 Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Thu, 24 Oct 2024 11:19:43 -0700 Subject: [PATCH 1/7] prov/rxm: fix definition of the rxm SAR segment enum The rxm SAR segment type enum was defined inside another struct. While techincally ok, this made it difficult for editors to find the type and reported compiler errors. This cleans it up to make it more readible and easier for editors to find the type Signed-off-by: Alexia Ingerson --- prov/rxm/src/rxm.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/prov/rxm/src/rxm.h b/prov/rxm/src/rxm.h index 16074babeac..e2759d6d077 100644 --- a/prov/rxm/src/rxm.h +++ b/prov/rxm/src/rxm.h @@ -417,13 +417,15 @@ struct rxm_pkt { char data[]; }; +enum rxm_sar_seg_type { + RXM_SAR_SEG_FIRST = 1, + RXM_SAR_SEG_MIDDLE = 2, + RXM_SAR_SEG_LAST = 3, +}; + union rxm_sar_ctrl_data { struct { - enum rxm_sar_seg_type { - RXM_SAR_SEG_FIRST = 1, - RXM_SAR_SEG_MIDDLE = 2, - RXM_SAR_SEG_LAST = 3, - } seg_type : 2; + enum rxm_sar_seg_type seg_type : 2; uint32_t offset; }; uint64_t align; From 7e9369ca7deaa7c3a9f74bc7ef392edeba986711 Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Wed, 23 Oct 2024 15:40:06 -0700 Subject: [PATCH 2/7] prov/rxm: add FI_AV_USER_ID support Add application side support for FI_AV_USER_ID which requires saving the fi_addr input as the internal fi_addr (for both the peer API srx use case and for reporting unique source address information). When supporting the capability for the application, remove it form the core provider information as it is only required on the top layer Signed-off-by: Alexia Ingerson --- prov/rxm/src/rxm_attr.c | 2 +- prov/rxm/src/rxm_init.c | 3 +++ prov/util/src/rxm_av.c | 28 ++++++++++++++++++++++------ 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/prov/rxm/src/rxm_attr.c b/prov/rxm/src/rxm_attr.c index defa7771188..632543585e4 100644 --- a/prov/rxm/src/rxm_attr.c +++ b/prov/rxm/src/rxm_attr.c @@ -40,7 +40,7 @@ OFI_RX_RMA_CAPS | FI_ATOMICS | FI_DIRECTED_RECV | \ FI_MULTI_RECV) -#define RXM_DOMAIN_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM) +#define RXM_DOMAIN_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM | FI_AV_USER_ID) /* Since we are a layering provider, the attributes for which we rely on the diff --git a/prov/rxm/src/rxm_init.c b/prov/rxm/src/rxm_init.c index 78610bc5f04..1a76796d4e0 100644 --- a/prov/rxm/src/rxm_init.c +++ b/prov/rxm/src/rxm_init.c @@ -262,6 +262,9 @@ int rxm_info_to_core(uint32_t version, const struct fi_info *hints, core_info->rx_attr->op_flags &= ~FI_MULTI_RECV; + core_info->domain_attr->caps &= ~(FI_AV_USER_ID); + core_info->caps &= ~(FI_AV_USER_ID); + return 0; } diff --git a/prov/util/src/rxm_av.c b/prov/util/src/rxm_av.c index 69a68a884db..a5e30c95026 100644 --- a/prov/util/src/rxm_av.c +++ b/prov/util/src/rxm_av.c @@ -165,7 +165,7 @@ rxm_put_peer_addr(struct rxm_av *av, fi_addr_t fi_addr) static int rxm_av_add_peers(struct rxm_av *av, const void *addr, size_t count, - fi_addr_t *fi_addr) + fi_addr_t *fi_addr, fi_addr_t *user_ids) { struct util_peer_addr *peer; const void *cur_addr; @@ -178,8 +178,12 @@ rxm_av_add_peers(struct rxm_av *av, const void *addr, size_t count, if (!peer) goto err; - peer->fi_addr = fi_addr ? fi_addr[i] : + if (user_ids) { + peer->fi_addr = user_ids[i]; + } else { + peer->fi_addr = fi_addr ? fi_addr[i] : ofi_av_lookup_fi_addr(&av->util_av, cur_addr); + } /* lookup can fail if prior AV insertion failed */ if (peer->fi_addr != FI_ADDR_NOTAVAIL) @@ -276,21 +280,33 @@ static int rxm_av_insert(struct fid_av *av_fid, const void *addr, size_t count, fi_addr_t *fi_addr, uint64_t flags, void *context) { struct rxm_av *av; + fi_addr_t *user_ids = NULL; int ret; + if (flags & FI_AV_USER_ID) { + assert(fi_addr); + user_ids = calloc(count, sizeof(*user_ids)); + assert(user_ids); + memcpy(user_ids, fi_addr, sizeof(*fi_addr) * count); + } + av = container_of(av_fid, struct rxm_av, util_av.av_fid.fid); ret = ofi_ip_av_insert(av_fid, addr, count, fi_addr, flags, context); if (ret < 0) - return ret; + goto out; count = ret; - ret = rxm_av_add_peers(av, addr, count, fi_addr); + ret = rxm_av_add_peers(av, addr, count, fi_addr, user_ids); if (ret) { rxm_av_remove(av_fid, fi_addr, count, flags); - return ret; + goto out; } +out: + free(user_ids); + if (ret) + return ret; return (int) count; } @@ -319,7 +335,7 @@ static int rxm_av_insertsym(struct fid_av *av_fid, const char *node, if (ret > 0 && ret < count) count = ret; - ret = rxm_av_add_peers(av, addr, count, fi_addr); + ret = rxm_av_add_peers(av, addr, count, fi_addr, NULL); if (ret) { rxm_av_remove(av_fid, fi_addr, count, flags); return ret; From f4e3722fec5fca84fa0dafe6f2c74584fffa497d Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Mon, 21 Oct 2024 15:40:44 -0700 Subject: [PATCH 3/7] prov/rxm: add rxm support for using a peer CQs and counters Support using the peer APIs by default using the util peer helper functions. Instead of going through the rxm-specific functions to write to CQs and counters, use the ofi_peer_cq/cntr APIs which use the owner ops. In the default case where rxm is not being used as a peer these will go to the regular ofi_cq_write functions. Signed-off-by: Alexia Ingerson --- prov/rxm/src/rxm.h | 77 +-------------- prov/rxm/src/rxm_cq.c | 200 +++++++++++++++++++++----------------- prov/rxm/src/rxm_ep.c | 44 +++++---- prov/rxm/src/rxm_msg.c | 4 +- prov/rxm/src/rxm_tagged.c | 15 +-- 5 files changed, 146 insertions(+), 194 deletions(-) diff --git a/prov/rxm/src/rxm.h b/prov/rxm/src/rxm.h index e2759d6d077..5d18f16e157 100644 --- a/prov/rxm/src/rxm.h +++ b/prov/rxm/src/rxm.h @@ -759,9 +759,10 @@ ssize_t rxm_handle_rx_buf(struct rxm_rx_buf *rx_buf); int rxm_endpoint(struct fid_domain *domain, struct fi_info *info, struct fid_ep **ep, void *context); - -void rxm_cq_write_error(struct util_cq *cq, struct util_cntr *cntr, - void *op_context, int err); +void rxm_cq_write_tx_error(struct rxm_ep *rxm_ep, uint8_t op, void *op_context, + int err); +void rxm_cq_write_rx_error(struct rxm_ep *rxm_ep, uint8_t op, void *op_context, + int err); void rxm_cq_write_error_all(struct rxm_ep *rxm_ep, int err); void rxm_handle_comp_error(struct rxm_ep *rxm_ep); ssize_t rxm_handle_comp(struct rxm_ep *rxm_ep, struct fi_cq_data_entry *comp); @@ -880,50 +881,6 @@ int rxm_msg_mr_reg_internal(struct rxm_domain *rxm_domain, const void *buf, size_t len, uint64_t acs, uint64_t flags, struct fid_mr **mr); -static inline void rxm_cntr_incerr(struct util_cntr *cntr) -{ - if (cntr) - cntr->cntr_fid.ops->adderr(&cntr->cntr_fid, 1); -} - -static inline void -rxm_cq_write(struct util_cq *cq, void *context, uint64_t flags, size_t len, - void *buf, uint64_t data, uint64_t tag) -{ - int ret; - - FI_DBG(&rxm_prov, FI_LOG_CQ, "Reporting %s completion\n", - fi_tostr((void *) &flags, FI_TYPE_CQ_EVENT_FLAGS)); - - ret = ofi_cq_write(cq, context, flags, len, buf, data, tag); - if (ret) { - FI_WARN(&rxm_prov, FI_LOG_CQ, - "Unable to report completion\n"); - assert(0); - } - if (cq->wait) - cq->wait->signal(cq->wait); -} - -static inline void -rxm_cq_write_src(struct util_cq *cq, void *context, uint64_t flags, size_t len, - void *buf, uint64_t data, uint64_t tag, fi_addr_t addr) -{ - int ret; - - FI_DBG(&rxm_prov, FI_LOG_CQ, "Reporting %s completion\n", - fi_tostr((void *) &flags, FI_TYPE_CQ_EVENT_FLAGS)); - - ret = ofi_cq_write_src(cq, context, flags, len, buf, data, tag, addr); - if (ret) { - FI_WARN(&rxm_prov, FI_LOG_CQ, - "Unable to report completion\n"); - assert(0); - } - if (cq->wait) - cq->wait->signal(cq->wait); -} - ssize_t rxm_get_conn(struct rxm_ep *rxm_ep, fi_addr_t addr, struct rxm_conn **rxm_conn); @@ -998,32 +955,6 @@ rxm_recv_entry_release(struct rxm_recv_entry *entry) ofi_buf_free(entry); } -static inline void -rxm_cq_write_recv_comp(struct rxm_rx_buf *rx_buf, void *context, uint64_t flags, - size_t len, char *buf) -{ - if (rx_buf->ep->util_coll_peer_xfer_ops && - rx_buf->pkt.hdr.tag & RXM_PEER_XFER_TAG_FLAG) { - struct fi_cq_tagged_entry cqe = { - .tag = rx_buf->pkt.hdr.tag, - .op_context = rx_buf->recv_entry->context, - }; - rx_buf->ep->util_coll_peer_xfer_ops-> - complete(rx_buf->ep->util_coll_ep, &cqe, 0); - return; - } - - if (rx_buf->ep->rxm_info->caps & FI_SOURCE) - rxm_cq_write_src(rx_buf->ep->util_ep.rx_cq, context, - flags, len, buf, rx_buf->pkt.hdr.data, - rx_buf->pkt.hdr.tag, - rx_buf->conn->peer->fi_addr); - else - rxm_cq_write(rx_buf->ep->util_ep.rx_cq, context, - flags, len, buf, rx_buf->pkt.hdr.data, - rx_buf->pkt.hdr.tag); -} - struct rxm_mr *rxm_mr_get_map_entry(struct rxm_domain *domain, uint64_t key); struct rxm_recv_entry * diff --git a/prov/rxm/src/rxm_cq.c b/prov/rxm/src/rxm_cq.c index 27c8cc6f1c0..b04b36444d3 100644 --- a/prov/rxm/src/rxm_cq.c +++ b/prov/rxm/src/rxm_cq.c @@ -101,6 +101,35 @@ static void rxm_replace_rx_buf(struct rxm_rx_buf *rx_buf) ofi_buf_free(new_rx_buf); } +static void rxm_cq_write_recv_comp(struct rxm_rx_buf *rx_buf, void *context, + uint64_t flags, size_t len, char *buf) +{ + int ret; + + if (rx_buf->ep->util_coll_peer_xfer_ops && + rx_buf->pkt.hdr.tag & RXM_PEER_XFER_TAG_FLAG) { + struct fi_cq_tagged_entry cqe = { + .tag = rx_buf->pkt.hdr.tag, + .op_context = rx_buf->recv_entry->context, + }; + rx_buf->ep->util_coll_peer_xfer_ops-> + complete(rx_buf->ep->util_coll_ep, &cqe, 0); + return; + } + if (rx_buf->ep->rxm_info->caps & FI_SOURCE) + ret = ofi_peer_cq_write(rx_buf->ep->util_ep.rx_cq, context, + flags, len, buf, rx_buf->pkt.hdr.data, + rx_buf->pkt.hdr.tag, + rx_buf->conn->peer->fi_addr); + else + ret = ofi_peer_cq_write(rx_buf->ep->util_ep.rx_cq, context, + flags, len, buf, rx_buf->pkt.hdr.data, + rx_buf->pkt.hdr.tag, FI_ADDR_NOTAVAIL); + if (ret) + FI_WARN(&rxm_prov, FI_LOG_CQ, + "Unable to write rx completion\n"); +} + static void rxm_finish_buf_recv(struct rxm_rx_buf *rx_buf) { uint64_t flags; @@ -136,19 +165,19 @@ static void rxm_cq_write_error_trunc(struct rxm_rx_buf *rx_buf, size_t done_len) int ret; if (rx_buf->ep->util_ep.flags & OFI_CNTR_ENABLED) - rxm_cntr_incerr(rx_buf->ep->util_ep.cntrs[CNTR_RX]); + ofi_ep_peer_rx_cntr_incerr(&rx_buf->ep->util_ep, ofi_op_msg); FI_WARN(&rxm_prov, FI_LOG_CQ, "Message truncated: " "recv buf length: %zu message length: %" PRIu64 "\n", done_len, rx_buf->pkt.hdr.size); - ret = ofi_cq_write_error_trunc(rx_buf->ep->util_ep.rx_cq, - rx_buf->recv_entry->context, - rx_buf->recv_entry->comp_flags | - rx_buf->pkt.hdr.flags, - rx_buf->pkt.hdr.size, - rx_buf->recv_entry->rxm_iov.iov[0].iov_base, - rx_buf->pkt.hdr.data, rx_buf->pkt.hdr.tag, - rx_buf->pkt.hdr.size - done_len); + ret = ofi_peer_cq_write_error_trunc( + rx_buf->ep->util_ep.rx_cq, + rx_buf->recv_entry->context, + rx_buf->recv_entry->comp_flags | + rx_buf->pkt.hdr.flags, rx_buf->pkt.hdr.size, + rx_buf->recv_entry->rxm_iov.iov[0].iov_base, + rx_buf->pkt.hdr.data, rx_buf->pkt.hdr.tag, + rx_buf->pkt.hdr.size - done_len); if (ret) { FI_WARN(&rxm_prov, FI_LOG_CQ, "Unable to write recv error CQ\n"); assert(0); @@ -166,16 +195,16 @@ static void rxm_finish_recv(struct rxm_rx_buf *rx_buf, size_t done_len) if (rx_buf->recv_entry->flags & FI_COMPLETION || rx_buf->ep->rxm_info->mode & OFI_BUFFERED_RECV) { - rxm_cq_write_recv_comp(rx_buf, rx_buf->recv_entry->context, - rx_buf->recv_entry->comp_flags | - rx_buf->pkt.hdr.flags | - (rx_buf->recv_entry->flags & FI_MULTI_RECV), - rx_buf->pkt.hdr.size, - rx_buf->recv_entry->rxm_iov. - iov[0].iov_base); - } - ofi_ep_cntr_inc(&rx_buf->ep->util_ep, CNTR_RX); - + rxm_cq_write_recv_comp( + rx_buf, rx_buf->recv_entry->context, + rx_buf->recv_entry->comp_flags | + rx_buf->pkt.hdr.flags | + (rx_buf->recv_entry->flags & FI_MULTI_RECV), + rx_buf->pkt.hdr.size, + rx_buf->recv_entry->rxm_iov. + iov[0].iov_base); + } + ofi_ep_peer_rx_cntr_inc(&rx_buf->ep->util_ep, ofi_op_msg); release: rxm_recv_entry_release(recv_entry); rxm_free_rx_buf(rx_buf); @@ -186,8 +215,9 @@ rxm_cq_write_tx_comp(struct rxm_ep *rxm_ep, uint64_t comp_flags, void *app_context, uint64_t flags) { if (flags & FI_COMPLETION) { - rxm_cq_write(rxm_ep->util_ep.tx_cq, app_context, - comp_flags, 0, NULL, 0, 0); + (void) ofi_peer_cq_write(rxm_ep->util_ep.tx_cq, app_context, + comp_flags, 0, NULL, 0, 0, + FI_ADDR_NOTAVAIL); } } @@ -201,9 +231,9 @@ static void rxm_finish_rma(struct rxm_ep *rxm_ep, struct rxm_tx_buf *rma_buf, rma_buf->flags); if (comp_flags & FI_WRITE) - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_WR); + ofi_ep_peer_tx_cntr_inc(&rxm_ep->util_ep, ofi_op_write); else - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_RD); + ofi_ep_peer_tx_cntr_inc(&rxm_ep->util_ep, ofi_op_read_req); if (!(rma_buf->flags & FI_INJECT) && !rxm_ep->rdm_mr_local && rxm_ep->msg_mr_local) { @@ -219,7 +249,7 @@ void rxm_finish_eager_send(struct rxm_ep *rxm_ep, struct rxm_tx_buf *tx_buf) rxm_cq_write_tx_comp(rxm_ep, ofi_tx_cq_flags(tx_buf->pkt.hdr.op), tx_buf->app_context, tx_buf->flags); - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_TX); + ofi_ep_peer_tx_cntr_inc(&rxm_ep->util_ep, ofi_op_msg); } static bool rxm_complete_sar(struct rxm_ep *rxm_ep, @@ -259,7 +289,7 @@ static void rxm_handle_sar_comp(struct rxm_ep *rxm_ep, return; rxm_cq_write_tx_comp(rxm_ep, comp_flags, app_context, tx_flags); - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_TX); + ofi_ep_peer_tx_cntr_inc(&rxm_ep->util_ep, ofi_op_msg); } static void rxm_rndv_rx_finish(struct rxm_rx_buf *rx_buf) @@ -295,7 +325,7 @@ static void rxm_rndv_tx_finish(struct rxm_ep *rxm_ep, ofi_buf_free(tx_buf->write_rndv.done_buf); tx_buf->write_rndv.done_buf = NULL; } - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_TX); + ofi_ep_peer_tx_cntr_inc(&rxm_ep->util_ep, ofi_op_msg); rxm_free_tx_buf(rxm_ep, tx_buf); } @@ -518,8 +548,8 @@ ssize_t rxm_rndv_read(struct rxm_rx_buf *rx_buf) rx_buf->recv_entry->rxm_iov.count, total_len, rx_buf); if (ret) { - rxm_cq_write_error(rx_buf->ep->util_ep.rx_cq, - rx_buf->ep->util_ep.cntrs[CNTR_RX], rx_buf, (int) ret); + rxm_cq_write_rx_error(rx_buf->ep, ofi_op_msg, rx_buf, + (int) ret); } return ret; } @@ -561,9 +591,8 @@ static ssize_t rxm_rndv_handle_wr_data(struct rxm_rx_buf *rx_buf) tx_buf->rma.count, total_len, tx_buf); if (ret) - rxm_cq_write_error(rx_buf->ep->util_ep.rx_cq, - rx_buf->ep->util_ep.cntrs[CNTR_RX], - tx_buf, (int) ret); + rxm_cq_write_rx_error(rx_buf->ep, ofi_op_msg, tx_buf, (int) ret); + rxm_free_rx_buf(rx_buf); return ret; } @@ -986,9 +1015,9 @@ ssize_t rxm_rndv_send_wr_data(struct rxm_rx_buf *rx_buf) static void rxm_handle_remote_write(struct rxm_ep *rxm_ep, struct fi_cq_data_entry *comp) { - rxm_cq_write(rxm_ep->util_ep.rx_cq, NULL, comp->flags, comp->len, NULL, - comp->data, 0); - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_REM_WR); + ofi_peer_cq_write(rxm_ep->util_ep.rx_cq, NULL, comp->flags, comp->len, + NULL, comp->data, 0, FI_ADDR_NOTAVAIL); + ofi_ep_peer_rx_cntr_inc(&rxm_ep->util_ep, ofi_op_write); if (comp->op_context) rxm_free_rx_buf(comp->op_context); } @@ -1222,10 +1251,7 @@ static ssize_t rxm_handle_atomic_req(struct rxm_ep *rxm_ep, } result_len = op == ofi_op_atomic ? 0 : offset; - if (op == ofi_op_atomic) - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_REM_WR); - else - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_REM_RD); + ofi_ep_peer_rx_cntr_inc(&rxm_ep->util_ep, op); return rxm_atomic_send_resp(rxm_ep, rx_buf, resp_buf, result_len, FI_SUCCESS); @@ -1236,7 +1262,6 @@ static ssize_t rxm_handle_atomic_resp(struct rxm_ep *rxm_ep, { struct rxm_tx_buf *tx_buf; struct rxm_atomic_resp_hdr *resp_hdr; - struct util_cntr *cntr = NULL; uint64_t len; ssize_t copy_len; ssize_t ret = 0; @@ -1286,33 +1311,15 @@ static ssize_t rxm_handle_atomic_resp(struct rxm_ep *rxm_ep, rxm_cq_write_tx_comp(rxm_ep, ofi_tx_cq_flags(tx_buf->pkt.hdr.op), tx_buf->app_context, tx_buf->flags); - if (tx_buf->pkt.hdr.op == ofi_op_atomic) { - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_WR); - } else if (tx_buf->pkt.hdr.op == ofi_op_atomic_compare || - tx_buf->pkt.hdr.op == ofi_op_atomic_fetch) { - ofi_ep_cntr_inc(&rxm_ep->util_ep, CNTR_RD); - } else { - ret = -FI_EOPNOTSUPP; - goto write_err; - } + ofi_ep_peer_tx_cntr_inc(&rxm_ep->util_ep, tx_buf->pkt.hdr.op); free: rxm_free_rx_buf(rx_buf); rxm_free_tx_buf(rxm_ep, tx_buf); return ret; write_err: - if (tx_buf->pkt.hdr.op == ofi_op_atomic) { - cntr = rxm_ep->util_ep.cntrs[CNTR_WR]; - } else if (tx_buf->pkt.hdr.op == ofi_op_atomic_compare || - tx_buf->pkt.hdr.op == ofi_op_atomic_fetch) { - cntr = rxm_ep->util_ep.cntrs[CNTR_RD]; - } else { - FI_WARN(&rxm_prov, FI_LOG_CQ, - "unknown atomic request op!\n"); - assert(0); - } - rxm_cq_write_error(rxm_ep->util_ep.tx_cq, cntr, - tx_buf->app_context, (int) ret); + rxm_cq_write_tx_error(rxm_ep, tx_buf->pkt.hdr.op, tx_buf->app_context, + (int) ret); goto free; } @@ -1480,23 +1487,38 @@ ssize_t rxm_handle_comp(struct rxm_ep *rxm_ep, struct fi_cq_data_entry *comp) } } -void rxm_cq_write_error(struct util_cq *cq, struct util_cntr *cntr, - void *op_context, int err) +void rxm_cq_write_tx_error(struct rxm_ep *rxm_ep, uint8_t op, void *op_context, + int err) { struct fi_cq_err_entry err_entry = {0}; err_entry.op_context = op_context; err_entry.prov_errno = err; err_entry.err = -err; - if (cntr) - rxm_cntr_incerr(cntr); + ofi_ep_peer_tx_cntr_incerr(&rxm_ep->util_ep, op); - if (ofi_cq_write_error(cq, &err_entry)) { - FI_WARN(&rxm_prov, FI_LOG_CQ, "Unable to ofi_cq_write_error\n"); + if (ofi_peer_cq_write_error(rxm_ep->util_ep.tx_cq, &err_entry)) { + FI_WARN(&rxm_prov, FI_LOG_CQ, + "Unable to ofi_peer_cq_write_error\n"); assert(0); } } +void rxm_cq_write_rx_error(struct rxm_ep *rxm_ep, uint8_t op, void *op_context, + int err) +{ + struct fi_cq_err_entry err_entry = {0}; + err_entry.op_context = op_context; + err_entry.prov_errno = err; + err_entry.err = -err; + + ofi_ep_peer_rx_cntr_incerr(&rxm_ep->util_ep, op); + + if (ofi_peer_cq_write_error(rxm_ep->util_ep.rx_cq, &err_entry)) + FI_WARN(&rxm_prov, FI_LOG_CQ, + "Unable to ofi_peer_cq_write_error\n"); +} + void rxm_cq_write_error_all(struct rxm_ep *rxm_ep, int err) { struct fi_cq_err_entry err_entry = {0}; @@ -1505,32 +1527,26 @@ void rxm_cq_write_error_all(struct rxm_ep *rxm_ep, int err) err_entry.prov_errno = err; err_entry.err = -err; if (rxm_ep->util_ep.tx_cq) { - ret = ofi_cq_write_error(rxm_ep->util_ep.tx_cq, &err_entry); + ret = ofi_peer_cq_write_error(rxm_ep->util_ep.tx_cq, &err_entry); if (ret) { FI_WARN(&rxm_prov, FI_LOG_CQ, - "Unable to ofi_cq_write_error\n"); + "Unable to ofi_peer_cq_write_error\n"); assert(0); } } if (rxm_ep->util_ep.rx_cq) { - ret = ofi_cq_write_error(rxm_ep->util_ep.rx_cq, &err_entry); + ret = ofi_peer_cq_write_error(rxm_ep->util_ep.rx_cq, &err_entry); if (ret) { FI_WARN(&rxm_prov, FI_LOG_CQ, - "Unable to ofi_cq_write_error\n"); + "Unable to ofi_peer_cq_write_error\n"); assert(0); } } - if (rxm_ep->util_ep.cntrs[CNTR_TX]) - rxm_cntr_incerr(rxm_ep->util_ep.cntrs[CNTR_TX]); - - if (rxm_ep->util_ep.cntrs[CNTR_RX]) - rxm_cntr_incerr(rxm_ep->util_ep.cntrs[CNTR_RX]); - - if (rxm_ep->util_ep.cntrs[CNTR_WR]) - rxm_cntr_incerr(rxm_ep->util_ep.cntrs[CNTR_WR]); - if (rxm_ep->util_ep.cntrs[CNTR_RD]) - rxm_cntr_incerr(rxm_ep->util_ep.cntrs[CNTR_RD]); + ofi_ep_peer_tx_cntr_incerr(&rxm_ep->util_ep, ofi_op_msg); + ofi_ep_peer_rx_cntr_incerr(&rxm_ep->util_ep, ofi_op_msg); + ofi_ep_peer_tx_cntr_incerr(&rxm_ep->util_ep, ofi_op_write); + ofi_ep_peer_tx_cntr_incerr(&rxm_ep->util_ep, ofi_op_read_req); } void rxm_handle_comp_error(struct rxm_ep *rxm_ep) @@ -1583,7 +1599,7 @@ void rxm_handle_comp_error(struct rxm_ep *rxm_ep) case RXM_INJECT_TX: rxm_free_tx_buf(rxm_ep, err_entry.op_context); if (cntr) - rxm_cntr_incerr(cntr); + cntr->peer_cntr->owner_ops->incerr(cntr->peer_cntr); return; case RXM_CREDIT_TX: case RXM_ATOMIC_RESP_SENT: /* BUG: should have consumed tx credit */ @@ -1647,12 +1663,13 @@ void rxm_handle_comp_error(struct rxm_ep *rxm_ep) } if (cntr) - rxm_cntr_incerr(cntr); + cntr->peer_cntr->owner_ops->incerr(cntr->peer_cntr); assert(cq); - ret = ofi_cq_write_error(cq, &err_entry); + ret = ofi_peer_cq_write_error(cq, &err_entry); if (ret) { - FI_WARN(&rxm_prov, FI_LOG_CQ, "Unable to ofi_cq_write_error\n"); + FI_WARN(&rxm_prov, FI_LOG_CQ, + "Unable to ofi_peer_cq_write_error\n"); assert(0); } } @@ -1665,8 +1682,8 @@ ssize_t rxm_thru_comp(struct rxm_ep *ep, struct fi_cq_data_entry *comp) cq = (comp->flags & (FI_RECV | FI_REMOTE_WRITE | FI_REMOTE_READ)) ? ep->util_ep.rx_cq : ep->util_ep.tx_cq; - ret = ofi_cq_write(cq, comp->op_context, comp->flags, comp->len, - comp->buf, comp->data, 0); + ret = ofi_peer_cq_write(cq, comp->op_context, comp->flags, comp->len, + comp->buf, comp->data, 0, FI_ADDR_NOTAVAIL); if (ret) { FI_WARN(&rxm_prov, FI_LOG_CQ, "Unable to report completion\n"); assert(0); @@ -1692,9 +1709,10 @@ void rxm_thru_comp_error(struct rxm_ep *ep) } cq = (err_entry.flags & FI_RECV) ? ep->util_ep.rx_cq : ep->util_ep.tx_cq; - ret = ofi_cq_write_error(cq, &err_entry); + ret = ofi_peer_cq_write_error(cq, &err_entry); if (ret) { - FI_WARN(&rxm_prov, FI_LOG_CQ, "Unable to ofi_cq_write_error\n"); + FI_WARN(&rxm_prov, FI_LOG_CQ, + "Unable to ofi_peer_cq_write_error\n"); assert(0); } } @@ -1730,8 +1748,8 @@ ssize_t rxm_cq_owner_write(struct fid_peer_cq *peer_cq, void *context, } rxm_cq = container_of(peer_cq, struct rxm_cq, peer_cq); - return ofi_cq_write(&rxm_cq->util_cq, req->app_context, req->flags, len, - buf, data, tag); + return ofi_peer_cq_write(&rxm_cq->util_cq, req->app_context, req->flags, + len, buf, data, tag, FI_ADDR_NOTAVAIL); } ssize_t rxm_cq_owner_writeerr(struct fid_peer_cq *peer_cq, @@ -1751,7 +1769,7 @@ ssize_t rxm_cq_owner_writeerr(struct fid_peer_cq *peer_cq, } rxm_cq = container_of(peer_cq, struct rxm_cq, peer_cq); - return ofi_cq_write_error(&rxm_cq->util_cq, &cqe_err); + return ofi_peer_cq_write_error(&rxm_cq->util_cq, &cqe_err); } int rxm_post_recv(struct rxm_rx_buf *rx_buf) diff --git a/prov/rxm/src/rxm_ep.c b/prov/rxm/src/rxm_ep.c index ba6a949122e..69a88e2caaf 100644 --- a/prov/rxm/src/rxm_ep.c +++ b/prov/rxm/src/rxm_ep.c @@ -746,9 +746,8 @@ rxm_ep_sar_handle_segment_failure(struct rxm_deferred_tx_entry *def_tx_entry, { rxm_ep_sar_tx_cleanup(def_tx_entry->rxm_ep, def_tx_entry->rxm_conn, def_tx_entry->sar_seg.cur_seg_tx_buf); - rxm_cq_write_error(def_tx_entry->rxm_ep->util_ep.tx_cq, - def_tx_entry->rxm_ep->util_ep.cntrs[CNTR_TX], - def_tx_entry->sar_seg.app_context, (int) ret); + rxm_cq_write_tx_error(def_tx_entry->rxm_ep, ofi_op_msg, + def_tx_entry->sar_seg.app_context, (int) ret); } /* Returns FI_SUCCESS if the SAR deferred TX queue is empty, @@ -843,10 +842,10 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep, if (ret) { if (ret == -FI_EAGAIN) return; - rxm_cq_write_error(def_tx_entry->rxm_ep->util_ep.rx_cq, - def_tx_entry->rxm_ep->util_ep.cntrs[CNTR_RX], - def_tx_entry->rndv_ack.rx_buf-> - recv_entry->context, (int) ret); + rxm_cq_write_rx_error( + def_tx_entry->rxm_ep, ofi_op_msg, + def_tx_entry->rndv_ack.rx_buf-> + recv_entry->context, (int) ret); } if (def_tx_entry->rndv_ack.rx_buf->recv_entry->rndv .tx_buf->pkt.ctrl_hdr @@ -868,9 +867,10 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep, if (ret) { if (ret == -FI_EAGAIN) return; - rxm_cq_write_error(def_tx_entry->rxm_ep->util_ep.tx_cq, - def_tx_entry->rxm_ep->util_ep.cntrs[CNTR_TX], - def_tx_entry->rndv_done.tx_buf, (int) ret); + rxm_cq_write_tx_error(def_tx_entry->rxm_ep, + ofi_op_msg, + def_tx_entry->rndv_done.tx_buf, + (int) ret); } RXM_UPDATE_STATE(FI_LOG_EP_DATA, def_tx_entry->rndv_done.tx_buf, @@ -888,10 +888,10 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep, if (ret) { if (ret == -FI_EAGAIN) return; - rxm_cq_write_error(def_tx_entry->rxm_ep->util_ep.rx_cq, - def_tx_entry->rxm_ep->util_ep.cntrs[CNTR_RX], - def_tx_entry->rndv_read.rx_buf-> - recv_entry->context, (int) ret); + rxm_cq_write_rx_error( + def_tx_entry->rxm_ep, ofi_op_msg, + def_tx_entry->rndv_read.rx_buf-> + recv_entry->context, (int) ret); } break; case RXM_DEFERRED_TX_RNDV_WRITE: @@ -906,9 +906,10 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep, if (ret) { if (ret == -FI_EAGAIN) return; - rxm_cq_write_error(def_tx_entry->rxm_ep->util_ep.rx_cq, - def_tx_entry->rxm_ep->util_ep.cntrs[CNTR_RX], - def_tx_entry->rndv_write.tx_buf, (int) ret); + rxm_cq_write_rx_error( + def_tx_entry->rxm_ep, ofi_op_msg, + def_tx_entry->rndv_write.tx_buf, + (int) ret); } break; case RXM_DEFERRED_TX_SAR_SEG: @@ -939,11 +940,12 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep, OFI_PRIORITY); if (ret) { if (ret != -FI_EAGAIN) { - rxm_cq_write_error( - def_tx_entry->rxm_ep->util_ep.rx_cq, - def_tx_entry->rxm_ep->util_ep.cntrs[CNTR_RX], + rxm_cq_write_rx_error( + def_tx_entry->rxm_ep, + ofi_op_msg, def_tx_entry->rndv_read.rx_buf-> - recv_entry->context, (int) ret); + recv_entry->context, + (int) ret); } return; } diff --git a/prov/rxm/src/rxm_msg.c b/prov/rxm/src/rxm_msg.c index 46cd1cfe285..3b9088a2858 100644 --- a/prov/rxm/src/rxm_msg.c +++ b/prov/rxm/src/rxm_msg.c @@ -140,8 +140,8 @@ rxm_post_mrecv(struct rxm_ep *ep, const struct iovec *iov, if ((cur_iov.iov_len < ep->min_multi_recv_size) || (ret && cur_iov.iov_len != iov->iov_len)) { - rxm_cq_write(ep->util_ep.rx_cq, context, FI_MULTI_RECV, - 0, NULL, 0, 0); + ofi_peer_cq_write(ep->util_ep.rx_cq, context, FI_MULTI_RECV, + 0, NULL, 0, 0, FI_ADDR_NOTAVAIL); } return ret; diff --git a/prov/rxm/src/rxm_tagged.c b/prov/rxm/src/rxm_tagged.c index 78e3d3ff0e9..8f18f34b3eb 100644 --- a/prov/rxm/src/rxm_tagged.c +++ b/prov/rxm/src/rxm_tagged.c @@ -50,8 +50,9 @@ rxm_discard_recv(struct rxm_ep *rxm_ep, struct rxm_rx_buf *rx_buf, RXM_DBG_ADDR_TAG(FI_LOG_EP_DATA, "Discarding message", rx_buf->unexp_msg.addr, rx_buf->unexp_msg.tag); - rxm_cq_write(rxm_ep->util_ep.rx_cq, context, FI_TAGGED | FI_RECV, - 0, NULL, rx_buf->pkt.hdr.data, rx_buf->pkt.hdr.tag); + ofi_peer_cq_write(rxm_ep->util_ep.rx_cq, context, FI_TAGGED | FI_RECV, + 0, NULL, rx_buf->pkt.hdr.data, + rx_buf->pkt.hdr.tag, FI_ADDR_NOTAVAIL); rxm_free_rx_buf(rx_buf); } @@ -73,8 +74,8 @@ rxm_peek_recv(struct rxm_ep *rxm_ep, fi_addr_t addr, uint64_t tag, rx_buf = rxm_get_unexp_msg(recv_queue, addr, tag, ignore); if (!rx_buf) { FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Message not found\n"); - ret = ofi_cq_write_error_peek(rxm_ep->util_ep.rx_cq, tag, - context); + ret = ofi_peer_cq_write_error_peek( + rxm_ep->util_ep.rx_cq, tag, context); if (ret) FI_WARN(&rxm_prov, FI_LOG_CQ, "Error writing to CQ\n"); return; @@ -94,9 +95,9 @@ rxm_peek_recv(struct rxm_ep *rxm_ep, fi_addr_t addr, uint64_t tag, dlist_remove(&rx_buf->unexp_msg.entry); } - rxm_cq_write(rxm_ep->util_ep.rx_cq, context, FI_TAGGED | FI_RECV, - rx_buf->pkt.hdr.size, NULL, - rx_buf->pkt.hdr.data, rx_buf->pkt.hdr.tag); + ofi_peer_cq_write(rxm_ep->util_ep.rx_cq, context, FI_TAGGED | FI_RECV, + rx_buf->pkt.hdr.size, NULL, rx_buf->pkt.hdr.data, + rx_buf->pkt.hdr.tag, FI_ADDR_NOTAVAIL); } static ssize_t From ee2111aa9886934921a7e0732ec4935f5aef2bd2 Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Thu, 31 Oct 2024 08:48:51 -0700 Subject: [PATCH 4/7] prov/rxm: replace rxm managed srx with util srx, support FI_PEER Remove rxm implementation of receive queues and leverage the util srx implementation which supports the peer srx API. This allows rxm to use the peer API calls to match receives. To do this, move the rxm protocol information from the receive entry into the rx_buf and allocate it dynamically as needed to track protocol information. This allows rxm to use the default peer_rx_entry instead of its own custom receive entry. With this last piece of the peer API implemented, rxm can also now advertise full support of the FI_PEER capability. Just like the FI_AV_USER_ID capability, rxm removes the bit from the core provider info as it is only a requirement from the application side and not from the message provider Signed-off-by: Alexia Ingerson --- include/ofi_util.h | 5 +- prov/rxm/src/rxm.h | 119 +++----- prov/rxm/src/rxm_attr.c | 3 +- prov/rxm/src/rxm_conn.c | 13 +- prov/rxm/src/rxm_cq.c | 390 ++++++++++++++------------ prov/rxm/src/rxm_domain.c | 24 +- prov/rxm/src/rxm_ep.c | 566 +++++++++++++------------------------- prov/rxm/src/rxm_init.c | 4 +- prov/rxm/src/rxm_msg.c | 235 ++-------------- prov/rxm/src/rxm_tagged.c | 210 ++------------ prov/tcp/src/xnet_av.c | 2 +- prov/util/src/rxm_av.c | 15 +- 12 files changed, 548 insertions(+), 1038 deletions(-) diff --git a/include/ofi_util.h b/include/ofi_util.h index dda5c903e6e..bc590bb4d1a 100644 --- a/include/ofi_util.h +++ b/include/ofi_util.h @@ -955,12 +955,15 @@ struct rxm_av { struct fid_peer_av peer_av; struct fid_av *util_coll_av; struct fid_av *offload_coll_av; + void (*foreach_ep)(struct util_av *av, struct util_ep *util_ep); }; int rxm_util_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, struct fid_av **fid_av, void *context, size_t conn_size, void (*remove_handler)(struct util_ep *util_ep, - struct util_peer_addr *peer)); + struct util_peer_addr *peer), + void (*foreach_ep)(struct util_av *av, + struct util_ep *ep)); size_t rxm_av_max_peers(struct rxm_av *av); void rxm_ref_peer(struct util_peer_addr *peer); void *rxm_av_alloc_conn(struct rxm_av *av); diff --git a/prov/rxm/src/rxm.h b/prov/rxm/src/rxm.h index 5d18f16e157..93e08624fc1 100644 --- a/prov/rxm/src/rxm.h +++ b/prov/rxm/src/rxm.h @@ -183,9 +183,9 @@ do { \ extern struct fi_provider rxm_prov; extern struct util_prov rxm_util_prov; -extern struct fi_ops_msg rxm_msg_ops; +extern struct fi_ops_msg rxm_msg_ops, rxm_no_recv_msg_ops; extern struct fi_ops_msg rxm_msg_thru_ops; -extern struct fi_ops_tagged rxm_tagged_ops; +extern struct fi_ops_tagged rxm_tagged_ops, rxm_no_recv_tagged_ops; extern struct fi_ops_tagged rxm_tagged_thru_ops; extern struct fi_ops_rma rxm_rma_ops; extern struct fi_ops_rma rxm_rma_thru_ops; @@ -265,6 +265,8 @@ struct rxm_fabric { struct rxm_domain { struct util_domain util_domain; struct fid_domain *msg_domain; + struct fid_ep rx_ep; + struct fid_peer_srx *srx; size_t max_atomic_size; size_t rx_post_size; uint64_t mr_key; @@ -443,24 +445,29 @@ rxm_sar_set_seg_type(struct ofi_ctrl_hdr *ctrl_hdr, enum rxm_sar_seg_type seg_ty ((union rxm_sar_ctrl_data *)&(ctrl_hdr->ctrl_data))->seg_type = seg_type; } -struct rxm_recv_match_attr { - fi_addr_t addr; - uint64_t tag; - uint64_t ignore; -}; - -struct rxm_unexp_msg { - struct dlist_entry entry; - fi_addr_t addr; - uint64_t tag; -}; - struct rxm_iov { struct iovec iov[RXM_IOV_LIMIT]; void *desc[RXM_IOV_LIMIT]; uint8_t count; }; +struct rxm_proto_info { + /* Used for SAR protocol */ + struct { + struct dlist_entry entry; + struct dlist_entry pkt_list; + struct fi_peer_rx_entry *rx_entry; + size_t total_recv_len; + struct rxm_conn *conn; + uint64_t msg_id; + } sar; + /* Used for Rendezvous protocol */ + struct { + /* This is used to send RNDV ACK */ + struct rxm_tx_buf *tx_buf; + } rndv; +}; + struct rxm_buf { /* Must stay at top */ struct fi_context fi_context; @@ -478,9 +485,10 @@ struct rxm_rx_buf { /* MSG EP / shared context to which bufs would be posted to */ struct fid_ep *rx_ep; struct dlist_entry repost_entry; + struct dlist_entry unexp_entry; struct rxm_conn *conn; /* msg ep data was received on */ - struct rxm_recv_entry *recv_entry; - struct rxm_unexp_msg unexp_msg; + struct fi_peer_rx_entry *peer_entry; + struct rxm_proto_info *proto_info; uint64_t comp_flags; struct fi_recv_context recv_context; bool repost; @@ -608,49 +616,6 @@ struct rxm_deferred_tx_entry { }; }; -struct rxm_recv_entry { - struct dlist_entry entry; - struct rxm_iov rxm_iov; - fi_addr_t addr; - void *context; - uint64_t flags; - uint64_t tag; - uint64_t ignore; - uint64_t comp_flags; - size_t total_len; - struct rxm_recv_queue *recv_queue; - - /* Used for SAR protocol */ - struct { - struct dlist_entry entry; - size_t total_recv_len; - struct rxm_conn *conn; - uint64_t msg_id; - } sar; - /* Used for Rendezvous protocol */ - struct { - /* This is used to send RNDV ACK */ - struct rxm_tx_buf *tx_buf; - } rndv; -}; -OFI_DECLARE_FREESTACK(struct rxm_recv_entry, rxm_recv_fs); - -enum rxm_recv_queue_type { - RXM_RECV_QUEUE_UNSPEC, - RXM_RECV_QUEUE_MSG, - RXM_RECV_QUEUE_TAGGED, -}; - -struct rxm_recv_queue { - struct rxm_ep *rxm_ep; - enum rxm_recv_queue_type type; - struct rxm_recv_fs *fs; - struct dlist_entry recv_list; - struct dlist_entry unexp_msg_list; - dlist_func_t *match_recv; - dlist_func_t *match_unexp; -}; - struct rxm_eager_ops { void (*comp_tx)(struct rxm_ep *rxm_ep, struct rxm_tx_buf *tx_eager_buf); @@ -690,6 +655,8 @@ struct rxm_ep { struct fi_ops_transfer_peer *offload_coll_peer_xfer_ops; uint64_t offload_coll_mask; + struct fid_peer_srx *srx; + struct fid_cq *msg_cq; uint64_t msg_cq_last_poll; size_t comp_per_progress; @@ -703,7 +670,6 @@ struct rxm_ep { bool do_progress; bool enable_direct_send; - size_t min_multi_recv_size; size_t buffered_min; size_t buffered_limit; size_t inject_limit; @@ -715,15 +681,13 @@ struct rxm_ep { struct ofi_bufpool *rx_pool; struct ofi_bufpool *tx_pool; struct ofi_bufpool *coll_pool; + struct ofi_bufpool *proto_info_pool; + struct rxm_pkt *inject_pkt; struct dlist_entry deferred_queue; struct dlist_entry rndv_wait_list; - struct rxm_recv_queue recv_queue; - struct rxm_recv_queue trecv_queue; - struct ofi_bufpool *multi_recv_pool; - struct rxm_eager_ops *eager_ops; struct rxm_rndv_ops *rndv_ops; }; @@ -757,6 +721,9 @@ int rxm_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, struct fid_cq **cq_fid, void *context); ssize_t rxm_handle_rx_buf(struct rxm_rx_buf *rx_buf); +int rxm_srx_context(struct fid_domain *domain, struct fi_rx_attr *attr, + struct fid_ep **rx_ep, void *context); + int rxm_endpoint(struct fid_domain *domain, struct fi_info *info, struct fid_ep **ep, void *context); void rxm_cq_write_tx_error(struct rxm_ep *rxm_ep, uint8_t op, void *op_context, @@ -915,17 +882,10 @@ ssize_t rxm_inject_send(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, const void *buf, size_t len); -struct rxm_recv_entry * -rxm_recv_entry_get(struct rxm_ep *rxm_ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, - uint64_t tag, uint64_t ignore, void *context, - uint64_t flags, struct rxm_recv_queue *recv_queue); -struct rxm_rx_buf * -rxm_get_unexp_msg(struct rxm_recv_queue *recv_queue, fi_addr_t addr, - uint64_t tag, uint64_t ignore); -ssize_t rxm_handle_unexp_sar(struct rxm_recv_queue *recv_queue, - struct rxm_recv_entry *recv_entry, - struct rxm_rx_buf *rx_buf); +ssize_t rxm_handle_unexp_sar(struct fi_peer_rx_entry *peer_entry); +int rxm_srx_context(struct fid_domain *domain, struct fi_rx_attr *attr, + struct fid_ep **rx_ep, void *context); + int rxm_post_recv(struct rxm_rx_buf *rx_buf); void rxm_av_remove_handler(struct util_ep *util_ep, struct util_peer_addr *peer); @@ -946,15 +906,6 @@ rxm_free_rx_buf(struct rxm_rx_buf *rx_buf) } } -static inline void -rxm_recv_entry_release(struct rxm_recv_entry *entry) -{ - if (entry->recv_queue) - ofi_freestack_push(entry->recv_queue->fs, entry); - else - ofi_buf_free(entry); -} - struct rxm_mr *rxm_mr_get_map_entry(struct rxm_domain *domain, uint64_t key); struct rxm_recv_entry * diff --git a/prov/rxm/src/rxm_attr.c b/prov/rxm/src/rxm_attr.c index 632543585e4..6dc1241329e 100644 --- a/prov/rxm/src/rxm_attr.c +++ b/prov/rxm/src/rxm_attr.c @@ -40,7 +40,8 @@ OFI_RX_RMA_CAPS | FI_ATOMICS | FI_DIRECTED_RECV | \ FI_MULTI_RECV) -#define RXM_DOMAIN_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM | FI_AV_USER_ID) +#define RXM_DOMAIN_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM | FI_AV_USER_ID | \ + FI_PEER) /* Since we are a layering provider, the attributes for which we rely on the diff --git a/prov/rxm/src/rxm_conn.c b/prov/rxm/src/rxm_conn.c index afe603234ec..73b26f2a9f3 100644 --- a/prov/rxm/src/rxm_conn.c +++ b/prov/rxm/src/rxm_conn.c @@ -58,7 +58,7 @@ struct rxm_eq_cm_entry { static void rxm_close_conn(struct rxm_conn *conn) { struct rxm_deferred_tx_entry *tx_entry; - struct rxm_recv_entry *rx_entry; + struct fi_peer_rx_entry *rx_entry; struct rxm_rx_buf *buf; FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "closing conn %p\n", conn); @@ -74,16 +74,13 @@ static void rxm_close_conn(struct rxm_conn *conn) while (!dlist_empty(&conn->deferred_sar_segments)) { buf = container_of(conn->deferred_sar_segments.next, - struct rxm_rx_buf, unexp_msg.entry); - dlist_remove(&buf->unexp_msg.entry); - rxm_free_rx_buf(buf); + struct rxm_rx_buf, unexp_entry); + dlist_remove(&buf->unexp_entry); } while (!dlist_empty(&conn->deferred_sar_msgs)) { - rx_entry = container_of(conn->deferred_sar_msgs.next, - struct rxm_recv_entry, sar.entry); - dlist_remove(&rx_entry->entry); - rxm_recv_entry_release(rx_entry); + rx_entry = (struct fi_peer_rx_entry*)conn->deferred_sar_msgs.next; + rx_entry->srx->owner_ops->free_entry(rx_entry); } fi_close(&conn->msg_ep->fid); rxm_flush_msg_cq(conn->ep); diff --git a/prov/rxm/src/rxm_cq.c b/prov/rxm/src/rxm_cq.c index b04b36444d3..51206ddde04 100644 --- a/prov/rxm/src/rxm_cq.c +++ b/prov/rxm/src/rxm_cq.c @@ -106,11 +106,12 @@ static void rxm_cq_write_recv_comp(struct rxm_rx_buf *rx_buf, void *context, { int ret; + flags &= ~FI_COMPLETION; if (rx_buf->ep->util_coll_peer_xfer_ops && rx_buf->pkt.hdr.tag & RXM_PEER_XFER_TAG_FLAG) { struct fi_cq_tagged_entry cqe = { .tag = rx_buf->pkt.hdr.tag, - .op_context = rx_buf->recv_entry->context, + .op_context = rx_buf->peer_entry->context, }; rx_buf->ep->util_coll_peer_xfer_ops-> complete(rx_buf->ep->util_coll_ep, &cqe, 0); @@ -137,7 +138,7 @@ static void rxm_finish_buf_recv(struct rxm_rx_buf *rx_buf) if ((rx_buf->pkt.ctrl_hdr.type == rxm_ctrl_seg) && rxm_sar_get_seg_type(&rx_buf->pkt.ctrl_hdr) != RXM_SAR_SEG_FIRST) { - dlist_insert_tail(&rx_buf->unexp_msg.entry, + dlist_insert_tail(&rx_buf->unexp_entry, &rx_buf->conn->deferred_sar_segments); rxm_replace_rx_buf(rx_buf); } @@ -172,10 +173,11 @@ static void rxm_cq_write_error_trunc(struct rxm_rx_buf *rx_buf, size_t done_len) done_len, rx_buf->pkt.hdr.size); ret = ofi_peer_cq_write_error_trunc( rx_buf->ep->util_ep.rx_cq, - rx_buf->recv_entry->context, - rx_buf->recv_entry->comp_flags | - rx_buf->pkt.hdr.flags, rx_buf->pkt.hdr.size, - rx_buf->recv_entry->rxm_iov.iov[0].iov_base, + rx_buf->peer_entry->context, + rx_buf->peer_entry->flags | + rx_buf->pkt.hdr.flags, + rx_buf->pkt.hdr.size, + rx_buf->peer_entry->iov[0].iov_base, rx_buf->pkt.hdr.data, rx_buf->pkt.hdr.tag, rx_buf->pkt.hdr.size - done_len); if (ret) { @@ -186,27 +188,22 @@ static void rxm_cq_write_error_trunc(struct rxm_rx_buf *rx_buf, size_t done_len) static void rxm_finish_recv(struct rxm_rx_buf *rx_buf, size_t done_len) { - struct rxm_recv_entry *recv_entry = rx_buf->recv_entry; - if (done_len < rx_buf->pkt.hdr.size) { rxm_cq_write_error_trunc(rx_buf, done_len); goto release; } - if (rx_buf->recv_entry->flags & FI_COMPLETION || + if (rx_buf->peer_entry->flags & FI_COMPLETION || rx_buf->ep->rxm_info->mode & OFI_BUFFERED_RECV) { - rxm_cq_write_recv_comp( - rx_buf, rx_buf->recv_entry->context, - rx_buf->recv_entry->comp_flags | - rx_buf->pkt.hdr.flags | - (rx_buf->recv_entry->flags & FI_MULTI_RECV), - rx_buf->pkt.hdr.size, - rx_buf->recv_entry->rxm_iov. - iov[0].iov_base); + rxm_cq_write_recv_comp(rx_buf, rx_buf->peer_entry->context, + rx_buf->peer_entry->flags | + rx_buf->pkt.hdr.flags, + rx_buf->pkt.hdr.size, + rx_buf->peer_entry->iov[0].iov_base); } ofi_ep_peer_rx_cntr_inc(&rx_buf->ep->util_ep, ofi_op_msg); release: - rxm_recv_entry_release(recv_entry); + rx_buf->ep->srx->owner_ops->free_entry(rx_buf->peer_entry); rxm_free_rx_buf(rx_buf); } @@ -294,18 +291,20 @@ static void rxm_handle_sar_comp(struct rxm_ep *rxm_ep, static void rxm_rndv_rx_finish(struct rxm_rx_buf *rx_buf) { + struct rxm_proto_info *proto_info; + RXM_UPDATE_STATE(FI_LOG_CQ, rx_buf, RXM_RNDV_FINISH); - if (rx_buf->recv_entry->rndv.tx_buf) { - ofi_buf_free(rx_buf->recv_entry->rndv.tx_buf); - rx_buf->recv_entry->rndv.tx_buf = NULL; + proto_info = rx_buf->proto_info; + if (proto_info->rndv.tx_buf) { + ofi_buf_free(proto_info); + ofi_buf_free(proto_info->rndv.tx_buf); } if (!rx_buf->ep->rdm_mr_local) - rxm_msg_mr_closev(rx_buf->mr, - rx_buf->recv_entry->rxm_iov.count); + rxm_msg_mr_closev(rx_buf->mr, rx_buf->peer_entry->count); - rxm_finish_recv(rx_buf, rx_buf->recv_entry->total_len); + rxm_finish_recv(rx_buf, rx_buf->peer_entry->msg_size); } static void rxm_rndv_tx_finish(struct rxm_ep *rxm_ep, @@ -398,96 +397,135 @@ static int rxm_rx_buf_match_msg_id(struct dlist_entry *item, const void *arg) uint64_t msg_id = *((uint64_t *) arg); struct rxm_rx_buf *rx_buf; - rx_buf = container_of(item, struct rxm_rx_buf, unexp_msg.entry); + rx_buf = container_of(item, struct rxm_rx_buf, unexp_entry); return (msg_id == rx_buf->pkt.ctrl_hdr.msg_id); } -static void rxm_process_seg_data(struct rxm_rx_buf *rx_buf, int *done) +static void rxm_init_sar_proto(struct rxm_rx_buf *rx_buf) +{ + struct rxm_proto_info *proto_info; + + proto_info = ofi_buf_alloc(rx_buf->ep->proto_info_pool); + if (!proto_info) { + FI_WARN(&rxm_prov, FI_LOG_CQ, + "Failed to allocate proto info buffer\n"); + return; + } + if (!rx_buf->conn) { + rx_buf->conn = ofi_idm_at(&rx_buf->ep->conn_idx_map, + (int) rx_buf->pkt.ctrl_hdr.conn_id); + } + + proto_info->sar.conn = rx_buf->conn; + proto_info->sar.msg_id = rx_buf->pkt.ctrl_hdr.msg_id; + proto_info->sar.total_recv_len = 0; + proto_info->sar.rx_entry = rx_buf->peer_entry; + + dlist_insert_tail(&proto_info->sar.entry, + &rx_buf->conn->deferred_sar_msgs); + + dlist_init(&proto_info->sar.pkt_list); + if (rx_buf->peer_entry->peer_context) + dlist_insert_tail(&rx_buf->unexp_entry, + &proto_info->sar.pkt_list); + + + rx_buf->proto_info = proto_info; +} + +int rxm_process_seg_data(struct rxm_rx_buf *rx_buf) { enum fi_hmem_iface iface; + struct rxm_proto_info *proto_info; uint64_t device; ssize_t done_len; + int done = 0; - iface = rxm_iov_desc_to_hmem_iface_dev(rx_buf->recv_entry->rxm_iov.iov, - rx_buf->recv_entry->rxm_iov.desc, - rx_buf->recv_entry->rxm_iov.count, + proto_info = rx_buf->proto_info; + iface = rxm_iov_desc_to_hmem_iface_dev(rx_buf->peer_entry->iov, + rx_buf->peer_entry->desc, + rx_buf->peer_entry->count, &device); done_len = ofi_copy_to_hmem_iov(iface, device, - rx_buf->recv_entry->rxm_iov.iov, - rx_buf->recv_entry->rxm_iov.count, - rx_buf->recv_entry->sar.total_recv_len, + rx_buf->peer_entry->iov, + rx_buf->peer_entry->count, + proto_info->sar.total_recv_len, rx_buf->pkt.data, rx_buf->pkt.ctrl_hdr.seg_size); assert(done_len == rx_buf->pkt.ctrl_hdr.seg_size); - rx_buf->recv_entry->sar.total_recv_len += done_len; + proto_info->sar.total_recv_len += done_len; if ((rxm_sar_get_seg_type(&rx_buf->pkt.ctrl_hdr) == RXM_SAR_SEG_LAST) || (done_len != rx_buf->pkt.ctrl_hdr.seg_size)) { - - dlist_remove(&rx_buf->recv_entry->sar.entry); - - /* Mark rxm_recv_entry::msg_id as unknown for futher re-use */ - rx_buf->recv_entry->sar.msg_id = RXM_SAR_RX_INIT; - - done_len = rx_buf->recv_entry->sar.total_recv_len; - rx_buf->recv_entry->sar.total_recv_len = 0; - - *done = 1; + if (!rx_buf->peer_entry->peer_context) + dlist_remove(&proto_info->sar.entry); + done_len = proto_info->sar.total_recv_len; + done = 1; + ofi_buf_free(rx_buf->proto_info); rxm_finish_recv(rx_buf, done_len); } else { - if (rx_buf->recv_entry->sar.msg_id == RXM_SAR_RX_INIT) { - if (!rx_buf->conn) { - rx_buf->conn = ofi_idm_at(&rx_buf->ep->conn_idx_map, - (int) rx_buf->pkt.ctrl_hdr.conn_id); - } - - rx_buf->recv_entry->sar.conn = rx_buf->conn; - rx_buf->recv_entry->sar.msg_id = rx_buf->pkt.ctrl_hdr.msg_id; - - dlist_insert_tail(&rx_buf->recv_entry->sar.entry, - &rx_buf->conn->deferred_sar_msgs); - } - /* The RX buffer can be reposted for further re-use */ - rx_buf->recv_entry = NULL; + rx_buf->peer_entry = NULL; rxm_free_rx_buf(rx_buf); - - *done = 0; } + return done; } static void rxm_handle_seg_data(struct rxm_rx_buf *rx_buf) { - struct rxm_recv_entry *recv_entry; + struct rxm_proto_info *proto_info; + struct fi_peer_rx_entry *rx_entry; struct rxm_conn *conn; uint64_t msg_id; struct dlist_entry *entry; - int done; - rxm_process_seg_data(rx_buf, &done); - if (done || !(rx_buf->ep->rxm_info->mode & OFI_BUFFERED_RECV)) + if (dlist_empty(&rx_buf->proto_info->sar.pkt_list)) { + rxm_process_seg_data(rx_buf); return; + } - recv_entry = rx_buf->recv_entry; + proto_info = rx_buf->proto_info; + dlist_insert_tail(&rx_buf->unexp_entry, &proto_info->sar.pkt_list); + + if ((rxm_sar_get_seg_type(&rx_buf->pkt.ctrl_hdr) == RXM_SAR_SEG_LAST)) + dlist_remove(&proto_info->sar.entry); + + rx_entry = rx_buf->peer_entry; conn = rx_buf->conn; msg_id = rx_buf->pkt.ctrl_hdr.msg_id; dlist_foreach_container_safe(&conn->deferred_sar_segments, struct rxm_rx_buf, rx_buf, - unexp_msg.entry, entry) { - if (!rxm_rx_buf_match_msg_id(&rx_buf->unexp_msg.entry, &msg_id)) + unexp_entry, entry) { + if (!rxm_rx_buf_match_msg_id(&rx_buf->unexp_entry, &msg_id)) continue; - dlist_remove(&rx_buf->unexp_msg.entry); - rx_buf->recv_entry = recv_entry; - rxm_process_seg_data(rx_buf, &done); - if (done) + dlist_remove(&rx_buf->unexp_entry); + rx_buf->peer_entry = rx_entry; + if (rxm_process_seg_data(rx_buf)) break; } } +ssize_t rxm_handle_unexp_sar(struct fi_peer_rx_entry *peer_entry) +{ + struct rxm_proto_info *proto_info; + struct rxm_rx_buf *rx_buf; + + rx_buf = (struct rxm_rx_buf *) peer_entry->peer_context; + proto_info = rx_buf->proto_info; + + while (!dlist_empty(&proto_info->sar.pkt_list)) { + dlist_pop_front(&proto_info->sar.pkt_list, + struct rxm_rx_buf, rx_buf, unexp_entry); + rxm_process_seg_data(rx_buf); + } + peer_entry->peer_context = NULL; + return FI_SUCCESS; +} + static ssize_t rxm_rndv_xfer(struct rxm_ep *rxm_ep, struct fid_ep *msg_ep, struct rxm_rndv_hdr *remote_hdr, struct iovec *local_iov, void **local_desc, size_t local_count, size_t total_len, @@ -538,14 +576,15 @@ ssize_t rxm_rndv_read(struct rxm_rx_buf *rx_buf) ssize_t ret; size_t total_len; - total_len = MIN(rx_buf->recv_entry->total_len, rx_buf->pkt.hdr.size); + total_len = MIN(rx_buf->peer_entry->msg_size, rx_buf->pkt.hdr.size); + rx_buf->peer_entry->msg_size = total_len; RXM_UPDATE_STATE(FI_LOG_CQ, rx_buf, RXM_RNDV_READ); ret = rxm_rndv_xfer(rx_buf->ep, rx_buf->conn->msg_ep, rx_buf->remote_rndv_hdr, - rx_buf->recv_entry->rxm_iov.iov, - rx_buf->recv_entry->rxm_iov.desc, - rx_buf->recv_entry->rxm_iov.count, total_len, + rx_buf->peer_entry->iov, + rx_buf->peer_entry->desc, + rx_buf->peer_entry->count, total_len, rx_buf); if (ret) { rxm_cq_write_rx_error(rx_buf->ep, ofi_op_msg, rx_buf, @@ -621,28 +660,26 @@ static ssize_t rxm_handle_rndv(struct rxm_rx_buf *rx_buf) rx_buf->rndv_rma_index = 0; if (!rx_buf->ep->rdm_mr_local) { - total_recv_len = MIN(rx_buf->recv_entry->total_len, + total_recv_len = MIN(rx_buf->peer_entry->msg_size, rx_buf->pkt.hdr.size); - ret = rxm_msg_mr_regv(rx_buf->ep, rx_buf->recv_entry->rxm_iov.iov, - rx_buf->recv_entry->rxm_iov.count, + ret = rxm_msg_mr_regv(rx_buf->ep, rx_buf->peer_entry->iov, + rx_buf->peer_entry->count, total_recv_len, rx_buf->ep->rndv_ops->rx_mr_access, rx_buf->mr); if (ret) return ret; - for (i = 0; (i < rx_buf->recv_entry->rxm_iov.count && + for (i = 0; (i < rx_buf->peer_entry->count && rx_buf->mr[i]); i++) { - rx_buf->recv_entry->rxm_iov.desc[i] = - fi_mr_desc(rx_buf->mr[i]); + rx_buf->peer_entry->desc[i] = fi_mr_desc(rx_buf->mr[i]); } } else { struct rxm_mr *mr; - for (i = 0; i < rx_buf->recv_entry->rxm_iov.count; i++) { - mr = rx_buf->recv_entry->rxm_iov.desc[i]; - rx_buf->recv_entry->rxm_iov.desc[i] = - fi_mr_desc(mr->msg_mr); + for (i = 0; i < rx_buf->peer_entry->count; i++) { + mr = rx_buf->peer_entry->desc[i]; + rx_buf->peer_entry->desc[i] = fi_mr_desc(mr->msg_mr); rx_buf->mr[i] = mr->msg_mr; } } @@ -656,9 +693,9 @@ static ssize_t rxm_handle_rndv(struct rxm_rx_buf *rx_buf) void rxm_handle_eager(struct rxm_rx_buf *rx_buf) { ssize_t done_len = rxm_copy_to_hmem_iov( - rx_buf->recv_entry->rxm_iov.desc, rx_buf->data, - rx_buf->pkt.hdr.size, rx_buf->recv_entry->rxm_iov.iov, - rx_buf->recv_entry->rxm_iov.count, 0); + rx_buf->peer_entry->desc, rx_buf->data, + rx_buf->pkt.hdr.size, rx_buf->peer_entry->iov, + rx_buf->peer_entry->count, 0); assert((size_t) done_len == rx_buf->pkt.hdr.size); @@ -671,14 +708,14 @@ void rxm_handle_coll_eager(struct rxm_rx_buf *rx_buf) uint64_t device; ssize_t done_len; - iface = rxm_iov_desc_to_hmem_iface_dev(rx_buf->recv_entry->rxm_iov.iov, - rx_buf->recv_entry->rxm_iov.desc, - rx_buf->recv_entry->rxm_iov.count, + iface = rxm_iov_desc_to_hmem_iface_dev(rx_buf->peer_entry->iov, + rx_buf->peer_entry->desc, + rx_buf->peer_entry->count, &device); done_len = ofi_copy_to_hmem_iov(iface, device, - rx_buf->recv_entry->rxm_iov.iov, - rx_buf->recv_entry->rxm_iov.count, 0, + rx_buf->peer_entry->iov, + rx_buf->peer_entry->count, 0, rx_buf->data, rx_buf->pkt.hdr.size); assert((size_t) done_len == rx_buf->pkt.hdr.size); @@ -686,11 +723,11 @@ void rxm_handle_coll_eager(struct rxm_rx_buf *rx_buf) rx_buf->pkt.hdr.tag & RXM_PEER_XFER_TAG_FLAG) { struct fi_cq_tagged_entry cqe = { .tag = rx_buf->pkt.hdr.tag, - .op_context = rx_buf->recv_entry->context, + .op_context = rx_buf->peer_entry->context, }; rx_buf->ep->util_coll_peer_xfer_ops-> complete(rx_buf->ep->util_coll_ep, &cqe, 0); - rxm_recv_entry_release(rx_buf->recv_entry); + rx_buf->ep->srx->owner_ops->free_entry(rx_buf->peer_entry); rxm_free_rx_buf(rx_buf); } else { rxm_finish_recv(rx_buf, done_len); @@ -715,73 +752,26 @@ ssize_t rxm_handle_rx_buf(struct rxm_rx_buf *rx_buf) } } -static void rxm_adjust_multi_recv(struct rxm_rx_buf *rx_buf) +static inline void rxm_entry_prep_for_queue(struct fi_peer_rx_entry *rx_entry, + struct rxm_rx_buf *rx_buf) { - struct rxm_recv_entry *recv_entry; - struct iovec new_iov; - size_t recv_size; - - recv_size = rx_buf->pkt.hdr.size; - - if (rx_buf->recv_entry->rxm_iov.iov[0].iov_len < recv_size || - rx_buf->recv_entry->rxm_iov.iov[0].iov_len - recv_size < - rx_buf->ep->min_multi_recv_size) - return; - - new_iov.iov_base = (uint8_t *) - rx_buf->recv_entry->rxm_iov.iov[0].iov_base + recv_size; - new_iov.iov_len = rx_buf->recv_entry->rxm_iov.iov[0].iov_len - recv_size;; - - rx_buf->recv_entry->rxm_iov.iov[0].iov_len = recv_size; - - recv_entry = rxm_multi_recv_entry_get(rx_buf->ep, &new_iov, - rx_buf->recv_entry->rxm_iov.desc, 1, - rx_buf->recv_entry->addr, - rx_buf->recv_entry->tag, - rx_buf->recv_entry->ignore, - rx_buf->recv_entry->context, - rx_buf->recv_entry->flags); - - rx_buf->recv_entry->flags &= ~FI_MULTI_RECV; - - dlist_insert_head(&recv_entry->entry, &rx_buf->ep->recv_queue.recv_list); -} - -static ssize_t -rxm_match_rx_buf(struct rxm_rx_buf *rx_buf, - struct rxm_recv_queue *recv_queue, - struct rxm_recv_match_attr *match_attr) -{ - struct dlist_entry *entry; - - entry = dlist_remove_first_match(&recv_queue->recv_list, - recv_queue->match_recv, match_attr); - if (entry) { - rx_buf->recv_entry = container_of(entry, struct rxm_recv_entry, entry); - - if (rx_buf->recv_entry->flags & FI_MULTI_RECV) - rxm_adjust_multi_recv(rx_buf); - - return rxm_handle_rx_buf(rx_buf); + rx_entry->peer_context = rx_buf; + rx_buf->peer_entry = rx_entry; + if (rx_buf->pkt.hdr.flags & FI_REMOTE_CQ_DATA) { + rx_entry->flags |= FI_REMOTE_CQ_DATA; + rx_entry->cq_data = rx_buf->pkt.hdr.data; } - - RXM_DBG_ADDR_TAG(FI_LOG_CQ, "No matching recv found for incoming msg", - match_attr->addr, match_attr->tag); - FI_DBG(&rxm_prov, FI_LOG_CQ, "Enqueueing msg to unexpected msg queue\n"); - rx_buf->unexp_msg.addr = match_attr->addr; - rx_buf->unexp_msg.tag = match_attr->tag; - - dlist_insert_tail(&rx_buf->unexp_msg.entry, - &recv_queue->unexp_msg_list); + if (rx_buf->pkt.ctrl_hdr.type == rxm_ctrl_seg) + rxm_init_sar_proto(rx_buf); rxm_replace_rx_buf(rx_buf); - return 0; } static ssize_t rxm_handle_recv_comp(struct rxm_rx_buf *rx_buf) { - struct rxm_recv_match_attr match_attr = { - .addr = FI_ADDR_UNSPEC, - }; + struct fid_peer_srx *srx = rx_buf->ep->srx; + struct fi_peer_rx_entry *rx_entry; + struct fi_peer_match_attr match = {0}; + int ret; if (rx_buf->ep->rxm_info->caps & (FI_SOURCE | FI_DIRECTED_RECV)) { if (rx_buf->ep->msg_srx) @@ -789,7 +779,9 @@ static ssize_t rxm_handle_recv_comp(struct rxm_rx_buf *rx_buf) (int) rx_buf->pkt.ctrl_hdr.conn_id); if (!rx_buf->conn) return -FI_EOTHER; - match_attr.addr = rx_buf->conn->peer->fi_addr; + match.addr = rx_buf->conn->peer->fi_addr; + } else { + match.addr = FI_ADDR_UNSPEC; } if (rx_buf->ep->rxm_info->mode & OFI_BUFFERED_RECV) { @@ -799,33 +791,52 @@ static ssize_t rxm_handle_recv_comp(struct rxm_rx_buf *rx_buf) switch(rx_buf->pkt.hdr.op) { case ofi_op_msg: + match.msg_size = rx_buf->pkt.hdr.size; FI_DBG(&rxm_prov, FI_LOG_CQ, "Got MSG op\n"); - return rxm_match_rx_buf(rx_buf, &rx_buf->ep->recv_queue, - &match_attr); + ret = srx->owner_ops->get_msg(srx, &match, &rx_entry); + if (ret == -FI_ENOENT) { + rxm_entry_prep_for_queue(rx_entry, rx_buf); + return srx->owner_ops->queue_msg(rx_entry); + } + rx_entry->peer_context = NULL; + break; case ofi_op_tagged: + match.tag = rx_buf->pkt.hdr.tag; + match.msg_size = rx_buf->pkt.hdr.size; FI_DBG(&rxm_prov, FI_LOG_CQ, "Got TAGGED op\n"); - match_attr.tag = rx_buf->pkt.hdr.tag; - return rxm_match_rx_buf(rx_buf, &rx_buf->ep->trecv_queue, - &match_attr); + ret = srx->owner_ops->get_tag(srx, &match, &rx_entry); + if (ret == -FI_ENOENT) { + rxm_entry_prep_for_queue(rx_entry, rx_buf); + return srx->owner_ops->queue_tag(rx_entry); + } + rx_entry->peer_context = NULL; + break; default: FI_WARN(&rxm_prov, FI_LOG_CQ, "Unknown op!\n"); assert(0); return -FI_EINVAL; } + rx_buf->peer_entry = rx_entry; + + if (rx_buf->pkt.ctrl_hdr.type == rxm_ctrl_seg) + rxm_init_sar_proto(rx_buf); + + return rxm_handle_rx_buf(rx_buf); } static int rxm_sar_match_msg_id(struct dlist_entry *item, const void *arg) { uint64_t msg_id = *((uint64_t *) arg); - struct rxm_recv_entry *recv_entry; + struct rxm_proto_info *proto_info; - recv_entry = container_of(item, struct rxm_recv_entry, sar.entry); - return (msg_id == recv_entry->sar.msg_id); + proto_info = container_of(item, struct rxm_proto_info, sar.entry); + return (msg_id == proto_info->sar.msg_id); } static ssize_t rxm_sar_handle_segment(struct rxm_rx_buf *rx_buf) { struct dlist_entry *sar_entry; + struct rxm_proto_info *proto_info; rx_buf->conn = ofi_idm_at(&rx_buf->ep->conn_idx_map, (int) rx_buf->pkt.ctrl_hdr.conn_id); @@ -841,8 +852,9 @@ static ssize_t rxm_sar_handle_segment(struct rxm_rx_buf *rx_buf) if (!sar_entry) return rxm_handle_recv_comp(rx_buf); - rx_buf->recv_entry = container_of(sar_entry, struct rxm_recv_entry, - sar.entry); + proto_info = container_of(sar_entry, struct rxm_proto_info, sar.entry); + rx_buf->peer_entry = proto_info->sar.rx_entry; + rx_buf->proto_info = proto_info; rxm_handle_seg_data(rx_buf); return 0; } @@ -860,8 +872,15 @@ static void rxm_rndv_send_rd_done(struct rxm_rx_buf *rx_buf) ret = -FI_ENOMEM; goto err; } + rx_buf->proto_info = ofi_buf_alloc(rx_buf->ep->proto_info_pool); + if (!rx_buf->proto_info) { + FI_WARN(&rxm_prov, FI_LOG_CQ, + "Failed to allocated proto info buf\n"); + assert(0); + return; + } - rx_buf->recv_entry->rndv.tx_buf = buf; + rx_buf->proto_info->rndv.tx_buf = buf; buf->pkt.ctrl_hdr.type = rxm_ctrl_rndv_rd_done; buf->pkt.ctrl_hdr.conn_id = rx_buf->conn->remote_index; @@ -888,8 +907,9 @@ static void rxm_rndv_send_rd_done(struct rxm_rx_buf *rx_buf) return; free: + rx_buf->proto_info->rndv.tx_buf = NULL; + ofi_buf_free(rx_buf->proto_info); ofi_buf_free(buf); - rx_buf->recv_entry->rndv.tx_buf = NULL; err: FI_WARN(&rxm_prov, FI_LOG_CQ, "unable to allocate/send rd rndv ack: %s\n", @@ -968,14 +988,22 @@ ssize_t rxm_rndv_send_wr_data(struct rxm_rx_buf *rx_buf) goto err; } - rx_buf->recv_entry->rndv.tx_buf = buf; + rx_buf->proto_info = ofi_buf_alloc(rx_buf->ep->proto_info_pool); + if (!rx_buf->proto_info) { + FI_WARN(&rxm_prov, FI_LOG_CQ, + "Failed to allocated proto info buf\n"); + return -FI_ENOMEM; + } + + rx_buf->proto_info->rndv.tx_buf = buf; + buf->pkt.ctrl_hdr.type = rxm_ctrl_rndv_wr_data; buf->pkt.ctrl_hdr.conn_id = rx_buf->conn->remote_index; buf->pkt.ctrl_hdr.msg_id = rx_buf->pkt.ctrl_hdr.msg_id; rxm_rndv_hdr_init(rx_buf->ep, buf->pkt.data, - rx_buf->recv_entry->rxm_iov.iov, - rx_buf->recv_entry->rxm_iov.count, rx_buf->mr); + rx_buf->peer_entry->iov, + rx_buf->peer_entry->count, rx_buf->mr); ret = fi_send(rx_buf->conn->msg_ep, &buf->pkt, sizeof(buf->pkt) + sizeof(struct rxm_rndv_hdr), buf->hdr.desc, 0, rx_buf); @@ -999,8 +1027,9 @@ ssize_t rxm_rndv_send_wr_data(struct rxm_rx_buf *rx_buf) return 0; free: + rx_buf->proto_info->rndv.tx_buf = NULL; + ofi_buf_free(rx_buf->proto_info); ofi_buf_free(buf); - rx_buf->recv_entry->rndv.tx_buf = NULL; err: FI_WARN(&rxm_prov, FI_LOG_CQ, "unable to allocate/send wr rndv ready: %s\n", @@ -1638,7 +1667,7 @@ void rxm_handle_comp_error(struct rxm_ep *rxm_ep) * the event yet. */ rx_buf = (struct rxm_rx_buf *) err_entry.op_context; - if (!rx_buf->recv_entry) { + if (!rx_buf->peer_entry) { ofi_buf_free((struct rxm_rx_buf *)err_entry.op_context); return; } @@ -1647,9 +1676,9 @@ void rxm_handle_comp_error(struct rxm_ep *rxm_ep) case RXM_RNDV_WRITE_DATA_SENT: /* BUG: should fail initial send */ case RXM_RNDV_READ: rx_buf = (struct rxm_rx_buf *) err_entry.op_context; - assert(rx_buf->recv_entry); - err_entry.op_context = rx_buf->recv_entry->context; - err_entry.flags = rx_buf->recv_entry->comp_flags; + assert(rx_buf->peer_entry); + err_entry.op_context = rx_buf->peer_entry->context; + err_entry.flags = rx_buf->peer_entry->flags; cq = rx_buf->ep->util_ep.rx_cq; cntr = rx_buf->ep->util_ep.cntrs[CNTR_RX]; @@ -1780,7 +1809,8 @@ int rxm_post_recv(struct rxm_rx_buf *rx_buf) if (rx_buf->ep->msg_srx) rx_buf->conn = NULL; rx_buf->hdr.state = RXM_RX; - rx_buf->recv_entry = NULL; + rx_buf->peer_entry = NULL; + rx_buf->proto_info = NULL; domain = container_of(rx_buf->ep->util_ep.domain, struct rxm_domain, util_domain); @@ -1858,7 +1888,7 @@ void rxm_ep_do_progress(struct util_ep *util_ep) rxm_conn_progress(rxm_ep); } } else { - rxm_conn_progress(rxm_ep); + rxm_conn_progress(rxm_ep); } } } while ((ret > 0) && (comp_read < rxm_ep->comp_per_progress)); @@ -1975,6 +2005,9 @@ int rxm_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, if (ret) goto err1; + if (attr->flags & FI_PEER) + goto out; + rxm_domain = container_of(domain, struct rxm_domain, util_domain.domain_fid); @@ -1996,11 +2029,12 @@ int rxm_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, if (ret) goto err2; } + rxm_cq->util_cq.cq_fid.ops = &rxm_cq_ops; +out: *cq_fid = &rxm_cq->util_cq.cq_fid; /* Override util_cq_fi_ops */ (*cq_fid)->fid.ops = &rxm_cq_fi_ops; - (*cq_fid)->ops = &rxm_cq_ops; return 0; err2: diff --git a/prov/rxm/src/rxm_domain.c b/prov/rxm/src/rxm_domain.c index 055fca16bea..9fcadf56763 100644 --- a/prov/rxm/src/rxm_domain.c +++ b/prov/rxm/src/rxm_domain.c @@ -221,6 +221,25 @@ static struct fi_ops_av_owner rxm_av_owner_ops = { .ep_addr = rxm_peer_av_ep_addr, }; +static fi_addr_t rxm_get_addr(struct fi_peer_rx_entry *rx_entry) +{ + struct rxm_rx_buf *rx_buf = rx_entry->peer_context; + + return rx_buf->conn->peer->fi_addr; +} + +static void rxm_foreach_ep(struct util_av *av, struct util_ep *ep) +{ + struct rxm_ep *rxm_ep; + struct fid_peer_srx *peer_srx; + + rxm_ep = container_of(ep, struct rxm_ep, util_ep); + peer_srx = container_of(rxm_ep->srx, struct fid_peer_srx, ep_fid); + if (peer_srx) + peer_srx->owner_ops->foreach_unspec_addr(peer_srx, &rxm_get_addr); +} + + static int rxm_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, struct fid_av **fid_av, void *context) @@ -236,7 +255,8 @@ rxm_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, ret = rxm_util_av_open(domain_fid, attr, &fid_av_new, context, sizeof(struct rxm_conn), - ofi_av_remove_cleanup ? rxm_av_remove_handler : NULL); + ofi_av_remove_cleanup ? rxm_av_remove_handler : NULL, + &rxm_foreach_ep); if (ret) return ret; @@ -346,7 +366,7 @@ static struct fi_ops_domain rxm_domain_ops = { .cntr_open = rxm_cntr_open, .poll_open = fi_poll_create, .stx_ctx = fi_no_stx_context, - .srx_ctx = fi_no_srx_context, + .srx_ctx = rxm_srx_context, .query_atomic = rxm_ep_query_atomic, .query_collective = rxm_query_collective, }; diff --git a/prov/rxm/src/rxm_ep.c b/prov/rxm/src/rxm_ep.c index 69a88e2caaf..b967643c0c5 100644 --- a/prov/rxm/src/rxm_ep.c +++ b/prov/rxm/src/rxm_ep.c @@ -42,79 +42,6 @@ #include "rxm.h" -static int rxm_match_noop(struct dlist_entry *item, const void *arg) -{ - OFI_UNUSED(item); - OFI_UNUSED(arg); - return 1; -} - -static int rxm_match_recv_entry(struct dlist_entry *item, const void *arg) -{ - struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *) arg; - struct rxm_recv_entry *recv_entry = - container_of(item, struct rxm_recv_entry, entry); - return ofi_match_addr(recv_entry->addr, attr->addr); -} - -static int rxm_match_recv_entry_tag(struct dlist_entry *item, const void *arg) -{ - struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *) arg; - struct rxm_recv_entry *recv_entry = - container_of(item, struct rxm_recv_entry, entry); - return ofi_match_tag(recv_entry->tag, recv_entry->ignore, attr->tag); -} - -static int rxm_match_recv_entry_tag_addr(struct dlist_entry *item, const void *arg) -{ - struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *) arg; - struct rxm_recv_entry *recv_entry = - container_of(item, struct rxm_recv_entry, entry); - return ofi_match_addr(recv_entry->addr, attr->addr) && - ofi_match_tag(recv_entry->tag, recv_entry->ignore, attr->tag); -} - -static int rxm_match_recv_entry_context(struct dlist_entry *item, const void *context) -{ - struct rxm_recv_entry *recv_entry = - container_of(item, struct rxm_recv_entry, entry); - return recv_entry->context == context; -} - -static fi_addr_t rxm_get_unexp_addr(struct rxm_unexp_msg *unexp_msg) -{ - struct rxm_rx_buf *rx_buf; - - rx_buf = container_of(unexp_msg, struct rxm_rx_buf, unexp_msg); - return (unexp_msg->addr != FI_ADDR_UNSPEC) ? - unexp_msg->addr : rx_buf->conn->peer->fi_addr; -} - -static int rxm_match_unexp_msg(struct dlist_entry *item, const void *arg) -{ - struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *)arg; - struct rxm_unexp_msg *unexp_msg = - container_of(item, struct rxm_unexp_msg, entry); - return ofi_match_addr(attr->addr, rxm_get_unexp_addr(unexp_msg)); -} - -static int rxm_match_unexp_msg_tag(struct dlist_entry *item, const void *arg) -{ - struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *) arg; - struct rxm_unexp_msg *unexp_msg = - container_of(item, struct rxm_unexp_msg, entry); - return ofi_match_tag(attr->tag, attr->ignore, unexp_msg->tag); -} - -static int rxm_match_unexp_msg_tag_addr(struct dlist_entry *item, const void *arg) -{ - struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *) arg; - struct rxm_unexp_msg *unexp_msg = - container_of(item, struct rxm_unexp_msg, entry); - return ofi_match_addr(attr->addr, rxm_get_unexp_addr(unexp_msg)) && - ofi_match_tag(attr->tag, attr->ignore, unexp_msg->tag); -} - static int rxm_buf_reg(struct ofi_bufpool_region *region) { struct rxm_ep *rxm_ep = region->pool->attr.context; @@ -158,6 +85,7 @@ static void rxm_init_rx_buf(struct ofi_bufpool_region *region, void *buf) fi_mr_desc((struct fid_mr *) region->context) : NULL; rx_buf->ep = ep; rx_buf->data = &rx_buf->pkt.data; + dlist_init(&rx_buf->unexp_entry); } static void rxm_init_tx_buf(struct ofi_bufpool_region *region, void *buf) @@ -186,69 +114,6 @@ static void rxm_buf_close(struct ofi_bufpool_region *region) } } -static void rxm_recv_entry_init(struct rxm_recv_entry *entry, void *arg) -{ - struct rxm_recv_queue *recv_queue = arg; - - assert(recv_queue->type != RXM_RECV_QUEUE_UNSPEC); - - entry->recv_queue = recv_queue; - entry->sar.msg_id = RXM_SAR_RX_INIT; - entry->sar.total_recv_len = 0; - /* set it to NULL to differentiate between regular ACKs and those - * sent with FI_INJECT */ - entry->rndv.tx_buf = NULL; - entry->comp_flags = FI_RECV; - - if (recv_queue->type == RXM_RECV_QUEUE_MSG) - entry->comp_flags |= FI_MSG; - else - entry->comp_flags |= FI_TAGGED; -} - -static int rxm_recv_queue_init(struct rxm_ep *rxm_ep, struct rxm_recv_queue *recv_queue, - size_t size, enum rxm_recv_queue_type type) -{ - recv_queue->rxm_ep = rxm_ep; - recv_queue->type = type; - recv_queue->fs = rxm_recv_fs_create(size, rxm_recv_entry_init, - recv_queue); - if (!recv_queue->fs) - return -FI_ENOMEM; - - dlist_init(&recv_queue->recv_list); - dlist_init(&recv_queue->unexp_msg_list); - if (type == RXM_RECV_QUEUE_MSG) { - if (rxm_ep->rxm_info->caps & FI_DIRECTED_RECV) { - recv_queue->match_recv = rxm_match_recv_entry; - recv_queue->match_unexp = rxm_match_unexp_msg; - } else { - recv_queue->match_recv = rxm_match_noop; - recv_queue->match_unexp = rxm_match_noop; - } - } else { - if (rxm_ep->rxm_info->caps & FI_DIRECTED_RECV) { - recv_queue->match_recv = rxm_match_recv_entry_tag_addr; - recv_queue->match_unexp = rxm_match_unexp_msg_tag_addr; - } else { - recv_queue->match_recv = rxm_match_recv_entry_tag; - recv_queue->match_unexp = rxm_match_unexp_msg_tag; - } - } - - return 0; -} - -static void rxm_recv_queue_close(struct rxm_recv_queue *recv_queue) -{ - /* It indicates that the recv_queue were allocated */ - if (recv_queue->fs) { - rxm_recv_fs_free(recv_queue->fs); - recv_queue->fs = NULL; - } - // TODO cleanup recv_list and unexp msg list -} - static int rxm_ep_create_pools(struct rxm_ep *rxm_ep) { struct ofi_bufpool_attr attr = {0}; @@ -287,8 +152,18 @@ static int rxm_ep_create_pools(struct rxm_ep *rxm_ep) "Unable to create peer xfer context pool\n"); goto free_tx_pool; } - return 0; + attr.size = sizeof(struct rxm_proto_info); + attr.alloc_fn = NULL; + attr.free_fn = NULL; + attr.init_fn = NULL; + ret = ofi_bufpool_create_attr(&attr, &rxm_ep->proto_info_pool); + if (ret) { + FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, + "Unable to create proto info pool\n"); + goto free_tx_pool; + } + return 0; free_tx_pool: ofi_bufpool_destroy(rxm_ep->tx_pool); @@ -298,62 +173,13 @@ static int rxm_ep_create_pools(struct rxm_ep *rxm_ep) return ret; } -static int rxm_multi_recv_pool_init(struct rxm_ep *rxm_ep) -{ - struct ofi_bufpool_attr attr = { - .size = sizeof(struct rxm_recv_entry), - .alignment = 16, - .max_cnt = 0, - .chunk_cnt = 16, - .alloc_fn = NULL, - .init_fn = NULL, - .context = rxm_ep, - .flags = OFI_BUFPOOL_NO_TRACK, - }; - - return ofi_bufpool_create_attr(&attr, &rxm_ep->multi_recv_pool); -} - -static int rxm_ep_rx_queue_init(struct rxm_ep *rxm_ep) -{ - int ret; - - ret = rxm_recv_queue_init(rxm_ep, &rxm_ep->recv_queue, - rxm_ep->rxm_info->rx_attr->size, - RXM_RECV_QUEUE_MSG); - if (ret) - return ret; - - ret = rxm_recv_queue_init(rxm_ep, &rxm_ep->trecv_queue, - rxm_ep->rxm_info->rx_attr->size, - RXM_RECV_QUEUE_TAGGED); - if (ret) - goto err_recv_tag; - - ret = rxm_multi_recv_pool_init(rxm_ep); - if (ret) - goto err_multi; - - return FI_SUCCESS; - -err_multi: - rxm_recv_queue_close(&rxm_ep->trecv_queue); -err_recv_tag: - rxm_recv_queue_close(&rxm_ep->recv_queue); - return ret; -} - /* It is safe to call this function, even if `rxm_ep_txrx_res_open` * has not yet been called */ static void rxm_ep_txrx_res_close(struct rxm_ep *ep) { - rxm_recv_queue_close(&ep->trecv_queue); - rxm_recv_queue_close(&ep->recv_queue); + if (ep->srx && ep->util_ep.ep_fid.msg != &rxm_no_recv_msg_ops) + (void) util_srx_close(&ep->srx->ep_fid.fid); - if (ep->multi_recv_pool) { - ofi_bufpool_destroy(ep->multi_recv_pool); - ep->multi_recv_pool = NULL; - } if (ep->rx_pool) { ofi_bufpool_destroy(ep->rx_pool); ep->rx_pool = NULL; @@ -362,6 +188,10 @@ static void rxm_ep_txrx_res_close(struct rxm_ep *ep) ofi_bufpool_destroy(ep->tx_pool); ep->tx_pool = NULL; } + if (ep->proto_info_pool) { + ofi_bufpool_destroy(ep->proto_info_pool); + ep->proto_info_pool = NULL; + } if (ep->coll_pool) { ofi_bufpool_destroy(ep->coll_pool); ep->coll_pool = NULL; @@ -420,53 +250,13 @@ static struct rxm_eager_ops coll_eager_ops = { .handle_rx = rxm_handle_coll_eager, }; -static bool rxm_ep_cancel_recv(struct rxm_ep *rxm_ep, - struct rxm_recv_queue *recv_queue, void *context) -{ - struct fi_cq_err_entry err_entry; - struct rxm_recv_entry *recv_entry; - struct dlist_entry *entry; - int ret; - - ofi_genlock_lock(&rxm_ep->util_ep.lock); - entry = dlist_remove_first_match(&recv_queue->recv_list, - rxm_match_recv_entry_context, - context); - if (!entry) - goto unlock; - - recv_entry = container_of(entry, struct rxm_recv_entry, entry); - memset(&err_entry, 0, sizeof(err_entry)); - err_entry.op_context = recv_entry->context; - err_entry.flags |= recv_entry->comp_flags; - err_entry.tag = recv_entry->tag; - err_entry.err = FI_ECANCELED; - err_entry.prov_errno = -FI_ECANCELED; - rxm_recv_entry_release(recv_entry); - ret = ofi_cq_write_error(rxm_ep->util_ep.rx_cq, &err_entry); - if (ret) { - FI_WARN(&rxm_prov, FI_LOG_CQ, "Error writing to CQ\n"); - assert(0); - } - -unlock: - ofi_genlock_unlock(&rxm_ep->util_ep.lock); - return entry != NULL; -} - static ssize_t rxm_ep_cancel(fid_t fid_ep, void *context) { struct rxm_ep *ep; ep = container_of(fid_ep, struct rxm_ep, util_ep.ep_fid); - if (rxm_passthru_info(ep->rxm_info)) - return fi_cancel(&ep->msg_srx->fid, context); - - if (!rxm_ep_cancel_recv(ep, &ep->trecv_queue, context)) - rxm_ep_cancel_recv(ep, &ep->recv_queue, context); - - return 0; + return ep->srx->ep_fid.ops->cancel(&ep->srx->ep_fid.fid, context); } static int rxm_ep_getopt(fid_t fid, int level, int optname, void *optval, @@ -480,10 +270,8 @@ static int rxm_ep_getopt(fid_t fid, int level, int optname, void *optval, switch (optname) { case FI_OPT_MIN_MULTI_RECV: - assert(sizeof(rxm_ep->min_multi_recv_size) == sizeof(size_t)); - *(size_t *)optval = rxm_ep->min_multi_recv_size; - *optlen = sizeof(size_t); - break; + return rxm_ep->srx->ep_fid.ops->getopt(&rxm_ep->srx->ep_fid.fid, + level, optname, optval, optlen); case FI_OPT_BUFFERED_MIN: assert(sizeof(rxm_ep->buffered_min) == sizeof(size_t)); *(size_t *)optval = rxm_ep->buffered_min; @@ -507,11 +295,8 @@ static int rxm_ep_setopt(fid_t fid, int level, int optname, switch (optname) { case FI_OPT_MIN_MULTI_RECV: - rxm_ep->min_multi_recv_size = *(size_t *)optval; - FI_INFO(&rxm_prov, FI_LOG_CORE, - "FI_OPT_MIN_MULTI_RECV set to %zu\n", - rxm_ep->min_multi_recv_size); - break; + return rxm_ep->srx->ep_fid.ops->setopt(&rxm_ep->srx->ep_fid.fid, + level, optname, optval, optlen); case FI_OPT_BUFFERED_MIN: if (rxm_ep->rx_pool) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, @@ -564,99 +349,6 @@ static struct fi_ops_ep rxm_ops_ep = { .tx_size_left = fi_no_tx_size_left, }; - -/* Caller must hold recv_queue->lock -- TODO which lock? */ -struct rxm_rx_buf * -rxm_get_unexp_msg(struct rxm_recv_queue *recv_queue, fi_addr_t addr, - uint64_t tag, uint64_t ignore) -{ - struct rxm_recv_match_attr match_attr; - struct dlist_entry *entry; - - if (dlist_empty(&recv_queue->unexp_msg_list)) - return NULL; - - match_attr.addr = addr; - match_attr.tag = tag; - match_attr.ignore = ignore; - - entry = dlist_find_first_match(&recv_queue->unexp_msg_list, - recv_queue->match_unexp, &match_attr); - if (!entry) - return NULL; - - RXM_DBG_ADDR_TAG(FI_LOG_EP_DATA, "Match for posted recv found in unexp" - " msg list\n", match_attr.addr, match_attr.tag); - - return container_of(entry, struct rxm_rx_buf, unexp_msg.entry); -} - -static void rxm_recv_entry_init_common(struct rxm_recv_entry *recv_entry, - const struct iovec *iov, void **desc, size_t count, - fi_addr_t src_addr, uint64_t tag, uint64_t ignore, - void *context, uint64_t flags, - struct rxm_recv_queue *recv_queue) -{ - size_t i; - - assert(!recv_entry->rndv.tx_buf); - recv_entry->rxm_iov.count = (uint8_t) count; - recv_entry->addr = src_addr; - recv_entry->context = context; - recv_entry->flags = flags; - recv_entry->ignore = ignore; - recv_entry->tag = tag; - - recv_entry->sar.msg_id = RXM_SAR_RX_INIT; - recv_entry->sar.total_recv_len = 0; - recv_entry->total_len = 0; - - for (i = 0; i < count; i++) { - recv_entry->rxm_iov.iov[i] = iov[i]; - recv_entry->total_len += iov[i].iov_len; - if (desc && desc[i]) - recv_entry->rxm_iov.desc[i] = desc[i]; - else - recv_entry->rxm_iov.desc[i] = NULL; - } -} - -struct rxm_recv_entry * -rxm_recv_entry_get(struct rxm_ep *rxm_ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, - uint64_t tag, uint64_t ignore, void *context, - uint64_t flags, struct rxm_recv_queue *recv_queue) -{ - struct rxm_recv_entry *recv_entry; - - if (ofi_freestack_isempty(recv_queue->fs)) - return NULL; - - recv_entry = ofi_freestack_pop(recv_queue->fs); - - rxm_recv_entry_init_common(recv_entry, iov, desc, count, src_addr, tag, - ignore, context, flags, recv_queue); - - return recv_entry; -} - -struct rxm_recv_entry * -rxm_multi_recv_entry_get(struct rxm_ep *rxm_ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, - uint64_t tag, uint64_t ignore, void *context, - uint64_t flags) -{ - struct rxm_recv_entry *recv_entry; - - recv_entry = ofi_buf_alloc(rxm_ep->multi_recv_pool); - - rxm_recv_entry_init_common(recv_entry, iov, desc, count, src_addr, tag, - ignore, context, flags, NULL); - - recv_entry->comp_flags = FI_MSG | FI_RECV; - return recv_entry; -} - struct rxm_tx_buf *rxm_get_tx_buf(struct rxm_ep *ep) { struct rxm_tx_buf *buf; @@ -820,6 +512,7 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn) { struct rxm_deferred_tx_entry *def_tx_entry; + struct rxm_proto_info *proto_info; struct iovec iov; struct fi_msg msg; ssize_t ret = 0; @@ -832,12 +525,11 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep, struct rxm_deferred_tx_entry, entry); switch (def_tx_entry->type) { case RXM_DEFERRED_TX_RNDV_ACK: + proto_info = def_tx_entry->rndv_ack.rx_buf->proto_info; ret = fi_send(def_tx_entry->rxm_conn->msg_ep, - &def_tx_entry->rndv_ack.rx_buf-> - recv_entry->rndv.tx_buf->pkt, + &proto_info->rndv.tx_buf->pkt, def_tx_entry->rndv_ack.pkt_size, - def_tx_entry->rndv_ack.rx_buf->recv_entry-> - rndv.tx_buf->hdr.desc, + proto_info->rndv.tx_buf->hdr.desc, 0, def_tx_entry->rndv_ack.rx_buf); if (ret) { if (ret == -FI_EAGAIN) @@ -845,11 +537,10 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep, rxm_cq_write_rx_error( def_tx_entry->rxm_ep, ofi_op_msg, def_tx_entry->rndv_ack.rx_buf-> - recv_entry->context, (int) ret); + peer_entry->context, (int) ret); } - if (def_tx_entry->rndv_ack.rx_buf->recv_entry->rndv - .tx_buf->pkt.ctrl_hdr - .type == rxm_ctrl_rndv_rd_done) + if (proto_info->rndv.tx_buf->pkt.ctrl_hdr.type == + rxm_ctrl_rndv_rd_done) RXM_UPDATE_STATE(FI_LOG_EP_DATA, def_tx_entry->rndv_ack.rx_buf, RXM_RNDV_READ_DONE_SENT); @@ -891,7 +582,7 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep, rxm_cq_write_rx_error( def_tx_entry->rxm_ep, ofi_op_msg, def_tx_entry->rndv_read.rx_buf-> - recv_entry->context, (int) ret); + peer_entry->context, (int) ret); } break; case RXM_DEFERRED_TX_RNDV_WRITE: @@ -944,7 +635,7 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep, def_tx_entry->rxm_ep, ofi_op_msg, def_tx_entry->rndv_read.rx_buf-> - recv_entry->context, + peer_entry->context, (int) ret); } return; @@ -1451,9 +1142,6 @@ static void rxm_ep_settings_init(struct rxm_ep *rxm_ep) sizeof(struct rxm_rndv_hdr))), rxm_buffer_size); - assert(!rxm_ep->min_multi_recv_size); - rxm_ep->min_multi_recv_size = rxm_buffer_size; - assert(!rxm_ep->buffered_limit); rxm_ep->buffered_limit = rxm_buffer_size; @@ -1465,13 +1153,11 @@ static void rxm_ep_settings_init(struct rxm_ep *rxm_ep) "\t\t MR local: MSG - %d, RxM - %d\n" "\t\t Completions per progress: MSG - %zu\n" "\t\t Buffered min: %zu\n" - "\t\t Min multi recv size: %zu\n" "\t\t inject size: %zu\n" "\t\t Protocol limits: Eager: %zu, SAR: %zu\n", rxm_ep->msg_mr_local, rxm_ep->rdm_mr_local, rxm_ep->comp_per_progress, rxm_ep->buffered_min, - rxm_ep->min_multi_recv_size, rxm_ep->inject_limit, - rxm_ep->eager_limit, rxm_ep->sar_limit); + rxm_ep->inject_limit, rxm_ep->eager_limit, rxm_ep->sar_limit); } static int rxm_ep_txrx_res_open(struct rxm_ep *rxm_ep) @@ -1484,19 +1170,7 @@ static int rxm_ep_txrx_res_open(struct rxm_ep *rxm_ep) dlist_init(&rxm_ep->deferred_queue); - ret = rxm_ep_rx_queue_init(rxm_ep); - if (ret) - goto err; - return FI_SUCCESS; -err: - ofi_bufpool_destroy(rxm_ep->coll_pool); - ofi_bufpool_destroy(rxm_ep->rx_pool); - ofi_bufpool_destroy(rxm_ep->tx_pool); - rxm_ep->coll_pool = NULL; - rxm_ep->rx_pool = NULL; - rxm_ep->tx_pool = NULL; - return ret; } static int rxm_ep_enable_check(struct rxm_ep *rxm_ep) @@ -1526,9 +1200,129 @@ static int rxm_ep_enable_check(struct rxm_ep *rxm_ep) return 0; } +static int rxm_unexp_start(struct fi_peer_rx_entry *rx_entry) +{ + struct rxm_rx_buf *rx_buf = rx_entry->peer_context; + + return rx_buf->pkt.ctrl_hdr.type == rxm_ctrl_seg ? + rxm_handle_unexp_sar(rx_entry): + rxm_handle_rx_buf(rx_buf); +} + +static int rxm_discard(struct fi_peer_rx_entry *rx_entry) +{ + struct rxm_rx_buf *rx_buf, *seg_rx; + + rx_buf = rx_entry->peer_context; + + if (rx_buf->pkt.ctrl_hdr.type == rxm_ctrl_seg) { + while (!dlist_empty(&rx_buf->proto_info->sar.pkt_list)) { + dlist_pop_front(&rx_buf->proto_info->sar.pkt_list, + struct rxm_rx_buf, seg_rx, unexp_entry); + rxm_free_rx_buf(seg_rx); + } + ofi_buf_free(rx_buf->proto_info); + } + + rxm_free_rx_buf(rx_buf); + return FI_SUCCESS; +} + +struct fi_ops_srx_peer rxm_srx_peer_ops = { + .size = sizeof(struct fi_ops_srx_peer), + .start_msg = rxm_unexp_start, + .start_tag = rxm_unexp_start, + .discard_msg = rxm_discard, + .discard_tag = rxm_discard, +}; + +static int rxm_srx_close(struct fid *fid) +{ + struct rxm_domain *domain = container_of(fid, struct rxm_domain, + rx_ep.fid); + + ofi_atomic_dec32(&domain->util_domain.ref); + + return FI_SUCCESS; +} + +static struct fi_ops rxm_srx_fi_ops = { + .size = sizeof(struct fi_ops), + .close = rxm_srx_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +static struct fi_ops_msg rxm_srx_msg_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = fi_no_msg_recv, + .recvv = fi_no_msg_recvv, + .recvmsg = fi_no_msg_recvmsg, + .send = fi_no_msg_send, + .sendv = fi_no_msg_sendv, + .sendmsg = fi_no_msg_sendmsg, + .inject = fi_no_msg_inject, + .senddata = fi_no_msg_senddata, + .injectdata = fi_no_msg_injectdata, +}; + +static struct fi_ops_tagged rxm_srx_tagged_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = fi_no_tagged_recv, + .recvv = fi_no_tagged_recvv, + .recvmsg = fi_no_tagged_recvmsg, + .send = fi_no_tagged_send, + .sendv = fi_no_tagged_sendv, + .sendmsg = fi_no_tagged_sendmsg, + .inject = fi_no_tagged_inject, + .senddata = fi_no_tagged_senddata, + .injectdata = fi_no_tagged_injectdata, +}; + +int rxm_srx_context(struct fid_domain *domain, struct fi_rx_attr *attr, + struct fid_ep **rx_ep, void *context) +{ + struct rxm_domain *rxm_domain; + + if (!(attr->op_flags & FI_PEER)) { + FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, + "shared srx only supported with FI_PEER flag\n"); + return -FI_EINVAL; + } + + rxm_domain = container_of(domain, struct rxm_domain, + util_domain.domain_fid); + + if (rxm_domain->srx) { + FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, + "Peer SRX context already imported\n"); + return -FI_EINVAL; + } + + rxm_domain->srx = ((struct fi_peer_srx_context *) + (context))->srx; + rxm_domain->srx->peer_ops = &rxm_srx_peer_ops; + rxm_domain->rx_ep.msg = &rxm_srx_msg_ops; + rxm_domain->rx_ep.tagged = &rxm_srx_tagged_ops; + rxm_domain->rx_ep.fid.ops = &rxm_srx_fi_ops; + rxm_domain->rx_ep.fid.fclass = FI_CLASS_SRX_CTX; + *rx_ep = &rxm_domain->rx_ep; + ofi_atomic_inc32(&rxm_domain->util_domain.ref); + + return FI_SUCCESS; +} + +static void rxm_update(struct util_srx_ctx *srx, struct util_rx_entry *rx_entry) +{ + //no update needed +} + static int rxm_ep_ctrl(struct fid *fid, int command, void *arg) { struct rxm_ep *ep; + struct rxm_domain *domain; + struct fid_ep *srx; int ret; ep = container_of(fid, struct rxm_ep, util_ep.ep_fid.fid); @@ -1564,6 +1358,32 @@ static int rxm_ep_ctrl(struct fid *fid, int command, void *arg) if (ret) return ret; + if (!ep->srx) { + domain = container_of(ep->util_ep.domain, + struct rxm_domain, + util_domain.domain_fid); + ret = util_ep_srx_context(&domain->util_domain, + ep->rxm_info->rx_attr->size, + RXM_IOV_LIMIT, rxm_buffer_size, + &rxm_update, &ep->util_ep.lock, + &srx); + if (ret) + return ret; + + ep->srx = container_of(srx, struct fid_peer_srx, + ep_fid.fid); + ep->srx->peer_ops = &rxm_srx_peer_ops; + + ret = util_srx_bind(&ep->srx->ep_fid.fid, + &ep->util_ep.rx_cq->cq_fid.fid, + FI_RECV); + if (ret) + return ret; + } else { + ep->util_ep.ep_fid.msg = &rxm_no_recv_msg_ops; + ep->util_ep.ep_fid.tagged = &rxm_no_recv_tagged_ops; + } + if (ep->msg_srx && !rxm_passthru_info(ep->rxm_info)) { ret = rxm_prepost_recv(ep, ep->msg_srx); if (ret) @@ -1592,10 +1412,21 @@ static int rxm_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) struct rxm_av *rxm_av; struct rxm_cq *rxm_cq; struct rxm_eq *rxm_eq; - int ret, retv = 0; + int ret; rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); + if (bfid->fclass == FI_CLASS_SRX_CTX) { + if (rxm_ep->srx) { + FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, + "SRX context already bound to EP\n"); + return -FI_EINVAL; + } + rxm_ep->srx = + (container_of(bfid, struct rxm_domain, rx_ep.fid))->srx; + return FI_SUCCESS; + } + ret = ofi_ep_bind(&rxm_ep->util_ep, bfid, flags); if (ret) return ret; @@ -1608,14 +1439,14 @@ static int rxm_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) &rxm_av->util_coll_av->fid, flags); if (ret) - retv = ret; + return ret; } if (rxm_ep->offload_coll_ep && rxm_av->offload_coll_av) { ret = ofi_ep_fid_bind(&rxm_ep->offload_coll_ep->fid, &rxm_av->offload_coll_av->fid, flags); if (ret) - retv = ret; + return ret; } break; @@ -1626,14 +1457,14 @@ static int rxm_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) &rxm_cq->util_coll_cq->fid, flags); if (ret) - retv = ret; + return ret; } if (rxm_ep->offload_coll_ep && rxm_cq->offload_coll_cq) { ret = ofi_ep_fid_bind(&rxm_ep->offload_coll_ep->fid, &rxm_cq->offload_coll_cq->fid, flags); if (ret) - retv = ret; + return ret; } break; @@ -1644,19 +1475,18 @@ static int rxm_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) &rxm_eq->util_coll_eq->fid, flags); if (ret) - retv = ret; + return ret; } if (rxm_ep->offload_coll_ep && rxm_eq->offload_coll_eq) { ret = ofi_ep_fid_bind(&rxm_ep->offload_coll_ep->fid, &rxm_eq->offload_coll_eq->fid, flags); if (ret) - retv = ret; + return ret; } - break; } - return retv; + return FI_SUCCESS; } static struct fi_ops rxm_ep_fi_ops = { diff --git a/prov/rxm/src/rxm_init.c b/prov/rxm/src/rxm_init.c index 1a76796d4e0..10a7ae535d7 100644 --- a/prov/rxm/src/rxm_init.c +++ b/prov/rxm/src/rxm_init.c @@ -262,8 +262,8 @@ int rxm_info_to_core(uint32_t version, const struct fi_info *hints, core_info->rx_attr->op_flags &= ~FI_MULTI_RECV; - core_info->domain_attr->caps &= ~(FI_AV_USER_ID); - core_info->caps &= ~(FI_AV_USER_ID); + core_info->domain_attr->caps &= ~(FI_AV_USER_ID | FI_PEER); + core_info->caps &= ~(FI_AV_USER_ID | FI_PEER); return 0; } diff --git a/prov/rxm/src/rxm_msg.c b/prov/rxm/src/rxm_msg.c index 3b9088a2858..5d48e88e53a 100644 --- a/prov/rxm/src/rxm_msg.c +++ b/prov/rxm/src/rxm_msg.c @@ -40,214 +40,16 @@ #include "rxm.h" - -ssize_t rxm_handle_unexp_sar(struct rxm_recv_queue *recv_queue, - struct rxm_recv_entry *recv_entry, - struct rxm_rx_buf *rx_buf) -{ - struct rxm_recv_match_attr match_attr; - struct dlist_entry *entry; - bool last; - ssize_t ret; - - ret = rxm_handle_rx_buf(rx_buf); - last = rxm_sar_get_seg_type(&rx_buf->pkt.ctrl_hdr) == RXM_SAR_SEG_LAST; - if (ret || last) - return ret; - - match_attr.addr = recv_entry->addr; - match_attr.tag = recv_entry->tag; - match_attr.ignore = recv_entry->ignore; - - dlist_foreach_container_safe(&recv_queue->unexp_msg_list, - struct rxm_rx_buf, rx_buf, - unexp_msg.entry, entry) { - if (!recv_queue->match_unexp(&rx_buf->unexp_msg.entry, - &match_attr)) - continue; - /* Handle unordered completions from MSG provider */ - if ((rx_buf->pkt.ctrl_hdr.msg_id != recv_entry->sar.msg_id) || - ((rx_buf->pkt.ctrl_hdr.type != rxm_ctrl_seg))) - continue; - - if (!rx_buf->conn) { - rx_buf->conn = ofi_idm_at(&rx_buf->ep->conn_idx_map, - (int) rx_buf->pkt.ctrl_hdr.conn_id); - } - if (recv_entry->sar.conn != rx_buf->conn) - continue; - rx_buf->recv_entry = recv_entry; - dlist_remove(&rx_buf->unexp_msg.entry); - last = rxm_sar_get_seg_type(&rx_buf->pkt.ctrl_hdr) == - RXM_SAR_SEG_LAST; - ret = rxm_handle_rx_buf(rx_buf); - if (ret || last) - break; - } - return ret; -} - -/* - * We don't expect to have unexpected messages when the app is using - * multi-recv buffers. Optimize for that case. - * - * If there are unexpected messages waiting when we post a mult-recv buffer, - * we trim off the start of the buffer, treat it as a normal buffer, and pair - * it with an unexpected message. We continue doing this until either no - * unexpected messages are left or the multi-recv buffer has been consumed. - */ -static ssize_t -rxm_post_mrecv(struct rxm_ep *ep, const struct iovec *iov, - void **desc, void *context, uint64_t op_flags) -{ - struct rxm_recv_entry *recv_entry; - struct rxm_rx_buf *rx_buf; - struct iovec cur_iov = *iov; - ssize_t ret; - - do { - recv_entry = rxm_recv_entry_get(ep, &cur_iov, desc, 1, - FI_ADDR_UNSPEC, 0, 0, context, - op_flags, &ep->recv_queue); - if (!recv_entry) { - ret = -FI_ENOMEM; - break; - } - - rx_buf = rxm_get_unexp_msg(&ep->recv_queue, recv_entry->addr, 0, 0); - if (!rx_buf) { - dlist_insert_tail(&recv_entry->entry, - &ep->recv_queue.recv_list); - return 0; - } - - dlist_remove(&rx_buf->unexp_msg.entry); - rx_buf->recv_entry = recv_entry; - recv_entry->flags &= ~FI_MULTI_RECV; - recv_entry->total_len = MIN(cur_iov.iov_len, rx_buf->pkt.hdr.size); - recv_entry->rxm_iov.iov[0].iov_len = recv_entry->total_len; - - cur_iov.iov_base = (uint8_t *) cur_iov.iov_base + recv_entry->total_len; - cur_iov.iov_len -= recv_entry->total_len; - - if (rx_buf->pkt.ctrl_hdr.type != rxm_ctrl_seg) - ret = rxm_handle_rx_buf(rx_buf); - else - ret = rxm_handle_unexp_sar(&ep->recv_queue, recv_entry, - rx_buf); - - } while (!ret && cur_iov.iov_len >= ep->min_multi_recv_size); - - if ((cur_iov.iov_len < ep->min_multi_recv_size) || - (ret && cur_iov.iov_len != iov->iov_len)) { - ofi_peer_cq_write(ep->util_ep.rx_cq, context, FI_MULTI_RECV, - 0, NULL, 0, 0, FI_ADDR_NOTAVAIL); - } - - return ret; -} - -static ssize_t -rxm_recv_common(struct rxm_ep *rxm_ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, - void *context, uint64_t op_flags) -{ - struct rxm_recv_entry *recv_entry; - struct rxm_rx_buf *rx_buf; - ssize_t ret; - - assert(rxm_ep->util_ep.rx_cq); - assert(count <= rxm_ep->rxm_info->rx_attr->iov_limit); - - ofi_genlock_lock(&rxm_ep->util_ep.lock); - if (op_flags & FI_MULTI_RECV) { - ret = rxm_post_mrecv(rxm_ep, iov, desc, context, op_flags); - goto release; - } - - recv_entry = rxm_recv_entry_get(rxm_ep, iov, desc, count, src_addr, - 0, 0, context, op_flags, - &rxm_ep->recv_queue); - if (!recv_entry) { - ret = -FI_EAGAIN; - goto release; - } - - rx_buf = rxm_get_unexp_msg(&rxm_ep->recv_queue, recv_entry->addr, 0, 0); - if (!rx_buf) { - dlist_insert_tail(&recv_entry->entry, - &rxm_ep->recv_queue.recv_list); - ret = FI_SUCCESS; - goto release; - } - - dlist_remove(&rx_buf->unexp_msg.entry); - rx_buf->recv_entry = recv_entry; - - ret = (rx_buf->pkt.ctrl_hdr.type != rxm_ctrl_seg) ? - rxm_handle_rx_buf(rx_buf) : - rxm_handle_unexp_sar(&rxm_ep->recv_queue, recv_entry, rx_buf); - -release: - ofi_genlock_unlock(&rxm_ep->util_ep.lock); - return ret; -} - -static ssize_t -rxm_buf_recv(struct rxm_ep *rxm_ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, - void *context, uint64_t flags) -{ - struct rxm_recv_entry *recv_entry; - struct fi_recv_context *recv_ctx = context; - struct rxm_rx_buf *rx_buf; - ssize_t ret = 0; - - context = recv_ctx->context; - rx_buf = container_of(recv_ctx, struct rxm_rx_buf, recv_context); - - ofi_genlock_lock(&rxm_ep->util_ep.lock); - if (flags & FI_CLAIM) { - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, - "Claiming buffered receive\n"); - - recv_entry = rxm_recv_entry_get(rxm_ep, iov, desc, count, - src_addr, 0, 0, context, - flags, &rxm_ep->recv_queue); - if (!recv_entry) { - ret = -FI_EAGAIN; - goto unlock; - } - - recv_entry->comp_flags |= FI_CLAIM; - - rx_buf->recv_entry = recv_entry; - ret = rxm_handle_rx_buf(rx_buf); - } else { - assert(flags & FI_DISCARD); - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, - "Discarding buffered receive\n"); - rxm_free_rx_buf(rx_buf); - } -unlock: - ofi_genlock_unlock(&rxm_ep->util_ep.lock); - return ret; -} - static ssize_t rxm_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); - if (rxm_ep->rxm_info->mode & OFI_BUFFERED_RECV) - return rxm_buf_recv(rxm_ep, msg->msg_iov, msg->desc, - msg->iov_count, msg->addr, msg->context, - flags | rxm_ep->util_ep.rx_msg_flags); - - return rxm_recv_common(rxm_ep, msg->msg_iov, msg->desc, - msg->iov_count, msg->addr, msg->context, - flags | rxm_ep->util_ep.rx_msg_flags); + return util_srx_generic_recv(&rxm_ep->srx->ep_fid, msg->msg_iov, + msg->desc, msg->iov_count, msg->addr, + msg->context, + flags | rxm_ep->util_ep.rx_msg_flags); } @@ -262,8 +64,9 @@ rxm_recv(struct fid_ep *ep_fid, void *buf, size_t len, .iov_len = len, }; - return rxm_recv_common(rxm_ep, &iov, &desc, 1, src_addr, - context, rxm_ep->util_ep.rx_op_flags); + return util_srx_generic_recv(&rxm_ep->srx->ep_fid, &iov, &desc, 1, + src_addr, context, + rxm_ep->util_ep.rx_op_flags); } static ssize_t @@ -273,8 +76,9 @@ rxm_recvv(struct fid_ep *ep_fid, const struct iovec *iov, struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); - return rxm_recv_common(rxm_ep, iov, desc, count, src_addr, - context, rxm_ep->util_ep.rx_op_flags); + return util_srx_generic_recv(&rxm_ep->srx->ep_fid, iov, desc, count, + src_addr, context, + rxm_ep->util_ep.rx_op_flags); } static ssize_t @@ -661,15 +465,13 @@ rxm_send_eager(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, eager_buf->app_context = context; eager_buf->flags = flags; + rxm_ep_format_tx_buf_pkt(rxm_conn, data_len, op, data, tag, + flags, &eager_buf->pkt); if (rxm_use_direct_send(rxm_ep, count, flags)) { - rxm_ep_format_tx_buf_pkt(rxm_conn, data_len, op, data, tag, - flags, &eager_buf->pkt); ret = rxm_direct_send(rxm_ep, rxm_conn, eager_buf, iov, desc, count); } else { - rxm_ep_format_tx_buf_pkt(rxm_conn, data_len, op, data, tag, - flags, &eager_buf->pkt); ret = rxm_copy_from_hmem_iov(desc, eager_buf->pkt.data, eager_buf->pkt.hdr.size, iov, count, 0); @@ -891,6 +693,19 @@ struct fi_ops_msg rxm_msg_ops = { .injectdata = rxm_injectdata, }; +struct fi_ops_msg rxm_no_recv_msg_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = fi_no_msg_recv, + .recvv = fi_no_msg_recvv, + .recvmsg = fi_no_msg_recvmsg, + .send = rxm_send, + .sendv = rxm_sendv, + .sendmsg = rxm_sendmsg, + .inject = rxm_inject, + .senddata = rxm_senddata, + .injectdata = rxm_injectdata, +}; + static ssize_t rxm_recv_thru(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, fi_addr_t src_addr, void *context) diff --git a/prov/rxm/src/rxm_tagged.c b/prov/rxm/src/rxm_tagged.c index 8f18f34b3eb..1276bac0ba3 100644 --- a/prov/rxm/src/rxm_tagged.c +++ b/prov/rxm/src/rxm_tagged.c @@ -43,189 +43,21 @@ #include "rxm.h" -static void -rxm_discard_recv(struct rxm_ep *rxm_ep, struct rxm_rx_buf *rx_buf, - void *context) -{ - RXM_DBG_ADDR_TAG(FI_LOG_EP_DATA, "Discarding message", - rx_buf->unexp_msg.addr, rx_buf->unexp_msg.tag); - - ofi_peer_cq_write(rxm_ep->util_ep.rx_cq, context, FI_TAGGED | FI_RECV, - 0, NULL, rx_buf->pkt.hdr.data, - rx_buf->pkt.hdr.tag, FI_ADDR_NOTAVAIL); - rxm_free_rx_buf(rx_buf); -} - -static void -rxm_peek_recv(struct rxm_ep *rxm_ep, fi_addr_t addr, uint64_t tag, - uint64_t ignore, void *context, uint64_t flags, - struct rxm_recv_queue *recv_queue) -{ - struct rxm_rx_buf *rx_buf; - int ret; - - RXM_DBG_ADDR_TAG(FI_LOG_EP_DATA, "Peeking message", addr, tag); - - /* peek doesn't support peer transfer at this moment */ - assert(!(flags & FI_PEER_TRANSFER)); - - rxm_ep_do_progress(&rxm_ep->util_ep); - - rx_buf = rxm_get_unexp_msg(recv_queue, addr, tag, ignore); - if (!rx_buf) { - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Message not found\n"); - ret = ofi_peer_cq_write_error_peek( - rxm_ep->util_ep.rx_cq, tag, context); - if (ret) - FI_WARN(&rxm_prov, FI_LOG_CQ, "Error writing to CQ\n"); - return; - } - - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Message found\n"); - - if (flags & FI_DISCARD) { - dlist_remove(&rx_buf->unexp_msg.entry); - rxm_discard_recv(rxm_ep, rx_buf, context); - return; - } - - if (flags & FI_CLAIM) { - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Marking message for Claim\n"); - ((struct fi_context *)context)->internal[0] = rx_buf; - dlist_remove(&rx_buf->unexp_msg.entry); - } - - ofi_peer_cq_write(rxm_ep->util_ep.rx_cq, context, FI_TAGGED | FI_RECV, - rx_buf->pkt.hdr.size, NULL, rx_buf->pkt.hdr.data, - rx_buf->pkt.hdr.tag, FI_ADDR_NOTAVAIL); -} - -static ssize_t -rxm_post_trecv(struct rxm_ep *rxm_ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, - uint64_t tag, uint64_t ignore, void *context, uint64_t op_flags) -{ - struct rxm_recv_entry *recv_entry; - struct rxm_rx_buf *rx_buf; - - assert(count <= rxm_ep->rxm_info->rx_attr->iov_limit); - - recv_entry = rxm_recv_entry_get(rxm_ep, iov, desc, count, src_addr, - tag, ignore, context, op_flags, - &rxm_ep->trecv_queue); - if (!recv_entry) - return -FI_EAGAIN; - - rx_buf = rxm_get_unexp_msg(&rxm_ep->trecv_queue, recv_entry->addr, - recv_entry->tag, recv_entry->ignore); - if (!rx_buf) { - dlist_insert_tail(&recv_entry->entry, - &rxm_ep->trecv_queue.recv_list); - return FI_SUCCESS; - } - - dlist_remove(&rx_buf->unexp_msg.entry); - rx_buf->recv_entry = recv_entry; - - if (rx_buf->pkt.ctrl_hdr.type != rxm_ctrl_seg) - return rxm_handle_rx_buf(rx_buf); - else - return rxm_handle_unexp_sar(&rxm_ep->trecv_queue, recv_entry, - rx_buf); -} - -static ssize_t -rxm_trecv_common(struct rxm_ep *rxm_ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, - uint64_t tag, uint64_t ignore, void *context, - uint64_t op_flags) -{ - ssize_t ret; - - if (op_flags & FI_PEER_TRANSFER) - tag |= RXM_PEER_XFER_TAG_FLAG; - - ofi_genlock_lock(&rxm_ep->util_ep.lock); - ret = rxm_post_trecv(rxm_ep, iov, desc, count, src_addr, - tag, ignore, context, op_flags); - ofi_genlock_unlock(&rxm_ep->util_ep.lock); - return ret; -} - static ssize_t rxm_trecvmsg(struct fid_ep *ep_fid, const struct fi_msg_tagged *msg, uint64_t flags) { - struct rxm_ep *rxm_ep; - struct rxm_recv_entry *recv_entry; - struct fi_recv_context *recv_ctx; - struct rxm_rx_buf *rx_buf; - void *context = msg->context; - ssize_t ret = 0; - - rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); - flags |= rxm_ep->util_ep.rx_msg_flags; - - if (!(flags & (FI_CLAIM | FI_PEEK)) && - !(rxm_ep->rxm_info->mode & OFI_BUFFERED_RECV)) { - return rxm_trecv_common(rxm_ep, msg->msg_iov, msg->desc, - msg->iov_count, msg->addr, - msg->tag, msg->ignore, context, flags); - } - - ofi_genlock_lock(&rxm_ep->util_ep.lock); - if (rxm_ep->rxm_info->mode & OFI_BUFFERED_RECV) { - recv_ctx = msg->context; - context = recv_ctx->context; - rx_buf = container_of(recv_ctx, struct rxm_rx_buf, recv_context); - - if (flags & FI_CLAIM) { - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, - "Claiming buffered receive\n"); - goto claim; - } - - assert(flags & FI_DISCARD); - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Discarding buffered receive\n"); - rxm_free_rx_buf(rx_buf); - goto unlock; - } - - if (flags & FI_PEEK) { - rxm_peek_recv(rxm_ep, msg->addr, msg->tag, msg->ignore, - context, flags, &rxm_ep->trecv_queue); - goto unlock; - } - - rx_buf = ((struct fi_context *) context)->internal[0]; - assert(rx_buf); - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Claim message\n"); - - if (flags & FI_DISCARD) { - rxm_discard_recv(rxm_ep, rx_buf, context); - goto unlock; - } - -claim: - assert (flags & FI_CLAIM); - recv_entry = rxm_recv_entry_get(rxm_ep, msg->msg_iov, msg->desc, - msg->iov_count, msg->addr, - msg->tag, msg->ignore, context, flags, - &rxm_ep->trecv_queue); - if (!recv_entry) { - ret = -FI_EAGAIN; - goto unlock; - } - - if (rxm_ep->rxm_info->mode & OFI_BUFFERED_RECV) - recv_entry->comp_flags |= FI_CLAIM; + uint64_t tag = msg->tag; + struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, + util_ep.ep_fid.fid); - rx_buf->recv_entry = recv_entry; - ret = rxm_handle_rx_buf(rx_buf); + if (flags & FI_PEER_TRANSFER) + tag |= RXM_PEER_XFER_TAG_FLAG; -unlock: - ofi_genlock_unlock(&rxm_ep->util_ep.lock); - return ret; + return util_srx_generic_trecv(&rxm_ep->srx->ep_fid, msg->msg_iov, + msg->desc, msg->iov_count, msg->addr, + msg->context, tag, msg->ignore, + flags | rxm_ep->util_ep.rx_msg_flags); } static ssize_t @@ -240,8 +72,9 @@ rxm_trecv(struct fid_ep *ep_fid, void *buf, size_t len, }; rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); - return rxm_trecv_common(rxm_ep, &iov, &desc, 1, src_addr, tag, ignore, - context, rxm_ep->util_ep.rx_op_flags); + return util_srx_generic_trecv(&rxm_ep->srx->ep_fid, &iov, &desc, 1, + src_addr, context, tag, ignore, + rxm_ep->util_ep.rx_op_flags); } static ssize_t @@ -252,8 +85,9 @@ rxm_trecvv(struct fid_ep *ep_fid, const struct iovec *iov, struct rxm_ep *rxm_ep; rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); - return rxm_trecv_common(rxm_ep, iov, desc, count, src_addr, tag, - ignore, context, rxm_ep->util_ep.rx_op_flags); + return util_srx_generic_trecv(&rxm_ep->srx->ep_fid, iov, desc, count, + src_addr, context, tag, ignore, + rxm_ep->util_ep.rx_op_flags); } static ssize_t @@ -372,7 +206,7 @@ rxm_tsenddata(struct fid_ep *ep_fid, const void *buf, size_t len, ret = rxm_send_common(rxm_ep, rxm_conn, &iov, &desc, 1, context, data, rxm_ep->util_ep.tx_op_flags | FI_REMOTE_CQ_DATA, - tag, ofi_op_tagged); + tag, ofi_op_tagged); unlock: ofi_genlock_unlock(&rxm_ep->util_ep.lock); return ret; @@ -416,6 +250,18 @@ struct fi_ops_tagged rxm_tagged_ops = { .injectdata = rxm_tinjectdata, }; +struct fi_ops_tagged rxm_no_recv_tagged_ops = { + .size = sizeof(struct fi_ops_tagged), + .recv = fi_no_tagged_recv, + .recvv = fi_no_tagged_recvv, + .recvmsg = fi_no_tagged_recvmsg, + .send = rxm_tsend, + .sendv = rxm_tsendv, + .sendmsg = rxm_tsendmsg, + .inject = rxm_tinject, + .senddata = rxm_tsenddata, + .injectdata = rxm_tinjectdata, +}; static ssize_t rxm_trecv_thru(struct fid_ep *ep_fid, void *buf, size_t len, diff --git a/prov/tcp/src/xnet_av.c b/prov/tcp/src/xnet_av.c index 14b82ccdafd..80b18f2a568 100644 --- a/prov/tcp/src/xnet_av.c +++ b/prov/tcp/src/xnet_av.c @@ -38,7 +38,7 @@ int xnet_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, struct fid_av **fid_av, void *context) { return rxm_util_av_open(domain_fid, attr, fid_av, context, - sizeof(struct xnet_conn), NULL); + sizeof(struct xnet_conn), NULL, NULL); } static int xnet_mplex_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, diff --git a/prov/util/src/rxm_av.c b/prov/util/src/rxm_av.c index a5e30c95026..beb11d0620c 100644 --- a/prov/util/src/rxm_av.c +++ b/prov/util/src/rxm_av.c @@ -281,6 +281,8 @@ static int rxm_av_insert(struct fid_av *av_fid, const void *addr, size_t count, { struct rxm_av *av; fi_addr_t *user_ids = NULL; + struct dlist_entry *av_entry; + struct util_ep *util_ep; int ret; if (flags & FI_AV_USER_ID) { @@ -303,6 +305,14 @@ static int rxm_av_insert(struct fid_av *av_fid, const void *addr, size_t count, goto out; } + if (!av->foreach_ep) + goto out; + + dlist_foreach(&av->util_av.ep_list, av_entry) { + util_ep = container_of(av_entry, struct util_ep, av_entry); + av->foreach_ep(&av->util_av, util_ep); + } + out: free(user_ids); if (ret) @@ -420,7 +430,9 @@ static struct fi_ops_av rxm_av_ops = { int rxm_util_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, struct fid_av **fid_av, void *context, size_t conn_size, void (*remove_handler)(struct util_ep *util_ep, - struct util_peer_addr *peer)) + struct util_peer_addr *peer), + void (*foreach_ep)(struct util_av *av, struct util_ep *ep)) + { struct util_domain *domain; struct util_av_attr util_attr; @@ -457,6 +469,7 @@ int rxm_util_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, av->util_av.av_fid.fid.ops = &rxm_av_fi_ops; av->util_av.av_fid.ops = &rxm_av_ops; av->util_av.remove_handler = remove_handler; + av->foreach_ep = foreach_ep; *fid_av = &av->util_av.av_fid; return 0; From 80334f529b5f23b4d4d1f92f62c58060fc6f3133 Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Tue, 3 Dec 2024 10:55:09 -0500 Subject: [PATCH 5/7] prov/lnx: Convert peer table to use buffer pools Convert peer table to use buffer pools in order to utilize the built-in capabilities of expanding the table as more peers are added dynamically. The peer table is protected by the domain's genlock. Signed-off-by: Amir Shehata --- prov/lnx/include/lnx.h | 19 ++---- prov/lnx/src/lnx_av.c | 142 +++++++++++++++-------------------------- prov/lnx/src/lnx_ops.c | 69 ++++++++++++-------- 3 files changed, 97 insertions(+), 133 deletions(-) diff --git a/prov/lnx/include/lnx.h b/prov/lnx/include/lnx.h index 450324d5d92..e6ed95f2efa 100644 --- a/prov/lnx/include/lnx.h +++ b/prov/lnx/include/lnx.h @@ -33,7 +33,6 @@ #ifndef LNX_H #define LNX_H -#define LNX_DEF_AV_SIZE 1024 #define LNX_MAX_LOCAL_EPS 16 #define LNX_IOV_LIMIT 4 @@ -180,6 +179,7 @@ struct lnx_peer_prov { struct lnx_peer { /* true if peer can be reached over shared memory, false otherwise */ bool lp_local; + fi_addr_t lp_fi_addr; /* Each provider that we can reach the peer on will have an entry * below. Each entry will contain all the local provider endpoints we @@ -200,10 +200,9 @@ struct lnx_peer { struct lnx_peer_table { struct util_av lpt_av; int lpt_max_count; - int lpt_count; struct lnx_domain *lpt_domain; - /* an array of peer entries */ - struct lnx_peer **lpt_entries; + /* an array of peer entries of type struct lnx_peer */ + struct ofi_bufpool *lpt_entries; }; struct lnx_ctx { @@ -293,6 +292,9 @@ int lnx_domain_open(struct fid_fabric *fabric, struct fi_info *info, int lnx_av_open(struct fid_domain *domain, struct fi_av_attr *attr, struct fid_av **av, void *context); +struct lnx_peer * +lnx_av_lookup_addr(struct lnx_peer_table *peer_tbl, fi_addr_t addr); + int lnx_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, struct fid_cq **cq, void *context); @@ -314,15 +316,6 @@ void lnx_free_entry(struct fi_peer_rx_entry *entry); void lnx_foreach_unspec_addr(struct fid_peer_srx *srx, fi_addr_t (*get_addr)(struct fi_peer_rx_entry *)); -static inline struct lnx_peer * -lnx_get_peer(struct lnx_peer **peers, fi_addr_t addr) -{ - if (!peers || addr == FI_ADDR_UNSPEC) - return NULL; - - return peers[addr]; -} - static inline void lnx_get_core_desc(struct lnx_mem_desc *desc, void **mem_desc) { diff --git a/prov/lnx/src/lnx_av.c b/prov/lnx/src/lnx_av.c index f0b8d09fb86..60a26f1ea28 100644 --- a/prov/lnx/src/lnx_av.c +++ b/prov/lnx/src/lnx_av.c @@ -58,76 +58,25 @@ #include "rdma/fi_ext.h" #include "lnx.h" -static void lnx_free_peer(struct lnx_peer *lp) +struct lnx_peer * +lnx_av_lookup_addr(struct lnx_peer_table *peer_tbl, fi_addr_t addr) { - struct lnx_peer_prov *lpp; - struct dlist_entry *tmp, *tmp2; - struct lnx_local2peer_map *lpm; + struct lnx_peer *entry; - dlist_foreach_container_safe(&lp->lp_provs, - struct lnx_peer_prov, lpp, entry, tmp) { - dlist_foreach_container_safe(&lpp->lpp_map, - struct lnx_local2peer_map, lpm, entry, tmp2) { - dlist_remove(&lpm->entry); - free(lpm); - } - dlist_remove(&lpp->entry); - free(lpp); - } + if (addr == FI_ADDR_UNSPEC) + return NULL; - free(lp); -} - -#if ENABLE_DEBUG -static void lnx_print_peer(int idx, struct lnx_peer *lp) -{ - int k; - struct lnx_peer_prov *lpp; - struct lnx_local2peer_map *lpm; + ofi_genlock_lock(&peer_tbl->lpt_domain->ld_domain.lock); - FI_DBG(&lnx_prov, FI_LOG_CORE, - "%d: lnx_peer[%d] is %s\n", getpid(), idx, - (lp->lp_local) ? "local" : "remote"); - dlist_foreach_container(&lp->lp_provs, - struct lnx_peer_prov, lpp, entry) { - FI_DBG(&lnx_prov, FI_LOG_CORE, - "%d: peer[%p] provider %s\n", getpid(), lpp, - lpp->lpp_prov_name); - dlist_foreach_container(&lpp->lpp_map, - struct lnx_local2peer_map, lpm, entry) { - FI_DBG(&lnx_prov, FI_LOG_CORE, - " %d: peer has %d mapped addrs\n", - getpid(), lpm->addr_count); - for (k = 0; k < lpm->addr_count; k++) - FI_DBG(&lnx_prov, FI_LOG_CORE, - " %d: addr = %lu\n", - getpid(), lpm->peer_addrs[k]); - } - } -} -#endif /* ENABLE_DEBUG */ + entry = ofi_bufpool_get_ibuf(peer_tbl->lpt_entries, addr); -static int lnx_peer_insert(struct lnx_peer_table *tbl, - struct lnx_peer *lp) -{ - int i; + ofi_genlock_unlock(&peer_tbl->lpt_domain->ld_domain.lock); - if (tbl->lpt_max_count == 0 || - tbl->lpt_count >= tbl->lpt_max_count) - return -FI_ENOENT; - - for (i = 0; i < tbl->lpt_max_count; i++) { - if (!tbl->lpt_entries[i]) { - tbl->lpt_entries[i] = lp; -#if ENABLE_DEBUG - lnx_print_peer(i, lp); -#endif - tbl->lpt_count++; - return i; - } - } + if (!entry) + FI_WARN(&lnx_prov, FI_LOG_CORE, + "Invalid fi_addr %#lx\n", addr); - return -FI_ENOENT; + return entry; } static int lnx_peer_av_remove(struct lnx_peer *lp) @@ -160,19 +109,22 @@ static int lnx_peer_av_remove(struct lnx_peer *lp) return frc; } -static int lnx_peer_remove(struct lnx_peer_table *tbl, int idx) +static int lnx_peer_remove(struct lnx_peer_table *tbl, fi_addr_t addr) { - struct lnx_peer *lp = tbl->lpt_entries[idx]; + struct lnx_peer *lp = NULL; int rc = 0; + ofi_genlock_lock(&tbl->lpt_domain->ld_domain.lock); + lp = ofi_bufpool_get_ibuf(tbl->lpt_entries, addr); if (!lp) - return 0; + goto out; rc = lnx_peer_av_remove(lp); - tbl->lpt_entries[idx] = NULL; - tbl->lpt_count--; + ofi_ibuf_free(lp); +out: + ofi_genlock_unlock(&tbl->lpt_domain->ld_domain.lock); return rc; } @@ -193,7 +145,7 @@ static int lnx_cleanup_avs(struct local_prov *prov) static inline void lnx_free_peer_tbl(struct lnx_peer_table *peer_tbl) { - free(peer_tbl->lpt_entries); + ofi_bufpool_destroy(peer_tbl->lpt_entries); free(peer_tbl); } @@ -501,10 +453,14 @@ int lnx_av_insert(struct fid_av *av, const void *addr, size_t count, la->la_prov_count <= 0) return -FI_EPROTO; - /* this is a local peer */ - lp = calloc(sizeof(*lp), 1); - if (!lp) + ofi_genlock_lock(&peer_tbl->lpt_domain->ld_domain.lock); + lp = ofi_ibuf_alloc(peer_tbl->lpt_entries); + if (!lp) { + ofi_genlock_unlock(&peer_tbl->lpt_domain->ld_domain.lock); return -FI_ENOMEM; + } + idx = ofi_buf_index(lp); + ofi_genlock_unlock(&peer_tbl->lpt_domain->ld_domain.lock); dlist_init(&lp->lp_provs); @@ -521,20 +477,18 @@ int lnx_av_insert(struct fid_av *av, const void *addr, size_t count, rc = lnx_peer_map_addrs(prov_table, lp, la, flags, context); if (rc) { - free(lp); + ofi_genlock_lock(&peer_tbl->lpt_domain->ld_domain.lock); + ofi_ibuf_free(lp); + ofi_genlock_unlock(&peer_tbl->lpt_domain->ld_domain.lock); return rc; } - idx = lnx_peer_insert(peer_tbl, lp); - if (idx == -1) { - rc = lnx_peer_av_remove(lp); - lnx_free_peer(lp); - FI_INFO(&lnx_prov, FI_LOG_CORE, - "Peer table size exceeded. Removed = %d\n", rc); - return -FI_ENOENT; - } + if (flags & FI_AV_USER_ID) + lp->lp_fi_addr = fi_addr[i]; + else + lp->lp_fi_addr = idx; - fi_addr[i] = (fi_addr_t) idx; + fi_addr[i] = idx; la = next_peer(la); } @@ -622,8 +576,12 @@ int lnx_av_open(struct fid_domain *domain, struct fi_av_attr *attr, struct lnx_domain *lnx_domain; struct lnx_peer_table *peer_tbl; struct local_prov *entry; - size_t table_sz = LNX_DEF_AV_SIZE; + size_t table_sz; int rc = 0; + struct ofi_bufpool_attr pool_attr = { + .size = sizeof(struct lnx_peer), + .flags = OFI_BUFPOOL_NO_TRACK | OFI_BUFPOOL_INDEXED, + }; if (!attr) return -FI_EINVAL; @@ -634,24 +592,24 @@ int lnx_av_open(struct fid_domain *domain, struct fi_av_attr *attr, if (attr->type != FI_AV_TABLE) attr->type = FI_AV_TABLE; + lnx_domain = container_of(domain, struct lnx_domain, + ld_domain.domain_fid.fid); + fabric = lnx_domain->ld_fabric; + peer_tbl = calloc(sizeof(*peer_tbl), 1); if (!peer_tbl) return -FI_ENOMEM; - if (attr->count != 0) - table_sz = attr->count; + table_sz = attr->count ? attr->count : ofi_universe_size; + table_sz = roundup_power_of_two(table_sz); + pool_attr.chunk_cnt = table_sz; - peer_tbl->lpt_entries = - calloc(sizeof(struct lnx_peer *) * table_sz, 1); - if (!peer_tbl->lpt_entries) { + rc = ofi_bufpool_create_attr(&pool_attr, &peer_tbl->lpt_entries); + if (rc) { rc = -FI_ENOMEM; goto failed; } - lnx_domain = container_of(domain, struct lnx_domain, - ld_domain.domain_fid.fid); - fabric = lnx_domain->ld_fabric; - rc = ofi_av_init_lightweight(&lnx_domain->ld_domain, attr, &peer_tbl->lpt_av, context); if (rc) { diff --git a/prov/lnx/src/lnx_ops.c b/prov/lnx/src/lnx_ops.c index 7d94b7c9352..2c6b725c0ac 100644 --- a/prov/lnx/src/lnx_ops.c +++ b/prov/lnx/src/lnx_ops.c @@ -416,7 +416,7 @@ ssize_t lnx_trecv(struct fid_ep *ep, void *buf, size_t len, void *desc, * multiple endpoints. Each endpoint has its own fi_addr_t which is * core provider specific. */ - lp = lnx_get_peer(peer_tbl->lpt_entries, src_addr); + lp = lnx_av_lookup_addr(peer_tbl, src_addr); if (lp) { rc = lnx_select_recv_pathway(lp, lep->le_domain, desc, &cep, &core_addr, &iov, 1, &mre, &mem_desc); @@ -464,7 +464,7 @@ ssize_t lnx_trecvv(struct fid_ep *ep, const struct iovec *iov, void **desc, peer_tbl = lep->le_peer_tbl; lnx_get_core_desc(*desc, &mem_desc); - lp = lnx_get_peer(peer_tbl->lpt_entries, src_addr); + lp = lnx_av_lookup_addr(peer_tbl, src_addr); if (lp) { rc = lnx_select_recv_pathway(lp, lep->le_domain, *desc, &cep, &core_addr, iov, count, &mre, &mem_desc); @@ -509,7 +509,7 @@ ssize_t lnx_trecvmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg, peer_tbl = lep->le_peer_tbl; - lp = lnx_get_peer(peer_tbl->lpt_entries, msg->addr); + lp = lnx_av_lookup_addr(peer_tbl, msg->addr); if (lp) { rc = lnx_select_recv_pathway(lp, lep->le_domain, *msg->desc, &cep, &core_addr, msg->msg_iov, @@ -549,6 +549,7 @@ ssize_t lnx_tsend(struct fid_ep *ep, const void *buf, size_t len, void *desc, { int rc; struct lnx_ep *lep; + struct lnx_peer *lp; struct local_prov_ep *cep; fi_addr_t core_addr; struct lnx_peer_table *peer_tbl; @@ -562,8 +563,8 @@ ssize_t lnx_tsend(struct fid_ep *ep, const void *buf, size_t len, void *desc, peer_tbl = lep->le_peer_tbl; - rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], - lep->le_domain, desc, &cep, + lp = lnx_av_lookup_addr(peer_tbl, dest_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, desc, &cep, &core_addr, &iov, 1, &mre, &mem_desc, NULL); if (rc) return rc; @@ -585,6 +586,7 @@ ssize_t lnx_tsendv(struct fid_ep *ep, const struct iovec *iov, void **desc, { int rc; struct lnx_ep *lep; + struct lnx_peer *lp; struct local_prov_ep *cep; fi_addr_t core_addr; struct lnx_peer_table *peer_tbl; @@ -597,8 +599,8 @@ ssize_t lnx_tsendv(struct fid_ep *ep, const struct iovec *iov, void **desc, peer_tbl = lep->le_peer_tbl; - rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], - lep->le_domain, (desc) ? *desc : NULL, &cep, + lp = lnx_av_lookup_addr(peer_tbl, dest_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, (desc) ? *desc : NULL, &cep, &core_addr, iov, count, &mre, &mem_desc, NULL); if (rc) return rc; @@ -619,6 +621,7 @@ ssize_t lnx_tsendmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg, { int rc; struct lnx_ep *lep; + struct lnx_peer *lp; struct local_prov_ep *cep; fi_addr_t core_addr; struct lnx_peer_table *peer_tbl; @@ -632,8 +635,8 @@ ssize_t lnx_tsendmsg(struct fid_ep *ep, const struct fi_msg_tagged *msg, peer_tbl = lep->le_peer_tbl; - rc = lnx_select_send_pathway(peer_tbl->lpt_entries[msg->addr], - lep->le_domain, + lp = lnx_av_lookup_addr(peer_tbl, msg->addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, (msg->desc) ? *msg->desc : NULL, &cep, &core_addr, msg->msg_iov, msg->iov_count, &mre, &mem_desc, NULL); @@ -661,6 +664,7 @@ ssize_t lnx_tinject(struct fid_ep *ep, const void *buf, size_t len, { int rc; struct lnx_ep *lep; + struct lnx_peer *lp; struct local_prov_ep *cep; fi_addr_t core_addr; struct lnx_peer_table *peer_tbl; @@ -672,8 +676,8 @@ ssize_t lnx_tinject(struct fid_ep *ep, const void *buf, size_t len, peer_tbl = lep->le_peer_tbl; - rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], - lep->le_domain, NULL, &cep, + lp = lnx_av_lookup_addr(peer_tbl, dest_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, NULL, &cep, &core_addr, NULL, 0, &mre, NULL, NULL); if (rc) return rc; @@ -695,6 +699,7 @@ ssize_t lnx_tsenddata(struct fid_ep *ep, const void *buf, size_t len, void *desc { int rc; struct lnx_ep *lep; + struct lnx_peer *lp; struct local_prov_ep *cep; fi_addr_t core_addr; struct lnx_peer_table *peer_tbl; @@ -708,8 +713,8 @@ ssize_t lnx_tsenddata(struct fid_ep *ep, const void *buf, size_t len, void *desc peer_tbl = lep->le_peer_tbl; - rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], - lep->le_domain, desc, &cep, + lp = lnx_av_lookup_addr(peer_tbl, dest_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, desc, &cep, &core_addr, &iov, 1, &mre, &mem_desc, NULL); if (rc) return rc; @@ -732,6 +737,7 @@ ssize_t lnx_tinjectdata(struct fid_ep *ep, const void *buf, size_t len, { int rc; struct lnx_ep *lep; + struct lnx_peer *lp; struct local_prov_ep *cep; fi_addr_t core_addr; struct lnx_peer_table *peer_tbl; @@ -743,8 +749,8 @@ ssize_t lnx_tinjectdata(struct fid_ep *ep, const void *buf, size_t len, peer_tbl = lep->le_peer_tbl; - rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], - lep->le_domain, NULL, &cep, + lp = lnx_av_lookup_addr(peer_tbl, dest_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, NULL, &cep, &core_addr, NULL, 0, &mre, NULL, NULL); if (rc) return rc; @@ -767,6 +773,7 @@ lnx_rma_read(struct fid_ep *ep, void *buf, size_t len, void *desc, { int rc; struct lnx_ep *lep; + struct lnx_peer *lp; struct fid_ep *core_ep; struct lnx_ctx *ctx; struct local_prov_ep *cep; @@ -783,8 +790,8 @@ lnx_rma_read(struct fid_ep *ep, void *buf, size_t len, void *desc, peer_tbl = lep->le_peer_tbl; - rc = lnx_select_send_pathway(peer_tbl->lpt_entries[src_addr], - lep->le_domain, desc, &cep, + lp = lnx_av_lookup_addr(peer_tbl, src_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, desc, &cep, &core_addr, &iov, 1, &mre, &mem_desc, &rkey); if (rc) goto out; @@ -810,6 +817,7 @@ lnx_rma_write(struct fid_ep *ep, const void *buf, size_t len, void *desc, { int rc; struct lnx_ep *lep; + struct lnx_peer *lp; struct fid_ep *core_ep; struct lnx_ctx *ctx; struct local_prov_ep *cep; @@ -826,9 +834,9 @@ lnx_rma_write(struct fid_ep *ep, const void *buf, size_t len, void *desc, peer_tbl = lep->le_peer_tbl; - rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], - lep->le_domain, desc, &cep, - &core_addr, &iov, 1, &mre, &mem_desc, &rkey); + lp = lnx_av_lookup_addr(peer_tbl, dest_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, desc, &cep, + &core_addr, &iov, 1, &mre, &mem_desc, &rkey); if (rc) goto out; @@ -856,6 +864,7 @@ lnx_atomic_write(struct fid_ep *ep, { int rc; struct lnx_ep *lep; + struct lnx_peer *lp; struct fid_ep *core_ep; struct lnx_ctx *ctx; struct local_prov_ep *cep; @@ -872,8 +881,8 @@ lnx_atomic_write(struct fid_ep *ep, peer_tbl = lep->le_peer_tbl; - rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], - lep->le_domain, desc, &cep, + lp = lnx_av_lookup_addr(peer_tbl, dest_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, desc, &cep, &core_addr, &iov, 1, &mre, &mem_desc, &rkey); if (rc) goto out; @@ -902,6 +911,7 @@ lnx_atomic_readwrite(struct fid_ep *ep, { int rc; struct lnx_ep *lep; + struct lnx_peer *lp; struct fid_ep *core_ep; struct lnx_ctx *ctx; struct local_prov_ep *cep; @@ -918,9 +928,10 @@ lnx_atomic_readwrite(struct fid_ep *ep, peer_tbl = lep->le_peer_tbl; - rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], - lep->le_domain, result_desc, &cep, &core_addr, &iov, 1, - &mre, &mem_desc, &rkey); + lp = lnx_av_lookup_addr(peer_tbl, dest_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, result_desc, + &cep, &core_addr, &iov, 1, + &mre, &mem_desc, &rkey); if (rc) goto out; @@ -950,6 +961,7 @@ lnx_atomic_compwrite(struct fid_ep *ep, { int rc; struct lnx_ep *lep; + struct lnx_peer *lp; struct fid_ep *core_ep; struct lnx_ctx *ctx; struct local_prov_ep *cep; @@ -966,9 +978,10 @@ lnx_atomic_compwrite(struct fid_ep *ep, peer_tbl = lep->le_peer_tbl; - rc = lnx_select_send_pathway(peer_tbl->lpt_entries[dest_addr], - lep->le_domain, result_desc, &cep, &core_addr, &iov, 1, - &mre, &mem_desc, &rkey); + lp = lnx_av_lookup_addr(peer_tbl, dest_addr); + rc = lnx_select_send_pathway(lp, lep->le_domain, result_desc, &cep, + &core_addr, &iov, 1, + &mre, &mem_desc, &rkey); if (rc) goto out; From 14a3395fb986a9aca300d1b5785a133c610be26f Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Thu, 5 Dec 2024 14:42:01 -0500 Subject: [PATCH 6/7] prov/lnx: Initialize flags to 0 flags is allocated on the stack which might have some random values. Ensure it's initialized to 0 because if sent to SHM provider uninitialized it could cause the provider to misbehave, since it's value is being checked. Signed-off-by: Amir Shehata --- prov/lnx/include/lnx.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prov/lnx/include/lnx.h b/prov/lnx/include/lnx.h index e6ed95f2efa..3d6506891e4 100644 --- a/prov/lnx/include/lnx.h +++ b/prov/lnx/include/lnx.h @@ -336,7 +336,7 @@ int lnx_create_mr(const struct iovec *iov, fi_addr_t addr, struct fi_mr_attr attr = {}; struct fi_mr_attr cur_abi_attr; struct ofi_mr_info info = {}; - uint64_t flags; + uint64_t flags = 0; int rc; attr.iov_count = 1; From d2a0e3f6aff6bb348d48cb7cc24a12ae8b3c62f8 Mon Sep 17 00:00:00 2001 From: Alexia Ingerson Date: Fri, 6 Dec 2024 15:07:53 -0800 Subject: [PATCH 7/7] NEWS.md: update news for 2.0 Signed-off-by: Alexia Ingerson --- NEWS.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/NEWS.md b/NEWS.md index 0d766534d3d..29ee5e2db1b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -67,6 +67,14 @@ Fix the preprocessor - Fix av strncpy - Fix various issues with initial commit +- Initialize flags to 0 +- Convert peer table to use buffer pools + +## RXM +- Replace rxm managed srx with util srx, support FI_PEER +- Add rxm support for using a peer CQs and counters +- Add FI_AV_USER_ID support +- Fix definition of the rxm SAR segment enum ## SHM