From c8d350cb8de33c80d3473c632c6edad8f4ccced3 Mon Sep 17 00:00:00 2001 From: Nikola Dancejic Date: Fri, 28 Feb 2020 18:27:34 +0000 Subject: [PATCH] prov/efa: fixing bug with reporting cq errors and fixing logic this patch fixed a bug where efa_ep_progress_internal would check for FI_EAVAIL instead of -FI_EAVAIL and fixed some of the logic surrounding error detection on the function. EQ entries are now filled when the cq cannot be read or when error entries cannot be read. Signed-off-by: Nikola Dancejic --- prov/efa/src/efa_ep.c | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/prov/efa/src/efa_ep.c b/prov/efa/src/efa_ep.c index bd7bdcc7bfa..ea0c91efdfa 100644 --- a/prov/efa/src/efa_ep.c +++ b/prov/efa/src/efa_ep.c @@ -425,35 +425,43 @@ static struct fi_ops efa_ep_ops = { .ops_open = fi_no_ops_open, }; -static void efa_ep_progress_internal(struct efa_cq *efa_cq, uint64_t flags) +static void efa_ep_progress_internal(struct efa_ep *ep, struct efa_cq *efa_cq) { - struct util_cq *cq = &efa_cq->util_cq; - int i; - ssize_t ret; + struct util_cq *cq; struct fi_cq_tagged_entry cq_entry[EFA_CQ_PROGRESS_ENTRIES]; struct fi_cq_tagged_entry *temp_cq_entry; struct fi_cq_err_entry cq_err_entry; fi_addr_t src_addr[EFA_CQ_PROGRESS_ENTRIES]; + uint64_t flags; + int i; + ssize_t ret, err; + + cq = &efa_cq->util_cq; + flags = ep->util_ep.caps; VALGRIND_MAKE_MEM_DEFINED(&cq_entry, sizeof(cq_entry)); ret = efa_cq_readfrom(&cq->cq_fid, cq_entry, EFA_CQ_PROGRESS_ENTRIES, (flags & FI_SOURCE) ? src_addr : NULL); if (ret == -FI_EAGAIN) - goto err_cq; + return; if (OFI_UNLIKELY(ret < 0)) { - ret = (ret == FI_EAVAIL) ? - efa_cq_readerr(&cq->cq_fid, &cq_err_entry, flags) : - -FI_EAVAIL; - if (OFI_UNLIKELY(ret < 0)) { - if (OFI_UNLIKELY(ret != -FI_EAGAIN)) - EFA_WARN(FI_LOG_CQ, - "failed to read cq error: %ld\n", ret); - goto err_cq; + if (OFI_UNLIKELY(ret != -FI_EAVAIL)) { + EFA_WARN(FI_LOG_CQ, "no error available errno: %ld\n", ret); + efa_eq_write_error(&ep->util_ep, FI_EOTHER, ret); + return; } + + err = efa_cq_readerr(&cq->cq_fid, &cq_err_entry, flags); + if (OFI_UNLIKELY(err < 0)) { + EFA_WARN(FI_LOG_CQ, "unable to read error entry errno: %ld\n", err); + efa_eq_write_error(&ep->util_ep, FI_EOTHER, err); + return; + } + ofi_cq_write_error(cq, &cq_err_entry); - goto err_cq; + return; } temp_cq_entry = (struct fi_cq_tagged_entry *)cq_entry; @@ -476,7 +484,6 @@ static void efa_ep_progress_internal(struct efa_cq *efa_cq, uint64_t flags) temp_cq_entry = (struct fi_cq_tagged_entry *) ((uint8_t *)temp_cq_entry + efa_cq->entry_size); } -err_cq: return; } @@ -493,10 +500,10 @@ void efa_ep_progress(struct util_ep *ep) fastlock_acquire(&ep->lock); if (rcq) - efa_ep_progress_internal(rcq, ep->caps); + efa_ep_progress_internal(efa_ep, rcq); if (scq && scq != rcq) - efa_ep_progress_internal(scq, ep->caps); + efa_ep_progress_internal(efa_ep, scq); fastlock_release(&ep->lock); }