From 972e3ac41e18ebb45cb41988f99e3bfc0c00577a Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Wed, 13 Nov 2024 22:59:52 +0000 Subject: [PATCH] prov/efa: Fix the error path of zero copy recv Fix the resource leak of rxe when the user rx pkt post fails. Make the user rx pkt pool fixed size (rx size) and doesn't allow further grow (for better traffic control). Signed-off-by: Shi Jin --- prov/efa/src/rdm/efa_rdm_ep_fiops.c | 4 ++- prov/efa/src/rdm/efa_rdm_ep_utils.c | 6 ++--- prov/efa/src/rdm/efa_rdm_msg.c | 3 +++ prov/efa/test/efa_unit_test_ep.c | 40 +++++++++++++++++++++++++++++ prov/efa/test/efa_unit_tests.c | 1 + prov/efa/test/efa_unit_tests.h | 1 + 6 files changed, 50 insertions(+), 5 deletions(-) diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 56e80bc146d..1387f6f8130 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -236,7 +236,9 @@ int efa_rdm_ep_create_buffer_pools(struct efa_rdm_ep *ep) ret = ofi_bufpool_create(&ep->user_rx_pkt_pool, sizeof(struct efa_rdm_pke), EFA_RDM_BUFPOOL_ALIGNMENT, - 0, ep->base_ep.info->rx_attr->size, 0); + ep->base_ep.info->rx_attr->size, + ep->base_ep.info->rx_attr->size, /* max count==chunk_cnt means pool is not allowed to grow */ + 0); if (ret) goto err_free; diff --git a/prov/efa/src/rdm/efa_rdm_ep_utils.c b/prov/efa/src/rdm/efa_rdm_ep_utils.c index 69812d0f90c..51e4e8c405a 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_utils.c +++ b/prov/efa/src/rdm/efa_rdm_ep_utils.c @@ -214,10 +214,8 @@ int efa_rdm_ep_post_user_recv_buf(struct efa_rdm_ep *ep, struct efa_rdm_ope *rxe assert(rxe->iov_count > 0 && rxe->iov_count <= ep->base_ep.info->rx_attr->iov_limit); assert(rxe->iov[0].iov_len >= ep->msg_prefix_size); pkt_entry = efa_rdm_pke_alloc(ep, ep->user_rx_pkt_pool, EFA_RDM_PKE_FROM_USER_RX_POOL); - if (OFI_UNLIKELY(!pkt_entry)) { - EFA_WARN(FI_LOG_EP_DATA, "Failed to allocate pkt_entry for user rx\n"); - return -FI_ENOMEM; - } + if (OFI_UNLIKELY(!pkt_entry)) + return -FI_EAGAIN; pkt_entry->ope = rxe; rxe->state = EFA_RDM_RXE_MATCHED; diff --git a/prov/efa/src/rdm/efa_rdm_msg.c b/prov/efa/src/rdm/efa_rdm_msg.c index cdbabe128c1..dc583c0255e 100644 --- a/prov/efa/src/rdm/efa_rdm_msg.c +++ b/prov/efa/src/rdm/efa_rdm_msg.c @@ -917,6 +917,9 @@ ssize_t efa_rdm_msg_generic_recv(struct efa_rdm_ep *ep, const struct fi_msg *msg } ret = efa_rdm_ep_post_user_recv_buf(ep, rxe, flags); + if (OFI_UNLIKELY(ret)) + efa_rdm_rxe_release(rxe); + ofi_genlock_unlock(srx_ctx->lock); } else if (op == ofi_op_tagged) { ret = util_srx_generic_trecv(ep->peer_srx_ep, msg->msg_iov, msg->desc, diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index f8dd2073df4..b87382c1a0f 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -1119,6 +1119,46 @@ void test_efa_rdm_ep_zcpy_recv_cancel(struct efa_resource **state) free(recv_buff.buff); } +/** + * @brief When user posts more than rx size fi_recv, we should return eagain and make sure + * there is no rx entry leaked + */ +void test_efa_rdm_ep_zcpy_recv_eagain(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff recv_buff; + int i; + struct efa_rdm_ep *efa_rdm_ep; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + assert_non_null(resource->hints); + + resource->hints->caps = FI_MSG; + + /* enable zero-copy recv mode in ep */ + test_efa_rdm_ep_use_zcpy_rx_impl(resource, false, true, true); + + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + /* Construct a recv buffer with mr */ + efa_unit_test_buff_construct(&recv_buff, resource, 16); + + for (i = 0; i < efa_rdm_ep->base_ep.info->rx_attr->size; i++) + assert_int_equal(fi_recv(resource->ep, recv_buff.buff, recv_buff.size, fi_mr_desc(recv_buff.mr), FI_ADDR_UNSPEC, NULL), 0); + + /* we should have rx number of rx entry before and after the extra recv post */ + assert_true(efa_unit_test_get_dlist_length(&efa_rdm_ep->rxe_list) == efa_rdm_ep->base_ep.info->rx_attr->size); + assert_int_equal(fi_recv(resource->ep, recv_buff.buff, recv_buff.size, fi_mr_desc(recv_buff.mr), FI_ADDR_UNSPEC, NULL), -FI_EAGAIN); + assert_true(efa_unit_test_get_dlist_length(&efa_rdm_ep->rxe_list) == efa_rdm_ep->base_ep.info->rx_attr->size); + + /** + * the buf is still posted to rdma-core, so unregistering mr can + * return non-zero. Currently ignore this failure. + */ + (void) fi_close(&recv_buff.mr->fid); + free(recv_buff.buff); +} + /** * @brief when efa_rdm_ep_post_handshake_error failed due to pkt pool exhaustion, * make sure both txe is cleaned diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index 7c485058132..2232ea36059 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -112,6 +112,7 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_no_mr_local, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_close_discard_posted_recv, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_zcpy_recv_cancel, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_zcpy_recv_eagain, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_dgram_cq_read_empty_cq, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_empty_cq, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index c4605c8e962..d44368bc81f 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -126,6 +126,7 @@ void test_efa_rdm_ep_user_p2p_not_supported_zcpy_rx_happy(); void test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_no_mr_local(); void test_efa_rdm_ep_close_discard_posted_recv(); void test_efa_rdm_ep_zcpy_recv_cancel(); +void test_efa_rdm_ep_zcpy_recv_eagain(); void test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion(); void test_dgram_cq_read_empty_cq(); void test_ibv_cq_ex_read_empty_cq();