diff --git a/prov/efa/docs/efa_rdm_protocol_v4.md b/prov/efa/docs/efa_rdm_protocol_v4.md index 9016ec00958..1877156779b 100644 --- a/prov/efa/docs/efa_rdm_protocol_v4.md +++ b/prov/efa/docs/efa_rdm_protocol_v4.md @@ -68,6 +68,12 @@ Chapter 4 "extra features/requests" describes the extra features/requests define * Section 4.6 describe the extra feature: RDMA-Write based message transfer. + * Section 4.7 describe the extra feature: Long read and runting read nack protocol. + + * Section 4.8 describe the extra feature: User receive QP. + + * Section 4.9 describe the extra feature: Unsolicited write recv. + Chapter 5 "What's not covered?" describes the contents that are intentionally left out of this document because they are considered "implementation details". @@ -323,6 +329,7 @@ Table: 2.1 a list of extra features/requests | 5 | RDMA-Write based data transfer | extra feature | libfabric 1.18.0 | Section 4.6 | | 6 | Read nack packets | extra feature | libfabric 1.20.0 | Section 4.7 | | 7 | User recv QP | extra feature & request| libfabric 1.22.0 | Section 4.8 | +| 8 | Unsolicited write recv | extra feature | libfabric 1.22.0 | Section 4.9 | How does protocol v4 maintain backward compatibility when extra features/requests are introduced? @@ -1611,6 +1618,17 @@ zero-copy receive mode. If a receiver gets RTM packets delivered to its default QP, it raises an error because it requests all RTM packets must be delivered to its user recv QP. +### 4.9 Unsolicited write recv + +The "Unsolicited write recv" is an extra feature that was +introduced with the libfabric 1.22.0. When this feature is on, rdma-write +with immediate data will not consume an rx buffer on the responder side. It is +defined as an extra feature because there is a set of requirements (firmware, +EFA kernel module and rdma-core) to be met before an endpoint can use the unsolicited +write recv capability, therefore an endpoint cannot assume the other party supports +unsolicited write recv. The rdma-write with immediate data cannot be issued if there +is a discrepancy on this feature between local and peer. + ## 5. What's not covered? The purpose of this document is to define the communication protocol. Therefore, it is intentionally written diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 47b3f53f9bd..1ed2b4e9a5a 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -1054,6 +1054,9 @@ void efa_rdm_ep_set_extra_info(struct efa_rdm_ep *ep) ep->extra_info[0] |= EFA_RDM_EXTRA_FEATURE_DELIVERY_COMPLETE; + if (efa_rdm_use_unsolicited_write_recv()) + ep->extra_info[0] |= EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV; + if (ep->use_zcpy_rx) { /* * When zcpy rx is enabled, an extra QP is created to diff --git a/prov/efa/src/rdm/efa_rdm_peer.h b/prov/efa/src/rdm/efa_rdm_peer.h index 8c2703fc140..fe2f79ead61 100644 --- a/prov/efa/src/rdm/efa_rdm_peer.h +++ b/prov/efa/src/rdm/efa_rdm_peer.h @@ -109,6 +109,23 @@ bool efa_rdm_peer_support_rdma_write(struct efa_rdm_peer *peer) (peer->extra_info[0] & EFA_RDM_EXTRA_FEATURE_RDMA_WRITE); } +/** + * @brief check for peer's unsolicited write support, assuming HANDSHAKE has already occurred + * + * @param[in] peer A peer which we have already received a HANDSHAKE from + * @return bool The peer's unsolicited write recv support + */ +static inline +bool efa_rdm_peer_support_unsolicited_write_recv(struct efa_rdm_peer *peer) +{ + /* Unsolicited write recv is an extra feature defined in version 4 (the base version). + * Because it is an extra feature, an EP will assume the peer does not support + * it before a handshake packet was received. + */ + return (peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED) && + (peer->extra_info[0] & EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV); +} + static inline bool efa_rdm_peer_support_delivery_complete(struct efa_rdm_peer *peer) { diff --git a/prov/efa/src/rdm/efa_rdm_protocol.h b/prov/efa/src/rdm/efa_rdm_protocol.h index 1b94b5338d1..8840ce5f401 100644 --- a/prov/efa/src/rdm/efa_rdm_protocol.h +++ b/prov/efa/src/rdm/efa_rdm_protocol.h @@ -40,7 +40,8 @@ struct efa_ep_addr { #define EFA_RDM_EXTRA_FEATURE_RDMA_WRITE BIT_ULL(5) #define EFA_RDM_EXTRA_FEATURE_READ_NACK BIT_ULL(6) #define EFA_RDM_EXTRA_FEATURE_REQUEST_USER_RECV_QP BIT_ULL(7) -#define EFA_RDM_NUM_EXTRA_FEATURE_OR_REQUEST 8 +#define EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV BIT_ULL(8) +#define EFA_RDM_NUM_EXTRA_FEATURE_OR_REQUEST 9 /* * The length of 64-bit extra_info array used in efa_rdm_ep * and efa_rdm_peer diff --git a/prov/efa/src/rdm/efa_rdm_rma.c b/prov/efa/src/rdm/efa_rdm_rma.c index 36b2d5171da..4557c0a0be9 100644 --- a/prov/efa/src/rdm/efa_rdm_rma.c +++ b/prov/efa/src/rdm/efa_rdm_rma.c @@ -370,6 +370,27 @@ ssize_t efa_rdm_rma_post_write(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe) return efa_rdm_ep_enforce_handshake_for_txe(ep, txe); if (efa_rdm_rma_should_write_using_rdma(ep, txe, txe->peer)) { + /** + * Unsolicited write recv is a feature that makes rdma-write with + * imm not consume an rx buffer on the responder side. Older libfabric + * or new libfabric working with old driver expects an rx buffer to be + * consumed for the rdma write with imm operation. + * As a result, we cannot post an rdma-write with imm when there is a + * discrepancy on this feature between initiator and responder. + */ + if ((txe->fi_flags & FI_REMOTE_CQ_DATA) && + (efa_rdm_use_unsolicited_write_recv() != efa_rdm_peer_support_unsolicited_write_recv(txe->peer))) { + (void) efa_rdm_construct_msg_with_local_and_peer_information(ep, txe->addr, ep->err_msg, "", EFA_RDM_ERROR_MSG_BUFFER_LENGTH); + EFA_WARN(FI_LOG_EP_DATA, + "Inconsistent support status detected on unsolicited write recv.\n" + "My support status: %d, peer support status: %d. %s.\n" + "This is usually caused by inconsistent efa driver, libfabric, or rdma-core versions.\n" + "Please use consistent software versions on both hosts, or disable the unsolicited write " + "recv feature by setting environment variable FI_EFA_USE_UNSOLICITED_WRITE_RECV=0\n", + efa_rdm_use_unsolicited_write_recv(), efa_rdm_peer_support_unsolicited_write_recv(txe->peer), + ep->err_msg); + return -FI_EINVAL; + } efa_rdm_ope_prepare_to_post_write(txe); return efa_rdm_ope_post_remote_write(txe); } diff --git a/prov/efa/src/rdm/efa_rdm_util.c b/prov/efa/src/rdm/efa_rdm_util.c index 02880c09dfd..0175b3884a9 100644 --- a/prov/efa/src/rdm/efa_rdm_util.c +++ b/prov/efa/src/rdm/efa_rdm_util.c @@ -97,6 +97,53 @@ void efa_rdm_get_desc_for_shm(int numdesc, void **efa_desc, void **shm_desc) } } +/** + * @brief Construct a message that contains the local and peer information, + * including the efa address and the host id. + * + * @param ep EFA RDM endpoint + * @param addr Remote peer fi_addr_t + * @param msg the ptr of the msg to be constructed (needs to be allocated already!) + * @param base_msg ptr to the base msg that will show at the beginning of msg + * @param msg_len the length of the message + * @return int 0 on success, negative integer on failure + */ +int efa_rdm_construct_msg_with_local_and_peer_information(struct efa_rdm_ep *ep, fi_addr_t addr, char *msg, const char *base_msg, size_t msg_len) +{ + char ep_addr_str[OFI_ADDRSTRLEN] = {0}, peer_addr_str[OFI_ADDRSTRLEN] = {0}; + char peer_host_id_str[EFA_HOST_ID_STRING_LENGTH + 1] = {0}; + char local_host_id_str[EFA_HOST_ID_STRING_LENGTH + 1] = {0}; + size_t len = 0; + int ret; + struct efa_rdm_peer *peer = efa_rdm_ep_get_peer(ep, addr); + + len = sizeof(ep_addr_str); + efa_rdm_ep_raw_addr_str(ep, ep_addr_str, &len); + len = sizeof(peer_addr_str); + efa_rdm_ep_get_peer_raw_addr_str(ep, addr, peer_addr_str, &len); + + if (!ep->host_id || EFA_HOST_ID_STRING_LENGTH != snprintf(local_host_id_str, EFA_HOST_ID_STRING_LENGTH + 1, "i-%017lx", ep->host_id)) { + strcpy(local_host_id_str, "N/A"); + } + + if (!peer->host_id || EFA_HOST_ID_STRING_LENGTH != snprintf(peer_host_id_str, EFA_HOST_ID_STRING_LENGTH + 1, "i-%017lx", peer->host_id)) { + strcpy(peer_host_id_str, "N/A"); + } + + ret = snprintf(msg, msg_len, "%s My EFA addr: %s My host id: %s Peer EFA addr: %s Peer host id: %s", + base_msg, ep_addr_str, local_host_id_str, peer_addr_str, peer_host_id_str); + + if (ret < 0 || ret > msg_len - 1) { + return -FI_EINVAL; + } + + if (strlen(msg) >= msg_len) { + return -FI_ENOBUFS; + } + + return FI_SUCCESS; +} + /** * @brief Write the error message and return its byte length * @param[in] ep EFA RDM endpoint @@ -108,42 +155,18 @@ void efa_rdm_get_desc_for_shm(int numdesc, void **efa_desc, void **shm_desc) */ int efa_rdm_write_error_msg(struct efa_rdm_ep *ep, fi_addr_t addr, int prov_errno, void **buf, size_t *buflen) { - char ep_addr_str[OFI_ADDRSTRLEN] = {0}, peer_addr_str[OFI_ADDRSTRLEN] = {0}; - char peer_host_id_str[EFA_HOST_ID_STRING_LENGTH + 1] = {0}; - char local_host_id_str[EFA_HOST_ID_STRING_LENGTH + 1] = {0}; - const char *base_msg = efa_strerror(prov_errno); - size_t len = 0; - struct efa_rdm_peer *peer = efa_rdm_ep_get_peer(ep, addr); - - *buf = NULL; - *buflen = 0; - - len = sizeof(ep_addr_str); - efa_rdm_ep_raw_addr_str(ep, ep_addr_str, &len); - len = sizeof(peer_addr_str); - efa_rdm_ep_get_peer_raw_addr_str(ep, addr, peer_addr_str, &len); - - if (!ep->host_id || EFA_HOST_ID_STRING_LENGTH != snprintf(local_host_id_str, EFA_HOST_ID_STRING_LENGTH + 1, "i-%017lx", ep->host_id)) { - strcpy(local_host_id_str, "N/A"); - } - - if (!peer->host_id || EFA_HOST_ID_STRING_LENGTH != snprintf(peer_host_id_str, EFA_HOST_ID_STRING_LENGTH + 1, "i-%017lx", peer->host_id)) { - strcpy(peer_host_id_str, "N/A"); - } - - int ret = snprintf(ep->err_msg, EFA_RDM_ERROR_MSG_BUFFER_LENGTH, "%s My EFA addr: %s My host id: %s Peer EFA addr: %s Peer host id: %s", - base_msg, ep_addr_str, local_host_id_str, peer_addr_str, peer_host_id_str); + const char *base_msg = efa_strerror(prov_errno); + int ret; - if (ret < 0 || ret > EFA_RDM_ERROR_MSG_BUFFER_LENGTH - 1) { - return -FI_EINVAL; - } + *buf = NULL; + *buflen = 0; - if (strlen(ep->err_msg) >= EFA_RDM_ERROR_MSG_BUFFER_LENGTH) { - return -FI_ENOBUFS; - } + ret = efa_rdm_construct_msg_with_local_and_peer_information(ep, addr, ep->err_msg, base_msg, EFA_RDM_ERROR_MSG_BUFFER_LENGTH); + if (ret) + return ret; - *buf = ep->err_msg; - *buflen = EFA_RDM_ERROR_MSG_BUFFER_LENGTH; + *buf = ep->err_msg; + *buflen = EFA_RDM_ERROR_MSG_BUFFER_LENGTH; - return 0; + return 0; } diff --git a/prov/efa/src/rdm/efa_rdm_util.h b/prov/efa/src/rdm/efa_rdm_util.h index a2ba0083295..b79bafb4e85 100644 --- a/prov/efa/src/rdm/efa_rdm_util.h +++ b/prov/efa/src/rdm/efa_rdm_util.h @@ -19,6 +19,8 @@ bool efa_rdm_get_use_device_rdma(uint32_t fabric_api_version); void efa_rdm_get_desc_for_shm(int numdesc, void **efa_desc, void **shm_desc); +int efa_rdm_construct_msg_with_local_and_peer_information(struct efa_rdm_ep *ep, fi_addr_t addr, char *msg, const char *base_msg, size_t msg_len); + int efa_rdm_write_error_msg(struct efa_rdm_ep *ep, fi_addr_t addr, int prov_errno, void **buf, size_t *buflen); #ifdef ENABLE_EFA_POISONING