diff --git a/src/transport_ofi.h b/src/transport_ofi.h index fab7cf957..93f4ac6b6 100644 --- a/src/transport_ofi.h +++ b/src/transport_ofi.h @@ -697,7 +697,7 @@ void shmem_transport_put_signal_nbi(shmem_transport_ctx_t* ctx, void *target, co }; do { - ret = fi_writemsg(ctx->ep, &msg, FI_DELIVERY_COMPLETE); + ret = fi_writemsg(ctx->ep, &msg, FI_DELIVERY_COMPLETE | FI_INJECT); } while (try_again(ctx, ret, &polled)); SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); @@ -785,6 +785,8 @@ void shmem_transport_put_signal_nbi(shmem_transport_ctx_t* ctx, void *target, co }; do { + /* FI_FENCE assures completion of one or more (for fragmentation) prior puts through + * signal delivery */ ret = fi_atomicmsg(ctx->ep, &msg_signal, FI_DELIVERY_COMPLETE | FI_FENCE | FI_INJECT); } while (try_again(ctx, ret, &polled)); diff --git a/src/transport_portals4.h b/src/transport_portals4.h index 4e5e5b435..ed377d2cd 100644 --- a/src/transport_portals4.h +++ b/src/transport_portals4.h @@ -647,6 +647,8 @@ static inline void shmem_transport_put_signal_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, uint64_t *sig_addr, uint64_t signal, int pe) { + /* FIXME: Need to optimize non-blocking put with signal for Portals. Current implementation below keeps + * the "fence" in between data and signal put */ shmem_transport_put_nbi(ctx, target, source, len, pe); shmem_transport_fence(ctx); shmem_transport_put_scalar(ctx, sig_addr, &signal, sizeof(uint64_t), pe);