ch4/ofi: use pack buffer pool for small messages

When sending message size is small enough, use genq pack buffer pool rather than allocating gpu registered buffer ad-hoc. This should avoid the overhead of buffer registration. It also prevent calling e.g. cudaFreeHost, which prevents its use in stream workq when a wait kernel is running.
pmodels · Jul 6, 2022 · fc1cc26 · fc1cc26
1 parent fb9d751
commit fc1cc26
Show file tree

Hide file tree

Showing 4 changed files with 31 additions and 11 deletions.
diff --git a/src/mpid/ch4/netmod/ofi/ofi_events.h b/src/mpid/ch4/netmod/ofi/ofi_events.h
@@ -34,7 +34,9 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_event(int vni,
     if (c == 0) {
         if ((event_id == MPIDI_OFI_EVENT_SEND_PACK) &&
             (MPIDI_OFI_REQUEST(sreq, noncontig.pack.pack_buffer))) {
-            MPIR_gpu_free_host(MPIDI_OFI_REQUEST(sreq, noncontig.pack.pack_buffer));
+            MPI_Aint data_sz = MPIDI_OFI_REQUEST(sreq, noncontig.pack.data_sz);
+            MPIDI_OFI_free_pack_buf(vni, data_sz,
+                                    MPIDI_OFI_REQUEST(sreq, noncontig.pack.pack_buffer));
         } else if (MPIDI_OFI_ENABLE_PT2PT_NOPACK && (event_id == MPIDI_OFI_EVENT_SEND_NOPACK))
             MPL_free(MPIDI_OFI_REQUEST(sreq, noncontig.nopack));
 

diff --git a/src/mpid/ch4/netmod/ofi/ofi_impl.h b/src/mpid/ch4/netmod/ofi/ofi_impl.h
@@ -713,4 +713,22 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_get_buffered(int vni, struct fi_cq_tagged
 #undef CQ_D_HEAD
 #undef CQ_D_TAIL
 
+MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_alloc_pack_buf(int vni, MPI_Aint size, void **buf)
+{
+    if (size <= MPIDI_OFI_DEFAULT_SHORT_SEND_SIZE) {
+        return MPIDU_genq_private_pool_alloc_cell(MPIDI_OFI_global.per_vni[vni].pack_buf_pool, buf);
+    } else {
+        return MPIR_gpu_malloc_host(buf, size);
+    }
+}
+
+MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_free_pack_buf(int vni, MPI_Aint size, void *buf)
+{
+    if (size <= MPIDI_OFI_DEFAULT_SHORT_SEND_SIZE) {
+        return MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.per_vni[vni].pack_buf_pool, buf);
+    } else {
+        return MPIR_gpu_free_host(buf);
+    }
+}
+
 #endif /* OFI_IMPL_H_INCLUDED */
diff --git a/src/mpid/ch4/netmod/ofi/ofi_pre.h b/src/mpid/ch4/netmod/ofi/ofi_pre.h
@@ -208,6 +208,7 @@ typedef struct {
             void *buf;
             size_t count;
             MPI_Datatype datatype;
+            MPI_Aint data_sz;
             char *pack_buffer;
         } pack;
         struct iovec *nopack;

diff --git a/src/mpid/ch4/netmod/ofi/ofi_send.h b/src/mpid/ch4/netmod/ofi/ofi_send.h
@@ -242,12 +242,8 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_normal(const void *buf, MPI_Aint cou
         /* Pack */
         MPIDI_OFI_REQUEST(sreq, event_id) = MPIDI_OFI_EVENT_SEND_PACK;
 
-        /* FIXME: allocating a GPU registered host buffer adds some additional overhead.
-         * However, once the new buffer pool infrastructure is setup, we would simply be
-         * allocating a buffer from the pool, so whether it's a regular malloc buffer or a GPU
-         * registered buffer should be equivalent with respect to performance. */
-        MPIR_gpu_malloc_host((void **) &MPIDI_OFI_REQUEST(sreq, noncontig.pack.pack_buffer),
-                             data_sz);
+        MPIDI_OFI_alloc_pack_buf(vni_local, data_sz,
+                                 (void **) &MPIDI_OFI_REQUEST(sreq, noncontig.pack.pack_buffer));
         MPIR_ERR_CHKANDJUMP1(MPIDI_OFI_REQUEST(sreq, noncontig.pack.pack_buffer) == NULL, mpi_errno,
                              MPI_ERR_OTHER, "**nomem", "**nomem %s", "Send Pack buffer alloc");
 
@@ -409,7 +405,8 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send(const void *buf, MPI_Aint count, MPI
             if (!MPIDI_OFI_ENABLE_HMEM) {
                 /* Force pack for GPU buffer. */
                 void *host_buf = NULL;
-                MPIR_gpu_malloc_host(&host_buf, data_sz);
+                mpi_errno = MPIDI_OFI_alloc_pack_buf(vni_src, data_sz, &host_buf);
+                MPIR_ERR_CHECK(mpi_errno);
                 MPIR_Typerep_pack(buf, count, datatype, 0, host_buf, data_sz, &actual_pack_bytes,
                                   MPIR_TYPEREP_FLAG_NONE);
                 MPIR_Assert(actual_pack_bytes == data_sz);
@@ -419,9 +416,8 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send(const void *buf, MPI_Aint count, MPI
         mpi_errno = MPIDI_OFI_send_lightweight(send_buf, data_sz, cq_data, dst_rank, tag, comm,
                                                context_offset, addr, vni_src, vni_dst);
         if (actual_pack_bytes > 0) {
-            /* Free stage host buf (assigned to send_buf already) after
-             * lightweight_send. */
-            MPIR_gpu_free_host(send_buf);
+            mpi_errno = MPIDI_OFI_free_pack_buf(vni_src, data_sz, send_buf);
+            MPIR_ERR_CHECK(mpi_errno);
         }
         if (!noreq) {
             *request = MPIR_Request_create_complete(MPIR_REQUEST_KIND__SEND);
@@ -432,8 +428,11 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send(const void *buf, MPI_Aint count, MPI
                                           dt_contig, data_sz, dt_ptr, dt_true_lb, syncflag);
     }
 
+  fn_exit:
     MPIR_FUNC_EXIT;
     return mpi_errno;
+  fn_fail:
+    goto fn_exit;
 }
 
 /* Common macro used by all MPIDI_NM_mpi_send routines to facilitate tuning */