Skip to content

Commit

Permalink
ch4/ofi: use pack buffer pool for small messages
Browse files Browse the repository at this point in the history
When sending message size is small enough, use genq pack buffer pool
rather than allocating gpu registered buffer ad-hoc. This should avoid
the overhead of buffer registration.

It also prevent calling e.g. cudaFreeHost, which prevents its use in
stream workq when a wait kernel is running.
  • Loading branch information
hzhou committed Jul 6, 2022
1 parent fb9d751 commit fc1cc26
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 11 deletions.
4 changes: 3 additions & 1 deletion src/mpid/ch4/netmod/ofi/ofi_events.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_event(int vni,
if (c == 0) {
if ((event_id == MPIDI_OFI_EVENT_SEND_PACK) &&
(MPIDI_OFI_REQUEST(sreq, noncontig.pack.pack_buffer))) {
MPIR_gpu_free_host(MPIDI_OFI_REQUEST(sreq, noncontig.pack.pack_buffer));
MPI_Aint data_sz = MPIDI_OFI_REQUEST(sreq, noncontig.pack.data_sz);
MPIDI_OFI_free_pack_buf(vni, data_sz,
MPIDI_OFI_REQUEST(sreq, noncontig.pack.pack_buffer));
} else if (MPIDI_OFI_ENABLE_PT2PT_NOPACK && (event_id == MPIDI_OFI_EVENT_SEND_NOPACK))
MPL_free(MPIDI_OFI_REQUEST(sreq, noncontig.nopack));

Expand Down
18 changes: 18 additions & 0 deletions src/mpid/ch4/netmod/ofi/ofi_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -713,4 +713,22 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_get_buffered(int vni, struct fi_cq_tagged
#undef CQ_D_HEAD
#undef CQ_D_TAIL

MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_alloc_pack_buf(int vni, MPI_Aint size, void **buf)
{
if (size <= MPIDI_OFI_DEFAULT_SHORT_SEND_SIZE) {
return MPIDU_genq_private_pool_alloc_cell(MPIDI_OFI_global.per_vni[vni].pack_buf_pool, buf);
} else {
return MPIR_gpu_malloc_host(buf, size);
}
}

MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_free_pack_buf(int vni, MPI_Aint size, void *buf)
{
if (size <= MPIDI_OFI_DEFAULT_SHORT_SEND_SIZE) {
return MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.per_vni[vni].pack_buf_pool, buf);
} else {
return MPIR_gpu_free_host(buf);
}
}

#endif /* OFI_IMPL_H_INCLUDED */
1 change: 1 addition & 0 deletions src/mpid/ch4/netmod/ofi/ofi_pre.h
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@ typedef struct {
void *buf;
size_t count;
MPI_Datatype datatype;
MPI_Aint data_sz;
char *pack_buffer;
} pack;
struct iovec *nopack;
Expand Down
19 changes: 9 additions & 10 deletions src/mpid/ch4/netmod/ofi/ofi_send.h
Original file line number Diff line number Diff line change
Expand Up @@ -242,12 +242,8 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_normal(const void *buf, MPI_Aint cou
/* Pack */
MPIDI_OFI_REQUEST(sreq, event_id) = MPIDI_OFI_EVENT_SEND_PACK;

/* FIXME: allocating a GPU registered host buffer adds some additional overhead.
* However, once the new buffer pool infrastructure is setup, we would simply be
* allocating a buffer from the pool, so whether it's a regular malloc buffer or a GPU
* registered buffer should be equivalent with respect to performance. */
MPIR_gpu_malloc_host((void **) &MPIDI_OFI_REQUEST(sreq, noncontig.pack.pack_buffer),
data_sz);
MPIDI_OFI_alloc_pack_buf(vni_local, data_sz,
(void **) &MPIDI_OFI_REQUEST(sreq, noncontig.pack.pack_buffer));
MPIR_ERR_CHKANDJUMP1(MPIDI_OFI_REQUEST(sreq, noncontig.pack.pack_buffer) == NULL, mpi_errno,
MPI_ERR_OTHER, "**nomem", "**nomem %s", "Send Pack buffer alloc");

Expand Down Expand Up @@ -409,7 +405,8 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send(const void *buf, MPI_Aint count, MPI
if (!MPIDI_OFI_ENABLE_HMEM) {
/* Force pack for GPU buffer. */
void *host_buf = NULL;
MPIR_gpu_malloc_host(&host_buf, data_sz);
mpi_errno = MPIDI_OFI_alloc_pack_buf(vni_src, data_sz, &host_buf);
MPIR_ERR_CHECK(mpi_errno);
MPIR_Typerep_pack(buf, count, datatype, 0, host_buf, data_sz, &actual_pack_bytes,
MPIR_TYPEREP_FLAG_NONE);
MPIR_Assert(actual_pack_bytes == data_sz);
Expand All @@ -419,9 +416,8 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send(const void *buf, MPI_Aint count, MPI
mpi_errno = MPIDI_OFI_send_lightweight(send_buf, data_sz, cq_data, dst_rank, tag, comm,
context_offset, addr, vni_src, vni_dst);
if (actual_pack_bytes > 0) {
/* Free stage host buf (assigned to send_buf already) after
* lightweight_send. */
MPIR_gpu_free_host(send_buf);
mpi_errno = MPIDI_OFI_free_pack_buf(vni_src, data_sz, send_buf);
MPIR_ERR_CHECK(mpi_errno);
}
if (!noreq) {
*request = MPIR_Request_create_complete(MPIR_REQUEST_KIND__SEND);
Expand All @@ -432,8 +428,11 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send(const void *buf, MPI_Aint count, MPI
dt_contig, data_sz, dt_ptr, dt_true_lb, syncflag);
}

fn_exit:
MPIR_FUNC_EXIT;
return mpi_errno;
fn_fail:
goto fn_exit;
}

/* Common macro used by all MPIDI_NM_mpi_send routines to facilitate tuning */
Expand Down

0 comments on commit fc1cc26

Please sign in to comment.