Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ch4/ofi: Lazily register FI_MULTI_RECV buffers #6422

Merged
merged 4 commits into from
Mar 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions src/include/mpir_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
#ifndef MPIR_GPU_H_INCLUDED
#define MPIR_GPU_H_INCLUDED

/* mpidpre.h and mpir_thread.h are needed by mpir_cvars.h */
#include "mpidpre.h"
#include "mpir_thread.h"
#include "mpir_cvars.h"
#include "mpir_err.h"

/*
Expand Down Expand Up @@ -40,6 +44,19 @@
for temporary buffers. When stream workq and GPU wait kernels are
in use, access APIs for GPU registered memory may cause deadlock.

- name : MPIR_CVAR_ENABLE_GPU_REGISTER
category : GPU
type : boolean
default : true
class : none
verbosity : MPI_T_VERBOSITY_USER_BASIC
scope : MPI_T_SCOPE_ALL_EQ
description : >-
Control whether to actually register buffers with the GPU runtime in
MPIR_gpu_register_host. This could lower the latency of certain GPU
communication at the cost of some amount of GPU memory consumed by
the MPI library. By default, registration is enabled.

=== END_MPI_T_CVAR_INFO_BLOCK ===
*/

Expand Down Expand Up @@ -97,15 +114,15 @@ MPL_STATIC_INLINE_PREFIX bool MPIR_GPU_query_pointer_is_dev(const void *ptr)

MPL_STATIC_INLINE_PREFIX int MPIR_gpu_register_host(const void *ptr, size_t size)
{
if (ENABLE_GPU) {
if (ENABLE_GPU && MPIR_CVAR_ENABLE_GPU_REGISTER) {
return MPL_gpu_register_host(ptr, size);
}
return MPI_SUCCESS;
}

MPL_STATIC_INLINE_PREFIX int MPIR_gpu_unregister_host(const void *ptr)
{
if (ENABLE_GPU) {
if (ENABLE_GPU && MPIR_CVAR_ENABLE_GPU_REGISTER) {
return MPL_gpu_unregister_host(ptr);
}
return MPI_SUCCESS;
Expand Down
27 changes: 18 additions & 9 deletions src/mpid/ch4/netmod/ofi/ofi_am_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -490,9 +490,12 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_am_isend_eager(int rank, MPIR_Comm * c

MPL_pointer_attr_t attr;
MPIR_GPU_query_pointer_attr(buf, &attr);
if (attr.type == MPL_GPU_POINTER_DEV && !MPIDI_OFI_ENABLE_HMEM) {
/* Force packing of GPU buffer in host memory */
need_packing = true;
if (attr.type == MPL_GPU_POINTER_DEV) {
MPIDI_OFI_register_am_bufs();
if (!MPIDI_OFI_ENABLE_HMEM) {
/* Force packing of GPU buffer in host memory */
need_packing = true;
}
}
} else {
data_sz = MPIDI_OFI_AMREQUEST(sreq, deferred_req)->data_sz;
Expand Down Expand Up @@ -647,9 +650,12 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_am_isend_pipeline(int rank, MPIR_Comm

MPL_pointer_attr_t attr;
MPIR_GPU_query_pointer_attr(buf, &attr);
if (attr.type == MPL_GPU_POINTER_DEV && !MPIDI_OFI_ENABLE_HMEM) {
/* Force packing of GPU buffer in host memory */
need_packing = true;
if (attr.type == MPL_GPU_POINTER_DEV) {
MPIDI_OFI_register_am_bufs();
if (!MPIDI_OFI_ENABLE_HMEM) {
/* Force packing of GPU buffer in host memory */
need_packing = true;
}
}
offset = 0;
} else {
Expand Down Expand Up @@ -743,9 +749,12 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_am_isend_rdma_read(int rank, MPIR_Comm

MPL_pointer_attr_t attr;
MPIR_GPU_query_pointer_attr(buf, &attr);
if (attr.type == MPL_GPU_POINTER_DEV && !MPIDI_OFI_ENABLE_HMEM) {
/* Force packing of GPU buffer in host memory */
need_packing = true;
if (attr.type == MPL_GPU_POINTER_DEV) {
MPIDI_OFI_register_am_bufs();
if (!MPIDI_OFI_ENABLE_HMEM) {
/* Force packing of GPU buffer in host memory */
need_packing = true;
}
}
} else {
data_sz = MPIDI_OFI_AMREQUEST(sreq, deferred_req)->data_sz;
Expand Down
19 changes: 19 additions & 0 deletions src/mpid/ch4/netmod/ofi/ofi_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -706,6 +706,25 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_get_buffered(int vni, struct fi_cq_tagged
return num;
}

MPL_STATIC_INLINE_PREFIX void MPIDI_OFI_register_am_bufs(void)
{
if (!MPIDI_OFI_global.am_bufs_registered) {
for (int i = 0; i < MPIDI_OFI_global.num_vnis; i++) {
MPIR_gpu_register_host(MPIDI_OFI_global.per_vni[i].am_bufs,
MPIDI_OFI_AM_BUFF_SZ * MPIDI_OFI_NUM_AM_BUFFERS);
}
}
}

MPL_STATIC_INLINE_PREFIX void MPIDI_OFI_unregister_am_bufs(void)
{
if (MPIDI_OFI_global.am_bufs_registered) {
for (int i = 0; i < MPIDI_OFI_global.num_vnis; i++) {
MPIR_gpu_unregister_host(MPIDI_OFI_global.per_vni[i].am_bufs);
}
}
}

#undef CQ_S_LIST
#undef CQ_S_HEAD
#undef CQ_S_TAIL
Expand Down
12 changes: 7 additions & 5 deletions src/mpid/ch4/netmod/ofi/ofi_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -897,8 +897,8 @@ int MPIDI_OFI_mpi_finalize_hook(void)
MPIDIU_map_destroy(MPIDI_OFI_global.per_vni[vni].am_send_seq_tracker);
MPIDIU_map_destroy(MPIDI_OFI_global.per_vni[vni].am_recv_seq_tracker);

for (i = 0; i < MPIDI_OFI_NUM_AM_BUFFERS; i++)
MPIR_gpu_free_host(MPIDI_OFI_global.per_vni[vni].am_bufs[i]);
MPIDI_OFI_unregister_am_bufs();
MPL_free(MPIDI_OFI_global.per_vni[vni].am_bufs);

MPIDU_genq_private_pool_destroy(MPIDI_OFI_global.per_vni[vni].am_hdr_buf_pool);

Expand Down Expand Up @@ -1542,13 +1542,15 @@ int ofi_am_post_recv(int vni, int nic)
FI_OPT_ENDPOINT,
FI_OPT_MIN_MULTI_RECV, &optlen, sizeof(optlen)), setopt);

/* we allocate a single buffer and post recvs using an offset */
MPIDI_OFI_global.per_vni[vni].am_bufs =
MPL_malloc(MPIDI_OFI_AM_BUFF_SZ * MPIDI_OFI_NUM_AM_BUFFERS, MPL_MEM_BUFFER);
for (int i = 0; i < MPIDI_OFI_NUM_AM_BUFFERS; i++) {
MPIR_gpu_malloc_host(&(MPIDI_OFI_global.per_vni[vni].am_bufs[i]), MPIDI_OFI_AM_BUFF_SZ);
MPIDI_OFI_global.per_vni[vni].am_reqs[i].event_id = MPIDI_OFI_EVENT_AM_RECV;
MPIDI_OFI_global.per_vni[vni].am_reqs[i].index = i;
MPIR_Assert(MPIDI_OFI_global.per_vni[vni].am_bufs[i]);
MPIR_Assert(MPIDI_OFI_global.per_vni[vni].am_bufs);
MPIDI_OFI_global.per_vni[vni].am_iov[i].iov_base =
MPIDI_OFI_global.per_vni[vni].am_bufs[i];
(char *) MPIDI_OFI_global.per_vni[vni].am_bufs + (MPIDI_OFI_AM_BUFF_SZ * i);
MPIDI_OFI_global.per_vni[vni].am_iov[i].iov_len = MPIDI_OFI_AM_BUFF_SZ;
MPIDI_OFI_global.per_vni[vni].am_msg[i].msg_iov =
&MPIDI_OFI_global.per_vni[vni].am_iov[i];
Expand Down
1 change: 1 addition & 0 deletions src/mpid/ch4/netmod/ofi/ofi_recv.h
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_irecv(void *buf,
MPL_pointer_attr_t attr;
MPIR_GPU_query_pointer_attr(recv_buf, &attr);
if (data_sz && attr.type == MPL_GPU_POINTER_DEV) {
MPIDI_OFI_register_am_bufs();
if (!MPIDI_OFI_ENABLE_HMEM) {
/* FIXME: at this point, GPU data takes host-buffer staging
* path for the whole chunk. For large memory size, pipeline
Expand Down
10 changes: 10 additions & 0 deletions src/mpid/ch4/netmod/ofi/ofi_rma.h
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_mpi_put(const void *origin_addr,

if (!MPIDI_OFI_ENABLE_RMA || !(winattr & MPIDI_WINATTR_NM_REACHABLE) ||
MPIR_GPU_query_pointer_is_dev(origin_addr)) {
MPIDI_OFI_register_am_bufs();
mpi_errno = MPIDIG_mpi_put(origin_addr, origin_count, origin_datatype, target_rank,
target_disp, target_count, target_datatype, win);
goto fn_exit;
Expand Down Expand Up @@ -496,6 +497,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_mpi_get(void *origin_addr,

if (!MPIDI_OFI_ENABLE_RMA || !(winattr & MPIDI_WINATTR_NM_REACHABLE) ||
MPIR_GPU_query_pointer_is_dev(origin_addr)) {
MPIDI_OFI_register_am_bufs();
mpi_errno = MPIDIG_mpi_get(origin_addr, origin_count, origin_datatype, target_rank,
target_disp, target_count, target_datatype, win);
goto fn_exit;
Expand Down Expand Up @@ -528,6 +530,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_mpi_rput(const void *origin_addr,

if (!MPIDI_OFI_ENABLE_RMA || !(winattr & MPIDI_WINATTR_NM_REACHABLE) ||
MPIR_GPU_query_pointer_is_dev(origin_addr)) {
MPIDI_OFI_register_am_bufs();
mpi_errno = MPIDIG_mpi_rput(origin_addr, origin_count, origin_datatype, target_rank,
target_disp, target_count, target_datatype, win, request);
goto fn_exit;
Expand Down Expand Up @@ -584,6 +587,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_mpi_compare_and_swap(const void *origin_ad
MPIR_GPU_query_pointer_is_dev(origin_addr) ||
MPIR_GPU_query_pointer_is_dev(compare_addr) ||
MPIR_GPU_query_pointer_is_dev(result_addr)) {
MPIDI_OFI_register_am_bufs();
mpi_errno =
MPIDIG_mpi_compare_and_swap(origin_addr, compare_addr, result_addr, datatype,
target_rank, target_disp, win);
Expand Down Expand Up @@ -971,6 +975,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_mpi_raccumulate(const void *origin_addr,
#endif
!MPIDI_OFI_ENABLE_RMA || !MPIDI_OFI_ENABLE_ATOMICS ||
!(winattr & MPIDI_WINATTR_NM_REACHABLE) || MPIR_GPU_query_pointer_is_dev(origin_addr)) {
MPIDI_OFI_register_am_bufs();
mpi_errno =
MPIDIG_mpi_raccumulate(origin_addr, origin_count, origin_datatype, target_rank,
target_disp, target_count, target_datatype, op, win, request);
Expand Down Expand Up @@ -1021,6 +1026,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_mpi_rget_accumulate(const void *origin_add
!(winattr & MPIDI_WINATTR_NM_REACHABLE) ||
MPIR_GPU_query_pointer_is_dev(origin_addr) ||
MPIR_GPU_query_pointer_is_dev(result_addr)) {
MPIDI_OFI_register_am_bufs();
mpi_errno =
MPIDIG_mpi_rget_accumulate(origin_addr, origin_count, origin_datatype, result_addr,
result_count, result_datatype, target_rank, target_disp,
Expand Down Expand Up @@ -1075,6 +1081,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_mpi_fetch_and_op(const void *origin_addr,
!(winattr & MPIDI_WINATTR_NM_REACHABLE) ||
MPIR_GPU_query_pointer_is_dev(origin_addr) ||
MPIR_GPU_query_pointer_is_dev(result_addr)) {
MPIDI_OFI_register_am_bufs();
mpi_errno =
MPIDIG_mpi_fetch_and_op(origin_addr, result_addr, datatype, target_rank, target_disp,
op, win);
Expand Down Expand Up @@ -1167,6 +1174,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_mpi_rget(void *origin_addr,

if (!MPIDI_OFI_ENABLE_RMA || !(winattr & MPIDI_WINATTR_NM_REACHABLE) ||
MPIR_GPU_query_pointer_is_dev(origin_addr)) {
MPIDI_OFI_register_am_bufs();
mpi_errno = MPIDIG_mpi_rget(origin_addr, origin_count, origin_datatype, target_rank,
target_disp, target_count, target_datatype, win, request);
goto fn_exit;
Expand Down Expand Up @@ -1213,6 +1221,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_mpi_get_accumulate(const void *origin_addr
!MPIDI_OFI_ENABLE_RMA || !MPIDI_OFI_ENABLE_ATOMICS ||
!(winattr & MPIDI_WINATTR_NM_REACHABLE) || MPIR_GPU_query_pointer_is_dev(origin_addr) ||
MPIR_GPU_query_pointer_is_dev(result_addr)) {
MPIDI_OFI_register_am_bufs();
mpi_errno =
MPIDIG_mpi_get_accumulate(origin_addr, origin_count, origin_datatype, result_addr,
result_count, result_datatype, target_rank, target_disp,
Expand Down Expand Up @@ -1254,6 +1263,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_mpi_accumulate(const void *origin_addr,
#endif
!MPIDI_OFI_ENABLE_RMA || !MPIDI_OFI_ENABLE_ATOMICS ||
!(winattr & MPIDI_WINATTR_NM_REACHABLE) || MPIR_GPU_query_pointer_is_dev(origin_addr)) {
MPIDI_OFI_register_am_bufs();
mpi_errno =
MPIDIG_mpi_accumulate(origin_addr, origin_count, origin_datatype, target_rank,
target_disp, target_count, target_datatype, op, win);
Expand Down
2 changes: 2 additions & 0 deletions src/mpid/ch4/netmod/ofi/ofi_send.h
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_normal(const void *buf, MPI_Aint cou
MPL_pointer_attr_t attr;
MPIR_GPU_query_pointer_attr(send_buf, &attr);
if (data_sz && attr.type == MPL_GPU_POINTER_DEV) {
MPIDI_OFI_register_am_bufs();
if (!MPIDI_OFI_ENABLE_HMEM) {
/* Force packing of GPU buffer in host memory */
/* FIXME: at this point, GPU data takes host-buffer staging
Expand Down Expand Up @@ -414,6 +415,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send(const void *buf, MPI_Aint count, MPI
MPL_pointer_attr_t attr;
MPIR_GPU_query_pointer_attr(send_buf, &attr);
if (attr.type == MPL_GPU_POINTER_DEV) {
MPIDI_OFI_register_am_bufs();
if (!MPIDI_OFI_ENABLE_HMEM) {
/* Force pack for GPU buffer. */
void *host_buf = NULL;
Expand Down
3 changes: 2 additions & 1 deletion src/mpid/ch4/netmod/ofi/ofi_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ typedef struct MPIDI_OFI_cq_list_t {
typedef struct {
struct iovec am_iov[MPIDI_OFI_MAX_NUM_AM_BUFFERS];
struct fi_msg am_msg[MPIDI_OFI_MAX_NUM_AM_BUFFERS];
void *am_bufs[MPIDI_OFI_MAX_NUM_AM_BUFFERS];
void *am_bufs;
MPIDI_OFI_am_repost_request_t am_reqs[MPIDI_OFI_MAX_NUM_AM_BUFFERS];

MPIDU_genq_private_pool_t am_hdr_buf_pool;
Expand Down Expand Up @@ -371,6 +371,7 @@ typedef struct {
int num_close_nics;
int num_comms_enabled_striping; /* Number of active communicators with striping enabled */
int num_comms_enabled_hashing; /* Number of active communicators with hashingenabled */
bool am_bufs_registered; /* whether active message buffers are GPU registered */

/* Window/RMA Globals */
void *win_map;
Expand Down
30 changes: 7 additions & 23 deletions src/mpid/common/genq/mpidu_genq_shmem_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,20 +40,6 @@

By default, the queue will continue to use sender-side queues until the performance impact
is verified.

- name : MPIR_CVAR_GENQ_SHMEM_POOL_GPU_REGISTER
category : CH4
type : boolean
default : false
class : none
verbosity : MPI_T_VERBOSITY_USER_BASIC
scope : MPI_T_SCOPE_ALL_EQ
description : >-
Control whether to register the shmem pool with the GPU runtime. This could lower the
latency of small GPU-to-GPU messages at the cost of some amount of GPU memory consumed
by the MPI library. By default, we do not register with the GPU since we expect most
GPU-to-GPU messages will take the IPC path,.

=== END_MPI_T_CVAR_INFO_BLOCK ===
*/

Expand Down Expand Up @@ -185,16 +171,14 @@ int MPIDU_genqi_shmem_pool_register(MPIDU_genqi_shmem_pool_s * pool_obj)

MPIR_FUNC_ENTER;

if (MPIR_CVAR_GENQ_SHMEM_POOL_GPU_REGISTER) {
int total_cells_size =
pool_obj->num_proc * pool_obj->cells_per_proc * pool_obj->cell_alloc_size;
int free_queue_size = pool_obj->num_proc * sizeof(MPIDU_genq_shmem_queue_u);
uintptr_t slab_size = total_cells_size + free_queue_size;
int total_cells_size =
pool_obj->num_proc * pool_obj->cells_per_proc * pool_obj->cell_alloc_size;
int free_queue_size = pool_obj->num_proc * sizeof(MPIDU_genq_shmem_queue_u);
uintptr_t slab_size = total_cells_size + free_queue_size;

rc = MPIR_gpu_register_host(pool_obj->slab, slab_size);
MPIR_ERR_CHECK(rc);
pool_obj->gpu_registered = true;
}
rc = MPIR_gpu_register_host(pool_obj->slab, slab_size);
MPIR_ERR_CHECK(rc);
pool_obj->gpu_registered = true;

fn_fail:
MPIR_FUNC_EXIT;
Expand Down