From df052f56c76e50ac1ef8287082be1c2844ebbe93 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Thu, 21 Jul 2022 16:07:44 -0500 Subject: [PATCH] gpu: add cvar MPIR_CVAR_GPU_HAS_WAIT_KERNEL Add cvar MPIR_CVAR_GPU_HAS_WAIT_KERNEL to supply yaksa_init with info hint "yaksa_has_wait_kernel". With the hint, yaksa should avoid code that may potentially dead lock with a wait kernel. --- src/include/mpir_gpu.h | 12 ++++++++++++ src/mpi/datatype/typerep/src/typerep_yaksa_init.c | 8 +++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/include/mpir_gpu.h b/src/include/mpir_gpu.h index f5e33e71f4f..e78e44fe9a7 100644 --- a/src/include/mpir_gpu.h +++ b/src/include/mpir_gpu.h @@ -28,6 +28,18 @@ and we do not query the buffer type internally because we assume no GPU buffer is use. + - name : MPIR_CVAR_GPU_HAS_WAIT_KERNEL + category : GPU + type : int + default : 0 + class : none + verbosity : MPI_T_VERBOSITY_USER_BASIC + scope : MPI_T_SCOPE_ALL_EQ + description : >- + If set to 1, avoid allocate allocating GPU registered host buffers + for temporary buffers. When stream workq and GPU wait kernels are + in use, access APIs for GPU registered memory may cause deadlock. + === END_MPI_T_CVAR_INFO_BLOCK === */ diff --git a/src/mpi/datatype/typerep/src/typerep_yaksa_init.c b/src/mpi/datatype/typerep/src/typerep_yaksa_init.c index 2c0813fae6b..fa49791776b 100644 --- a/src/mpi/datatype/typerep/src/typerep_yaksa_init.c +++ b/src/mpi/datatype/typerep/src/typerep_yaksa_init.c @@ -404,7 +404,13 @@ void MPIR_Typerep_init(void) yaksa_info_keyval_append(MPII_yaksa_info_nogpu, "yaksa_gpu_driver", "nogpu", 6); if (MPIR_CVAR_ENABLE_GPU) { - yaksa_init(NULL); + yaksa_info_t info = NULL; + if (MPIR_CVAR_GPU_HAS_WAIT_KERNEL) { + yaksa_info_create(&info); + yaksa_info_keyval_append(info, "yaksa_has_wait_kernel", "1", 2); + } + + yaksa_init(info); } else { /* prevent yaksa to query gpu devices, which can be very expensive */ yaksa_init(MPII_yaksa_info_nogpu);