From 8fae8fbc24c7dbfd99cbfba2c98fbaebade102fb Mon Sep 17 00:00:00 2001 From: Ken Raffenetti Date: Wed, 22 Feb 2017 13:13:19 -0600 Subject: [PATCH 01/14] CH4: Remove redundant autoconf macros PAC_ARG_SHARED_MEMORY is already provided/checked in MPL. There is no need to add it again in CH4. Signed-off-by: Lena Oden --- src/mpid/ch4/subconfigure.m4 | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/mpid/ch4/subconfigure.m4 b/src/mpid/ch4/subconfigure.m4 index 8ac981845e1..d1966cb9d6d 100644 --- a/src/mpid/ch4/subconfigure.m4 +++ b/src/mpid/ch4/subconfigure.m4 @@ -371,14 +371,11 @@ if test "$enable_ch4r_per_comm_msg_queue" = "yes" ; then [Define if CH4U will use per-communicator message queues]) fi -PAC_ARG_SHARED_MEMORY - AC_CONFIG_FILES([ src/mpid/ch4/src/mpid_ch4_net_array.c src/mpid/ch4/include/netmodpre.h src/mpid/ch4/include/shmpre.h ]) -PAC_ARG_SHARED_MEMORY ])dnl end AM_COND_IF(BUILD_CH4,...) AM_CONDITIONAL([BUILD_CH4_SHM],[test "$ch4_shm_level" = "yes" -o "$ch4_shm_level" = "exclusive"]) From c2a8387eabf4f5985900aa451c666005ae354509 Mon Sep 17 00:00:00 2001 From: Ken Raffenetti Date: Wed, 22 Feb 2017 13:39:08 -0600 Subject: [PATCH 02/14] CH4: Move checks inside BUILD_CH4 conditional These configure checks should only be run if CH4 is selected to be built. Otherwise they will pollute the configuration. Signed-off-by: Lena Oden --- src/mpid/ch4/subconfigure.m4 | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/mpid/ch4/subconfigure.m4 b/src/mpid/ch4/subconfigure.m4 index d1966cb9d6d..c8a79a45d68 100644 --- a/src/mpid/ch4/subconfigure.m4 +++ b/src/mpid/ch4/subconfigure.m4 @@ -371,15 +371,6 @@ if test "$enable_ch4r_per_comm_msg_queue" = "yes" ; then [Define if CH4U will use per-communicator message queues]) fi -AC_CONFIG_FILES([ -src/mpid/ch4/src/mpid_ch4_net_array.c -src/mpid/ch4/include/netmodpre.h -src/mpid/ch4/include/shmpre.h -]) -])dnl end AM_COND_IF(BUILD_CH4,...) - -AM_CONDITIONAL([BUILD_CH4_SHM],[test "$ch4_shm_level" = "yes" -o "$ch4_shm_level" = "exclusive"]) - AC_CHECK_HEADERS(sys/mman.h sys/stat.h fcntl.h) gl_FUNC_RANDOM_R @@ -390,6 +381,14 @@ else AC_MSG_NOTICE([Using a non-symmetric heap]) fi +AC_CONFIG_FILES([ +src/mpid/ch4/src/mpid_ch4_net_array.c +src/mpid/ch4/include/netmodpre.h +src/mpid/ch4/include/shmpre.h +]) +])dnl end AM_COND_IF(BUILD_CH4,...) + +AM_CONDITIONAL([BUILD_CH4_SHM],[test "$ch4_shm_level" = "yes" -o "$ch4_shm_level" = "exclusive"]) ])dnl end _BODY From cda0d6e35fd5fea0d927df0f385ed85ee1773ab9 Mon Sep 17 00:00:00 2001 From: Ken Raffenetti Date: Wed, 22 Feb 2017 13:40:40 -0600 Subject: [PATCH 03/14] CH4: Check for mmap CH4 requires mmap. We should check for it at configure time instead of blowing up at compile time if it is not available. Signed-off-by: Lena Oden --- src/mpid/ch4/subconfigure.m4 | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mpid/ch4/subconfigure.m4 b/src/mpid/ch4/subconfigure.m4 index c8a79a45d68..ba940f994ef 100644 --- a/src/mpid/ch4/subconfigure.m4 +++ b/src/mpid/ch4/subconfigure.m4 @@ -372,6 +372,7 @@ if test "$enable_ch4r_per_comm_msg_queue" = "yes" ; then fi AC_CHECK_HEADERS(sys/mman.h sys/stat.h fcntl.h) +AC_CHECK_FUNC(mmap, [], [AC_MSG_ERROR(mmap is required to build CH4)]) gl_FUNC_RANDOM_R if test "$HAVE_RANDOM_R" = "1" -a "$HAVE_STRUCT_RANDOM_DATA" = "1" ; then From 97d901a5bbb3adf30995f88606aa08a88ee7bd56 Mon Sep 17 00:00:00 2001 From: Ken Raffenetti Date: Wed, 22 Feb 2017 13:14:18 -0600 Subject: [PATCH 04/14] CH4: Check for gethostname CH4 can use gethostname if it is available. Signed-off-by: Lena Oden --- src/mpid/ch4/subconfigure.m4 | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/mpid/ch4/subconfigure.m4 b/src/mpid/ch4/subconfigure.m4 index ba940f994ef..9a68c488c7e 100644 --- a/src/mpid/ch4/subconfigure.m4 +++ b/src/mpid/ch4/subconfigure.m4 @@ -382,6 +382,12 @@ else AC_MSG_NOTICE([Using a non-symmetric heap]) fi +AC_CHECK_FUNCS(gethostname) +if test "$ac_cv_func_gethostname" = "yes" ; then + # Do we need to declare gethostname? + PAC_FUNC_NEEDS_DECL([#include ],gethostname) +fi + AC_CONFIG_FILES([ src/mpid/ch4/src/mpid_ch4_net_array.c src/mpid/ch4/include/netmodpre.h From 683871e1eff3ea7ee4df4f3cc8c8b2e141e820f8 Mon Sep 17 00:00:00 2001 From: "Oblomov, Sergey" Date: Mon, 6 Mar 2017 12:48:28 +0300 Subject: [PATCH 05/14] CH4/OFI: Fix potential crash in AM mode - when atomics are not enabled (wrapped into active messaging) in this case fi_atomics* should not be called because it may crash (capabilities are not requested/provided and OFI atomics infrastructure may be unintialized) Change-Id: Ia6ef8914462fd8e43d10be2a67961455c589c704 Signed-off-by: Ken Raffenetti --- src/mpid/ch4/netmod/ofi/util.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/mpid/ch4/netmod/ofi/util.c b/src/mpid/ch4/netmod/ofi/util.c index 9daa8b65ab7..3b31670800e 100644 --- a/src/mpid/ch4/netmod/ofi/util.c +++ b/src/mpid/ch4/netmod/ofi/util.c @@ -700,6 +700,10 @@ static inline void create_dt_map() dtsize[FI_LONG_DOUBLE] = sizeof(long double); dtsize[FI_LONG_DOUBLE_COMPLEX] = sizeof(long double complex); + /* when atomics are disabled and atomics capability are not + * enabled call fo fi_atomic*** may crash */ + MPIR_Assert(MPIDI_OFI_ENABLE_ATOMICS); + for (i = 0; i < MPIDI_OFI_DT_SIZES; i++) for (j = 0; j < MPIDI_OFI_OP_SIZES; j++) { enum fi_datatype fi_dt = (enum fi_datatype) -1; @@ -817,5 +821,8 @@ void MPIDI_OFI_index_datatypes() add_index(MPI_LONG_INT, &index); add_index(MPI_SHORT_INT, &index); /* 60 */ add_index(MPI_LONG_DOUBLE_INT, &index); - create_dt_map(); + + /* do not generate map when atomics are not enabled */ + if(MPIDI_OFI_ENABLE_ATOMICS) + create_dt_map(); } From a825760b9c364630af6657a759cbcd92041fd8b5 Mon Sep 17 00:00:00 2001 From: Ken Raffenetti Date: Wed, 8 Mar 2017 14:12:18 -0600 Subject: [PATCH 06/14] mpl: Fix permissions on uthash header No reviewer. --- src/mpl/include/mpl_uthash.h | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 src/mpl/include/mpl_uthash.h diff --git a/src/mpl/include/mpl_uthash.h b/src/mpl/include/mpl_uthash.h old mode 100755 new mode 100644 From 0ca9ad085ef9d5a7decc0f67c3a727f0ca1918b2 Mon Sep 17 00:00:00 2001 From: Ken Raffenetti Date: Thu, 19 Jan 2017 15:18:07 +0900 Subject: [PATCH 07/14] mpid/common/shm: Add counter for segment allocation Without some way to distinguish each shared memory segment allocation, we will run into key collisions in the PMI KVS space. Add a counter to uniquely identify each new segment as it is created. Signed-off-by: Pavan Balaji --- src/mpid/common/shm/mpidu_shm_alloc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/mpid/common/shm/mpidu_shm_alloc.c b/src/mpid/common/shm/mpidu_shm_alloc.c index 9ff2ab6ad93..f3e83118e60 100644 --- a/src/mpid/common/shm/mpidu_shm_alloc.c +++ b/src/mpid/common/shm/mpidu_shm_alloc.c @@ -50,6 +50,8 @@ static int check_alloc(MPIDU_shm_seg_t *memory, MPIDU_shm_barrier_t *barrier, static size_t segment_len = 0; +static int num_segments = 0; + typedef struct asym_check_region { void *base_ptr; @@ -337,7 +339,7 @@ int MPIDU_shm_seg_commit(MPIDU_shm_seg_t *memory, MPIDU_shm_barrier_t **barrier, /* post name of shared file */ MPIR_Assert(local_procs_0 == rank); - MPL_snprintf(key, key_max_sz, "sharedFilename[%i]", rank); + MPL_snprintf(key, key_max_sz, "sharedFilename[%i]-%i", rank, num_segments); mpi_errno = MPL_shm_hnd_get_serialized_by_ref(memory->hnd, &serialized_hnd); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP (mpi_errno); @@ -367,7 +369,7 @@ int MPIDU_shm_seg_commit(MPIDU_shm_seg_t *memory, MPIDU_shm_barrier_t **barrier, MPIR_ERR_CHKANDJUMP1 (pmi_errno != PMI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**pmi_barrier", "**pmi_barrier %d", pmi_errno); /* get name of shared file */ - MPL_snprintf(key, key_max_sz, "sharedFilename[%i]", local_procs_0); + MPL_snprintf(key, key_max_sz, "sharedFilename[%i]-%i", local_procs_0, num_segments); pmi_errno = PMI_KVS_Get(kvs_name, key, val, val_max_sz); MPIR_ERR_CHKANDJUMP1(pmi_errno != PMI_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**pmi_kvs_get", "**pmi_kvs_get %d", pmi_errno); @@ -401,6 +403,8 @@ int MPIDU_shm_seg_commit(MPIDU_shm_seg_t *memory, MPIDU_shm_barrier_t **barrier, memory->symmetrical = 0 ; } #endif + num_segments++; + /* assign sections of the shared memory segment to their pointers */ start_addr = current_addr; From c462e9bce634fc2a492b6c0654facf8ed95f6ce8 Mon Sep 17 00:00:00 2001 From: Ken Raffenetti Date: Tue, 17 Jan 2017 15:13:14 +0900 Subject: [PATCH 08/14] CH4/OFI: Optimize business card retrieval Exploit shared memory to avoid having each process get and store a full copy of the business cards for MPI_COMM_WORLD. Use a shared memory segment instead, where each process on a node retrieves a smaller portion of the total number of cards. Effectively reduces memory consumption and PMI traffic per node. Signed-off-by: Pavan Balaji --- src/mpid/ch4/netmod/ofi/ofi_init.h | 37 +++++++++++++++++++++++------- src/mpid/ch4/src/ch4_init.h | 14 +++++------ 2 files changed, 36 insertions(+), 15 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_init.h b/src/mpid/ch4/netmod/ofi/ofi_init.h index fda8ea94e93..fd242c4f644 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_init.h +++ b/src/mpid/ch4/netmod/ofi/ofi_init.h @@ -14,6 +14,7 @@ #include "ofi_impl.h" #include "mpir_cvars.h" #include "pmi.h" +#include "mpidu_shm.h" /* === BEGIN_MPI_T_CVAR_INFO_BLOCK === @@ -714,6 +715,10 @@ static inline int MPIDI_NM_mpi_init_hook(int rank, &MPIDI_Global.ep, 0)); if (do_av_insert) { + int local_rank, num_local = 0, local_rank_0 = -1; + MPIDU_shm_seg_t memory; + MPIDU_shm_barrier_t *barrier; + int start, end; /* ---------------------------------- */ /* Get our endpoint name and publish */ @@ -742,16 +747,31 @@ static inline int MPIDI_NM_mpi_init_hook(int rank, MPIDI_OFI_PMI_CALL_POP(PMI_KVS_Put(MPIDI_Global.kvsname, keyS, val), pmi); MPIDI_OFI_PMI_CALL_POP(PMI_KVS_Commit(MPIDI_Global.kvsname), pmi); #endif - MPIDI_OFI_PMI_CALL_POP(PMI_Barrier(), pmi); + + for (i = 0; i < size; i++) { + if (MPIDI_CH4_rank_is_local(i, MPIR_Process.comm_world)) { + if (i == rank) + local_rank = num_local; + + if (local_rank_0 == -1) + local_rank_0 = i; + + num_local++; + } + } + + MPIDU_shm_seg_alloc(size * MPIDI_Global.addrnamelen, (void **)&table); + MPIDU_shm_seg_commit(&memory, &barrier, num_local, local_rank, local_rank_0, rank); /* -------------------------------- */ /* Create our address table from */ /* encoded KVS values */ /* -------------------------------- */ - table = (char *) MPL_malloc(size * MPIDI_Global.addrnamelen); - maxlen = MPIDI_KVSAPPSTRLEN; - - for (i = 0; i < size; i++) { + start = local_rank * (size / num_local); + end = start + (size / num_local); + if (local_rank == num_local - 1) + end += size % num_local; + for (i = start; i < end; i++) { sprintf(keyS, "OFI-%d", i); #ifdef USE_CRAYPMI_API MPIDI_OFI_PMI_CALL_POP(PMI2_KVS_Get @@ -764,6 +784,7 @@ static inline int MPIDI_NM_mpi_init_hook(int rank, (valS, "OFI", (char *) &table[i * MPIDI_Global.addrnamelen], MPIDI_Global.addrnamelen, &maxlen), buscard_len); } + PMI_Barrier(); /* -------------------------------- */ /* Table is constructed. Map it */ @@ -775,6 +796,9 @@ static inline int MPIDI_NM_mpi_init_hook(int rank, MPIDI_OFI_AV(&MPIDIU_get_av(0, i)).dest = mapped_table[i]; } MPL_free(mapped_table); + + PMI_Barrier(); + MPIDU_shm_seg_destroy(&memory, num_local); } /* -------------------------------- */ @@ -870,9 +894,6 @@ static inline int MPIDI_NM_mpi_init_hook(int rank, fi_freeinfo(hints); - if (table) - MPL_free(table); - MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NETMOD_OFI_INIT); return mpi_errno; fn_fail: diff --git a/src/mpid/ch4/src/ch4_init.h b/src/mpid/ch4/src/ch4_init.h index 4b0971c52d2..60e1c2994b0 100644 --- a/src/mpid/ch4/src/ch4_init.h +++ b/src/mpid/ch4/src/ch4_init.h @@ -213,13 +213,6 @@ MPL_STATIC_INLINE_PREFIX int MPID_Init(int *argc, MPIR_Datatype_init(); MPIR_Group_init(); - mpi_errno = MPIDI_NM_mpi_init_hook(rank, size, appnum, &MPIR_Process.attrs.tag_ub, - MPIR_Process.comm_world, - MPIR_Process.comm_self, has_parent, 1, &netmod_contexts); - if (mpi_errno != MPI_SUCCESS) { - MPIR_ERR_POPFATAL(mpi_errno); - } - #ifdef MPIDI_BUILD_CH4_LOCALITY_INFO int i; for (i = 0; i < MPIR_Process.comm_world->local_size; i++) { @@ -245,6 +238,13 @@ MPL_STATIC_INLINE_PREFIX int MPID_Init(int *argc, } #endif + mpi_errno = MPIDI_NM_mpi_init_hook(rank, size, appnum, &MPIR_Process.attrs.tag_ub, + MPIR_Process.comm_world, + MPIR_Process.comm_self, has_parent, 1, &netmod_contexts); + if (mpi_errno != MPI_SUCCESS) { + MPIR_ERR_POPFATAL(mpi_errno); + } + #ifdef MPIDI_BUILD_CH4_SHM mpi_errno = MPIDI_SHM_mpi_init_hook(rank, size); From e3f13fa248d3aba6b0cb8a3963d890b18ad9f66d Mon Sep 17 00:00:00 2001 From: Ken Raffenetti Date: Fri, 20 Jan 2017 11:57:41 +0900 Subject: [PATCH 09/14] pm/hydra: Remove check for duplicate PMI keys Move duplicate key checking at the PMI server level to a preprocessor block only built when error checking is enabled. Makes key insertion constant time in a default build, which will improve address exchange time with many MPI processes. Signed-off-by: Pavan Balaji --- src/pm/hydra/configure.ac | 1 + src/pm/hydra/pm/pmiserv/common.c | 11 ++++++++++- src/pm/hydra/pm/pmiserv/common.h | 1 + 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/pm/hydra/configure.ac b/src/pm/hydra/configure.ac index 89e2812ded7..e0a45be184d 100644 --- a/src/pm/hydra/configure.ac +++ b/src/pm/hydra/configure.ac @@ -658,6 +658,7 @@ for option in $enable_g ; do all) PAC_APPEND_FLAG(-g, CFLAGS) AC_DEFINE(USE_MEMORY_TRACING,1,[Define if memory tracing is enabled]) + AC_DEFINE(PMI_KEY_CHECK,1,[Define if we should check for PMI key collisions]) ;; *) ;; diff --git a/src/pm/hydra/pm/pmiserv/common.c b/src/pm/hydra/pm/pmiserv/common.c index 7ec397a1d40..baa61f65eb5 100644 --- a/src/pm/hydra/pm/pmiserv/common.c +++ b/src/pm/hydra/pm/pmiserv/common.c @@ -128,6 +128,7 @@ HYD_status HYD_pmcd_pmi_allocate_kvs(struct HYD_pmcd_pmi_kvs ** kvs, int pgid) HYDU_MALLOC_OR_JUMP(*kvs, struct HYD_pmcd_pmi_kvs *, sizeof(struct HYD_pmcd_pmi_kvs), status); MPL_snprintf((*kvs)->kvsname, PMI_MAXKVSLEN, "kvs_%d_%d", (int) getpid(), pgid); (*kvs)->key_pair = NULL; + (*kvs)->tail = NULL; fn_exit: HYDU_FUNC_EXIT(); @@ -156,7 +157,7 @@ void HYD_pmcd_free_pmi_kvs_list(struct HYD_pmcd_pmi_kvs *kvs_list) HYD_status HYD_pmcd_pmi_add_kvs(const char *key, char *val, struct HYD_pmcd_pmi_kvs *kvs, int *ret) { - struct HYD_pmcd_pmi_kvs_pair *key_pair, *run, *last; + struct HYD_pmcd_pmi_kvs_pair *key_pair; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); @@ -171,8 +172,12 @@ HYD_status HYD_pmcd_pmi_add_kvs(const char *key, char *val, struct HYD_pmcd_pmi_ if (kvs->key_pair == NULL) { kvs->key_pair = key_pair; + kvs->tail = key_pair; } else { +#ifdef PMI_KEY_CHECK + struct HYD_pmcd_pmi_kvs_pair *run, *last; + for (run = kvs->key_pair; run; run = run->next) { if (!strcmp(run->key, key_pair->key)) { /* duplicate key found */ @@ -183,6 +188,10 @@ HYD_status HYD_pmcd_pmi_add_kvs(const char *key, char *val, struct HYD_pmcd_pmi_ } /* Add key_pair to end of list. */ last->next = key_pair; +#else + kvs->tail->next = key_pair; + kvs->tail = key_pair; +#endif } fn_exit: diff --git a/src/pm/hydra/pm/pmiserv/common.h b/src/pm/hydra/pm/pmiserv/common.h index 5423fb9dd8c..77653683029 100644 --- a/src/pm/hydra/pm/pmiserv/common.h +++ b/src/pm/hydra/pm/pmiserv/common.h @@ -26,6 +26,7 @@ struct HYD_pmcd_pmi_kvs_pair { struct HYD_pmcd_pmi_kvs { char kvsname[PMI_MAXKVSLEN]; /* Name of this kvs */ struct HYD_pmcd_pmi_kvs_pair *key_pair; + struct HYD_pmcd_pmi_kvs_pair *tail; }; struct HYD_pmcd_hdr { From 0176491c0c6ed1dab3eab3b29f5adcb0547101ae Mon Sep 17 00:00:00 2001 From: Ken Raffenetti Date: Fri, 20 Jan 2017 16:34:27 +0900 Subject: [PATCH 10/14] pm/hydra: Add hash table for KVS get cache Signed-off-by: Pavan Balaji --- src/pm/hydra/pm/pmiserv/pmip_pmi_v1.c | 61 ++++++++++++++------------- 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/src/pm/hydra/pm/pmiserv/pmip_pmi_v1.c b/src/pm/hydra/pm/pmiserv/pmip_pmi_v1.c index a18806d834e..c605b84b956 100644 --- a/src/pm/hydra/pm/pmiserv/pmip_pmi_v1.c +++ b/src/pm/hydra/pm/pmiserv/pmip_pmi_v1.c @@ -9,6 +9,7 @@ #include "bsci.h" #include "demux.h" #include "topo.h" +#include "mpl_uthash.h" #define debug(...) \ { \ @@ -28,11 +29,15 @@ static struct { int keyval_len; } cache_put; -static struct { - char **key; - char **val; - int keyval_len; -} cache_get; +struct cache_elem { + char *key; + char *val; + MPL_UT_hash_handle hh; +}; + +static struct cache_elem *cache_get = NULL, *hash_get = NULL; + +static int num_elems = 0; static HYD_status send_cmd_upstream(const char *start, int fd, int num_args, char *args[]) { @@ -173,7 +178,6 @@ static HYD_status fn_init(int fd, char *args[]) for (i = 0; i < CACHE_PUT_KEYVAL_MAXLEN + 1; i++) cache_put.keyval[i] = NULL; cache_put.keyval_len = 0; - cache_get.keyval_len = 0; global_init = 0; } @@ -381,6 +385,7 @@ static HYD_status fn_get(int fd, char *args[]) char *cmd, *key, *val; struct HYD_pmcd_token *tokens; int token_count, i; + struct cache_elem *found = NULL; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); @@ -405,19 +410,12 @@ static HYD_status fn_get(int fd, char *args[]) MPL_free(cmd); } else { - val = NULL; - for (i = 0; i < cache_get.keyval_len; i++) { - if (!strcmp(cache_get.key[i], key)) { - val = cache_get.val[i]; - break; - } - } - - if (val) { + MPL_HASH_FIND_STR(hash_get, key, found); + if (found) { HYD_STRING_STASH_INIT(stash); HYD_STRING_STASH(stash, MPL_strdup("cmd=get_result rc="), status); HYD_STRING_STASH(stash, MPL_strdup("0 msg=success value="), status); - HYD_STRING_STASH(stash, MPL_strdup(val), status); + HYD_STRING_STASH(stash, MPL_strdup(found->val), status); HYD_STRING_STASH(stash, MPL_strdup("\n"), status); HYD_STRING_SPIT(stash, cmd, status); @@ -502,16 +500,20 @@ static HYD_status fn_keyval_cache(int fd, char *args[]) /* allocate a larger space for the cached keyvals, copy over the * older keyvals and add the new ones in */ - HYDU_REALLOC_OR_JUMP(cache_get.key, char **, - (cache_get.keyval_len + token_count) * sizeof(char *), status); - HYDU_REALLOC_OR_JUMP(cache_get.val, char **, - (cache_get.keyval_len + token_count) * sizeof(char *), status); - - for (i = 0; i < token_count; i++) { - cache_get.key[cache_get.keyval_len + i] = MPL_strdup(tokens[i].key); - cache_get.val[cache_get.keyval_len + i] = MPL_strdup(tokens[i].val); + MPL_HASH_CLEAR(hh, hash_get); + HYDU_REALLOC_OR_JUMP(cache_get, struct cache_elem *, + (sizeof(struct cache_elem) * (num_elems + token_count)), status); + for (i = 0; i < num_elems; i++) { + struct cache_elem *elem = cache_get + i; + MPL_HASH_ADD_STR(hash_get, key, elem); + } + for (; i < num_elems + token_count; i++) { + struct cache_elem *elem = cache_get + i; + elem->key = MPL_strdup(tokens[i - num_elems].key); + elem->val = MPL_strdup(tokens[i - num_elems].val); + MPL_HASH_ADD_STR(hash_get, key, elem); } - cache_get.keyval_len += token_count; + num_elems += token_count; fn_exit: HYD_pmcd_pmi_free_tokens(tokens, token_count); @@ -595,12 +597,11 @@ static HYD_status fn_finalize(int fd, char *args[]) if (finalize_count == HYD_pmcd_pmip.local.proxy_process_count) { /* All processes have finalized */ - for (i = 0; i < cache_get.keyval_len; i++) { - MPL_free(cache_get.key[i]); - MPL_free(cache_get.val[i]); + for (i = 0; i < num_elems; i++) { + MPL_free((cache_get + i)->key); + MPL_free((cache_get + i)->val); } - MPL_free(cache_get.key); - MPL_free(cache_get.val); + MPL_free(cache_get); } fn_exit: From baf40040b34f492fcb7f3d60c4a80853a146e0d5 Mon Sep 17 00:00:00 2001 From: Ken Raffenetti Date: Mon, 30 Jan 2017 10:31:31 -0600 Subject: [PATCH 11/14] pm/hydra: Store proxy count in process group structure Not having access to the number of proxies in a process group meant we had to count them by traversing the list at certain times. This made for an inefficient barrier algorithm that counted the number of proxies each time it received a proxy command. Instead, we store the number of proxies in the pg struct so we can compare it against the current barrier count. Signed-off-by: Pavan Balaji --- src/pm/hydra/include/hydra.h | 1 + src/pm/hydra/pm/pmiserv/pmiserv_pmi_v1.c | 7 +------ src/pm/hydra/utils/alloc/alloc.c | 2 ++ 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/pm/hydra/include/hydra.h b/src/pm/hydra/include/hydra.h index 13735e7f8d1..4176c821679 100644 --- a/src/pm/hydra/include/hydra.h +++ b/src/pm/hydra/include/hydra.h @@ -315,6 +315,7 @@ struct HYD_exec { struct HYD_pg { int pgid; struct HYD_proxy *proxy_list; + int proxy_count; int pg_process_count; int barrier_count; diff --git a/src/pm/hydra/pm/pmiserv/pmiserv_pmi_v1.c b/src/pm/hydra/pm/pmiserv/pmiserv_pmi_v1.c index 4c41999d25b..3e6e8111d74 100644 --- a/src/pm/hydra/pm/pmiserv/pmiserv_pmi_v1.c +++ b/src/pm/hydra/pm/pmiserv/pmiserv_pmi_v1.c @@ -136,7 +136,6 @@ static HYD_status bcast_keyvals(int fd, int pid) static HYD_status fn_barrier_in(int fd, int pid, int pgid, char *args[]) { struct HYD_proxy *proxy, *tproxy; - int proxy_count; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); @@ -144,12 +143,8 @@ static HYD_status fn_barrier_in(int fd, int pid, int pgid, char *args[]) proxy = HYD_pmcd_pmi_find_proxy(fd); HYDU_ASSERT(proxy, status); - proxy_count = 0; - for (tproxy = proxy->pg->proxy_list; tproxy; tproxy = tproxy->next) - proxy_count++; - proxy->pg->barrier_count++; - if (proxy->pg->barrier_count == proxy_count) { + if (proxy->pg->barrier_count == proxy->pg->proxy_count) { proxy->pg->barrier_count = 0; bcast_keyvals(fd, pid); diff --git a/src/pm/hydra/utils/alloc/alloc.c b/src/pm/hydra/utils/alloc/alloc.c index d204ceb9402..1338c12ae41 100644 --- a/src/pm/hydra/utils/alloc/alloc.c +++ b/src/pm/hydra/utils/alloc/alloc.c @@ -136,6 +136,7 @@ void HYDU_init_pg(struct HYD_pg *pg, int pgid) { pg->pgid = pgid; pg->proxy_list = NULL; + pg->proxy_count = 0; pg->pg_process_count = 0; pg->barrier_count = 0; pg->spawner_pg = NULL; @@ -480,6 +481,7 @@ HYD_status HYDU_create_proxy_list(struct HYD_exec *exec_list, struct HYD_node *n for (proxy = pg->proxy_list, i = 0; proxy; proxy = proxy->next, i++) proxy->proxy_id = i; + pg->proxy_count = i; fn_exit: HYDU_FUNC_EXIT(); From 721085b0487edfc7545596740ee5dadfe47bed46 Mon Sep 17 00:00:00 2001 From: Ken Raffenetti Date: Mon, 30 Jan 2017 12:02:49 -0600 Subject: [PATCH 12/14] pm/hydra: Add proxy hash for fast lookup Use a hash based lookup for proxies based on the control fd. Avoid a list traversal, which will be costly when the number of proxies is high. Signed-off-by: Pavan Balaji --- src/pm/hydra/include/hydra.h | 3 +++ src/pm/hydra/include/hydra_server.h | 3 +++ src/pm/hydra/pm/pmiserv/pmiserv_cb.c | 2 ++ src/pm/hydra/pm/pmiserv/pmiserv_pmi.c | 8 ++------ 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/pm/hydra/include/hydra.h b/src/pm/hydra/include/hydra.h index 4176c821679..4b56e685b1f 100644 --- a/src/pm/hydra/include/hydra.h +++ b/src/pm/hydra/include/hydra.h @@ -15,6 +15,7 @@ #include "hydra_config.h" #include "mpl.h" +#include "mpl_uthash.h" extern char *HYD_dbg_prefix; @@ -370,6 +371,8 @@ struct HYD_proxy { int control_fd; struct HYD_proxy *next; + + MPL_UT_hash_handle hh; }; /* Global user parameters */ diff --git a/src/pm/hydra/include/hydra_server.h b/src/pm/hydra/include/hydra_server.h index 2299d2b8166..4965a63d74f 100644 --- a/src/pm/hydra/include/hydra_server.h +++ b/src/pm/hydra/include/hydra_server.h @@ -38,6 +38,9 @@ struct HYD_server_info_s { /* Process groups */ struct HYD_pg pg_list; + /* Hash for fast proxy lookup */ + struct HYD_proxy *proxy_hash; + /* Cleanup */ int cmd_pipe[2]; diff --git a/src/pm/hydra/pm/pmiserv/pmiserv_cb.c b/src/pm/hydra/pm/pmiserv/pmiserv_cb.c index acc8007bf91..594cc042919 100644 --- a/src/pm/hydra/pm/pmiserv/pmiserv_cb.c +++ b/src/pm/hydra/pm/pmiserv/pmiserv_cb.c @@ -84,6 +84,7 @@ static HYD_status cleanup_proxy(struct HYD_proxy *proxy) status = HYDT_dmx_deregister_fd(proxy->control_fd); HYDU_ERR_POP(status, "error deregistering fd\n"); close(proxy->control_fd); + MPL_HASH_DEL(HYD_server_info.proxy_hash, proxy); /* Reset the control fd, so when the fd is reused, we don't * find the wrong proxy */ @@ -470,6 +471,7 @@ HYD_status HYD_pmcd_pmiserv_proxy_init_cb(int fd, HYD_event_t events, void *user /* This will be the control socket for this proxy */ proxy->control_fd = fd; + MPL_HASH_ADD_INT(HYD_server_info.proxy_hash, control_fd, proxy); /* Send out the executable information */ status = send_exec_info(proxy); diff --git a/src/pm/hydra/pm/pmiserv/pmiserv_pmi.c b/src/pm/hydra/pm/pmiserv/pmiserv_pmi.c index 372e4289a5a..75fcf5b3978 100644 --- a/src/pm/hydra/pm/pmiserv/pmiserv_pmi.c +++ b/src/pm/hydra/pm/pmiserv/pmiserv_pmi.c @@ -14,15 +14,11 @@ struct HYD_pmcd_pmi_publish *HYD_pmcd_pmi_publish_list = NULL; struct HYD_proxy *HYD_pmcd_pmi_find_proxy(int fd) { - struct HYD_pg *pg; struct HYD_proxy *proxy; - for (pg = &HYD_server_info.pg_list; pg; pg = pg->next) - for (proxy = pg->proxy_list; proxy; proxy = proxy->next) - if (proxy->control_fd == fd) - return proxy; + MPL_HASH_FIND_INT(HYD_server_info.proxy_hash, &fd, proxy); - return NULL; + return proxy; } HYD_status HYD_pmcd_pmi_finalize(void) From 062efb43e65c0b073a54764121007cbb565896b5 Mon Sep 17 00:00:00 2001 From: Ken Raffenetti Date: Wed, 1 Feb 2017 08:18:21 -0500 Subject: [PATCH 13/14] CH4/OFI: Use shm barrier for local sync in init When we are coordinating the shared business cards, there is no need for global synchronization. A shm barrier for local synchronization is sufficient. Signed-off-by: Pavan Balaji --- src/mpid/ch4/netmod/ofi/ofi_init.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_init.h b/src/mpid/ch4/netmod/ofi/ofi_init.h index fd242c4f644..82240d79066 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_init.h +++ b/src/mpid/ch4/netmod/ofi/ofi_init.h @@ -784,7 +784,9 @@ static inline int MPIDI_NM_mpi_init_hook(int rank, (valS, "OFI", (char *) &table[i * MPIDI_Global.addrnamelen], MPIDI_Global.addrnamelen, &maxlen), buscard_len); } - PMI_Barrier(); + mpi_errno = MPIDU_shm_barrier(barrier, num_local); + if (mpi_errno) + MPIR_ERR_POP(mpi_errno); /* -------------------------------- */ /* Table is constructed. Map it */ @@ -797,7 +799,9 @@ static inline int MPIDI_NM_mpi_init_hook(int rank, } MPL_free(mapped_table); - PMI_Barrier(); + mpi_errno = MPIDU_shm_barrier(barrier, num_local); + if (mpi_errno) + MPIR_ERR_POP(mpi_errno); MPIDU_shm_seg_destroy(&memory, num_local); } From d2d4d3b2e843521e862799a5130b3d00997cd62f Mon Sep 17 00:00:00 2001 From: Pavan Balaji Date: Fri, 10 Feb 2017 08:46:58 -0600 Subject: [PATCH 14/14] pm/hydra: Retry timed out socket connections Signed-off-by: Ken Raffenetti --- src/pm/hydra/utils/sock/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pm/hydra/utils/sock/sock.c b/src/pm/hydra/utils/sock/sock.c index fb40caf9ff1..4305642071d 100644 --- a/src/pm/hydra/utils/sock/sock.c +++ b/src/pm/hydra/utils/sock/sock.c @@ -150,7 +150,7 @@ HYD_status HYDU_sock_connect(const char *host, uint16_t port, int *fd, int retri retry_count = 0; do { ret = connect(*fd, (struct sockaddr *) &sa, sizeof(struct sockaddr_in)); - if (ret < 0 && errno == ECONNREFUSED) { + if (ret < 0 && (errno == ECONNREFUSED || errno == ETIMEDOUT)) { /* connection error; increase retry count and delay */ retry_count++; if (retry_count > retries)