diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index 64c99ded9d..3ec2eb7afb 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -200,26 +200,24 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, FI_ASYNC_IOV | FI_RX_CQ_DATA; hints->ep_attr->type = FI_EP_RDM; - // FI_ADDR_CXI is also available - // FI_ADDR_CXI_COMPAT is the one that MPICH chooses - // FI_ADDR_OPX is sneakily used to denote FI_ADDR_CXI_COMPAT - hints->addr_format = FI_ADDR_OPX; - - // The following differ from the SST configuration in ADIOS2 - hints->domain_attr->mr_mode = FI_MR_ENDPOINT; // - hints->domain_attr->control_progress = FI_PROGRESS_MANUAL; // - hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; // - - // Authentication is needed - // TODO: the first ID in SLINGSHOT_SVC_IDS is chosen, but we should rather - // choose the one corresponding with the FABRIC_IFACE - // example: - // SLINGSHOT_SVC_IDS=5,5,5,5 - // SLINGSHOT_VNIS=1310,1271 - // SLINGSHOT_DEVICES=cxi0,cxi1,cxi2,cxi3 - // FABRIC_IFACE=cxi2 (user specified) - if(fabric->cxi_auth_key) + uint32_t fi_version; + if (fabric->cxi_auth_key) { + fi_version = FI_VERSION(1, 11); + + hints->domain_attr->mr_mode = FI_MR_ENDPOINT; + hints->domain_attr->control_progress = FI_PROGRESS_MANUAL; + hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + + // Authentication is needed + // TODO: the first ID in SLINGSHOT_SVC_IDS is chosen, but we should + // rather choose the one corresponding with the FABRIC_IFACE + // example: + // SLINGSHOT_SVC_IDS=5,5,5,5 + // SLINGSHOT_VNIS=1310,1271 + // SLINGSHOT_DEVICES=cxi0,cxi1,cxi2,cxi3 + // FABRIC_IFACE=cxi2 (user specified) + hints->ep_attr->auth_key = malloc(sizeof(struct cxi_auth_key)); memcpy(hints->ep_attr->auth_key, fabric->cxi_auth_key, sizeof(struct cxi_auth_key)); @@ -230,6 +228,14 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, sizeof(struct cxi_auth_key)); hints->domain_attr->auth_key_size = sizeof(struct cxi_auth_key); } + else + { + fi_version = FI_VERSION(1, 5); + + hints->domain_attr->mr_mode = FI_MR_BASIC; + hints->domain_attr->control_progress = FI_PROGRESS_AUTO; + hints->domain_attr->data_progress = FI_PROGRESS_AUTO; + } if (Params->DataInterface) { @@ -243,7 +249,7 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, fabric->info = NULL; pthread_mutex_lock(&fabric_mutex); - fi_getinfo(FI_VERSION(1,11), NULL, NULL, 0, hints, &info); + fi_getinfo(fi_version, NULL, NULL, 0, hints, &info); pthread_mutex_unlock(&fabric_mutex); if (!info) { @@ -269,7 +275,8 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, } if ((((strcmp(prov_name, "verbs") == 0) && info->src_addr) || (strcmp(prov_name, "gni") == 0) || - (strcmp(prov_name, "psm2") == 0)) && + (strcmp(prov_name, "psm2") == 0) || + (strcmp(prov_name, "cxi") == 0)) && (!useinfo || !ifname || (strcmp(useinfo->domain_attr->name, ifname) != 0))) { @@ -280,7 +287,8 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, useinfo = info; } else if (((strstr(prov_name, "verbs") && info->src_addr) || - strstr(prov_name, "gni") || strstr(prov_name, "psm2")) && + strstr(prov_name, "gni") || strstr(prov_name, "psm2") || + strstr(prov_name, "cxi")) && !useinfo) { Svcs->verbose(CP_Stream, DPTraceVerbose, @@ -359,16 +367,23 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, fabric->addr_len = info->src_addrlen; /* - * FI_MR_ALLOCATED and FI_MR_ENDPOINT are required for the CXI provider. - * FI_MR_LOCAL and FI_MR_PROV_KEY are for compatibility - * with the rest of the legacy SST-libfabric implementation (where mr_mode - * used to be FI_MR_BASIC which is equivalent to FI_MR_VIRT_ADDR | - * FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_LOCAL) + * The libfabric data-plane of SST was originally programmed to use + * FI_MR_BASIC as mr_mode, which is equivalent to + * FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_LOCAL. + * + * However, HPE's CXI provider requires two changes to that: + * (1) It does not support FI_MR_VIRT_ADDR. + * (2) It requires use of FI_MR_ENDPOINT. + * + * So we propagate the bit value currently contained in the mr_mode + * for these flags. */ - info->domain_attr->mr_mode = - FI_MR_ALLOCATED | FI_MR_ENDPOINT | FI_MR_PROV_KEY | FI_MR_LOCAL; + info->domain_attr->mr_mode = FI_MR_ALLOCATED | FI_MR_PROV_KEY | + FI_MR_LOCAL | + (FI_MR_ENDPOINT & info->domain_attr->mr_mode) | + (FI_MR_VIRT_ADDR & info->domain_attr->mr_mode); - fabric->mr_virt_addr = 0; + fabric->mr_virt_addr = info->domain_attr->mr_mode & FI_MR_VIRT_ADDR ? 1 : 0; #ifdef SST_HAVE_CRAY_DRC if (strstr(info->fabric_attr->prov_name, "gni") && fabric->auth_key) @@ -2026,15 +2041,36 @@ static int RdmaGetPriority(CP_Services Svcs, void *CP_Stream, FI_ASYNC_IOV | FI_RX_CQ_DATA; hints->ep_attr->type = FI_EP_RDM; - // FI_ADDR_CXI is also available - // FI_ADDR_CXI_COMPAT is the one that MPICH chooses - // FI_ADDR_OPX is sneakily used to denote FI_ADDR_CXI_COMPAT - hints->addr_format = FI_ADDR_OPX; + char const *vni_env_str = getenv("SLINGSHOT_VNIS"); - // The following differ from the SST configuration in ADIOS2 - hints->domain_attr->mr_mode = FI_MR_ENDPOINT; // - hints->domain_attr->control_progress = FI_PROGRESS_MANUAL; // - hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; // + uint32_t fi_version; + if (vni_env_str) + { + // try fishing for the CXI provider + Svcs->verbose(CP_Stream, DPSummaryVerbose, + "RDMA Dataplane trying to check for an available CXI " + "provider since environment variable SLINGSHOT_VNIS is " + "defined (value: '%s').\n", + vni_env_str); + fi_version = FI_VERSION(1, 11); + + hints->domain_attr->mr_mode = FI_MR_ENDPOINT; + hints->domain_attr->control_progress = FI_PROGRESS_MANUAL; + hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + } + else + { + Svcs->verbose(CP_Stream, DPSummaryVerbose, + "RDMA Dataplane trying to check for an available non-CXI " + "provider since environment variable SLINGSHOT_VNIS is " + "not defined.\n"); + + fi_version = FI_VERSION(1, 5); + + hints->domain_attr->mr_mode = FI_MR_BASIC; + hints->domain_attr->control_progress = FI_PROGRESS_AUTO; + hints->domain_attr->data_progress = FI_PROGRESS_AUTO; + } if (Params->DataInterface) { @@ -2052,7 +2088,7 @@ static int RdmaGetPriority(CP_Services Svcs, void *CP_Stream, } pthread_mutex_lock(&fabric_mutex); - fi_getinfo(FI_VERSION(1, 11), NULL, NULL, 0, hints, &info); + fi_getinfo(fi_version, NULL, NULL, 0, hints, &info); pthread_mutex_unlock(&fabric_mutex); fi_freeinfo(hints);