Skip to content

Commit

Permalink
DAOS-13151 client: cache and reuse the attach info for the default sy…
Browse files Browse the repository at this point in the history
…stem (#14172)

improve the daos_init() and pool_connect() process to reuse the attach info
instead of doing agent drpc upcalls multiple times.

Required-githooks: true

Change-Id: I404ca66ba986f9e0284738875347da5e057a534a
Signed-off-by: Mohamad Chaarawi <[email protected]>
  • Loading branch information
mchaarawi authored and mjmac committed May 6, 2024
1 parent a518c0e commit 7bad6f7
Show file tree
Hide file tree
Showing 6 changed files with 89 additions and 76 deletions.
14 changes: 11 additions & 3 deletions src/client/api/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ daos_init(void)
{
struct d_fault_attr_t *d_fault_init;
struct d_fault_attr_t *d_fault_mem = NULL;
struct d_fault_attr_t d_fault_mem_saved;
struct d_fault_attr_t d_fault_mem_saved;
int rc;

D_MUTEX_LOCK(&module_lock);
Expand Down Expand Up @@ -188,19 +188,24 @@ daos_init(void)
if (rc != 0)
D_GOTO(out_agent, rc);

/** get and cache attach info of default system */
rc = dc_mgmt_cache_attach_info(NULL);
if (rc != 0)
D_GOTO(out_job, rc);

/**
* get CaRT configuration (see mgmtModule.handleGetAttachInfo for the
* handling of NULL system names)
*/
rc = dc_mgmt_net_cfg(NULL);
if (rc != 0)
D_GOTO(out_job, rc);
D_GOTO(out_attach, rc);

/** set up event queue */
rc = daos_eq_lib_init();
if (rc != 0) {
D_ERROR("failed to initialize eq_lib: "DF_RC"\n", DP_RC(rc));
D_GOTO(out_job, rc);
D_GOTO(out_attach, rc);
}

/**
Expand Down Expand Up @@ -256,6 +261,8 @@ daos_init(void)
pl_fini();
out_eq:
daos_eq_lib_fini();
out_attach:
dc_mgmt_drop_attach_info();
out_job:
dc_job_fini();
out_agent:
Expand Down Expand Up @@ -313,6 +320,7 @@ daos_fini(void)
DF_RC"\n", DP_RC(rc));

dc_tm_fini();
dc_mgmt_drop_attach_info();
dc_agent_fini();
dc_job_fini();

Expand Down
9 changes: 3 additions & 6 deletions src/engine/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ static char modules[MAX_MODULE_OPTIONS + 1];
static unsigned int nr_threads;

/** DAOS system name (corresponds to crt group ID) */
char daos_sysname[DAOS_SYS_NAME_MAX + 1] = DAOS_DEFAULT_SYS_NAME;
char *daos_sysname = DAOS_DEFAULT_SYS_NAME;

/** Storage node hostname */
char dss_hostname[DSS_HOSTNAME_MAX_LEN];
Expand Down Expand Up @@ -1031,16 +1031,13 @@ parse(int argc, char **argv)
rc = arg_strtoul(optarg, &dss_core_offset, "\"-f\"");
break;
case 'g': {
size_t sys_len = strnlen(optarg, DAOS_SYS_NAME_MAX + 1);

if (sys_len > DAOS_SYS_NAME_MAX) {
if (strnlen(optarg, DAOS_SYS_NAME_MAX + 1) > DAOS_SYS_NAME_MAX) {
printf("DAOS system name must be at most %d bytes\n",
DAOS_SYS_NAME_MAX);
rc = -DER_INVAL;
break;
}
memcpy(daos_sysname, optarg, sys_len);
daos_sysname[sys_len] = '\0';
daos_sysname = optarg;
break;
}
case 's':
Expand Down
27 changes: 14 additions & 13 deletions src/include/daos/mgmt.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,19 +67,20 @@ int dc_mgmt_pool_find(struct dc_mgmt_sys *sys, const char *label,
int dc_mgmt_notify_pool_connect(struct dc_pool *pool);
int dc_mgmt_notify_pool_disconnect(struct dc_pool *pool);
int dc_mgmt_notify_exit(void);
int dc_mgmt_net_get_num_srv_ranks(void);

int
dc_mgmt_net_get_num_srv_ranks(void);
int dc_mgmt_get_sys_info(const char *sys, struct daos_sys_info **info);
void dc_mgmt_put_sys_info(struct daos_sys_info *info);

void
dc_mgmt_put_sys_info(struct daos_sys_info *info);
int
dc_mgmt_tm_register(const char *sys, const char *jobid, key_t shm_key, uid_t *owner_uid);

int dc_get_attach_info(const char *name, bool all_ranks,
struct dc_mgmt_sys_info *info,
Mgmt__GetAttachInfoResp **respp);

void dc_put_attach_info(struct dc_mgmt_sys_info *info,
Mgmt__GetAttachInfoResp *resp);

dc_get_attach_info(const char *name, bool all_ranks, struct dc_mgmt_sys_info *info,
Mgmt__GetAttachInfoResp **respp);
void
dc_put_attach_info(struct dc_mgmt_sys_info *info, Mgmt__GetAttachInfoResp *resp);
int
dc_mgmt_cache_attach_info(const char *name);
void
dc_mgmt_drop_attach_info(void);
int
dc_mgmt_tm_register(const char *sys, const char *jobid, key_t shm_key, uid_t *owner_uid);
#endif
2 changes: 1 addition & 1 deletion src/include/daos_srv/daos_engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
#define ADDR_STR_MAX_LEN 128

/** DAOS system name (corresponds to crt group ID) */
extern char daos_sysname[];
extern char *daos_sysname;

/** number of target (XS set) per engine */
extern unsigned int dss_tgt_nr;
Expand Down
2 changes: 1 addition & 1 deletion src/mgmt/cli_debug.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ dc_debug_set_params(tse_task_t *task)
int rc;

args = dc_task_get_args(task);
rc = dc_mgmt_sys_attach(args->grp, &cp_arg.sys);
rc = dc_mgmt_sys_attach(args->grp, &cp_arg.sys);
if (rc != 0) {
D_ERROR("failed to attach to grp %s, rc "DF_RC"\n", args->grp,
DP_RC(rc));
Expand Down
111 changes: 59 additions & 52 deletions src/mgmt/cli_mgmt.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@

char agent_sys_name[DAOS_SYS_NAME_MAX + 1] = DAOS_DEFAULT_SYS_NAME;

static struct dc_mgmt_sys_info info_g;
static Mgmt__GetAttachInfoResp *resp_g;

int
dc_cp(tse_task_t *task, void *data)
{
Expand Down Expand Up @@ -224,6 +227,12 @@ dc_put_attach_info(struct dc_mgmt_sys_info *info, Mgmt__GetAttachInfoResp *resp)
return put_attach_info(info, resp);
}

void
dc_mgmt_drop_attach_info(void)
{
return put_attach_info(&info_g, resp_g);
}

/*
* Get the attach info (i.e., rank URIs) for name. To avoid duplicating the
* rank URIs, we return the GetAttachInfo response directly. Callers are
Expand All @@ -235,9 +244,9 @@ get_attach_info(const char *name, bool all_ranks, struct dc_mgmt_sys_info *info,
{
struct drpc_alloc alloc = PROTO_ALLOCATOR_INIT(alloc);
struct drpc *ctx;
Mgmt__GetAttachInfoReq req = MGMT__GET_ATTACH_INFO_REQ__INIT;
Mgmt__GetAttachInfoResp *resp;
Mgmt__GetAttachInfoReq req = MGMT__GET_ATTACH_INFO_REQ__INIT;
uint8_t *reqb;
Mgmt__GetAttachInfoResp *resp;
size_t reqb_size;
Drpc__Call *dreq;
Drpc__Response *dresp;
Expand Down Expand Up @@ -286,8 +295,7 @@ get_attach_info(const char *name, bool all_ranks, struct dc_mgmt_sys_info *info,
rc = -DER_MISC;
goto out_dresp;
}
resp = mgmt__get_attach_info_resp__unpack(&alloc.alloc, dresp->body.len,
dresp->body.data);
resp = mgmt__get_attach_info_resp__unpack(&alloc.alloc, dresp->body.len, dresp->body.data);
if (alloc.oom)
D_GOTO(out_dresp, rc = -DER_NOMEM);
if (resp == NULL) {
Expand Down Expand Up @@ -331,12 +339,20 @@ get_attach_info(const char *name, bool all_ranks, struct dc_mgmt_sys_info *info,
}

int
dc_get_attach_info(const char *name, bool all_ranks,
struct dc_mgmt_sys_info *info,
Mgmt__GetAttachInfoResp **respp) {
dc_get_attach_info(const char *name, bool all_ranks, struct dc_mgmt_sys_info *info,
Mgmt__GetAttachInfoResp **respp)
{
return get_attach_info(name, all_ranks, info, respp);
}

int
dc_mgmt_cache_attach_info(const char *name)
{
if (name != NULL && strcmp(name, agent_sys_name) != 0)
return -DER_INVAL;
return get_attach_info(name, true, &info_g, &resp_g);
}

static void
free_rank_uris(struct daos_rank_uri *uris, uint32_t nr_uris)
{
Expand Down Expand Up @@ -517,19 +533,14 @@ print_mgmt_net_env()
*/
int dc_mgmt_net_cfg(const char *name)
{
int rc;
char buf[SYS_INFO_BUF_SIZE];
char *crt_timeout;
char *ofi_interface;
char *ofi_domain;
char *cli_srx_set;
struct dc_mgmt_sys_info info;
Mgmt__GetAttachInfoResp *resp;

/* Query the agent for the CaRT network configuration parameters */
rc = get_attach_info(name, true /* all_ranks */, &info, &resp);
if (rc != 0)
return rc;
int rc;
char buf[SYS_INFO_BUF_SIZE];
char *crt_timeout;
char *ofi_interface;
char *ofi_domain;
char *cli_srx_set;
struct dc_mgmt_sys_info *info = &info_g;
Mgmt__GetAttachInfoResp *resp = resp_g;

if (resp->client_net_hint != NULL && resp->client_net_hint->n_env_vars > 0) {
int i;
Expand Down Expand Up @@ -559,18 +570,18 @@ int dc_mgmt_net_cfg(const char *name)
g_num_serv_ranks = resp->n_rank_uris;
D_INFO("Setting number of server ranks to %d\n", g_num_serv_ranks);
/* These two are always set */
rc = d_setenv("CRT_PHY_ADDR_STR", info.provider, 1);
rc = d_setenv("CRT_PHY_ADDR_STR", info->provider, 1);
if (rc != 0)
D_GOTO(cleanup, rc = d_errno2der(errno));

sprintf(buf, "%d", info.crt_ctx_share_addr);
sprintf(buf, "%d", info->crt_ctx_share_addr);
rc = d_setenv("CRT_CTX_SHARE_ADDR", buf, 1);
if (rc != 0)
D_GOTO(cleanup, rc = d_errno2der(errno));

/* If the server has set this, the client must use the same value. */
if (info.srv_srx_set != -1) {
sprintf(buf, "%d", info.srv_srx_set);
if (info->srv_srx_set != -1) {
sprintf(buf, "%d", info->srv_srx_set);
rc = d_setenv("FI_OFI_RXM_USE_SRX", buf, 1);
if (rc != 0)
D_GOTO(cleanup, rc = d_errno2der(errno));
Expand All @@ -590,7 +601,7 @@ int dc_mgmt_net_cfg(const char *name)
/* Allow client env overrides for these three */
d_agetenv_str(&crt_timeout, "CRT_TIMEOUT");
if (!crt_timeout) {
sprintf(buf, "%d", info.crt_timeout);
sprintf(buf, "%d", info->crt_timeout);
rc = d_setenv("CRT_TIMEOUT", buf, 1);
if (rc != 0)
D_GOTO(cleanup, rc = d_errno2der(errno));
Expand All @@ -603,7 +614,7 @@ int dc_mgmt_net_cfg(const char *name)
d_agetenv_str(&ofi_interface, "OFI_INTERFACE");
d_agetenv_str(&ofi_domain, "OFI_DOMAIN");
if (!ofi_interface) {
rc = d_setenv("OFI_INTERFACE", info.interface, 1);
rc = d_setenv("OFI_INTERFACE", info->interface, 1);
if (rc != 0) {
d_freeenv_str(&ofi_domain);
D_GOTO(cleanup, rc = d_errno2der(errno));
Expand All @@ -617,7 +628,7 @@ int dc_mgmt_net_cfg(const char *name)
D_WARN("Ignoring OFI_DOMAIN '%s' because OFI_INTERFACE is not set; using "
"automatic configuration instead\n", ofi_domain);

rc = d_setenv("OFI_DOMAIN", info.domain, 1);
rc = d_setenv("OFI_DOMAIN", info->domain, 1);
if (rc != 0) {
d_freeenv_str(&ofi_domain);
D_GOTO(cleanup, rc = d_errno2der(errno));
Expand All @@ -635,39 +646,24 @@ int dc_mgmt_net_cfg(const char *name)
print_mgmt_net_env();

cleanup:
put_attach_info(&info, resp);

return rc;
}

int dc_mgmt_net_cfg_check(const char *name)
{
int rc;
char *cli_srx_set;
struct dc_mgmt_sys_info info;
Mgmt__GetAttachInfoResp *resp;

/* Query the agent for the CaRT network configuration parameters */
rc = get_attach_info(name, true /* all_ranks */, &info, &resp);
if (rc != 0)
return rc;

/* Client may not set it if the server hasn't. */
if (info.srv_srx_set == -1) {
if (info_g.srv_srx_set == -1) {
d_agetenv_str(&cli_srx_set, "FI_OFI_RXM_USE_SRX");
if (cli_srx_set) {
D_ERROR("Client set FI_OFI_RXM_USE_SRX to %s, "
"but server is unset!\n", cli_srx_set);
d_freeenv_str(&cli_srx_set);
rc = -DER_INVAL;
goto out;
return -DER_INVAL;
}
}
rc = 0;

out:
put_attach_info(&info, resp);
return rc;
return 0;
}

static int send_monitor_request(struct dc_pool *pool, int request_type)
Expand Down Expand Up @@ -860,7 +856,8 @@ attach(const char *name, struct dc_mgmt_sys **sysp)
{
struct dc_mgmt_sys *sys;
crt_group_t *group;
Mgmt__GetAttachInfoResp *resp;
Mgmt__GetAttachInfoResp *resp;
bool need_free_resp = false;
int rc;

D_DEBUG(DB_MGMT, "attaching to system '%s'\n", name);
Expand Down Expand Up @@ -888,21 +885,32 @@ attach(const char *name, struct dc_mgmt_sys **sysp)
goto out;
}

rc = get_attach_info(name, true /* all_ranks */, &sys->sy_info, &resp);
if (rc != 0)
goto err_sys;
if (strcmp(name, agent_sys_name) != 0 || !resp_g) {
need_free_resp = true;
rc = get_attach_info(name, true /* all_ranks */, &sys->sy_info, &resp);
if (rc != 0)
goto err_sys;
} else {
resp = resp_g;
rc = fill_sys_info(resp, &sys->sy_info);
if (rc != 0)
goto err_sys;
}

rc = attach_group(name, &sys->sy_info, resp, &sys->sy_group);
if (rc != 0)
goto err_info;

free_get_attach_info_resp(resp);
if (need_free_resp)
free_get_attach_info_resp(resp);
out:
*sysp = sys;
return 0;

err_info:
put_attach_info(&sys->sy_info, resp);
d_rank_list_free(sys->sy_info.ms_ranks);
if (need_free_resp)
free_get_attach_info_resp(resp);
err_sys:
D_FREE(sys);
err:
Expand Down Expand Up @@ -973,7 +981,6 @@ dc_mgmt_sys_attach(const char *name, struct dc_mgmt_sys **sysp)
{
if (name == NULL)
name = agent_sys_name;

return sys_attach(name, sysp);
}

Expand Down

0 comments on commit 7bad6f7

Please sign in to comment.