From a518c0ecc6df9e726a4ce4b9d1d04ea40d888dd4 Mon Sep 17 00:00:00 2001 From: Mohamad Chaarawi Date: Mon, 22 Apr 2024 11:12:44 -0500 Subject: [PATCH 1/2] DAOS-15655 control: fix support for non default system name (#14170) Signed-off-by: Mohamad Chaarawi --- src/control/cmd/daos/pool.go | 10 ++++------ src/control/cmd/daos/util.go | 5 +---- src/engine/init.c | 17 ++++++++++------- src/include/daos_srv/daos_engine.h | 3 +++ src/mgmt/cli_mgmt.c | 12 +++++++++++- src/pool/srv_cli.c | 2 +- 6 files changed, 30 insertions(+), 19 deletions(-) diff --git a/src/control/cmd/daos/pool.go b/src/control/cmd/daos/pool.go index aab4fb9c4a7..e8e7462e302 100644 --- a/src/control/cmd/daos/pool.go +++ b/src/control/cmd/daos/pool.go @@ -15,7 +15,6 @@ import ( "github.com/google/uuid" "github.com/pkg/errors" - "github.com/daos-stack/daos/src/control/build" "github.com/daos-stack/daos/src/control/cmd/dmg/pretty" "github.com/daos-stack/daos/src/control/common" "github.com/daos-stack/daos/src/control/common/proto/convert" @@ -94,12 +93,11 @@ func (cmd *poolBaseCmd) PoolID() ui.LabelOrUUIDFlag { func (cmd *poolBaseCmd) connectPool(flags C.uint) error { sysName := cmd.SysName - if sysName == "" { - sysName = build.DefaultSystemName + var cSysName *C.char + if sysName != "" { + cSysName := C.CString(sysName) + defer freeString(cSysName) } - cSysName := C.CString(sysName) - defer freeString(cSysName) - var rc C.int switch { case cmd.PoolID().HasLabel(): diff --git a/src/control/cmd/daos/util.go b/src/control/cmd/daos/util.go index a3af4c82bcc..e9334c0f110 100644 --- a/src/control/cmd/daos/util.go +++ b/src/control/cmd/daos/util.go @@ -17,7 +17,6 @@ import ( "github.com/google/uuid" "github.com/pkg/errors" - "github.com/daos-stack/daos/src/control/build" "github.com/daos-stack/daos/src/control/common/cmdutil" "github.com/daos-stack/daos/src/control/lib/daos" "github.com/daos-stack/daos/src/control/logging" @@ -42,6 +41,7 @@ init_op_vals(struct cmd_args_s *ap) ap->o_op = -1; ap->fs_op = -1; ap->sh_op = -1; + ap->sysname = NULL; } void @@ -184,8 +184,6 @@ func freeCmdArgs(ap *C.struct_cmd_args_s) { return } - freeString(ap.sysname) - C.free(unsafe.Pointer(ap.dfs_path)) C.free(unsafe.Pointer(ap.dfs_prefix)) @@ -213,7 +211,6 @@ func allocCmdArgs(log logging.Logger) (ap *C.struct_cmd_args_s, cleanFn func(), // allocate the struct using C memory to avoid any issues with Go GC ap = (*C.struct_cmd_args_s)(C.calloc(1, C.sizeof_struct_cmd_args_s)) C.init_op_vals(ap) - ap.sysname = C.CString(build.DefaultSystemName) ctx, cancel := context.WithCancel(context.Background()) outStream, outCleanup, err := createWriteStream(ctx, log.Info) diff --git a/src/engine/init.c b/src/engine/init.c index d639456eeb1..90f3986d146 100644 --- a/src/engine/init.c +++ b/src/engine/init.c @@ -43,7 +43,7 @@ static char modules[MAX_MODULE_OPTIONS + 1]; static unsigned int nr_threads; /** DAOS system name (corresponds to crt group ID) */ -static char *daos_sysname = DAOS_DEFAULT_SYS_NAME; +char daos_sysname[DAOS_SYS_NAME_MAX + 1] = DAOS_DEFAULT_SYS_NAME; /** Storage node hostname */ char dss_hostname[DSS_HOSTNAME_MAX_LEN]; @@ -1030,16 +1030,19 @@ parse(int argc, char **argv) case 'f': rc = arg_strtoul(optarg, &dss_core_offset, "\"-f\""); break; - case 'g': - if (strnlen(optarg, DAOS_SYS_NAME_MAX + 1) > - DAOS_SYS_NAME_MAX) { - printf("DAOS system name must be at most " - "%d bytes\n", DAOS_SYS_NAME_MAX); + case 'g': { + size_t sys_len = strnlen(optarg, DAOS_SYS_NAME_MAX + 1); + + if (sys_len > DAOS_SYS_NAME_MAX) { + printf("DAOS system name must be at most %d bytes\n", + DAOS_SYS_NAME_MAX); rc = -DER_INVAL; break; } - daos_sysname = optarg; + memcpy(daos_sysname, optarg, sys_len); + daos_sysname[sys_len] = '\0'; break; + } case 's': dss_storage_path = optarg; break; diff --git a/src/include/daos_srv/daos_engine.h b/src/include/daos_srv/daos_engine.h index 116c486e943..5c894d1fd0d 100644 --- a/src/include/daos_srv/daos_engine.h +++ b/src/include/daos_srv/daos_engine.h @@ -31,6 +31,9 @@ /* Standard max length of addresses e.g. URI, PCI */ #define ADDR_STR_MAX_LEN 128 +/** DAOS system name (corresponds to crt group ID) */ +extern char daos_sysname[]; + /** number of target (XS set) per engine */ extern unsigned int dss_tgt_nr; diff --git a/src/mgmt/cli_mgmt.c b/src/mgmt/cli_mgmt.c index 2eff852eb9f..adac5d6cbf4 100644 --- a/src/mgmt/cli_mgmt.c +++ b/src/mgmt/cli_mgmt.c @@ -26,6 +26,8 @@ #include #include +char agent_sys_name[DAOS_SYS_NAME_MAX + 1] = DAOS_DEFAULT_SYS_NAME; + int dc_cp(tse_task_t *task, void *data) { @@ -304,6 +306,14 @@ get_attach_info(const char *name, bool all_ranks, struct dc_mgmt_sys_info *info, rc = fill_sys_info(resp, info); if (rc != 0) goto out_resp; + + /** set the agent system info to be the default one */ + if (name == NULL) { + if (copy_str(agent_sys_name, resp->sys)) { + rc = -DER_INVAL; + goto out_resp; + } + } *respp = resp; out_resp: @@ -962,7 +972,7 @@ int dc_mgmt_sys_attach(const char *name, struct dc_mgmt_sys **sysp) { if (name == NULL) - name = DAOS_DEFAULT_SYS_NAME; + name = agent_sys_name; return sys_attach(name, sysp); } diff --git a/src/pool/srv_cli.c b/src/pool/srv_cli.c index dbbaea834cf..995e16728b0 100644 --- a/src/pool/srv_cli.c +++ b/src/pool/srv_cli.c @@ -62,7 +62,7 @@ dsc_pool_open(uuid_t pool_uuid, uuid_t poh_uuid, unsigned int flags, pool->dp_capas = flags; /** attach to the server group and initialize rsvc_client */ - rc = dc_mgmt_sys_attach(NULL, &pool->dp_sys); + rc = dc_mgmt_sys_attach(daos_sysname, &pool->dp_sys); if (rc != 0) D_GOTO(out, rc); From 7bad6f732d9ff4a96a76dd221fcbe44ac9ff76a6 Mon Sep 17 00:00:00 2001 From: Mohamad Chaarawi Date: Sun, 28 Apr 2024 22:21:51 -0500 Subject: [PATCH 2/2] DAOS-13151 client: cache and reuse the attach info for the default system (#14172) improve the daos_init() and pool_connect() process to reuse the attach info instead of doing agent drpc upcalls multiple times. Required-githooks: true Change-Id: I404ca66ba986f9e0284738875347da5e057a534a Signed-off-by: Mohamad Chaarawi --- src/client/api/init.c | 14 +++- src/engine/init.c | 9 +-- src/include/daos/mgmt.h | 27 +++---- src/include/daos_srv/daos_engine.h | 2 +- src/mgmt/cli_debug.c | 2 +- src/mgmt/cli_mgmt.c | 111 +++++++++++++++-------------- 6 files changed, 89 insertions(+), 76 deletions(-) diff --git a/src/client/api/init.c b/src/client/api/init.c index f574169d8c7..40e7aa5e0d9 100644 --- a/src/client/api/init.c +++ b/src/client/api/init.c @@ -139,7 +139,7 @@ daos_init(void) { struct d_fault_attr_t *d_fault_init; struct d_fault_attr_t *d_fault_mem = NULL; - struct d_fault_attr_t d_fault_mem_saved; + struct d_fault_attr_t d_fault_mem_saved; int rc; D_MUTEX_LOCK(&module_lock); @@ -188,19 +188,24 @@ daos_init(void) if (rc != 0) D_GOTO(out_agent, rc); + /** get and cache attach info of default system */ + rc = dc_mgmt_cache_attach_info(NULL); + if (rc != 0) + D_GOTO(out_job, rc); + /** * get CaRT configuration (see mgmtModule.handleGetAttachInfo for the * handling of NULL system names) */ rc = dc_mgmt_net_cfg(NULL); if (rc != 0) - D_GOTO(out_job, rc); + D_GOTO(out_attach, rc); /** set up event queue */ rc = daos_eq_lib_init(); if (rc != 0) { D_ERROR("failed to initialize eq_lib: "DF_RC"\n", DP_RC(rc)); - D_GOTO(out_job, rc); + D_GOTO(out_attach, rc); } /** @@ -256,6 +261,8 @@ daos_init(void) pl_fini(); out_eq: daos_eq_lib_fini(); +out_attach: + dc_mgmt_drop_attach_info(); out_job: dc_job_fini(); out_agent: @@ -313,6 +320,7 @@ daos_fini(void) DF_RC"\n", DP_RC(rc)); dc_tm_fini(); + dc_mgmt_drop_attach_info(); dc_agent_fini(); dc_job_fini(); diff --git a/src/engine/init.c b/src/engine/init.c index 90f3986d146..a72130a44a5 100644 --- a/src/engine/init.c +++ b/src/engine/init.c @@ -43,7 +43,7 @@ static char modules[MAX_MODULE_OPTIONS + 1]; static unsigned int nr_threads; /** DAOS system name (corresponds to crt group ID) */ -char daos_sysname[DAOS_SYS_NAME_MAX + 1] = DAOS_DEFAULT_SYS_NAME; +char *daos_sysname = DAOS_DEFAULT_SYS_NAME; /** Storage node hostname */ char dss_hostname[DSS_HOSTNAME_MAX_LEN]; @@ -1031,16 +1031,13 @@ parse(int argc, char **argv) rc = arg_strtoul(optarg, &dss_core_offset, "\"-f\""); break; case 'g': { - size_t sys_len = strnlen(optarg, DAOS_SYS_NAME_MAX + 1); - - if (sys_len > DAOS_SYS_NAME_MAX) { + if (strnlen(optarg, DAOS_SYS_NAME_MAX + 1) > DAOS_SYS_NAME_MAX) { printf("DAOS system name must be at most %d bytes\n", DAOS_SYS_NAME_MAX); rc = -DER_INVAL; break; } - memcpy(daos_sysname, optarg, sys_len); - daos_sysname[sys_len] = '\0'; + daos_sysname = optarg; break; } case 's': diff --git a/src/include/daos/mgmt.h b/src/include/daos/mgmt.h index eee326c761b..8572b18b8b2 100644 --- a/src/include/daos/mgmt.h +++ b/src/include/daos/mgmt.h @@ -67,19 +67,20 @@ int dc_mgmt_pool_find(struct dc_mgmt_sys *sys, const char *label, int dc_mgmt_notify_pool_connect(struct dc_pool *pool); int dc_mgmt_notify_pool_disconnect(struct dc_pool *pool); int dc_mgmt_notify_exit(void); -int dc_mgmt_net_get_num_srv_ranks(void); - +int + dc_mgmt_net_get_num_srv_ranks(void); int dc_mgmt_get_sys_info(const char *sys, struct daos_sys_info **info); -void dc_mgmt_put_sys_info(struct daos_sys_info *info); - +void +dc_mgmt_put_sys_info(struct daos_sys_info *info); int - dc_mgmt_tm_register(const char *sys, const char *jobid, key_t shm_key, uid_t *owner_uid); - -int dc_get_attach_info(const char *name, bool all_ranks, - struct dc_mgmt_sys_info *info, - Mgmt__GetAttachInfoResp **respp); - -void dc_put_attach_info(struct dc_mgmt_sys_info *info, - Mgmt__GetAttachInfoResp *resp); - +dc_get_attach_info(const char *name, bool all_ranks, struct dc_mgmt_sys_info *info, + Mgmt__GetAttachInfoResp **respp); +void +dc_put_attach_info(struct dc_mgmt_sys_info *info, Mgmt__GetAttachInfoResp *resp); +int +dc_mgmt_cache_attach_info(const char *name); +void +dc_mgmt_drop_attach_info(void); +int +dc_mgmt_tm_register(const char *sys, const char *jobid, key_t shm_key, uid_t *owner_uid); #endif diff --git a/src/include/daos_srv/daos_engine.h b/src/include/daos_srv/daos_engine.h index 5c894d1fd0d..a7588e36e3f 100644 --- a/src/include/daos_srv/daos_engine.h +++ b/src/include/daos_srv/daos_engine.h @@ -32,7 +32,7 @@ #define ADDR_STR_MAX_LEN 128 /** DAOS system name (corresponds to crt group ID) */ -extern char daos_sysname[]; +extern char *daos_sysname; /** number of target (XS set) per engine */ extern unsigned int dss_tgt_nr; diff --git a/src/mgmt/cli_debug.c b/src/mgmt/cli_debug.c index 3a8e0a14490..fa44dd27684 100644 --- a/src/mgmt/cli_debug.c +++ b/src/mgmt/cli_debug.c @@ -33,7 +33,7 @@ dc_debug_set_params(tse_task_t *task) int rc; args = dc_task_get_args(task); - rc = dc_mgmt_sys_attach(args->grp, &cp_arg.sys); + rc = dc_mgmt_sys_attach(args->grp, &cp_arg.sys); if (rc != 0) { D_ERROR("failed to attach to grp %s, rc "DF_RC"\n", args->grp, DP_RC(rc)); diff --git a/src/mgmt/cli_mgmt.c b/src/mgmt/cli_mgmt.c index adac5d6cbf4..bc5da348369 100644 --- a/src/mgmt/cli_mgmt.c +++ b/src/mgmt/cli_mgmt.c @@ -28,6 +28,9 @@ char agent_sys_name[DAOS_SYS_NAME_MAX + 1] = DAOS_DEFAULT_SYS_NAME; +static struct dc_mgmt_sys_info info_g; +static Mgmt__GetAttachInfoResp *resp_g; + int dc_cp(tse_task_t *task, void *data) { @@ -224,6 +227,12 @@ dc_put_attach_info(struct dc_mgmt_sys_info *info, Mgmt__GetAttachInfoResp *resp) return put_attach_info(info, resp); } +void +dc_mgmt_drop_attach_info(void) +{ + return put_attach_info(&info_g, resp_g); +} + /* * Get the attach info (i.e., rank URIs) for name. To avoid duplicating the * rank URIs, we return the GetAttachInfo response directly. Callers are @@ -235,9 +244,9 @@ get_attach_info(const char *name, bool all_ranks, struct dc_mgmt_sys_info *info, { struct drpc_alloc alloc = PROTO_ALLOCATOR_INIT(alloc); struct drpc *ctx; - Mgmt__GetAttachInfoReq req = MGMT__GET_ATTACH_INFO_REQ__INIT; - Mgmt__GetAttachInfoResp *resp; + Mgmt__GetAttachInfoReq req = MGMT__GET_ATTACH_INFO_REQ__INIT; uint8_t *reqb; + Mgmt__GetAttachInfoResp *resp; size_t reqb_size; Drpc__Call *dreq; Drpc__Response *dresp; @@ -286,8 +295,7 @@ get_attach_info(const char *name, bool all_ranks, struct dc_mgmt_sys_info *info, rc = -DER_MISC; goto out_dresp; } - resp = mgmt__get_attach_info_resp__unpack(&alloc.alloc, dresp->body.len, - dresp->body.data); + resp = mgmt__get_attach_info_resp__unpack(&alloc.alloc, dresp->body.len, dresp->body.data); if (alloc.oom) D_GOTO(out_dresp, rc = -DER_NOMEM); if (resp == NULL) { @@ -331,12 +339,20 @@ get_attach_info(const char *name, bool all_ranks, struct dc_mgmt_sys_info *info, } int -dc_get_attach_info(const char *name, bool all_ranks, - struct dc_mgmt_sys_info *info, - Mgmt__GetAttachInfoResp **respp) { +dc_get_attach_info(const char *name, bool all_ranks, struct dc_mgmt_sys_info *info, + Mgmt__GetAttachInfoResp **respp) +{ return get_attach_info(name, all_ranks, info, respp); } +int +dc_mgmt_cache_attach_info(const char *name) +{ + if (name != NULL && strcmp(name, agent_sys_name) != 0) + return -DER_INVAL; + return get_attach_info(name, true, &info_g, &resp_g); +} + static void free_rank_uris(struct daos_rank_uri *uris, uint32_t nr_uris) { @@ -517,19 +533,14 @@ print_mgmt_net_env() */ int dc_mgmt_net_cfg(const char *name) { - int rc; - char buf[SYS_INFO_BUF_SIZE]; - char *crt_timeout; - char *ofi_interface; - char *ofi_domain; - char *cli_srx_set; - struct dc_mgmt_sys_info info; - Mgmt__GetAttachInfoResp *resp; - - /* Query the agent for the CaRT network configuration parameters */ - rc = get_attach_info(name, true /* all_ranks */, &info, &resp); - if (rc != 0) - return rc; + int rc; + char buf[SYS_INFO_BUF_SIZE]; + char *crt_timeout; + char *ofi_interface; + char *ofi_domain; + char *cli_srx_set; + struct dc_mgmt_sys_info *info = &info_g; + Mgmt__GetAttachInfoResp *resp = resp_g; if (resp->client_net_hint != NULL && resp->client_net_hint->n_env_vars > 0) { int i; @@ -559,18 +570,18 @@ int dc_mgmt_net_cfg(const char *name) g_num_serv_ranks = resp->n_rank_uris; D_INFO("Setting number of server ranks to %d\n", g_num_serv_ranks); /* These two are always set */ - rc = d_setenv("CRT_PHY_ADDR_STR", info.provider, 1); + rc = d_setenv("CRT_PHY_ADDR_STR", info->provider, 1); if (rc != 0) D_GOTO(cleanup, rc = d_errno2der(errno)); - sprintf(buf, "%d", info.crt_ctx_share_addr); + sprintf(buf, "%d", info->crt_ctx_share_addr); rc = d_setenv("CRT_CTX_SHARE_ADDR", buf, 1); if (rc != 0) D_GOTO(cleanup, rc = d_errno2der(errno)); /* If the server has set this, the client must use the same value. */ - if (info.srv_srx_set != -1) { - sprintf(buf, "%d", info.srv_srx_set); + if (info->srv_srx_set != -1) { + sprintf(buf, "%d", info->srv_srx_set); rc = d_setenv("FI_OFI_RXM_USE_SRX", buf, 1); if (rc != 0) D_GOTO(cleanup, rc = d_errno2der(errno)); @@ -590,7 +601,7 @@ int dc_mgmt_net_cfg(const char *name) /* Allow client env overrides for these three */ d_agetenv_str(&crt_timeout, "CRT_TIMEOUT"); if (!crt_timeout) { - sprintf(buf, "%d", info.crt_timeout); + sprintf(buf, "%d", info->crt_timeout); rc = d_setenv("CRT_TIMEOUT", buf, 1); if (rc != 0) D_GOTO(cleanup, rc = d_errno2der(errno)); @@ -603,7 +614,7 @@ int dc_mgmt_net_cfg(const char *name) d_agetenv_str(&ofi_interface, "OFI_INTERFACE"); d_agetenv_str(&ofi_domain, "OFI_DOMAIN"); if (!ofi_interface) { - rc = d_setenv("OFI_INTERFACE", info.interface, 1); + rc = d_setenv("OFI_INTERFACE", info->interface, 1); if (rc != 0) { d_freeenv_str(&ofi_domain); D_GOTO(cleanup, rc = d_errno2der(errno)); @@ -617,7 +628,7 @@ int dc_mgmt_net_cfg(const char *name) D_WARN("Ignoring OFI_DOMAIN '%s' because OFI_INTERFACE is not set; using " "automatic configuration instead\n", ofi_domain); - rc = d_setenv("OFI_DOMAIN", info.domain, 1); + rc = d_setenv("OFI_DOMAIN", info->domain, 1); if (rc != 0) { d_freeenv_str(&ofi_domain); D_GOTO(cleanup, rc = d_errno2der(errno)); @@ -635,39 +646,24 @@ int dc_mgmt_net_cfg(const char *name) print_mgmt_net_env(); cleanup: - put_attach_info(&info, resp); - return rc; } int dc_mgmt_net_cfg_check(const char *name) { - int rc; char *cli_srx_set; - struct dc_mgmt_sys_info info; - Mgmt__GetAttachInfoResp *resp; - - /* Query the agent for the CaRT network configuration parameters */ - rc = get_attach_info(name, true /* all_ranks */, &info, &resp); - if (rc != 0) - return rc; /* Client may not set it if the server hasn't. */ - if (info.srv_srx_set == -1) { + if (info_g.srv_srx_set == -1) { d_agetenv_str(&cli_srx_set, "FI_OFI_RXM_USE_SRX"); if (cli_srx_set) { D_ERROR("Client set FI_OFI_RXM_USE_SRX to %s, " "but server is unset!\n", cli_srx_set); d_freeenv_str(&cli_srx_set); - rc = -DER_INVAL; - goto out; + return -DER_INVAL; } } - rc = 0; - -out: - put_attach_info(&info, resp); - return rc; + return 0; } static int send_monitor_request(struct dc_pool *pool, int request_type) @@ -860,7 +856,8 @@ attach(const char *name, struct dc_mgmt_sys **sysp) { struct dc_mgmt_sys *sys; crt_group_t *group; - Mgmt__GetAttachInfoResp *resp; + Mgmt__GetAttachInfoResp *resp; + bool need_free_resp = false; int rc; D_DEBUG(DB_MGMT, "attaching to system '%s'\n", name); @@ -888,21 +885,32 @@ attach(const char *name, struct dc_mgmt_sys **sysp) goto out; } - rc = get_attach_info(name, true /* all_ranks */, &sys->sy_info, &resp); - if (rc != 0) - goto err_sys; + if (strcmp(name, agent_sys_name) != 0 || !resp_g) { + need_free_resp = true; + rc = get_attach_info(name, true /* all_ranks */, &sys->sy_info, &resp); + if (rc != 0) + goto err_sys; + } else { + resp = resp_g; + rc = fill_sys_info(resp, &sys->sy_info); + if (rc != 0) + goto err_sys; + } rc = attach_group(name, &sys->sy_info, resp, &sys->sy_group); if (rc != 0) goto err_info; - free_get_attach_info_resp(resp); + if (need_free_resp) + free_get_attach_info_resp(resp); out: *sysp = sys; return 0; err_info: - put_attach_info(&sys->sy_info, resp); + d_rank_list_free(sys->sy_info.ms_ranks); + if (need_free_resp) + free_get_attach_info_resp(resp); err_sys: D_FREE(sys); err: @@ -973,7 +981,6 @@ dc_mgmt_sys_attach(const char *name, struct dc_mgmt_sys **sysp) { if (name == NULL) name = agent_sys_name; - return sys_attach(name, sysp); }