Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-8331 client: Add client side metrics #14030

Merged
merged 19 commits into from
Apr 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions src/cart/README.env
Original file line number Diff line number Diff line change
Expand Up @@ -182,3 +182,15 @@ This file lists the environment variables used in CaRT.
. CRT_TEST_CONT
When set to 1, orterun does not automatically shut down other servers when
one server is shutdown. Used in cart internal testing.

. D_CLIENT_METRICS_ENABLE
When set to 1, client side metrics will be collected on each daos client, which
can by retrieved by daos_metrics -j job_id on each client.

. D_CLIENT_METRICS_RETAIN
when set to 1, client side metrics will be retained even after the job exits, i.e.
those metrics can be retrieved by daos_metrics even after job exits.

. D_CLIENT_METRICS_DUMP_PATH
Set client side metrics dump path(file) for each client, so these metrics will be
dumped to the specified file when the job exits.
78 changes: 42 additions & 36 deletions src/cart/crt_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,42 +18,48 @@ struct crt_plugin_gdata crt_plugin_gdata;
static bool g_prov_settings_applied[CRT_PROV_COUNT];

/* List of the environment variables used in CaRT */
static const char *crt_env_names[] = {"D_PROVIDER",
"D_INTERFACE",
"D_DOMAIN",
"D_PORT",
"CRT_PHY_ADDR_STR",
"D_LOG_STDERR_IN_LOG",
"D_LOG_SIZE",
"D_LOG_FILE",
"D_LOG_FILE_APPEND_PID",
"D_LOG_MASK",
"DD_MASK",
"DD_STDERR",
"DD_SUBSYS",
"CRT_TIMEOUT",
"CRT_ATTACH_INFO_PATH",
"OFI_PORT",
"OFI_INTERFACE",
"OFI_DOMAIN",
"CRT_CREDIT_EP_CTX",
"CRT_CTX_SHARE_ADDR",
"CRT_CTX_NUM",
"D_FI_CONFIG",
"FI_UNIVERSE_SIZE",
"CRT_ENABLE_MEM_PIN",
"FI_OFI_RXM_USE_SRX",
"D_LOG_FLUSH",
"CRT_MRC_ENABLE",
"CRT_SECONDARY_PROVIDER",
"D_PROVIDER_AUTH_KEY",
"D_PORT_AUTO_ADJUST",
"D_POLL_TIMEOUT",
"D_LOG_FILE_APPEND_RANK",
"D_QUOTA_RPCS",
"D_POST_INIT",
"D_POST_INCR",
"DAOS_SIGNAL_REGISTER"};
static const char *crt_env_names[] = {
"D_PROVIDER",
"D_INTERFACE",
"D_DOMAIN",
"D_PORT",
"CRT_PHY_ADDR_STR",
"D_LOG_STDERR_IN_LOG",
"D_LOG_SIZE",
"D_LOG_FILE",
"D_LOG_FILE_APPEND_PID",
"D_LOG_MASK",
"DD_MASK",
"DD_STDERR",
"DD_SUBSYS",
"CRT_TIMEOUT",
"CRT_ATTACH_INFO_PATH",
"OFI_PORT",
"OFI_INTERFACE",
"OFI_DOMAIN",
"CRT_CREDIT_EP_CTX",
"CRT_CTX_SHARE_ADDR",
"CRT_CTX_NUM",
"D_FI_CONFIG",
"FI_UNIVERSE_SIZE",
"CRT_ENABLE_MEM_PIN",
"FI_OFI_RXM_USE_SRX",
"D_LOG_FLUSH",
"CRT_MRC_ENABLE",
"CRT_SECONDARY_PROVIDER",
"D_PROVIDER_AUTH_KEY",
"D_PORT_AUTO_ADJUST",
"D_POLL_TIMEOUT",
"D_LOG_FILE_APPEND_RANK",
"D_QUOTA_RPCS",
"D_POST_INIT",
"D_POST_INCR",
"DAOS_SIGNAL_REGISTER",
"D_CLIENT_METRICS_ENABLE",
"D_CLIENT_METRICS_RETAIN",
"D_CLIENT_METRICS_DUMP_PATH",

};

static void
crt_lib_init(void) __attribute__((__constructor__));
Expand Down
2 changes: 1 addition & 1 deletion src/client/api/SConscript
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Build DAOS client"""

LIBDAOS_SRC = ['agent.c', 'array.c', 'container.c', 'event.c', 'init.c', 'job.c', 'kv.c', 'mgmt.c',
'object.c', 'pool.c', 'rpc.c', 'task.c', 'tx.c', 'pipeline.c']
'object.c', 'pool.c', 'rpc.c', 'task.c', 'tx.c', 'pipeline.c', 'metrics.c']


def scons():
Expand Down
14 changes: 13 additions & 1 deletion src/client/api/init.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* (C) Copyright 2016-2023 Intel Corporation.
* (C) Copyright 2016-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand All @@ -23,6 +23,7 @@
#include <daos/btree_class.h>
#include <daos/placement.h>
#include <daos/job.h>
#include <daos/metrics.h>
#if BUILD_PIPELINE
#include <daos/pipeline.h>
#endif
Expand Down Expand Up @@ -227,6 +228,13 @@ daos_init(void)
if (rc != 0)
D_GOTO(out_pl, rc);

/** set up client telemetry */
rc = dc_tm_init();
if (rc != 0) {
/* should not be fatal */
DL_WARN(rc, "failed to initialize client telemetry");
}

/** set up pool */
rc = dc_pool_init();
if (rc != 0)
Expand Down Expand Up @@ -260,6 +268,7 @@ daos_init(void)
out_pool:
dc_pool_fini();
out_mgmt:
dc_tm_fini();
dc_mgmt_fini();
out_pl:
pl_fini();
Expand Down Expand Up @@ -309,6 +318,8 @@ daos_fini(void)
D_GOTO(unlock, rc);
}

/** clean up all registered per-module metrics */
daos_metrics_fini();
#if BUILD_PIPELINE
dc_pipeline_fini();
#endif
Expand All @@ -322,6 +333,7 @@ daos_fini(void)
D_ERROR("failed to disconnect some resources may leak, "
DF_RC"\n", DP_RC(rc));

dc_tm_fini();
dc_agent_fini();
dc_job_fini();

Expand Down
216 changes: 216 additions & 0 deletions src/client/api/metrics.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
/*
* (C) Copyright 2020-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
#define D_LOGFAC DD_FAC(client)

#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/utsname.h>
#include <sys/shm.h>
#include <daos/common.h>
#include <daos/job.h>
#include <daos/tls.h>
#include <daos/metrics.h>
#include <daos/mgmt.h>
#include <gurt/telemetry_common.h>
#include <gurt/telemetry_consumer.h>
#include <gurt/telemetry_producer.h>

#define INIT_JOB_NUM 1024
bool daos_client_metric = false;
bool daos_client_metric_retain = false;

#define MAX_IDS_SIZE(num) (num * D_TM_METRIC_SIZE)
/* The client side metrics structure looks like
* root/job_id/pid/....
*/

static int
shm_chown(key_t key, uid_t new_owner)
{
struct shmid_ds shmid_ds;
int shmid;
int rc;

rc = shmget(key, 0, 0);
if (rc < 0) {
D_ERROR("shmget(0x%x) failed: %s (%d)\n", key, strerror(errno), errno);
return daos_errno2der(errno);
}
shmid = rc;

rc = shmctl(shmid, IPC_STAT, &shmid_ds);
if (rc < 0) {
D_ERROR("shmctl(0x%x, IPC_STAT) failed: %s (%d)\n", shmid, strerror(errno), errno);
return daos_errno2der(errno);
}

shmid_ds.shm_perm.uid = new_owner;
rc = shmctl(shmid, IPC_SET, &shmid_ds);
if (rc < 0) {
D_ERROR("shmctl(0x%x, IPC_SET) failed: %s (%d)\n", shmid, strerror(errno), errno);
return daos_errno2der(errno);
}

return 0;
}

static int
init_managed_root(const char *name, pid_t pid, int flags)
{
uid_t agent_uid;
key_t key;
int rc;

/* Set the key based on our pid so that it can be easily found. */
key = pid - D_TM_SHARED_MEMORY_KEY;
mjmac marked this conversation as resolved.
Show resolved Hide resolved
rc = d_tm_init_with_name(key, MAX_IDS_SIZE(INIT_JOB_NUM), flags, name);
if (rc != 0) {
DL_ERROR(rc, "failed to initialize root for %s.", name);
return rc;
}

/* Request that the agent adds our segment into the tree. */
rc = dc_mgmt_tm_register(NULL, dc_jobid, pid, &agent_uid);
if (rc != 0) {
DL_ERROR(rc, "client telemetry setup failed.");
return rc;
}

/* Change ownership of the segment so that the agent can manage it. */
D_INFO("setting shm segment 0x%x to be owned by uid %d\n", pid, agent_uid);
rc = shm_chown(pid, agent_uid);
if (rc != 0) {
DL_ERROR(rc, "failed to chown shm segment.");
return rc;
}

return 0;
}

int
dc_tm_init(void)
{
struct d_tm_node_t *started_at;
pid_t pid = getpid();
int metrics_tag;
char root_name[D_TM_MAX_NAME_LEN];
int rc;

d_getenv_bool(DAOS_CLIENT_METRICS_ENABLE, &daos_client_metric);
mjmac marked this conversation as resolved.
Show resolved Hide resolved
if (!daos_client_metric && d_isenv_def(DAOS_CLIENT_METRICS_DUMP_PATH))
daos_client_metric = true;

if (!daos_client_metric)
return 0;

D_INFO("Setting up client telemetry for %s/%d\n", dc_jobid, pid);

rc = dc_tls_key_create();
if (rc)
D_GOTO(out, rc);

metrics_tag = D_TM_OPEN_OR_CREATE | D_TM_MULTIPLE_WRITER_LOCK;
d_getenv_bool(DAOS_CLIENT_METRICS_RETAIN, &daos_client_metric_retain);
if (daos_client_metric_retain)
metrics_tag |= D_TM_RETAIN_SHMEM;

snprintf(root_name, sizeof(root_name), "%d", pid);
rc = init_managed_root(root_name, pid, metrics_tag);
if (rc != 0) {
DL_ERROR(rc, "failed to initialize client telemetry");
D_GOTO(out, rc);
}

rc = d_tm_add_metric(&started_at, D_TM_TIMESTAMP, "Timestamp of client startup", NULL,
"started_at");
if (rc != 0) {
DL_ERROR(rc, "add metric started_at failed.");
D_GOTO(out, rc);
}

d_tm_record_timestamp(started_at);
out:
if (rc != 0) {
daos_client_metric = false;
d_tm_fini();
}

return rc;
}

static void
iter_dump(struct d_tm_context *ctx, struct d_tm_node_t *node, int level, char *path, int format,
int opt_fields, void *arg)
{
d_tm_print_node(ctx, node, level, path, format, opt_fields, (FILE *)arg);
}

static int
dump_tm_file(const char *dump_path)
{
struct d_tm_context *ctx;
struct d_tm_node_t *root;
char dirname[D_TM_MAX_NAME_LEN] = {0};
uint32_t filter;
FILE *dump_file;
int rc = 0;

dump_file = fopen(dump_path, "w+");
if (dump_file == NULL) {
D_INFO("cannot open %s", dump_path);
return -DER_INVAL;
}

filter = D_TM_COUNTER | D_TM_DURATION | D_TM_TIMESTAMP | D_TM_MEMINFO |
D_TM_TIMER_SNAPSHOT | D_TM_GAUGE | D_TM_STATS_GAUGE;

ctx = d_tm_open(DC_TM_JOB_ROOT_ID);
if (ctx == NULL)
D_GOTO(close, rc = -DER_NOMEM);

snprintf(dirname, sizeof(dirname), "%s/%u", dc_jobid, getpid());
root = d_tm_find_metric(ctx, dirname);
if (root == NULL) {
printf("No metrics found at: '%s'\n", dirname);
D_GOTO(close_ctx, rc = -DER_NONEXIST);
}

d_tm_print_field_descriptors(0, dump_file);

d_tm_iterate(ctx, root, 0, filter, NULL, D_TM_CSV, 0, iter_dump, dump_file);

close_ctx:
d_tm_close(&ctx);
close:
fclose(dump_file);
return rc;
}

void
dc_tm_fini()
{
char *dump_path;
int rc;

if (!daos_client_metric)
return;

rc = d_agetenv_str(&dump_path, DAOS_CLIENT_METRICS_DUMP_PATH);
if (rc != 0)
D_GOTO(out, rc);
if (dump_path != NULL) {
D_INFO("dump path is %s\n", dump_path);
dump_tm_file(dump_path);
}
d_freeenv_str(&dump_path);

out:
dc_tls_fini();
dc_tls_key_delete();

d_tm_fini();
}
3 changes: 2 additions & 1 deletion src/common/SConscript
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ COMMON_FILES = ['debug.c', 'mem.c', 'fail_loc.c', 'lru.c',
'acl_api.c', 'acl_util.c', 'acl_principal.c', 'cont_props.c',
'dedup.c', 'profile.c', 'compression.c', 'compression_isal.c',
'compression_qat.c', 'multihash.c', 'multihash_isal.c',
'cipher.c', 'cipher_isal.c', 'qat.c', 'fault_domain.c']
'cipher.c', 'cipher_isal.c', 'qat.c', 'fault_domain.c',
'tls.c', 'metrics.c']


def build_daos_common(denv, client):
Expand Down
Loading
Loading