Skip to content

Commit

Permalink
DAOS-14561 vos: Add garbage collection metrics (#13244)
Browse files Browse the repository at this point in the history
* Add some metrics for garbage collection
* Add total metrics for vos space, so we can infer free space
* Add aggregation failure counter

Also adds a small change to the Go builder for
handling versions with a -suffix.

Required-githooks: true
Change-Id: I098ec9ddce846758f669fd8d7dba7a0a5b44ef2d

Co-authored-by: Michael MacDonald <[email protected]>
Change-Id: I97502b0503fbe0b5bc57d7b3cee514264b885fa3
Signed-off-by: Jeff Olivier <[email protected]>
Signed-off-by: Michael MacDonald <[email protected]>
  • Loading branch information
jolivier23 and mjmac committed Dec 13, 2023
1 parent 75c62d9 commit aa7d66b
Show file tree
Hide file tree
Showing 8 changed files with 189 additions and 10 deletions.
2 changes: 2 additions & 0 deletions site_scons/site_tools/go_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ def _check_go_version(context):

# go version go1.2.3 Linux/amd64
go_version = out.split(' ')[2].replace('go', '')
if '-' in go_version:
go_version = go_version.split('-')[0]
if len([x for x, y in
zip(go_version.split('.'), MIN_GO_VERSION.split('.'))
if int(x) < int(y)]) > 0:
Expand Down
2 changes: 1 addition & 1 deletion src/vos/tests/vts_gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ gc_key_test(void **state)
int rc;

rc = gc_key_run(args);
assert_int_equal(rc, 0);
assert_rc_equal(rc, 0);
}

static int
Expand Down
18 changes: 16 additions & 2 deletions src/vos/vos_aggregate.c
Original file line number Diff line number Diff line change
Expand Up @@ -2358,7 +2358,12 @@ vos_aggregate_pre_cb(daos_handle_t ih, vos_iter_entry_t *entry,
}

if (rc < 0) {
struct vos_agg_metrics *vam = agg_cont2metrics(cont);

D_ERROR("VOS aggregation failed: "DF_RC"\n", DP_RC(rc));
if (vam && vam->vam_fail_count)
d_tm_inc_counter(vam->vam_fail_count, 1);

return rc;
}

Expand Down Expand Up @@ -2431,7 +2436,11 @@ vos_aggregate_post_cb(daos_handle_t ih, vos_iter_entry_t *entry,
inc_agg_counter(agg_param, type, AGG_OP_DEL);
rc = 0;
} else if (rc != 0) {
struct vos_agg_metrics *vam = agg_cont2metrics(cont);

D_ERROR("VOS aggregation failed: %d\n", rc);
if (vam && vam->vam_fail_count)
d_tm_inc_counter(vam->vam_fail_count, 1);

/*
* -DER_TX_BUSY error indicates current ilog aggregation
Expand All @@ -2442,8 +2451,6 @@ vos_aggregate_post_cb(daos_handle_t ih, vos_iter_entry_t *entry,
* orphan the current entry due to incarnation log semantics.
*/
if (rc == -DER_TX_BUSY) {
struct vos_agg_metrics *vam = agg_cont2metrics(cont);

agg_param->ap_in_progress = 1;
rc = 0;
switch (type) {
Expand Down Expand Up @@ -2736,6 +2743,13 @@ vos_aggregate(daos_handle_t coh, daos_epoch_range_t *epr,
free_agg_data:
D_FREE(ad);

if (rc < 0) {
struct vos_agg_metrics *vam = agg_cont2metrics(cont);

if (vam && vam->vam_fail_count)
d_tm_inc_counter(vam->vam_fail_count, 1);
}

return rc;
}

Expand Down
21 changes: 21 additions & 0 deletions src/vos/vos_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -753,6 +753,12 @@ vos_metrics_alloc(const char *path, int tgt_id)
if (rc)
D_WARN("Failed to create 'merged_size' telemetry : "DF_RC"\n", DP_RC(rc));

/* VOS aggregation failed */
rc = d_tm_add_metric(&vam->vam_fail_count, D_TM_COUNTER, "aggregation failures", NULL,
"%s/%s/fail_count/tgt_%u", path, VOS_AGG_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'fail_count' telemetry : "DF_RC"\n", DP_RC(rc));

/* Metrics related to VOS checkpointing */
vos_chkpt_metrics_init(&vp_metrics->vp_chkpt_metrics, path, tgt_id);

Expand All @@ -768,6 +774,21 @@ vos_metrics_alloc(const char *path, int tgt_id)
if (rc)
D_WARN("Failed to create 'nvme_used' telemetry : "DF_RC"\n", DP_RC(rc));

/* VOS space SCM total metric */
rc = d_tm_add_metric(&vsm->vsm_scm_total, D_TM_GAUGE, "SCM space total", "bytes",
"%s/%s/scm_total/tgt_%u", path, VOS_SPACE_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'scm_total' telemetry : " DF_RC "\n", DP_RC(rc));

/* VOS space NVME total metric */
rc = d_tm_add_metric(&vsm->vsm_nvme_total, D_TM_GAUGE, "NVME space total", "bytes",
"%s/%s/nvme_total/tgt_%u", path, VOS_SPACE_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'nvme_total' telemetry : " DF_RC "\n", DP_RC(rc));

/** garbage collection metrics */
vos_gc_metrics_init(&vp_metrics->vp_gc_metrics, path, tgt_id);

/* Initialize the vos_space_metrics timeout counter */
vsm->vsm_last_update_ts = 0;

Expand Down
126 changes: 121 additions & 5 deletions src/vos/vos_gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -657,6 +657,33 @@ gc_get_container(struct vos_pool *pool)
return cont;
}

static void
gc_update_stats(struct vos_pool *pool)
{
struct vos_gc_stat *stat = &pool->vp_gc_stat;
struct vos_gc_stat *gstat = &pool->vp_gc_stat_global;
struct vos_gc_metrics *vgm;

if (pool->vp_metrics != NULL) {
vgm = &pool->vp_metrics->vp_gc_metrics;
d_tm_inc_counter(vgm->vgm_cont_del, stat->gs_conts);
d_tm_inc_counter(vgm->vgm_obj_del, stat->gs_objs);
d_tm_inc_counter(vgm->vgm_dkey_del, stat->gs_dkeys);
d_tm_inc_counter(vgm->vgm_akey_del, stat->gs_akeys);
d_tm_inc_counter(vgm->vgm_ev_del, stat->gs_recxs);
d_tm_inc_counter(vgm->vgm_sv_del, stat->gs_singvs);
}

gstat->gs_conts += stat->gs_conts;
gstat->gs_objs += stat->gs_objs;
gstat->gs_dkeys += stat->gs_dkeys;
gstat->gs_akeys += stat->gs_akeys;
gstat->gs_recxs += stat->gs_recxs;
gstat->gs_singvs += stat->gs_singvs;

memset(stat, 0, sizeof(*stat));
}

/**
* Run garbage collector for a pool, it returns if all @credits are consumed
* or there is nothing to be reclaimed.
Expand All @@ -671,7 +698,7 @@ gc_reclaim_pool(struct vos_pool *pool, int *credits, bool *empty_ret)

if (pool->vp_dying) {
*empty_ret = true;
return 0;
D_GOTO(done, rc = 0);
}

/* take an extra ref to avoid concurrent container destroy/free */
Expand All @@ -684,7 +711,8 @@ gc_reclaim_pool(struct vos_pool *pool, int *credits, bool *empty_ret)
DP_UUID(pool->vp_id), DP_RC(rc));
if (cont != NULL)
vos_cont_decref(cont);
return rc;
*empty_ret = false;
goto done;
}

*empty_ret = false;
Expand Down Expand Up @@ -781,6 +809,9 @@ gc_reclaim_pool(struct vos_pool *pool, int *credits, bool *empty_ret)
if (cont != NULL)
vos_cont_decref(cont);

done:
gc_update_stats(pool);

return rc;
}

Expand Down Expand Up @@ -916,7 +947,7 @@ gc_have_pool(struct vos_pool *pool)
static void
gc_log_pool(struct vos_pool *pool)
{
struct vos_gc_stat *stat = &pool->vp_gc_stat;
struct vos_gc_stat *stat = &pool->vp_gc_stat_global;

D_DEBUG(DB_TRACE,
"Pool="DF_UUID", GC reclaimed:\n"
Expand Down Expand Up @@ -1106,6 +1137,9 @@ int
vos_gc_pool(daos_handle_t poh, int credits, int (*yield_func)(void *arg),
void *yield_arg)
{
struct d_tm_node_t *duration = NULL;
struct d_tm_node_t *tight = NULL;
struct d_tm_node_t *slack = NULL;
struct vos_pool *pool = vos_hdl2pool(poh);
struct vos_tls *tls = vos_tls_get(pool->vp_sysdb);
struct vos_gc_param param;
Expand All @@ -1131,24 +1165,44 @@ vos_gc_pool(daos_handle_t poh, int credits, int (*yield_func)(void *arg),

tls->vtl_gc_running++;

if (pool->vp_metrics != NULL) {
duration = pool->vp_metrics->vp_gc_metrics.vgm_duration;
slack = pool->vp_metrics->vp_gc_metrics.vgm_slack_cnt;
tight = pool->vp_metrics->vp_gc_metrics.vgm_tight_cnt;
}

while (1) {
int creds = param.vgc_credits;

d_tm_mark_duration_start(duration, D_TM_CLOCK_THREAD_CPUTIME);
if (creds == GC_CREDS_TIGHT)
d_tm_inc_counter(tight, 1);
else
d_tm_inc_counter(slack, 1);

if (credits > 0 && (credits - total) < creds)
creds = credits - total;

total += creds;
rc = vos_gc_pool_tight(poh, &creds);

if (rc) {
D_ERROR("GC pool failed: " DF_RC "\n", DP_RC(rc));
d_tm_mark_duration_end(duration);
break;
}
total -= creds; /* subtract the remainded credits */
if (creds != 0)
if (creds != 0) {
d_tm_mark_duration_end(duration);
break; /* reclaimed everything */
}

if (credits > 0 && total >= credits)
if (credits > 0 && total >= credits) {
d_tm_mark_duration_end(duration);
break; /* consumed all credits */
}

d_tm_mark_duration_end(duration);

if (vos_gc_yield(&param)) {
D_DEBUG(DB_TRACE, "GC pool run aborted\n");
Expand Down Expand Up @@ -1199,3 +1253,65 @@ vos_flush_pool(daos_handle_t poh, bool force, uint32_t nr_flush, uint32_t *nr_fl

return rc;
}

#define VOS_GC_DIR "vos_gc"
void
vos_gc_metrics_init(struct vos_gc_metrics *vgm, const char *path, int tgt_id)
{
int rc;

/* GC slice duration */
rc = d_tm_add_metric(&vgm->vgm_duration, D_TM_DURATION | D_TM_CLOCK_THREAD_CPUTIME,
"GC slice duration", NULL, "%s/%s/duration/tgt_%u", path, VOS_GC_DIR,
tgt_id);
if (rc)
D_WARN("Failed to create 'duration' telemetry: " DF_RC "\n", DP_RC(rc));

/* GC container deletion */
rc = d_tm_add_metric(&vgm->vgm_cont_del, D_TM_COUNTER, "GC containers deleted", NULL,
"%s/%s/cont_del/tgt_%u", path, VOS_GC_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'cont_del' telemetry: " DF_RC "\n", DP_RC(rc));

/* GC object deletion */
rc = d_tm_add_metric(&vgm->vgm_obj_del, D_TM_COUNTER, "GC objects deleted", NULL,
"%s/%s/obj_del/tgt_%u", path, VOS_GC_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'obj_del' telemetry: " DF_RC "\n", DP_RC(rc));

/* GC dkey deletion */
rc = d_tm_add_metric(&vgm->vgm_dkey_del, D_TM_COUNTER, "GC dkeys deleted", NULL,
"%s/%s/dkey_del/tgt_%u", path, VOS_GC_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'dkey_del' telemetry: " DF_RC "\n", DP_RC(rc));

/* GC akey deletion */
rc = d_tm_add_metric(&vgm->vgm_akey_del, D_TM_COUNTER, "GC akeys deleted", NULL,
"%s/%s/akey_del/tgt_%u", path, VOS_GC_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'akey_del' telemetry: " DF_RC "\n", DP_RC(rc));

/* GC ev deletion */
rc = d_tm_add_metric(&vgm->vgm_ev_del, D_TM_COUNTER, "GC ev deleted", NULL,
"%s/%s/ev_del/tgt_%u", path, VOS_GC_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'ev_del' telemetry: " DF_RC "\n", DP_RC(rc));

/* GC sv deletion */
rc = d_tm_add_metric(&vgm->vgm_sv_del, D_TM_COUNTER, "GC sv deleted", NULL,
"%s/%s/sv_del/tgt_%u", path, VOS_GC_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'sv_del' telemetry: " DF_RC "\n", DP_RC(rc));

/* GC slack mode runs */
rc = d_tm_add_metric(&vgm->vgm_slack_cnt, D_TM_COUNTER, "GC slack mode count", NULL,
"%s/%s/slack_cnt/tgt_%u", path, VOS_GC_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'slack_cnt' telemetry: " DF_RC "\n", DP_RC(rc));

/* GC tight mode runs */
rc = d_tm_add_metric(&vgm->vgm_tight_cnt, D_TM_COUNTER, "GC tight mode count", NULL,
"%s/%s/tight_cnt/tgt_%u", path, VOS_GC_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'tight_cnt' telemetry: " DF_RC "\n", DP_RC(rc));
}
20 changes: 20 additions & 0 deletions src/vos/vos_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,19 @@ struct vos_agg_metrics {
struct d_tm_node_t *vam_del_ev; /* Deleted EV records */
struct d_tm_node_t *vam_merge_recs; /* Total merged EV records */
struct d_tm_node_t *vam_merge_size; /* Total merged size */
struct d_tm_node_t *vam_fail_count; /* Aggregation failed */
};

struct vos_gc_metrics {
struct d_tm_node_t *vgm_duration; /* Duration of each gc scan */
struct d_tm_node_t *vgm_cont_del; /* containers reclaimed */
struct d_tm_node_t *vgm_obj_del; /* objects reclaimed */
struct d_tm_node_t *vgm_dkey_del; /* dkeys reclaimed */
struct d_tm_node_t *vgm_akey_del; /* akeys reclaimed */
struct d_tm_node_t *vgm_ev_del; /* EV records reclaimed */
struct d_tm_node_t *vgm_sv_del; /* SV records reclaimed */
struct d_tm_node_t *vgm_slack_cnt; /* Slack mode count */
struct d_tm_node_t *vgm_tight_cnt; /* Tight mode count */
};

/*
Expand All @@ -200,10 +213,14 @@ struct vos_chkpt_metrics {
};

void vos_chkpt_metrics_init(struct vos_chkpt_metrics *vc_metrics, const char *path, int tgt_id);
void
vos_gc_metrics_init(struct vos_gc_metrics *vc_metrics, const char *path, int tgt_id);

struct vos_space_metrics {
struct d_tm_node_t *vsm_scm_used; /* SCM space used */
struct d_tm_node_t *vsm_nvme_used; /* NVMe space used */
struct d_tm_node_t *vsm_scm_total; /* SCM space total */
struct d_tm_node_t *vsm_nvme_total; /* NVMe space total */
uint64_t vsm_last_update_ts; /* Timeout counter */
};

Expand All @@ -219,6 +236,7 @@ struct vos_rh_metrics {
struct vos_pool_metrics {
void *vp_vea_metrics;
struct vos_agg_metrics vp_agg_metrics;
struct vos_gc_metrics vp_gc_metrics;
struct vos_space_metrics vp_space_metrics;
struct vos_chkpt_metrics vp_chkpt_metrics;
struct vos_rh_metrics vp_rh_metrics;
Expand Down Expand Up @@ -255,6 +273,8 @@ struct vos_pool {
/** btr handle for the container table */
daos_handle_t vp_cont_th;
/** GC statistics of this pool */
struct vos_gc_stat vp_gc_stat_global;
/** GC per slice statistics of this pool */
struct vos_gc_stat vp_gc_stat;
/** link chain on vos_tls::vtl_gc_pools */
d_list_t vp_gc_link;
Expand Down
4 changes: 2 additions & 2 deletions src/vos/vos_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -1494,7 +1494,7 @@ vos_pool_query(daos_handle_t poh, vos_pool_info_t *pinfo)

D_ASSERT(pinfo != NULL);
pinfo->pif_cont_nr = pool_df->pd_cont_nr;
pinfo->pif_gc_stat = pool->vp_gc_stat;
pinfo->pif_gc_stat = pool->vp_gc_stat_global;

rc = vos_space_query(pool, &pinfo->pif_space, true);
if (rc)
Expand Down Expand Up @@ -1552,7 +1552,7 @@ vos_pool_ctl(daos_handle_t poh, enum vos_pool_opc opc, void *param)
default:
return -DER_NOSYS;
case VOS_PO_CTL_RESET_GC:
memset(&pool->vp_gc_stat, 0, sizeof(pool->vp_gc_stat));
memset(&pool->vp_gc_stat_global, 0, sizeof(pool->vp_gc_stat_global));
break;
case VOS_PO_CTL_SET_POLICY:
if (param == NULL)
Expand Down
6 changes: 6 additions & 0 deletions src/vos/vos_space.c
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,12 @@ vos_space_update_metrics(struct vos_pool *pool)
if (!vpm)
return;

if (vpm->vp_space_metrics.vsm_last_update_ts == 0) {
/* Set the constant values */
d_tm_set_gauge(vpm->vp_space_metrics.vsm_scm_total, pool->vp_pool_df->pd_scm_sz);
d_tm_set_gauge(vpm->vp_space_metrics.vsm_nvme_total, pool->vp_pool_df->pd_nvme_sz);
}

now = daos_gettime_coarse();
if (now < vpm->vp_space_metrics.vsm_last_update_ts + VOS_SPACE_METRICS_INTV) {
return;
Expand Down

0 comments on commit aa7d66b

Please sign in to comment.