Skip to content

Commit

Permalink
DAOS-14561 vos: Add garbage collection metrics
Browse files Browse the repository at this point in the history
* Add some metrics for garbage collection
* Add total metrics for vos space, so we can infer free space

Required-githooks: true

Signed-off-by: Jeff Olivier <[email protected]>
  • Loading branch information
jolivier23 committed Oct 26, 2023
1 parent 2044a37 commit cc354d5
Show file tree
Hide file tree
Showing 6 changed files with 164 additions and 8 deletions.
2 changes: 1 addition & 1 deletion src/vos/tests/vts_gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ gc_key_test(void **state)
int rc;

rc = gc_key_run(args);
assert_int_equal(rc, 0);
assert_rc_equal(rc, 0);
}

static int
Expand Down
15 changes: 15 additions & 0 deletions src/vos/vos_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -774,6 +774,21 @@ vos_metrics_alloc(const char *path, int tgt_id)
if (rc)
D_WARN("Failed to create 'nvme_used' telemetry : "DF_RC"\n", DP_RC(rc));

/* VOS space SCM total metric */
rc = d_tm_add_metric(&vsm->vsm_scm_total, D_TM_GAUGE, "SCM space used", "bytes",
"%s/%s/scm_used/tgt_%u", path, VOS_SPACE_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'scm_total' telemetry : " DF_RC "\n", DP_RC(rc));

/* VOS space NVME total metric */
rc = d_tm_add_metric(&vsm->vsm_nvme_total, D_TM_GAUGE, "NVME space used", "bytes",
"%s/%s/nvme_used/tgt_%u", path, VOS_SPACE_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'nvme_total' telemetry : " DF_RC "\n", DP_RC(rc));

/** garbage collection metrics */
vos_gc_metrics_init(&vp_metrics->vp_gc_metrics, path, tgt_id);

/* Initialize the vos_space_metrics timeout counter */
vsm->vsm_last_update_ts = 0;

Expand Down
126 changes: 121 additions & 5 deletions src/vos/vos_gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -675,6 +675,33 @@ gc_get_container(struct vos_pool *pool)
return cont;
}

static void
gc_update_stats(struct vos_pool *pool)
{
struct vos_gc_stat *stat = &pool->vp_gc_stat;
struct vos_gc_stat *gstat = &pool->vp_gc_stat_global;
struct vos_gc_metrics *vgm;

if (pool->vp_metrics != NULL) {
vgm = &pool->vp_metrics->vp_gc_metrics;
d_tm_set_gauge(vgm->vgm_cont_del, stat->gs_conts);
d_tm_set_gauge(vgm->vgm_obj_del, stat->gs_objs);
d_tm_set_gauge(vgm->vgm_dkey_del, stat->gs_dkeys);
d_tm_set_gauge(vgm->vgm_akey_del, stat->gs_akeys);
d_tm_set_gauge(vgm->vgm_ev_del, stat->gs_recxs);
d_tm_set_gauge(vgm->vgm_sv_del, stat->gs_singvs);
}

gstat->gs_conts += stat->gs_conts;
gstat->gs_objs += stat->gs_objs;
gstat->gs_dkeys += stat->gs_dkeys;
gstat->gs_akeys += stat->gs_akeys;
gstat->gs_recxs += stat->gs_recxs;
gstat->gs_singvs += stat->gs_singvs;

memset(stat, 0, sizeof(*stat));
}

/**
* Run garbage collector for a pool, it returns if all @credits are consumed
* or there is nothing to be reclaimed.
Expand All @@ -689,7 +716,7 @@ gc_reclaim_pool(struct vos_pool *pool, int *credits, bool *empty_ret)

if (pool->vp_dying) {
*empty_ret = true;
return 0;
D_GOTO(done, rc = 0);
}

/* take an extra ref to avoid concurrent container destroy/free */
Expand All @@ -702,7 +729,8 @@ gc_reclaim_pool(struct vos_pool *pool, int *credits, bool *empty_ret)
DP_UUID(pool->vp_id), DP_RC(rc));
if (cont != NULL)
vos_cont_decref(cont);
return rc;
*empty_ret = false;
goto done;
}

*empty_ret = false;
Expand Down Expand Up @@ -799,6 +827,9 @@ gc_reclaim_pool(struct vos_pool *pool, int *credits, bool *empty_ret)
if (cont != NULL)
vos_cont_decref(cont);

done:
gc_update_stats(pool);

return rc;
}

Expand Down Expand Up @@ -934,7 +965,7 @@ gc_have_pool(struct vos_pool *pool)
static void
gc_log_pool(struct vos_pool *pool)
{
struct vos_gc_stat *stat = &pool->vp_gc_stat;
struct vos_gc_stat *stat = &pool->vp_gc_stat_global;

D_DEBUG(DB_TRACE,
"Pool="DF_UUID", GC reclaimed:\n"
Expand Down Expand Up @@ -1124,6 +1155,9 @@ int
vos_gc_pool(daos_handle_t poh, int credits, int (*yield_func)(void *arg),
void *yield_arg)
{
struct d_tm_node_t *duration = NULL;
struct d_tm_node_t *tight = NULL;
struct d_tm_node_t *slack = NULL;
struct vos_pool *pool = vos_hdl2pool(poh);
struct vos_tls *tls = vos_tls_get(pool->vp_sysdb);
struct vos_gc_param param;
Expand All @@ -1149,24 +1183,44 @@ vos_gc_pool(daos_handle_t poh, int credits, int (*yield_func)(void *arg),

tls->vtl_gc_running++;

if (pool->vp_metrics != NULL) {
duration = pool->vp_metrics->vp_gc_metrics.vgm_duration;
slack = pool->vp_metrics->vp_gc_metrics.vgm_slack_cnt;
tight = pool->vp_metrics->vp_gc_metrics.vgm_tight_cnt;
}

while (1) {
int creds = param.vgc_credits;

d_tm_mark_duration_start(duration, D_TM_CLOCK_THREAD_CPUTIME);
if (creds == GC_CREDS_TIGHT)
d_tm_inc_counter(tight, 1);
else
d_tm_inc_counter(slack, 1);

if (credits > 0 && (credits - total) < creds)
creds = credits - total;

total += creds;
rc = vos_gc_pool_tight(poh, &creds);

if (rc) {
D_ERROR("GC pool failed: " DF_RC "\n", DP_RC(rc));
d_tm_mark_duration_end(duration);
break;
}
total -= creds; /* subtract the remainded credits */
if (creds != 0)
if (creds != 0) {
d_tm_mark_duration_end(duration);
break; /* reclaimed everything */
}

if (credits > 0 && total >= credits)
if (credits > 0 && total >= credits) {
d_tm_mark_duration_end(duration);
break; /* consumed all credits */
}

d_tm_mark_duration_end(duration);

if (vos_gc_yield(&param)) {
D_DEBUG(DB_TRACE, "GC pool run aborted\n");
Expand Down Expand Up @@ -1217,3 +1271,65 @@ vos_flush_pool(daos_handle_t poh, uint32_t nr_flush, uint32_t *nr_flushed)

return rc;
}

#define VOS_GC_DIR "vos_gc"
void
vos_gc_metrics_init(struct vos_gc_metrics *vgm, const char *path, int tgt_id)
{
int rc;

/* GC slice duration */
rc = d_tm_add_metric(&vgm->vgm_duration, D_TM_DURATION | D_TM_CLOCK_THREAD_CPUTIME,
"GC slice duration", NULL, "%s/%s/duration/tgt_%u", path, VOS_GC_DIR,
tgt_id);
if (rc)
D_WARN("Failed to create 'duration' telemetry: " DF_RC "\n", DP_RC(rc));

/* GC container deletion */
rc = d_tm_add_metric(&vgm->vgm_cont_del, D_TM_GAUGE, "GC containers deleted", NULL,
"%s/%s/cont_del/tgt_%u", path, VOS_GC_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'cont_del' telemetry: " DF_RC "\n", DP_RC(rc));

/* GC object deletion */
rc = d_tm_add_metric(&vgm->vgm_obj_del, D_TM_GAUGE, "GC objects deleted", NULL,
"%s/%s/obj_del/tgt_%u", path, VOS_GC_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'obj_del' telemetry: " DF_RC "\n", DP_RC(rc));

/* GC dkey deletion */
rc = d_tm_add_metric(&vgm->vgm_dkey_del, D_TM_GAUGE, "GC dkeys deleted", NULL,
"%s/%s/dkey_del/tgt_%u", path, VOS_GC_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'dkey_del' telemetry: " DF_RC "\n", DP_RC(rc));

/* GC akey deletion */
rc = d_tm_add_metric(&vgm->vgm_akey_del, D_TM_GAUGE, "GC akeys deleted", NULL,
"%s/%s/akey_del/tgt_%u", path, VOS_GC_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'akey_del' telemetry: " DF_RC "\n", DP_RC(rc));

/* GC ev deletion */
rc = d_tm_add_metric(&vgm->vgm_ev_del, D_TM_GAUGE, "GC ev deleted", NULL,
"%s/%s/ev_del/tgt_%u", path, VOS_GC_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'ev_del' telemetry: " DF_RC "\n", DP_RC(rc));

/* GC sv deletion */
rc = d_tm_add_metric(&vgm->vgm_sv_del, D_TM_GAUGE, "GC sv deleted", NULL,
"%s/%s/sv_del/tgt_%u", path, VOS_GC_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'sv_del' telemetry: " DF_RC "\n", DP_RC(rc));

/* GC slack mode runs */
rc = d_tm_add_metric(&vgm->vgm_slack_cnt, D_TM_COUNTER, "GC slack mode count", NULL,
"%s/%s/slack_cnt/tgt_%u", path, VOS_GC_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'slack_cnt' telemetry: " DF_RC "\n", DP_RC(rc));

/* GC tight mode runs */
rc = d_tm_add_metric(&vgm->vgm_tight_cnt, D_TM_COUNTER, "GC tight mode count", NULL,
"%s/%s/tight_cnt/tgt_%u", path, VOS_GC_DIR, tgt_id);
if (rc)
D_WARN("Failed to create 'tight_cnt' telemetry: " DF_RC "\n", DP_RC(rc));
}
19 changes: 19 additions & 0 deletions src/vos/vos_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,18 @@ struct vos_agg_metrics {
struct d_tm_node_t *vam_merge_size; /* Total merged size */
};

struct vos_gc_metrics {
struct d_tm_node_t *vgm_duration; /* Duration of each gc scan */
struct d_tm_node_t *vgm_cont_del; /* containers reclaimed */
struct d_tm_node_t *vgm_obj_del; /* objects reclaimed */
struct d_tm_node_t *vgm_dkey_del; /* dkeys reclaimed */
struct d_tm_node_t *vgm_akey_del; /* akeys reclaimed */
struct d_tm_node_t *vgm_ev_del; /* EV records reclaimed */
struct d_tm_node_t *vgm_sv_del; /* SV records reclaimed */
struct d_tm_node_t *vgm_slack_cnt; /* Slack mode count */
struct d_tm_node_t *vgm_tight_cnt; /* Tight mode count */
};

/*
* VOS Pool metrics for checkpoint activity.
*/
Expand All @@ -200,10 +212,14 @@ struct vos_chkpt_metrics {
};

void vos_chkpt_metrics_init(struct vos_chkpt_metrics *vc_metrics, const char *path, int tgt_id);
void
vos_gc_metrics_init(struct vos_gc_metrics *vc_metrics, const char *path, int tgt_id);

struct vos_space_metrics {
struct d_tm_node_t *vsm_scm_used; /* SCM space used */
struct d_tm_node_t *vsm_nvme_used; /* NVMe space used */
struct d_tm_node_t *vsm_scm_total; /* SCM space total */
struct d_tm_node_t *vsm_nvme_total; /* NVMe space total */
uint64_t vsm_last_update_ts; /* Timeout counter */
};

Expand All @@ -219,6 +235,7 @@ struct vos_rh_metrics {
struct vos_pool_metrics {
void *vp_vea_metrics;
struct vos_agg_metrics vp_agg_metrics;
struct vos_gc_metrics vp_gc_metrics;
struct vos_space_metrics vp_space_metrics;
struct vos_chkpt_metrics vp_chkpt_metrics;
struct vos_rh_metrics vp_rh_metrics;
Expand Down Expand Up @@ -255,6 +272,8 @@ struct vos_pool {
/** btr handle for the container table */
daos_handle_t vp_cont_th;
/** GC statistics of this pool */
struct vos_gc_stat vp_gc_stat_global;
/** GC per slice statistics of this pool */
struct vos_gc_stat vp_gc_stat;
/** link chain on vos_tls::vtl_gc_pools */
d_list_t vp_gc_link;
Expand Down
4 changes: 2 additions & 2 deletions src/vos/vos_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -1504,7 +1504,7 @@ vos_pool_query(daos_handle_t poh, vos_pool_info_t *pinfo)

D_ASSERT(pinfo != NULL);
pinfo->pif_cont_nr = pool_df->pd_cont_nr;
pinfo->pif_gc_stat = pool->vp_gc_stat;
pinfo->pif_gc_stat = pool->vp_gc_stat_global;

rc = vos_space_query(pool, &pinfo->pif_space, true);
if (rc)
Expand Down Expand Up @@ -1562,7 +1562,7 @@ vos_pool_ctl(daos_handle_t poh, enum vos_pool_opc opc, void *param)
default:
return -DER_NOSYS;
case VOS_PO_CTL_RESET_GC:
memset(&pool->vp_gc_stat, 0, sizeof(pool->vp_gc_stat));
memset(&pool->vp_gc_stat_global, 0, sizeof(pool->vp_gc_stat_global));
break;
case VOS_PO_CTL_SET_POLICY:
if (param == NULL)
Expand Down
6 changes: 6 additions & 0 deletions src/vos/vos_space.c
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,12 @@ vos_space_update_metrics(struct vos_pool *pool)
if (!vpm)
return;

if (vpm->vp_space_metrics.vsm_last_update_ts == 0) {
/* Set the constant values */
d_tm_set_gauge(vpm->vp_space_metrics.vsm_scm_total, pool->vp_pool_df->pd_scm_sz);
d_tm_set_gauge(vpm->vp_space_metrics.vsm_nvme_total, pool->vp_pool_df->pd_nvme_sz);
}

now = daos_gettime_coarse();
if (now < vpm->vp_space_metrics.vsm_last_update_ts + VOS_SPACE_METRICS_INTV) {
return;
Expand Down

0 comments on commit cc354d5

Please sign in to comment.