Skip to content

Commit

Permalink
Merge pull request collectd#4177 from eero-t/log-metrics-option
Browse files Browse the repository at this point in the history
[Collectd 6] gpu_sysman: add "LogMetrics" option
  • Loading branch information
eero-t authored Mar 14, 2024
2 parents a3a7b39 + ce8a26c commit 8ed19ba
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 28 deletions.
12 changes: 12 additions & 0 deletions src/collectd.conf.pod
Original file line number Diff line number Diff line change
Expand Up @@ -3777,6 +3777,18 @@ If enabled, plugin logs at start some information about plugin
settings, all the GPUs detected through Sysman API, and enables
"pci_dev" PCI device ID label for the metrics.

=item B<LogMetrics>

If enabled, all metric values are also printed to collectd log
(standard output by default). This can be useful for local real-time
monitoring / debugging of specific GPU metric values, as one does not
need to enable any write plugins.

Output is most readable when only one metric type + "MetricsOutput"
variant are enabled, and collectd (container) sees only single GPU.
Alternatively one could grep the output for the relevant GPU, metric
type and its output variant, but that adds delay to the output.

=item B<MetricsOutput>

Set of "base", "rate", and "ratio" strings, separated by comma, colon,
Expand Down
86 changes: 72 additions & 14 deletions src/gpu_sysman.c
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/**
* collectd - src/gpu_sysman.c
*
* Copyright(c) 2020-2023 Intel Corporation. All rights reserved.
* Copyright(c) 2020-2024 Intel Corporation. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
Expand Down Expand Up @@ -40,6 +40,7 @@
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#include <level_zero/ze_api.h>
#include <level_zero/zes_api.h>
Expand All @@ -55,6 +56,7 @@
#endif

#include "collectd.h"
/* comment avoiding local clang-format conflict with collectd CI one */
#include "plugin.h"
#include "utils/common/common.h"

Expand Down Expand Up @@ -141,6 +143,7 @@ static gpu_device_t *gpus;
static uint32_t gpu_count;
static struct {
bool gpuinfo;
bool logmetrics;
gpu_disable_t disabled;
output_t output;
uint32_t samples;
Expand All @@ -159,8 +162,9 @@ static struct {
#define KEY_DISABLE_TEMP "DisableTemperature"
#define KEY_DISABLE_THROTTLE "DisableThrottleTime"

#define KEY_METRICS_OUTPUT "MetricsOutput"
#define KEY_LOG_GPU_INFO "LogGpuInfo"
#define KEY_LOG_METRICS "LogMetrics"
#define KEY_METRICS_OUTPUT "MetricsOutput"
#define KEY_SAMPLES "Samples"
#define MAX_SAMPLES 64

Expand Down Expand Up @@ -767,19 +771,73 @@ static int gpu_init(void) {
return gpu_config_init(count);
}

static double metric2double(metric_type_t type, value_t value) {
switch (type) {
case METRIC_TYPE_GAUGE:
return value.gauge;
case METRIC_TYPE_COUNTER:
return value.counter;
case METRIC_TYPE_COUNTER_FP:
return value.counter_fp;
case METRIC_TYPE_UP_DOWN:
return value.up_down;
case METRIC_TYPE_UP_DOWN_FP:
return value.up_down_fp;
case METRIC_TYPE_UNTYPED:
break;
}
assert(0);
}

/* Add device labels to all metrics in given metric family and submit family to
* collectd. Resets metric family after dispatch */
* collectd, and log the metric if metric logging is enabled.
* Resets metric family after dispatch */
static void gpu_submit(gpu_device_t *gpu, metric_family_t *fam) {
metric_t *m = fam->metric.ptr;
struct timespec ts;
/* cdtime() is not monotonic */
clock_gettime(CLOCK_MONOTONIC, &ts);

const char *pci_bdf = gpu->pci_bdf;
/* logmetrics readability: skip common BDF address prefix */
if (strncmp(pci_bdf, "0000:", 5) == 0) {
pci_bdf += 5;
}

const char *name = fam->name;
/* logmetrics readability: skip common metric prefix */
if (strncmp(name, METRIC_PREFIX, strlen(METRIC_PREFIX)) == 0) {
name += strlen(METRIC_PREFIX);
}

for (size_t i = 0; i < fam->metric.num; i++) {
metric_label_set(m + i, "pci_bdf", gpu->pci_bdf);
metric_t *m = fam->metric.ptr + i;

/* log metric values in addition to dispatching them? */
if (config.logmetrics) {
const char *type = "<type>";
char *labels[] = {"direction", "location", "type"};
for (size_t i = 0; i < STATIC_ARRAY_SIZE(labels); i++) {
char const *l = metric_label_get(m, labels[i]);
if (l != NULL) {
type = l;
break;
}
}
INFO("[%7ld.%03ld] %s: %s / %s [%ld]: %.3f", ts.tv_sec,
ts.tv_nsec / 1000000, pci_bdf, name, type, i,
metric2double(fam->type, m->value));
}

/* add extra per-metric labels */
metric_label_set(m, "pci_bdf", gpu->pci_bdf);
if (gpu->dev_file) {
metric_label_set(m + i, "dev_file", gpu->dev_file);
metric_label_set(m, "dev_file", gpu->dev_file);
}
if (gpu->pci_dev) {
metric_label_set(m + i, "pci_dev", gpu->pci_dev);
metric_label_set(m, "pci_dev", gpu->pci_dev);
}
}

int status = plugin_dispatch_metric_family(fam);
if (status != 0) {
ERROR(PLUGIN_NAME ": gpu_submit(%s, %s) failed: %s", gpu->pci_bdf,
Expand Down Expand Up @@ -2518,6 +2576,8 @@ static int gpu_config_parse(const char *key, const char *value) {
config.disabled.throttle = IS_TRUE(value);
} else if (strcasecmp(key, KEY_LOG_GPU_INFO) == 0) {
config.gpuinfo = IS_TRUE(value);
} else if (strcasecmp(key, KEY_LOG_METRICS) == 0) {
config.logmetrics = IS_TRUE(value);
} else if (strcasecmp(key, KEY_METRICS_OUTPUT) == 0) {
config.output = 0;
static const char delim[] = ",:/ ";
Expand Down Expand Up @@ -2571,13 +2631,11 @@ static int gpu_config_parse(const char *key, const char *value) {
void module_register(void) {
/* NOTE: key strings *must* be static */
static const char *config_keys[] = {
KEY_DISABLE_ENGINE, KEY_DISABLE_ENGINE_SINGLE,
KEY_DISABLE_FABRIC, KEY_DISABLE_FREQ,
KEY_DISABLE_MEM, KEY_DISABLE_MEMBW,
KEY_DISABLE_POWER, KEY_DISABLE_RAS,
KEY_DISABLE_RAS_SEPARATE, KEY_DISABLE_TEMP,
KEY_DISABLE_THROTTLE, KEY_METRICS_OUTPUT,
KEY_LOG_GPU_INFO, KEY_SAMPLES};
KEY_DISABLE_ENGINE, KEY_DISABLE_ENGINE_SINGLE, KEY_DISABLE_FABRIC,
KEY_DISABLE_FREQ, KEY_DISABLE_MEM, KEY_DISABLE_MEMBW,
KEY_DISABLE_POWER, KEY_DISABLE_RAS, KEY_DISABLE_RAS_SEPARATE,
KEY_DISABLE_TEMP, KEY_DISABLE_THROTTLE, KEY_METRICS_OUTPUT,
KEY_LOG_GPU_INFO, KEY_LOG_METRICS, KEY_SAMPLES};
const int config_keys_num = STATIC_ARRAY_SIZE(config_keys);

plugin_register_config(PLUGIN_NAME, gpu_config_parse, config_keys,
Expand Down
17 changes: 3 additions & 14 deletions src/gpu_sysman_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -809,19 +809,6 @@ static void compose_name(char *buf, size_t bufsize, const char *name,
assert(len < bufsize);
}

static double get_value(metric_type_t type, value_t value) {
switch (type) {
case METRIC_TYPE_COUNTER:
return value.counter;
break;
case METRIC_TYPE_GAUGE:
return value.gauge;
break;
default:
assert(0);
}
}

/* matches constructed metric names against validation array ones and
* updates the values accordingly
*/
Expand All @@ -833,7 +820,7 @@ int plugin_dispatch_metric_family(metric_family_t const *fam) {
metric_t *metric = fam->metric.ptr;

for (size_t m = 0; m < fam->metric.num; m++) {
double value = get_value(fam->type, metric[m].value);
double value = metric2double(fam->type, metric[m].value);
compose_name(name, sizeof(name), fam->name, &metric[m]);
if (globs.verbose & VERBOSE_METRICS) {
fprintf(stderr, "METRIC: %s: %g\n", name, value);
Expand Down Expand Up @@ -1425,6 +1412,7 @@ int main(int argc, const char **argv) {

assert(registry.config("DisableSeparateErrors", "false") == 0);
set_verbose(VERBOSE_CALLS_METRICS, VERBOSE_METRICS_NORMAL);
assert(registry.config("LogMetrics", "true") == 0);
assert(registry.init() == 0);

fprintf(stderr, "Query all metrics for the first time, with separate errors "
Expand All @@ -1436,6 +1424,7 @@ int main(int argc, const char **argv) {
assert(globs.warnings == 0);
/* per-time counters do not report on first round */
assert(validate_and_reset_saved_metrics(1, 0) > 0);
assert(registry.config("LogMetrics", "false") == 0);
fprintf(stderr, "metrics query round 1: PASS\n\n");

api_calls = globs.api_calls;
Expand Down

0 comments on commit 8ed19ba

Please sign in to comment.