diff --git a/src/client/api/metrics.c b/src/client/api/metrics.c index a4a78a6e25c..5cee0326280 100644 --- a/src/client/api/metrics.c +++ b/src/client/api/metrics.c @@ -27,9 +27,10 @@ bool daos_client_metric_retain; int dc_tm_init(void) { - int metrics_tag; - pid_t pid; - int rc; + struct d_tm_node_t *started_at; + int metrics_tag; + pid_t pid; + int rc; d_getenv_bool(DAOS_CLIENT_METRICS_ENABLE, &daos_client_metric); if (!daos_client_metric) @@ -64,10 +65,18 @@ dc_tm_init(void) rc = d_tm_add_ephemeral_dir(NULL, MAX_IDS_SIZE(INIT_JOB_NUM), "%s/%u", dc_jobid, pid); if (rc != 0) { - DL_ERROR(rc, "add metric %s/%u failed.\n", dc_jobid, pid); + DL_ERROR(rc, "add metric %s/%u failed.", dc_jobid, pid); D_GOTO(out, rc); } + rc = d_tm_add_metric(&started_at, D_TM_TIMESTAMP, "Timestamp of client startup", NULL, + "%s/%u/%s", dc_jobid, pid, "started_at"); + if (rc != 0) { + DL_ERROR(rc, "add metric %s/%u/started_at failed.", dc_jobid, pid); + D_GOTO(out, rc); + } + + d_tm_record_timestamp(started_at); out: if (rc) d_tm_fini(); diff --git a/src/common/SConscript b/src/common/SConscript index 05004e51509..0eec057198d 100644 --- a/src/common/SConscript +++ b/src/common/SConscript @@ -11,6 +11,7 @@ COMMON_FILES = ['debug.c', 'mem.c', 'fail_loc.c', 'lru.c', 'cipher.c', 'cipher_isal.c', 'qat.c', 'fault_domain.c', 'tls.c', 'metrics.c'] + def build_daos_common(denv, client): """ Building non-pmem version for client's common lib""" benv = denv.Clone() diff --git a/src/control/cmd/daos_agent/config.go b/src/control/cmd/daos_agent/config.go index ad8fc40924d..6e62a725f1a 100644 --- a/src/control/cmd/daos_agent/config.go +++ b/src/control/cmd/daos_agent/config.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2023 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -54,6 +54,13 @@ type Config struct { DisableAutoEvict bool `yaml:"disable_auto_evict,omitempty"` ExcludeFabricIfaces common.StringSet `yaml:"exclude_fabric_ifaces,omitempty"` FabricInterfaces []*NUMAFabricConfig `yaml:"fabric_ifaces,omitempty"` + TelemetryPort int `yaml:"telemetry_port,omitempty"` + TelemetryRetain time.Duration `yaml:"telemetry_retain,omitempty"` +} + +// TelemetryEnabled returns true if client telemetry collection/export is enabled. +func (c *Config) TelemetryEnabled() bool { + return c.TelemetryPort > 0 } // NUMAFabricConfig defines a list of fabric interfaces that belong to a NUMA @@ -88,6 +95,10 @@ func LoadConfig(cfgPath string) (*Config, error) { return nil, fmt.Errorf("invalid system name: %s", cfg.SystemName) } + if cfg.TelemetryRetain > 0 && cfg.TelemetryPort == 0 { + return nil, errors.New("telemetry_retain requires telemetry_port") + } + return cfg, nil } diff --git a/src/control/cmd/daos_agent/infocache.go b/src/control/cmd/daos_agent/infocache.go index 0dbdf4fc645..7193cf6f151 100644 --- a/src/control/cmd/daos_agent/infocache.go +++ b/src/control/cmd/daos_agent/infocache.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2023 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -8,6 +8,7 @@ package main import ( "context" + "fmt" "net" "strings" "sync" @@ -22,6 +23,7 @@ import ( "github.com/daos-stack/daos/src/control/lib/control" "github.com/daos-stack/daos/src/control/lib/hardware" "github.com/daos-stack/daos/src/control/lib/hardware/hwprov" + "github.com/daos-stack/daos/src/control/lib/telemetry" "github.com/daos-stack/daos/src/control/logging" ) @@ -36,17 +38,20 @@ type fabricScanFn func(ctx context.Context, providers ...string) (*NUMAFabric, e // NewInfoCache creates a new InfoCache with appropriate parameters set. func NewInfoCache(ctx context.Context, log logging.Logger, client control.UnaryInvoker, cfg *Config) *InfoCache { ic := &InfoCache{ - log: log, - ignoreIfaces: cfg.ExcludeFabricIfaces, - client: client, - cache: cache.NewItemCache(log), - getAttachInfo: control.GetAttachInfo, - fabricScan: getFabricScanFn(log, cfg, hwprov.DefaultFabricScanner(log)), - netIfaces: net.Interfaces, - devClassGetter: hwprov.DefaultNetDevClassProvider(log), - devStateGetter: hwprov.DefaultNetDevStateProvider(log), + log: log, + ignoreIfaces: cfg.ExcludeFabricIfaces, + client: client, + cache: cache.NewItemCache(log), + getAttachInfoCb: control.GetAttachInfo, + fabricScan: getFabricScanFn(log, cfg, hwprov.DefaultFabricScanner(log)), + netIfaces: net.Interfaces, + devClassGetter: hwprov.DefaultNetDevClassProvider(log), + devStateGetter: hwprov.DefaultNetDevStateProvider(log), } + ic.clientTelemetryEnabled.Store(cfg.TelemetryEnabled()) + ic.clientTelemetryRetain.Store(cfg.TelemetryRetain > 0) + if cfg.DisableCache { ic.DisableAttachInfoCache() ic.DisableFabricCache() @@ -198,12 +203,14 @@ type InfoCache struct { cache *cache.ItemCache fabricCacheDisabled atm.Bool attachInfoCacheDisabled atm.Bool + clientTelemetryEnabled atm.Bool + clientTelemetryRetain atm.Bool - getAttachInfo getAttachInfoFn - fabricScan fabricScanFn - netIfaces func() ([]net.Interface, error) - devClassGetter hardware.NetDevClassProvider - devStateGetter hardware.NetDevStateProvider + getAttachInfoCb getAttachInfoFn + fabricScan fabricScanFn + netIfaces func() ([]net.Interface, error) + devClassGetter hardware.NetDevClassProvider + devStateGetter hardware.NetDevStateProvider client control.UnaryInvoker attachInfoRefresh time.Duration @@ -292,6 +299,41 @@ func (c *InfoCache) EnableStaticFabricCache(ctx context.Context, nf *NUMAFabric) c.EnableFabricCache() } +func (c *InfoCache) getAttachInfo(ctx context.Context, rpcClient control.UnaryInvoker, req *control.GetAttachInfoReq) (*control.GetAttachInfoResp, error) { + if c == nil { + return nil, errors.New("InfoCache is nil") + } + if c.getAttachInfoCb == nil { + return nil, errors.New("getAttachInfoFn is nil") + } + + resp, err := c.getAttachInfoCb(ctx, rpcClient, req) + if err != nil { + return nil, err + } + c.adjustAttachInfo(resp) + return resp, nil +} + +// adjustAttachInfo performs any necessary adjustments to the attach info +// before returning it. +func (c *InfoCache) adjustAttachInfo(resp *control.GetAttachInfoResp) { + if c == nil || resp == nil { + return + } + + if c.clientTelemetryEnabled.IsTrue() { + resp.ClientNetHint.EnvVars = append(resp.ClientNetHint.EnvVars, + fmt.Sprintf("%s=1", telemetry.ClientMetricsEnabledEnv), + ) + if c.clientTelemetryRetain.IsTrue() { + resp.ClientNetHint.EnvVars = append(resp.ClientNetHint.EnvVars, + fmt.Sprintf("%s=1", telemetry.ClientMetricsRetainEnv), + ) + } + } +} + // GetAttachInfo fetches the attach info from the cache, and refreshes if necessary. func (c *InfoCache) GetAttachInfo(ctx context.Context, sys string) (*control.GetAttachInfoResp, error) { if c == nil { @@ -308,7 +350,8 @@ func (c *InfoCache) GetAttachInfo(ctx context.Context, sys string) (*control.Get } createItem := func() (cache.Item, error) { c.log.Debugf("cache miss for %s", sysAttachInfoKey(sys)) - return newCachedAttachInfo(c.attachInfoRefresh, sys, c.client, c.getAttachInfo), nil + cai := newCachedAttachInfo(c.attachInfoRefresh, sys, c.client, c.getAttachInfo) + return cai, nil } item, release, err := c.cache.GetOrCreate(ctx, sysAttachInfoKey(sys), createItem) diff --git a/src/control/cmd/daos_agent/infocache_test.go b/src/control/cmd/daos_agent/infocache_test.go index 54571d006a7..e86c44bfc0c 100644 --- a/src/control/cmd/daos_agent/infocache_test.go +++ b/src/control/cmd/daos_agent/infocache_test.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2023 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -8,20 +8,23 @@ package main import ( "context" + "fmt" "net" "testing" "time" + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/pkg/errors" + "github.com/daos-stack/daos/src/control/build" "github.com/daos-stack/daos/src/control/common" "github.com/daos-stack/daos/src/control/common/test" "github.com/daos-stack/daos/src/control/lib/cache" "github.com/daos-stack/daos/src/control/lib/control" "github.com/daos-stack/daos/src/control/lib/hardware" + "github.com/daos-stack/daos/src/control/lib/telemetry" "github.com/daos-stack/daos/src/control/logging" - "github.com/google/go-cmp/cmp" - "github.com/google/go-cmp/cmp/cmpopts" - "github.com/pkg/errors" ) type testInfoCacheParams struct { @@ -32,6 +35,8 @@ type testInfoCacheParams struct { mockNetDevStateGetter hardware.NetDevStateProvider disableFabricCache bool disableAttachInfoCache bool + enableClientTelemetry bool + retainClientTelemetry bool ctlInvoker control.Invoker cachedItems []cache.Item } @@ -43,16 +48,19 @@ func newTestInfoCache(t *testing.T, log logging.Logger, params testInfoCachePara } ic := &InfoCache{ - log: log, - getAttachInfo: params.mockGetAttachInfo, - fabricScan: params.mockScanFabric, - devClassGetter: params.mockNetDevClassGetter, - devStateGetter: params.mockNetDevStateGetter, - netIfaces: params.mockNetIfaces, - client: params.ctlInvoker, - cache: c, + log: log, + getAttachInfoCb: params.mockGetAttachInfo, + fabricScan: params.mockScanFabric, + devClassGetter: params.mockNetDevClassGetter, + devStateGetter: params.mockNetDevStateGetter, + netIfaces: params.mockNetIfaces, + client: params.ctlInvoker, + cache: c, } + ic.clientTelemetryEnabled.Store(params.enableClientTelemetry) + ic.clientTelemetryRetain.Store(params.retainClientTelemetry) + if ic.netIfaces == nil { ic.netIfaces = func() ([]net.Interface, error) { return []net.Interface{ @@ -714,6 +722,14 @@ func TestAgent_InfoCache_GetAttachInfo(t *testing.T) { NetDevClass: uint32(hardware.Ether), }, } + telemEnabledResp := copyGetAttachInfoResp(ctlResp) + telemEnabledResp.ClientNetHint.EnvVars = append(telemEnabledResp.ClientNetHint.EnvVars, + fmt.Sprintf("%s=1", telemetry.ClientMetricsEnabledEnv), + ) + telemRetainedResp := copyGetAttachInfoResp(telemEnabledResp) + telemRetainedResp.ClientNetHint.EnvVars = append(telemRetainedResp.ClientNetHint.EnvVars, + fmt.Sprintf("%s=1", telemetry.ClientMetricsRetainEnv), + ) for name, tc := range map[string]struct { getInfoCache func(logging.Logger) *InfoCache @@ -734,7 +750,7 @@ func TestAgent_InfoCache_GetAttachInfo(t *testing.T) { disableAttachInfoCache: true, }) }, - remoteResp: ctlResp, + remoteResp: copyGetAttachInfoResp(ctlResp), expResp: ctlResp, expRemote: true, }, @@ -748,11 +764,45 @@ func TestAgent_InfoCache_GetAttachInfo(t *testing.T) { expErr: errors.New("mock remote"), expRemote: true, }, + "cache disabled; client telemetry enabled": { + getInfoCache: func(l logging.Logger) *InfoCache { + return newTestInfoCache(t, l, testInfoCacheParams{ + disableAttachInfoCache: true, + enableClientTelemetry: true, + }) + }, + remoteResp: copyGetAttachInfoResp(ctlResp), + expResp: telemEnabledResp, + expRemote: true, + }, + "cache enabled; client telemetry enabled": { + getInfoCache: func(l logging.Logger) *InfoCache { + return newTestInfoCache(t, l, testInfoCacheParams{ + enableClientTelemetry: true, + }) + }, + remoteResp: copyGetAttachInfoResp(ctlResp), + expResp: telemEnabledResp, + expRemote: true, + expCached: true, + }, + "cache enabled; client telemetry enabled; client telemetry retained": { + getInfoCache: func(l logging.Logger) *InfoCache { + return newTestInfoCache(t, l, testInfoCacheParams{ + enableClientTelemetry: true, + retainClientTelemetry: true, + }) + }, + remoteResp: copyGetAttachInfoResp(ctlResp), + expResp: telemRetainedResp, + expRemote: true, + expCached: true, + }, "enabled but empty": { getInfoCache: func(l logging.Logger) *InfoCache { return newTestInfoCache(t, l, testInfoCacheParams{}) }, - remoteResp: ctlResp, + remoteResp: copyGetAttachInfoResp(ctlResp), expResp: ctlResp, expRemote: true, expCached: true, @@ -772,7 +822,7 @@ func TestAgent_InfoCache_GetAttachInfo(t *testing.T) { fetch: func(_ context.Context, _ control.UnaryInvoker, _ *control.GetAttachInfoReq) (*control.GetAttachInfoResp, error) { return nil, errors.New("shouldn't call cached remote") }, - lastResponse: ctlResp, + lastResponse: copyGetAttachInfoResp(ctlResp), cacheItem: cacheItem{lastCached: time.Now()}, system: "test", }) @@ -790,7 +840,7 @@ func TestAgent_InfoCache_GetAttachInfo(t *testing.T) { fetch: func(_ context.Context, _ control.UnaryInvoker, _ *control.GetAttachInfoReq) (*control.GetAttachInfoResp, error) { return nil, errors.New("shouldn't call cached remote") }, - lastResponse: ctlResp, + lastResponse: copyGetAttachInfoResp(ctlResp), cacheItem: cacheItem{lastCached: time.Now()}, system: build.DefaultSystemName, }) @@ -814,7 +864,7 @@ func TestAgent_InfoCache_GetAttachInfo(t *testing.T) { return ic }, system: "somethingelse", - remoteResp: ctlResp, + remoteResp: copyGetAttachInfoResp(ctlResp), expResp: ctlResp, expCached: true, expRemote: true, @@ -831,7 +881,7 @@ func TestAgent_InfoCache_GetAttachInfo(t *testing.T) { calledRemote := false if ic != nil { - ic.getAttachInfo = func(_ context.Context, _ control.UnaryInvoker, _ *control.GetAttachInfoReq) (*control.GetAttachInfoResp, error) { + ic.getAttachInfoCb = func(_ context.Context, _ control.UnaryInvoker, _ *control.GetAttachInfoReq) (*control.GetAttachInfoResp, error) { calledRemote = true return tc.remoteResp, tc.remoteErr } diff --git a/src/control/cmd/daos_agent/start.go b/src/control/cmd/daos_agent/start.go index cb5505234d5..0e8e6ca2016 100644 --- a/src/control/cmd/daos_agent/start.go +++ b/src/control/cmd/daos_agent/start.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2023 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -127,6 +127,16 @@ func (cmd *startCmd) Execute(_ []string) error { } cmd.Debugf("dRPC socket server started: %s", time.Since(drpcSrvStart)) + if cmd.cfg.TelemetryEnabled() { + telemetryStart := time.Now() + shutdown, err := startPrometheusExporter(ctx, cmd, cmd.cfg) + if err != nil { + return errors.Wrap(err, "unable to start prometheus exporter") + } + defer shutdown() + cmd.Debugf("telemetry exporter started: %s", time.Since(telemetryStart)) + } + cmd.Debugf("startup complete in %s", time.Since(startedAt)) cmd.Infof("%s (pid %d) listening on %s", versionString(), os.Getpid(), sockPath) if err := systemd.Ready(); err != nil && err != systemd.ErrSdNotifyNoSocket { diff --git a/src/control/cmd/daos_agent/telemetry.go b/src/control/cmd/daos_agent/telemetry.go new file mode 100644 index 00000000000..5bce4c7e64c --- /dev/null +++ b/src/control/cmd/daos_agent/telemetry.go @@ -0,0 +1,36 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package main + +import ( + "context" + + "github.com/prometheus/client_golang/prometheus" + + "github.com/daos-stack/daos/src/control/lib/telemetry/promexp" + "github.com/daos-stack/daos/src/control/logging" +) + +func startPrometheusExporter(ctx context.Context, log logging.Logger, cfg *Config) (func(), error) { + expCfg := &promexp.ExporterConfig{ + Port: cfg.TelemetryPort, + Title: "DAOS Client Telemetry", + Register: func(ctx context.Context, log logging.Logger) error { + c, err := promexp.NewClientCollector(ctx, log, &promexp.CollectorOpts{ + RetainDuration: cfg.TelemetryRetain, + }) + if err != nil { + return err + } + prometheus.MustRegister(c) + + return nil + }, + } + + return promexp.StartExporter(ctx, log, expCfg) +} diff --git a/src/control/common/test/utils.go b/src/control/common/test/utils.go index 81c17facecd..f7cc72ef72a 100644 --- a/src/control/common/test/utils.go +++ b/src/control/common/test/utils.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2018-2022 Intel Corporation. +// (C) Copyright 2018-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -26,6 +26,8 @@ import ( "github.com/google/go-cmp/cmp/cmpopts" "golang.org/x/sys/unix" "google.golang.org/protobuf/testing/protocmp" + + "github.com/daos-stack/daos/src/control/logging" ) // AssertTrue asserts b is true @@ -412,3 +414,14 @@ func Context(t *testing.T) context.Context { t.Cleanup(cancel) return ctx } + +// MustLogContext returns a context containing the supplied logger. +// Canceled when the test is done. +func MustLogContext(t *testing.T, log logging.Logger) context.Context { + t.Helper() + ctx, err := logging.ToContext(Context(t), log) + if err != nil { + t.Fatal(err) + } + return ctx +} diff --git a/src/control/lib/telemetry/promexp/client.go b/src/control/lib/telemetry/promexp/client.go new file mode 100644 index 00000000000..273fea53037 --- /dev/null +++ b/src/control/lib/telemetry/promexp/client.go @@ -0,0 +1,168 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// +//go:build linux && (amd64 || arm64) +// +build linux +// +build amd64 arm64 + +package promexp + +import ( + "context" + "regexp" + "strconv" + "strings" + "time" + + "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" + + "github.com/daos-stack/daos/src/control/lib/atm" + "github.com/daos-stack/daos/src/control/lib/telemetry" + "github.com/daos-stack/daos/src/control/logging" +) + +type ( + // ClientCollector is a metrics collector for DAOS client metrics. + ClientCollector struct { + metricsCollector + } +) + +func extractClientLabels(log logging.Logger, in string) (labels labelMap, name string) { + log.Tracef("in: %q", in) + + labels = make(labelMap) + compsIdx := 0 + comps := strings.Split(in, string(telemetry.PathSep)) + if len(comps) == 0 { + return labels, "" + } + + if strings.HasPrefix(comps[compsIdx], "ID") { + if len(comps) == 1 { + return labels, "" + } + compsIdx++ + } + + for i, label := range []string{"job", "pid", "tid"} { + if i > 0 { + // After jobid, we should have a pid and/or tid, and + // then move on to the engine labels. + _, err := strconv.Atoi(comps[compsIdx]) + if err != nil { + break + } + } + + if len(comps) == compsIdx+1 { + // If we have a weird path ending on a pid or tid, treat it + // as empty of labels. + if _, err := strconv.Atoi(comps[compsIdx]); err == nil && i > 0 { + return labelMap{}, "" + } + return labels, comps[compsIdx] + } + labels[label] = comps[compsIdx] + compsIdx++ + } + + var engLabels labelMap + engLabels, name = extractEngineLabels(log, strings.Join(comps[compsIdx:], string(telemetry.PathSep))) + for k, v := range engLabels { + labels[k] = v + } + + return +} + +func newClientMetric(log logging.Logger, m telemetry.Metric) *sourceMetric { + labels, name := extractClientLabels(log, m.FullPath()) + baseName := "client_" + name + + return newSourceMetric(log, m, baseName, labels) +} + +// newClientSource creates a new MetricSource for client metrics. +func newClientSource(parent context.Context) (*MetricSource, func(), error) { + ctx, err := telemetry.InitClientRoot(parent) + if err != nil { + return nil, nil, errors.Wrap(err, "failed to init telemetry") + } + + cleanupFn := func() { + telemetry.Detach(ctx) + } + + return &MetricSource{ + ctx: ctx, + enabled: atm.NewBool(true), + tmSchema: telemetry.NewSchema(), + smSchema: newSourceMetricSchema(newClientMetric), + }, cleanupFn, nil +} + +// NewClientCollector creates a new ClientCollector instance. +func NewClientCollector(ctx context.Context, log logging.Logger, opts *CollectorOpts) (*ClientCollector, error) { + source, cleanup, err := newClientSource(ctx) + if err != nil { + return nil, errors.Wrap(err, "failed to create client source") + } + + if opts == nil { + opts = defaultCollectorOpts() + } + + go func() { + <-ctx.Done() + cleanup() + }() + + if opts.RetainDuration != 0 { + log.Debugf("pruning client metrics every %s", opts.RetainDuration) + + go func() { + pruneTicker := time.NewTicker(opts.RetainDuration) + defer pruneTicker.Stop() + + for { + select { + case <-ctx.Done(): + case <-pruneTicker.C: + source.PruneSegments(log, opts.RetainDuration) + } + } + }() + } + + c := &ClientCollector{ + metricsCollector: metricsCollector{ + log: log, + summary: prometheus.NewSummaryVec( + prometheus.SummaryOpts{ + Namespace: "client", + Subsystem: "exporter", + Name: "scrape_duration_seconds", + Help: "daos_client_exporter: Duration of a scrape job.", + }, + []string{"source", "result"}, + ), + collectFn: func(ch chan *sourceMetric) { + source.Collect(log, ch) + }, + }, + } + + for _, pat := range opts.Ignores { + re, err := regexp.Compile(pat) + if err != nil { + return nil, errors.Wrapf(err, "failed to compile %q", pat) + } + c.ignoredMetrics = append(c.ignoredMetrics, re) + } + + return c, nil +} diff --git a/src/control/lib/telemetry/promexp/client_test.go b/src/control/lib/telemetry/promexp/client_test.go new file mode 100644 index 00000000000..ea60ceba0d3 --- /dev/null +++ b/src/control/lib/telemetry/promexp/client_test.go @@ -0,0 +1,158 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package promexp + +import ( + "fmt" + "regexp" + "strings" + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/prometheus/client_golang/prometheus" + + "github.com/daos-stack/daos/src/control/common/test" + "github.com/daos-stack/daos/src/control/logging" +) + +func TestPromExp_extractClientLabels(t *testing.T) { + shmID := 256 + jobID := "testJob" + pid := "12345" + tid := "67890" + + testPath := func(suffix string) string { + return fmt.Sprintf("ID: %d/%s/%s/%s/%s", shmID, jobID, pid, tid, suffix) + } + + for name, tc := range map[string]struct { + input string + expName string + expLabels labelMap + }{ + "empty": { + expLabels: labelMap{}, + }, + "ID stripped": { + input: "ID: 123", + expLabels: labelMap{}, + }, + "weird truncation": { + input: "ID: 123/jobbo/6783/90", + expLabels: labelMap{}, + }, + "active update ops": { + input: testPath("io/ops/update/active"), + expName: "io_ops_update_active", + expLabels: labelMap{ + "job": jobID, + "pid": pid, + "tid": tid, + }, + }, + "fetch latency 1MB": { + input: testPath("io/latency/fetch/1MB"), + expName: "io_latency_fetch", + expLabels: labelMap{ + "job": jobID, + "pid": pid, + "tid": tid, + "size": "1MB", + }, + }, + "started_at": { + input: fmt.Sprintf("ID: %d/%s/%s/started_at", shmID, jobID, pid), + expName: "started_at", + expLabels: labelMap{ + "job": jobID, + "pid": pid, + }, + }, + "pool ops": { + input: fmt.Sprintf("ID: %d/%s/%s/pool/%s/ops/foo", shmID, jobID, pid, test.MockPoolUUID(1)), + expName: "pool_ops_foo", + expLabels: labelMap{ + "job": jobID, + "pid": pid, + "pool": test.MockPoolUUID(1).String(), + }, + }, + } { + t.Run(name, func(t *testing.T) { + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + + labels, name := extractClientLabels(log, tc.input) + + test.AssertEqual(t, name, tc.expName, "") + if diff := cmp.Diff(labels, tc.expLabels); diff != "" { + t.Errorf("labels mismatch (-want +got):\n%s", diff) + } + }) + } +} + +func TestPromExp_NewClientCollector(t *testing.T) { + for name, tc := range map[string]struct { + opts *CollectorOpts + expErr error + expResult *ClientCollector + }{ + "defaults": { + expResult: &ClientCollector{ + metricsCollector: metricsCollector{ + summary: &prometheus.SummaryVec{ + MetricVec: &prometheus.MetricVec{}, + }, + }, + }, + }, + "opts with ignores": { + opts: &CollectorOpts{Ignores: []string{"one", "two"}}, + expResult: &ClientCollector{ + metricsCollector: metricsCollector{ + summary: &prometheus.SummaryVec{ + MetricVec: &prometheus.MetricVec{}, + }, + ignoredMetrics: []*regexp.Regexp{ + regexp.MustCompile("one"), + regexp.MustCompile("two"), + }, + }, + }, + }, + } { + t.Run(name, func(t *testing.T) { + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + + result, err := NewClientCollector(test.Context(t), log, tc.opts) + + test.CmpErr(t, tc.expErr, err) + + cmpOpts := []cmp.Option{ + cmpopts.IgnoreUnexported(MetricSource{}), + cmpopts.IgnoreUnexported(prometheus.SummaryVec{}), + cmpopts.IgnoreUnexported(prometheus.MetricVec{}), + cmpopts.IgnoreUnexported(regexp.Regexp{}), + cmp.AllowUnexported(ClientCollector{}), + cmp.AllowUnexported(metricsCollector{}), + cmp.FilterPath(func(p cmp.Path) bool { + // Ignore a few specific fields + return (strings.HasSuffix(p.String(), "log") || + strings.HasSuffix(p.String(), "sourceMutex") || + strings.HasSuffix(p.String(), "cleanupSource") || + strings.HasSuffix(p.String(), "collectFn")) + }, cmp.Ignore()), + } + if diff := cmp.Diff(tc.expResult, result, cmpOpts...); diff != "" { + t.Fatalf("(-want, +got)\n%s", diff) + } + }) + } +} diff --git a/src/control/lib/telemetry/promexp/collector.go b/src/control/lib/telemetry/promexp/collector.go index d57f1cf1687..33dfd925dfd 100644 --- a/src/control/lib/telemetry/promexp/collector.go +++ b/src/control/lib/telemetry/promexp/collector.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021-2022 Intel Corporation. +// (C) Copyright 2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -7,441 +7,34 @@ // +build linux // +build amd64 arm64 -// - package promexp import ( - "context" - "fmt" "regexp" - "strings" - "sync" - "unicode" + "time" - "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" - "github.com/daos-stack/daos/src/control/lib/atm" "github.com/daos-stack/daos/src/control/lib/telemetry" "github.com/daos-stack/daos/src/control/logging" ) type ( - Collector struct { - log logging.Logger - summary *prometheus.SummaryVec - ignoredMetrics []*regexp.Regexp - sources []*EngineSource - cleanupSource map[uint32]func() - sourceMutex sync.RWMutex // To protect sources - } - + // CollectorOpts contains options for the metrics collector. CollectorOpts struct { - Ignores []string - } - - EngineSource struct { - ctx context.Context - tmMutex sync.RWMutex // To protect telemetry collection - Index uint32 - Rank uint32 - enabled atm.Bool - tmSchema *telemetry.Schema - rmSchema rankMetricSchema + Ignores []string + RetainDuration time.Duration } - rankMetricSchema struct { - mu sync.Mutex - rankMetrics map[string]*rankMetric - seen map[string]struct{} + metricsCollector struct { + log logging.Logger + summary *prometheus.SummaryVec + ignoredMetrics []*regexp.Regexp + collectFn func(ch chan *sourceMetric) } ) -func (s *rankMetricSchema) Prune() { - s.mu.Lock() - defer s.mu.Unlock() - - for id := range s.rankMetrics { - if _, found := s.seen[id]; !found { - delete(s.rankMetrics, id) - } - } - s.seen = make(map[string]struct{}) -} - -func (s *rankMetricSchema) add(log logging.Logger, rank uint32, metric telemetry.Metric) (rm *rankMetric) { - s.mu.Lock() - defer s.mu.Unlock() - - id := metric.FullPath() - s.seen[id] = struct{}{} - - var found bool - if rm, found = s.rankMetrics[id]; !found { - rm = newRankMetric(log, rank, metric) - s.rankMetrics[id] = rm - } else { - rm.resetVecs() - } - - return -} - -func NewEngineSource(parent context.Context, idx uint32, rank uint32) (*EngineSource, func(), error) { - ctx, err := telemetry.Init(parent, idx) - if err != nil { - return nil, nil, errors.Wrap(err, "failed to init telemetry") - } - - cleanupFn := func() { - telemetry.Detach(ctx) - } - - return &EngineSource{ - ctx: ctx, - Index: idx, - Rank: rank, - enabled: atm.NewBool(true), - tmSchema: telemetry.NewSchema(), - rmSchema: rankMetricSchema{ - rankMetrics: make(map[string]*rankMetric), - seen: make(map[string]struct{}), - }, - }, cleanupFn, nil -} - -func defaultCollectorOpts() *CollectorOpts { - return &CollectorOpts{} -} - -func NewCollector(log logging.Logger, opts *CollectorOpts, sources ...*EngineSource) (*Collector, error) { - if opts == nil { - opts = defaultCollectorOpts() - } - - c := &Collector{ - log: log, - sources: sources, - cleanupSource: make(map[uint32]func()), - summary: prometheus.NewSummaryVec( - prometheus.SummaryOpts{ - Namespace: "engine", - Subsystem: "exporter", - Name: "scrape_duration_seconds", - Help: "daos_exporter: Duration of a scrape job.", - }, - []string{"source", "result"}, - ), - } - - for _, pat := range opts.Ignores { - re, err := regexp.Compile(pat) - if err != nil { - return nil, errors.Wrapf(err, "failed to compile %q", pat) - } - c.ignoredMetrics = append(c.ignoredMetrics, re) - } - - return c, nil -} - -type labelMap map[string]string - -func (lm labelMap) keys() (keys []string) { - for label := range lm { - keys = append(keys, label) - } - - return -} - -func sanitizeMetricName(in string) string { - return strings.Map(func(r rune) rune { - switch { - // Valid names for Prometheus are limited to: - case r >= 'a' && r <= 'z': // lowercase letters - case r >= 'A' && r <= 'Z': // uppercase letters - case unicode.IsDigit(r): // digits - default: // sanitize any other character - return '_' - } - - return r - }, strings.TrimLeft(in, "/")) -} - -func matchLabel(labels labelMap, input, match, label string) bool { - if !strings.HasPrefix(input, match) { - return false - } - - splitStr := strings.SplitN(input, "_", 2) - if len(splitStr) == 2 { - labels[label] = splitStr[1] - return true - } - return false -} - -func appendName(cur, name string) string { - if cur == "" { - return name - } - return cur + "_" + name -} - -// extractLabels takes a "/"-separated DAOS metric name in order to -// create a normalized Prometheus name and label map. -// -// NB: Prometheus metric names should follow best practices as -// outlined at https://prometheus.io/docs/practices/naming/ -// -// In particular, a metric name should describe the measurement, -// not the entity the measurement is about. In other words, if 4 -// different entities share the same measurement, then there should -// be a single metric with a label that distinguishes between -// individual measurement values. -// -// Good: pool_started_at {pool="00000000-1111-2222-3333-4444444444"} -// Bad: pool_00000000_1111_2222_3333_4444444444_started_at -func extractLabels(in string) (labels labelMap, name string) { - labels = make(labelMap) - compsIdx := 0 - comps := strings.Split(in, string(telemetry.PathSep)) - if len(comps) == 0 { - return labels, in - } - - if strings.HasPrefix(comps[compsIdx], "ID") { - if len(comps) == 1 { - return labels, "" - } - compsIdx++ - } - - switch comps[compsIdx] { - case "pool": - name = "pool" - compsIdx++ - labels["pool"] = comps[compsIdx] - compsIdx++ - switch comps[compsIdx] { - case "ops": - compsIdx++ - name += "_ops_" + comps[compsIdx] - compsIdx++ - } - case "io": - name = "io" - compsIdx++ - switch comps[compsIdx] { - case "latency": - compsIdx++ - name += "_latency_" + comps[compsIdx] - compsIdx++ - labels["size"] = comps[compsIdx] - compsIdx++ - case "ops": - compsIdx++ - name += "_ops_" + comps[compsIdx] - compsIdx++ - default: - name += "_" + comps[compsIdx] - compsIdx++ - } - case "net": - compsIdx++ - if comps[compsIdx] == "uri" { - compsIdx++ - name = "net_uri_" + comps[compsIdx] - compsIdx++ - break - } - - name = "net" - labels["provider"] = comps[compsIdx] - compsIdx++ - case "nvme": - name = "nvme" - compsIdx++ - labels["device"] = comps[compsIdx] - compsIdx++ - } - - for { - if len(comps) == compsIdx { - break - } - - switch { - case matchLabel(labels, comps[compsIdx], "tgt_", "target"): - compsIdx++ - case matchLabel(labels, comps[compsIdx], "xs_", "xstream"): - compsIdx++ - case matchLabel(labels, comps[compsIdx], "ctx_", "context"): - compsIdx++ - default: - name = appendName(name, comps[compsIdx]) - compsIdx++ - } - } - - name = sanitizeMetricName(name) - return -} - -func (es *EngineSource) Collect(log logging.Logger, ch chan<- *rankMetric) { - if es == nil { - log.Error("nil engine source") - return - } - if !es.IsEnabled() { - return - } - if ch == nil { - log.Error("nil channel") - return - } - - es.tmMutex.RLock() - defer es.tmMutex.RUnlock() - - metrics := make(chan telemetry.Metric) - go func() { - if err := telemetry.CollectMetrics(es.ctx, es.tmSchema, metrics); err != nil { - log.Errorf("failed to collect metrics for engine rank %d: %s", es.Rank, err) - return - } - es.tmSchema.Prune() - }() - - for metric := range metrics { - ch <- es.rmSchema.add(log, es.Rank, metric) - } - es.rmSchema.Prune() -} - -// IsEnabled checks if the engine source is enabled. -func (es *EngineSource) IsEnabled() bool { - return es.enabled.IsTrue() -} - -// Enable enables the engine source. -func (es *EngineSource) Enable() { - es.enabled.SetTrue() -} - -// Disable disables the engine source. -func (es *EngineSource) Disable() { - es.enabled.SetFalse() -} - -type gvMap map[string]*prometheus.GaugeVec - -func (m gvMap) add(name, help string, labels labelMap) { - if _, found := m[name]; !found { - gv := prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: name, - Help: help, - }, labels.keys()) - m[name] = gv - } -} - -func (m gvMap) set(name string, value float64, labels labelMap) error { - gv, found := m[name] - if !found { - return errors.Errorf("gauge vector %s not found", name) - } - gv.With(prometheus.Labels(labels)).Set(value) - - return nil -} - -type cvMap map[string]*prometheus.CounterVec - -func (m cvMap) add(name, help string, labels labelMap) { - if _, found := m[name]; !found { - cv := prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: name, - Help: help, - }, labels.keys()) - m[name] = cv - } -} - -func (m cvMap) set(name string, value float64, labels labelMap) error { - cv, found := m[name] - if !found { - return errors.Errorf("counter vector %s not found", name) - } - cv.With(prometheus.Labels(labels)).Add(value) - - return nil -} - -type rankMetric struct { - rank uint32 - metric telemetry.Metric - baseName string - labels labelMap - gvm gvMap - cvm cvMap -} - -func (rm *rankMetric) collect(ch chan<- prometheus.Metric) { - for _, gv := range rm.gvm { - gv.Collect(ch) - } - for _, cv := range rm.cvm { - cv.Collect(ch) - } -} - -func (rm *rankMetric) resetVecs() { - for _, gv := range rm.gvm { - gv.Reset() - } - for _, cv := range rm.cvm { - cv.Reset() - } -} - -func newRankMetric(log logging.Logger, rank uint32, m telemetry.Metric) *rankMetric { - rm := &rankMetric{ - metric: m, - rank: rank, - gvm: make(gvMap), - cvm: make(cvMap), - } - - var name string - rm.labels, name = extractLabels(m.FullPath()) - rm.labels["rank"] = fmt.Sprintf("%d", rm.rank) - rm.baseName = "engine_" + name - - desc := m.Desc() - - switch rm.metric.Type() { - case telemetry.MetricTypeGauge, telemetry.MetricTypeTimestamp, - telemetry.MetricTypeSnapshot: - rm.gvm.add(rm.baseName, desc, rm.labels) - case telemetry.MetricTypeStatsGauge, telemetry.MetricTypeDuration: - rm.gvm.add(rm.baseName, desc, rm.labels) - for _, ms := range getMetricStats(rm.baseName, rm.metric) { - rm.gvm.add(ms.name, ms.desc, rm.labels) - } - case telemetry.MetricTypeCounter: - rm.cvm.add(rm.baseName, desc, rm.labels) - default: - log.Errorf("[%s]: metric type %d not supported", name, rm.metric.Type()) - } - - return rm -} - -func (c *Collector) isIgnored(name string) bool { +func (c *metricsCollector) isIgnored(name string) bool { for _, re := range c.ignoredMetrics { // TODO: We may want to look into removing the use of regexp here // in favor of a less-flexible but more efficient approach. @@ -454,105 +47,7 @@ func (c *Collector) isIgnored(name string) bool { return false } -type metricStat struct { - name string - desc string - value float64 -} - -func getMetricStats(baseName string, m telemetry.Metric) (stats []*metricStat) { - ms, ok := m.(telemetry.StatsMetric) - if !ok { - return - } - - for name, s := range map[string]struct { - fn func() float64 - desc string - }{ - "min": { - fn: ms.FloatMin, - desc: " (min value)", - }, - "max": { - fn: ms.FloatMax, - desc: " (max value)", - }, - "mean": { - fn: ms.Mean, - desc: " (mean)", - }, - "stddev": { - fn: ms.StdDev, - desc: " (std dev)", - }, - } { - stats = append(stats, &metricStat{ - name: baseName + "_" + name, - desc: m.Desc() + s.desc, - value: s.fn(), - }) - } - - return -} - -// AddSource adds an EngineSource to the Collector. -func (c *Collector) AddSource(es *EngineSource, cleanup func()) { - if es == nil { - c.log.Error("attempted to add nil EngineSource") - return - } - - c.sourceMutex.Lock() - defer c.sourceMutex.Unlock() - - // If we attempt to add a duplicate, remove the old one. - c.removeSourceNoLock(es.Index) - - c.sources = append(c.sources, es) - if cleanup != nil { - c.cleanupSource[es.Index] = cleanup - } -} - -// RemoveSource removes an EngineSource with a given index from the Collector. -func (c *Collector) RemoveSource(engineIdx uint32) { - c.sourceMutex.Lock() - defer c.sourceMutex.Unlock() - - c.removeSourceNoLock(engineIdx) -} - -func (c *Collector) removeSourceNoLock(engineIdx uint32) { - for i, es := range c.sources { - if es.Index == engineIdx { - es.Disable() - c.sources = append(c.sources[:i], c.sources[i+1:]...) - - // Ensure that EngineSource isn't collecting during cleanup - es.tmMutex.Lock() - if cleanup, found := c.cleanupSource[engineIdx]; found && cleanup != nil { - cleanup() - } - es.tmMutex.Unlock() - delete(c.cleanupSource, engineIdx) - break - } - } -} - -func (c *Collector) getSources() []*EngineSource { - c.sourceMutex.RLock() - defer c.sourceMutex.RUnlock() - - sourceCopy := make([]*EngineSource, len(c.sources)) - _ = copy(sourceCopy, c.sources) - return sourceCopy -} - -// Collect collects metrics from all EngineSources. -func (c *Collector) Collect(ch chan<- prometheus.Metric) { +func (c *metricsCollector) Collect(ch chan<- prometheus.Metric) { if c == nil { return } @@ -560,49 +55,51 @@ func (c *Collector) Collect(ch chan<- prometheus.Metric) { c.log.Error("passed a nil channel") return } + if c.collectFn == nil { + c.log.Error("collectFn is nil") + return + } - rankMetrics := make(chan *rankMetric) - go func(sources []*EngineSource) { - for _, source := range sources { - source.Collect(c.log, rankMetrics) - } - close(rankMetrics) - }(c.getSources()) + sourceMetrics := make(chan *sourceMetric) + go func() { + c.collectFn(sourceMetrics) + close(sourceMetrics) + }() - for rm := range rankMetrics { - if c.isIgnored(rm.baseName) { + for sm := range sourceMetrics { + if c.isIgnored(sm.baseName) { continue } var err error - switch rm.metric.Type() { + switch sm.metric.Type() { case telemetry.MetricTypeGauge, telemetry.MetricTypeTimestamp, telemetry.MetricTypeSnapshot: - err = rm.gvm.set(rm.baseName, rm.metric.FloatValue(), rm.labels) + err = sm.gvm.set(sm.baseName, sm.metric.FloatValue(), sm.labels) case telemetry.MetricTypeStatsGauge, telemetry.MetricTypeDuration: - if err = rm.gvm.set(rm.baseName, rm.metric.FloatValue(), rm.labels); err != nil { + if err = sm.gvm.set(sm.baseName, sm.metric.FloatValue(), sm.labels); err != nil { break } - for _, ms := range getMetricStats(rm.baseName, rm.metric) { - if err = rm.gvm.set(ms.name, ms.value, rm.labels); err != nil { + for _, ms := range getMetricStats(sm.baseName, sm.metric) { + if err = sm.gvm.set(ms.name, ms.value, sm.labels); err != nil { break } } case telemetry.MetricTypeCounter: - err = rm.cvm.set(rm.baseName, rm.metric.FloatValue(), rm.labels) + err = sm.cvm.set(sm.baseName, sm.metric.FloatValue(), sm.labels) default: - c.log.Errorf("[%s]: metric type %d not supported", rm.baseName, rm.metric.Type()) + c.log.Errorf("[%s]: metric type %d not supported", sm.baseName, sm.metric.Type()) } if err != nil { - c.log.Errorf("[%s]: %s", rm.baseName, err) + c.log.Errorf("[%s]: %s", sm.baseName, err) continue } - rm.collect(ch) + sm.collect(ch) } } -func (c *Collector) Describe(ch chan<- *prometheus.Desc) { +func (c *metricsCollector) Describe(ch chan<- *prometheus.Desc) { c.summary.Describe(ch) } diff --git a/src/control/lib/telemetry/promexp/engine.go b/src/control/lib/telemetry/promexp/engine.go new file mode 100644 index 00000000000..d97226e7b56 --- /dev/null +++ b/src/control/lib/telemetry/promexp/engine.go @@ -0,0 +1,271 @@ +// +// (C) Copyright 2021-2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// +//go:build linux && (amd64 || arm64) +// +build linux +// +build amd64 arm64 + +package promexp + +import ( + "context" + "fmt" + "regexp" + "strings" + "sync" + + "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" + + "github.com/daos-stack/daos/src/control/lib/atm" + "github.com/daos-stack/daos/src/control/lib/telemetry" + "github.com/daos-stack/daos/src/control/logging" +) + +type ( + // EngineCollector collects metrics from DAOS Engine sources. + EngineCollector struct { + metricsCollector + sources []*EngineSource + cleanupSource map[uint32]func() + sourceMutex sync.RWMutex // To protect sources + } + + // EngineSource provides metrics for a single DAOS Engine. + EngineSource struct { + MetricSource + Index uint32 + Rank uint32 + } +) + +// NewEngineSource initializes a new metrics source for a DAOS Engine. +func NewEngineSource(parent context.Context, idx uint32, rank uint32) (*EngineSource, func(), error) { + ctx, err := telemetry.Init(parent, idx) + if err != nil { + return nil, nil, errors.Wrap(err, "failed to init telemetry") + } + + cleanupFn := func() { + telemetry.Detach(ctx) + } + + return &EngineSource{ + MetricSource: MetricSource{ + ctx: ctx, + enabled: atm.NewBool(true), + tmSchema: telemetry.NewSchema(), + smSchema: newSourceMetricSchema(func(l logging.Logger, m telemetry.Metric) *sourceMetric { + return newRankMetric(l, rank, m) + }), + }, + Index: idx, + Rank: rank, + }, cleanupFn, nil +} + +// NewEngineCollector initializes a new collector for DAOS Engine sources. +func NewEngineCollector(log logging.Logger, opts *CollectorOpts, sources ...*EngineSource) (*EngineCollector, error) { + if opts == nil { + opts = defaultCollectorOpts() + } + + c := &EngineCollector{ + metricsCollector: metricsCollector{ + log: log, + summary: prometheus.NewSummaryVec( + prometheus.SummaryOpts{ + Namespace: "engine", + Subsystem: "exporter", + Name: "scrape_duration_seconds", + Help: "daos_exporter: Duration of a scrape job.", + }, + []string{"source", "result"}, + ), + }, + sources: sources, + cleanupSource: make(map[uint32]func()), + } + + c.collectFn = func(metrics chan *sourceMetric) { + for _, source := range c.getSources() { + source.Collect(c.log, metrics) + } + } + + for _, pat := range opts.Ignores { + re, err := regexp.Compile(pat) + if err != nil { + return nil, errors.Wrapf(err, "failed to compile %q", pat) + } + c.ignoredMetrics = append(c.ignoredMetrics, re) + } + + return c, nil +} + +// extractEngineLabels takes a "/"-separated DAOS metric name in order to +// create a normalized Prometheus name and label map. +// +// NB: Prometheus metric names should follow best practices as +// outlined at https://prometheus.io/docs/practices/naming/ +// +// In particular, a metric name should describe the measurement, +// not the entity the measurement is about. In other words, if 4 +// different entities share the same measurement, then there should +// be a single metric with a label that distinguishes between +// individual measurement values. +// +// Good: pool_started_at {pool="00000000-1111-2222-3333-4444444444"} +// Bad: pool_00000000_1111_2222_3333_4444444444_started_at +func extractEngineLabels(log logging.Logger, in string) (labels labelMap, name string) { + log.Tracef("in: %q", in) + + labels = make(labelMap) + compsIdx := 0 + comps := strings.Split(in, string(telemetry.PathSep)) + if len(comps) == 0 { + return labels, "" + } + + if strings.HasPrefix(comps[compsIdx], "ID") { + if len(comps) == 1 { + return labels, "" + } + compsIdx++ + } + + switch comps[compsIdx] { + case "pool": + name = "pool" + compsIdx++ + labels["pool"] = comps[compsIdx] + compsIdx++ + switch comps[compsIdx] { + case "ops": + compsIdx++ + name += "_ops_" + comps[compsIdx] + compsIdx++ + } + case "io": + name = "io" + compsIdx++ + switch comps[compsIdx] { + case "latency": + compsIdx++ + name += "_latency_" + comps[compsIdx] + compsIdx++ + labels["size"] = comps[compsIdx] + compsIdx++ + case "ops": + compsIdx++ + name += "_ops_" + comps[compsIdx] + compsIdx++ + default: + name += "_" + comps[compsIdx] + compsIdx++ + } + case "net": + compsIdx++ + if comps[compsIdx] == "uri" { + compsIdx++ + name = "net_uri_" + comps[compsIdx] + compsIdx++ + break + } + + name = "net" + labels["provider"] = comps[compsIdx] + compsIdx++ + case "nvme": + name = "nvme" + compsIdx++ + labels["device"] = comps[compsIdx] + compsIdx++ + } + + for { + if len(comps) == compsIdx { + break + } + + switch { + case matchLabel(labels, comps[compsIdx], "tgt_", "target"): + compsIdx++ + case matchLabel(labels, comps[compsIdx], "xs_", "xstream"): + compsIdx++ + case matchLabel(labels, comps[compsIdx], "ctx_", "context"): + compsIdx++ + default: + name = appendName(name, comps[compsIdx]) + compsIdx++ + } + } + + name = sanitizeMetricName(name) + return +} + +func newRankMetric(log logging.Logger, rank uint32, m telemetry.Metric) *sourceMetric { + labels, name := extractEngineLabels(log, m.FullPath()) + baseName := "engine_" + name + labels["rank"] = fmt.Sprintf("%d", rank) + + return newSourceMetric(log, m, baseName, labels) +} + +// AddSource adds an EngineSource to the Collector. +func (c *EngineCollector) AddSource(es *EngineSource, cleanup func()) { + if es == nil { + c.log.Error("attempted to add nil EngineSource") + return + } + + c.sourceMutex.Lock() + defer c.sourceMutex.Unlock() + + // If we attempt to add a duplicate, remove the old one. + c.removeSourceNoLock(es.Index) + + c.sources = append(c.sources, es) + if cleanup != nil { + c.cleanupSource[es.Index] = cleanup + } +} + +// RemoveSource removes an EngineSource with a given index from the Collector. +func (c *EngineCollector) RemoveSource(engineIdx uint32) { + c.sourceMutex.Lock() + defer c.sourceMutex.Unlock() + + c.removeSourceNoLock(engineIdx) +} + +func (c *EngineCollector) removeSourceNoLock(engineIdx uint32) { + for i, es := range c.sources { + if es.Index == engineIdx { + es.Disable() + c.sources = append(c.sources[:i], c.sources[i+1:]...) + + // Ensure that EngineSource isn't collecting during cleanup + es.tmMutex.Lock() + if cleanup, found := c.cleanupSource[engineIdx]; found && cleanup != nil { + cleanup() + } + es.tmMutex.Unlock() + delete(c.cleanupSource, engineIdx) + break + } + } +} + +func (c *EngineCollector) getSources() []*EngineSource { + c.sourceMutex.RLock() + defer c.sourceMutex.RUnlock() + + sourceCopy := make([]*EngineSource, len(c.sources)) + _ = copy(sourceCopy, c.sources) + return sourceCopy +} diff --git a/src/control/lib/telemetry/promexp/collector_test.go b/src/control/lib/telemetry/promexp/engine_test.go similarity index 88% rename from src/control/lib/telemetry/promexp/collector_test.go rename to src/control/lib/telemetry/promexp/engine_test.go index 7dbe295d627..44924a6fcd7 100644 --- a/src/control/lib/telemetry/promexp/collector_test.go +++ b/src/control/lib/telemetry/promexp/engine_test.go @@ -1,12 +1,7 @@ // -// (C) Copyright 2021-2022 Intel Corporation. +// (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent -// -//go:build linux && (amd64 || arm64) -// +build linux -// +build amd64 arm64 - // package promexp @@ -62,7 +57,10 @@ func TestPromexp_NewEngineSource(t *testing.T) { test.CmpErr(t, tc.expErr, err) - if diff := cmp.Diff(tc.expResult, result, cmpopts.IgnoreUnexported(EngineSource{})); diff != "" { + cmpOpts := cmp.Options{ + cmpopts.IgnoreUnexported(MetricSource{}), + } + if diff := cmp.Diff(tc.expResult, result, cmpOpts...); diff != "" { t.Fatalf("(-want, +got)\n%s", diff) } @@ -155,31 +153,20 @@ func TestPromExp_EngineSource_Collect(t *testing.T) { for name, tc := range map[string]struct { es *EngineSource - resultChan chan *rankMetric + resultChan chan *sourceMetric expMetrics telemetry.TestMetricsMap }{ - "nil source": { - resultChan: make(chan *rankMetric), - }, "nil channel": { es: validSrc, }, - "bad source": { - es: &EngineSource{ - ctx: test.Context(t), - Rank: 123, - Index: testIdx + 1, - }, - resultChan: make(chan *rankMetric), - }, "success": { es: validSrc, - resultChan: make(chan *rankMetric), + resultChan: make(chan *sourceMetric), expMetrics: realMetrics, }, "disabled": { es: disabledSrc, - resultChan: make(chan *rankMetric), + resultChan: make(chan *sourceMetric), expMetrics: telemetry.TestMetricsMap{}, }, } { @@ -189,7 +176,7 @@ func TestPromExp_EngineSource_Collect(t *testing.T) { go tc.es.Collect(log, tc.resultChan) - gotMetrics := []*rankMetric{} + gotMetrics := []*sourceMetric{} for { done := false select { @@ -206,7 +193,7 @@ func TestPromExp_EngineSource_Collect(t *testing.T) { test.AssertEqual(t, len(tc.expMetrics), len(gotMetrics), "wrong number of metrics returned") for _, got := range gotMetrics { - test.AssertEqual(t, testRank, got.rank, "wrong rank") + test.AssertEqual(t, fmt.Sprintf("%d", testRank), got.labels["rank"], "wrong rank") expM, ok := tc.expMetrics[got.metric.Type()] if !ok { t.Fatalf("metric type %d not expected", got.metric.Type()) @@ -220,7 +207,7 @@ func TestPromExp_EngineSource_Collect(t *testing.T) { } } -func TestPromExp_NewCollector(t *testing.T) { +func TestPromExp_NewEngineCollector(t *testing.T) { testSrc := []*EngineSource{ { Rank: 1, @@ -234,20 +221,24 @@ func TestPromExp_NewCollector(t *testing.T) { sources []*EngineSource opts *CollectorOpts expErr error - expResult *Collector + expResult *EngineCollector }{ "no sources": { - expResult: &Collector{ - summary: &prometheus.SummaryVec{ - MetricVec: &prometheus.MetricVec{}, + expResult: &EngineCollector{ + metricsCollector: metricsCollector{ + summary: &prometheus.SummaryVec{ + MetricVec: &prometheus.MetricVec{}, + }, }, }, }, "defaults": { sources: testSrc, - expResult: &Collector{ - summary: &prometheus.SummaryVec{ - MetricVec: &prometheus.MetricVec{}, + expResult: &EngineCollector{ + metricsCollector: metricsCollector{ + summary: &prometheus.SummaryVec{ + MetricVec: &prometheus.MetricVec{}, + }, }, sources: testSrc, }, @@ -255,15 +246,17 @@ func TestPromExp_NewCollector(t *testing.T) { "opts with ignores": { sources: testSrc, opts: &CollectorOpts{Ignores: []string{"one", "two"}}, - expResult: &Collector{ - summary: &prometheus.SummaryVec{ - MetricVec: &prometheus.MetricVec{}, + expResult: &EngineCollector{ + metricsCollector: metricsCollector{ + summary: &prometheus.SummaryVec{ + MetricVec: &prometheus.MetricVec{}, + }, + ignoredMetrics: []*regexp.Regexp{ + regexp.MustCompile("one"), + regexp.MustCompile("two"), + }, }, sources: testSrc, - ignoredMetrics: []*regexp.Regexp{ - regexp.MustCompile("one"), - regexp.MustCompile("two"), - }, }, }, "bad regexp in ignores": { @@ -276,21 +269,23 @@ func TestPromExp_NewCollector(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) defer test.ShowBufferOnFailure(t, buf) - result, err := NewCollector(log, tc.opts, tc.sources...) + result, err := NewEngineCollector(log, tc.opts, tc.sources...) test.CmpErr(t, tc.expErr, err) cmpOpts := []cmp.Option{ - cmpopts.IgnoreUnexported(EngineSource{}), + cmpopts.IgnoreUnexported(MetricSource{}), cmpopts.IgnoreUnexported(prometheus.SummaryVec{}), cmpopts.IgnoreUnexported(prometheus.MetricVec{}), cmpopts.IgnoreUnexported(regexp.Regexp{}), - cmp.AllowUnexported(Collector{}), + cmp.AllowUnexported(EngineCollector{}), + cmp.AllowUnexported(metricsCollector{}), cmp.FilterPath(func(p cmp.Path) bool { // Ignore a few specific fields return (strings.HasSuffix(p.String(), "log") || strings.HasSuffix(p.String(), "sourceMutex") || - strings.HasSuffix(p.String(), "cleanupSource")) + strings.HasSuffix(p.String(), "cleanupSource") || + strings.HasSuffix(p.String(), "collectFn")) }, cmp.Ignore()), } if diff := cmp.Diff(tc.expResult, result, cmpOpts...); diff != "" { @@ -338,7 +333,7 @@ func TestPromExp_Collector_Prune(t *testing.T) { } defer cleanup() - defaultCollector, err := NewCollector(log, nil, engSrc) + defaultCollector, err := NewEngineCollector(log, nil, engSrc) if err != nil { t.Fatalf("failed to create collector: %s", err.Error()) } @@ -357,12 +352,12 @@ func TestPromExp_Collector_Prune(t *testing.T) { } } - engSrc.rmSchema.mu.Lock() - for m := range engSrc.rmSchema.rankMetrics { - _, name := extractLabels(m) + engSrc.smSchema.mu.Lock() + for m := range engSrc.smSchema.sourceMetrics { + _, name := extractEngineLabels(log, m) names = append(names, name) } - engSrc.rmSchema.mu.Unlock() + engSrc.smSchema.mu.Unlock() sort.Strings(names) return @@ -373,7 +368,7 @@ func TestPromExp_Collector_Prune(t *testing.T) { for _, m := range maps { for t, m := range m { if t != telemetry.MetricTypeDirectory && t != telemetry.MetricTypeLink { - _, name := extractLabels(m.FullPath()) + _, name := extractEngineLabels(log, m.FullPath()) unique[name] = struct{}{} } } @@ -422,7 +417,7 @@ func TestPromExp_Collector_Collect(t *testing.T) { } defer cleanup() - defaultCollector, err := NewCollector(log, nil, engSrc) + defaultCollector, err := NewEngineCollector(log, nil, engSrc) if err != nil { t.Fatalf("failed to create collector: %s", err.Error()) } @@ -433,7 +428,7 @@ func TestPromExp_Collector_Collect(t *testing.T) { "engine_stats_gauge2", "engine_timer_duration", } - ignoreCollector, err := NewCollector(log, &CollectorOpts{ + ignoreCollector, err := NewEngineCollector(log, &CollectorOpts{ Ignores: ignores, }, engSrc) if err != nil { @@ -441,13 +436,10 @@ func TestPromExp_Collector_Collect(t *testing.T) { } for name, tc := range map[string]struct { - collector *Collector + collector *EngineCollector resultChan chan prometheus.Metric expMetricNames []string }{ - "nil collector": { - resultChan: make(chan prometheus.Metric), - }, "nil channel": { collector: defaultCollector, }, @@ -512,7 +504,7 @@ func TestPromExp_Collector_Collect(t *testing.T) { } } -func TestPromExp_extractLabels(t *testing.T) { +func TestPromExp_extractEngineLabels(t *testing.T) { for name, tc := range map[string]struct { input string expName string @@ -626,7 +618,10 @@ func TestPromExp_extractLabels(t *testing.T) { }, } { t.Run(name, func(t *testing.T) { - labels, name := extractLabels(tc.input) + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + + labels, name := extractEngineLabels(log, tc.input) test.AssertEqual(t, name, tc.expName, "") if diff := cmp.Diff(labels, tc.expLabels); diff != "" { @@ -686,7 +681,7 @@ func TestPromExp_Collector_AddSource(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) defer test.ShowBufferOnFailure(t, buf) - collector, err := NewCollector(log, nil, tc.startSrc...) + collector, err := NewEngineCollector(log, nil, tc.startSrc...) if err != nil { t.Fatalf("failed to set up collector: %s", err) } @@ -789,7 +784,7 @@ func TestPromExp_Collector_RemoveSource(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) defer test.ShowBufferOnFailure(t, buf) - collector, err := NewCollector(log, nil, tc.startSrc...) + collector, err := NewEngineCollector(log, nil, tc.startSrc...) if err != nil { t.Fatalf("failed to set up collector: %s", err) } @@ -799,7 +794,10 @@ func TestPromExp_Collector_RemoveSource(t *testing.T) { collector.RemoveSource(tc.idx) - if diff := cmp.Diff(tc.expSrc, collector.sources, cmpopts.IgnoreUnexported(EngineSource{})); diff != "" { + cmpOpts := cmp.Options{ + cmpopts.IgnoreUnexported(MetricSource{}), + } + if diff := cmp.Diff(tc.expSrc, collector.sources, cmpOpts...); diff != "" { t.Fatalf("(-want, +got)\n%s", diff) } diff --git a/src/control/lib/telemetry/promexp/httpd.go b/src/control/lib/telemetry/promexp/httpd.go new file mode 100644 index 00000000000..cdd1955629b --- /dev/null +++ b/src/control/lib/telemetry/promexp/httpd.go @@ -0,0 +1,100 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// +//go:build linux && (amd64 || arm64) +// +build linux +// +build amd64 arm64 + +package promexp + +import ( + "context" + "fmt" + "net/http" + "time" + + "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + + "github.com/daos-stack/daos/src/control/logging" +) + +type ( + // RegMonFn defines a function signature for registering a Prometheus + // monitor. + RegMonFn func(context.Context, logging.Logger) error + + // ExporterConfig defines the configuration for the Prometheus exporter. + ExporterConfig struct { + Port int + Title string + Register RegMonFn + } +) + +const ( + // EngineTelemetryPort specifies the default port for engine telemetry. + EngineTelemetryPort = 9191 + // ClientTelemetryPort specifies the default port for client telemetry. + ClientTelemetryPort = 9192 +) + +// StartExporter starts the Prometheus exporter. +func StartExporter(ctx context.Context, log logging.Logger, cfg *ExporterConfig) (func(), error) { + if cfg == nil { + return nil, errors.New("invalid exporter config: nil config") + } + + if cfg.Port <= 0 { + return nil, errors.New("invalid exporter config: bad port") + } + + if cfg.Register == nil { + return nil, errors.New("invalid exporter config: nil register function") + } + + if err := cfg.Register(ctx, log); err != nil { + return nil, errors.Wrap(err, "failed to register client monitor") + } + + listenAddress := fmt.Sprintf("0.0.0.0:%d", cfg.Port) + + srv := http.Server{Addr: listenAddress} + http.Handle("/metrics", promhttp.HandlerFor( + prometheus.DefaultGatherer, promhttp.HandlerOpts{}, + )) + http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + num, err := w.Write([]byte(fmt.Sprintf(` +