diff --git a/docs/admin/administration.md b/docs/admin/administration.md index 049e98eb81c..d4efdfca0f5 100644 --- a/docs/admin/administration.md +++ b/docs/admin/administration.md @@ -286,7 +286,7 @@ written to `$HOME/.prometheus.yml`. To start the Prometheus server with the configuration file generated by `dmg`: ``` -prometheus --config-file=$HOME/.prometheus.yml +prometheus --config.file=$HOME/.prometheus.yml ``` ## Storage Operations diff --git a/docs/admin/deployment.md b/docs/admin/deployment.md index e318397a28d..bddd2edd1d8 100644 --- a/docs/admin/deployment.md +++ b/docs/admin/deployment.md @@ -759,6 +759,56 @@ transport_config: key: /etc/daos/certs/admin.key ``` +#### Telemetry Certificate Configuration + +The DAOS Telemetry framework has option to use certificates to authenticate +between server/client and admin node. +Creating of certificate is not part of DAOS scope and it is up to Admin to +generate the certificate and add it to the DAOS server and client system. + +#### Telemetry Yaml Example + +Information on telelmetry config parameters in respective yaml file. + +```yaml +# /etc/daos/daos_server.yml (servers) +telemetry_config: + # To use telemetry in secure mode + allow_insecure: false + # Set the server telemetry endpoint port number + port: 9191 + # Server certificate for use in TLS handshakes + https_cert: /etc/daos/certs/telemetry.crt + # Key portion of Server Certificate + https_key: /etc/daos/certs/telemetry.key +``` + +```yaml +# /etc/daos/daos_agent.yml (clients) +telemetry_config: + # To use telemetry in secure mode + allow_insecure: false + # Enable client telemetry for all DAOS clients. + enabled: true + # Set the client telemetry endpoint port number + port: 9192 + # Retain client telemetry for a period of time after the client process exits. + retain: 30s + # Server certificate for use in TLS handshakes + https_cert: /etc/daos/certs/telemetry.crt + # Key portion of Server Certificate + https_key: /etc/daos/certs/telemetry.key +``` + +```yaml +# /etc/daos/daos_control.yml (dmg/admin) +telemetry_config: + # To use telemetry in secure mode + allow_insecure: false + # Skip the Server certificate verification. Recomendate for testing purpose only. + https_exception: true +``` + ### Server Startup The DAOS Server is started as a systemd service. The DAOS Server diff --git a/src/control/cmd/daos_agent/config.go b/src/control/cmd/daos_agent/config.go index 3263850df51..61b4e00a1e9 100644 --- a/src/control/cmd/daos_agent/config.go +++ b/src/control/cmd/daos_agent/config.go @@ -58,9 +58,11 @@ type Config struct { IncludeFabricIfaces common.StringSet `yaml:"include_fabric_ifaces,omitempty"` FabricInterfaces []*NUMAFabricConfig `yaml:"fabric_ifaces,omitempty"` ProviderIdx uint // TODO SRS-31: Enable with multiprovider functionality - TelemetryPort int `yaml:"telemetry_port,omitempty"` - TelemetryEnabled bool `yaml:"telemetry_enabled,omitempty"` - TelemetryRetain time.Duration `yaml:"telemetry_retain,omitempty"` + TelemetryConfig *security.TelemetryConfig `yaml:"telemetry_config"` + // Support Old config options. + TelemetryPort int `yaml:"telemetry_port,omitempty"` + TelemetryEnabled bool `yaml:"telemetry_enabled,omitempty"` + TelemetryRetain time.Duration `yaml:"telemetry_retain,omitempty"` } // Validate performs basic validation of the configuration. @@ -73,11 +75,24 @@ func (c *Config) Validate() error { return fmt.Errorf("invalid system name: %s", c.SystemName) } - if c.TelemetryRetain > 0 && c.TelemetryPort == 0 { + // Support Old config options and copy it to the underline new structure value. + if c.TelemetryRetain > 0 { + c.TelemetryConfig.Retain = c.TelemetryRetain + } + + if c.TelemetryPort != 0 { + c.TelemetryConfig.Port = c.TelemetryPort + } + + if c.TelemetryEnabled { + c.TelemetryConfig.Enabled = c.TelemetryEnabled + } + + if c.TelemetryConfig.Retain > 0 && c.TelemetryConfig.Port == 0 { return errors.New("telemetry_retain requires telemetry_port") } - if c.TelemetryEnabled && c.TelemetryPort == 0 { + if c.TelemetryConfig.Enabled && c.TelemetryConfig.Port == 0 { return errors.New("telemetry_enabled requires telemetry_port") } @@ -90,7 +105,7 @@ func (c *Config) Validate() error { // TelemetryExportEnabled returns true if client telemetry export is enabled. func (c *Config) TelemetryExportEnabled() bool { - return c.TelemetryPort > 0 + return c.TelemetryConfig.Port > 0 } // NUMAFabricConfig defines a list of fabric interfaces that belong to a NUMA @@ -125,6 +140,12 @@ func LoadConfig(cfgPath string) (*Config, error) { return nil, errors.Wrap(err, "agent config validation failed") } + if !cfg.TelemetryConfig.AllowInsecure { + if cfg.TelemetryConfig.HttpsCert == "" || cfg.TelemetryConfig.HttpsKey == "" { + return nil, errors.New("For secure mode, https_cert and https_key required under telemetry_config") + } + } + return cfg, nil } @@ -139,5 +160,6 @@ func DefaultConfig() *Config { LogLevel: common.DefaultControlLogLevel, TransportConfig: security.DefaultAgentTransportConfig(), CredentialConfig: &security.CredentialConfig{}, + TelemetryConfig: security.DefaultClientTelemetryConfig(), } } diff --git a/src/control/cmd/daos_agent/config_test.go b/src/control/cmd/daos_agent/config_test.go index 59a51c5709d..34a83ba2dbc 100644 --- a/src/control/cmd/daos_agent/config_test.go +++ b/src/control/cmd/daos_agent/config_test.go @@ -100,6 +100,58 @@ include_fabric_ifaces: ["ib0"] exclude_fabric_ifaces: ["ib3"] `) + telemetryRetainWithBadPort := test.CreateTestFile(t, dir, ` + +control_log_mask: debug +transport_config: + allow_insecure: true +telemetry_config: + telemetry_retain: 1m + telemetry_port: 0 +`) + + telemetryEnabledWithBadPort := test.CreateTestFile(t, dir, ` +name: shire +access_points: ["one:10001", "two:10001"] +port: 4242 +runtime_dir: /tmp/runtime +log_file: /home/frodo/logfile +control_log_mask: debug +transport_config: + allow_insecure: true +telemetry_config: + telemetry_enabled: true + telemetry_port: 0 +`) + + telemetryWithoutHttpsCert := test.CreateTestFile(t, dir, ` +name: shire +access_points: ["one:10001", "two:10001"] +port: 4242 +runtime_dir: /tmp/runtime +log_file: /home/frodo/logfile +control_log_mask: debug +transport_config: + allow_insecure: true +telemetry_config: + allow_insecure: false + https_cert: "" +`) + + telemetryWithoutHttpsKey := test.CreateTestFile(t, dir, ` +name: shire +access_points: ["one:10001", "two:10001"] +port: 4242 +runtime_dir: /tmp/runtime +log_file: /home/frodo/logfile +control_log_mask: debug +transport_config: + allow_insecure: true +telemetry_config: + allow_insecure: false + https_key: "" +`) + for name, tc := range map[string]struct { path string expResult *Config @@ -120,6 +172,22 @@ exclude_fabric_ifaces: ["ib3"] path: emptyFile, expResult: DefaultConfig(), }, + "telemetry retain with no port": { + path: telemetryRetainWithBadPort, + expErr: errors.New("telemetry_retain requires telemetry_port"), + }, + "telemetry enabled with no port": { + path: telemetryEnabledWithBadPort, + expErr: errors.New("telemetry_enabled requires telemetry_port"), + }, + "telemetry with secure mode with no server certificate": { + path: telemetryWithoutHttpsCert, + expErr: errors.New("For secure mode, https_cert and https_key required under telemetry_config"), + }, + "telemetry with secure mode with no server key": { + path: telemetryWithoutHttpsKey, + expErr: errors.New("For secure mode, https_cert and https_key required under telemetry_config"), + }, "without optional items": { path: withoutOptCfg, expResult: &Config{ @@ -134,6 +202,7 @@ exclude_fabric_ifaces: ["ib3"] AllowInsecure: true, CertificateConfig: DefaultConfig().TransportConfig.CertificateConfig, }, + TelemetryConfig: security.DefaultClientTelemetryConfig(), }, }, "bad log mask": { @@ -170,6 +239,7 @@ exclude_fabric_ifaces: ["ib3"] AllowInsecure: true, CertificateConfig: DefaultConfig().TransportConfig.CertificateConfig, }, + TelemetryConfig: security.DefaultClientTelemetryConfig(), ExcludeFabricIfaces: common.NewStringSet("ib3"), FabricInterfaces: []*NUMAFabricConfig{ { diff --git a/src/control/cmd/daos_agent/infocache.go b/src/control/cmd/daos_agent/infocache.go index 9f1e8139f40..4fce2b47200 100644 --- a/src/control/cmd/daos_agent/infocache.go +++ b/src/control/cmd/daos_agent/infocache.go @@ -49,8 +49,8 @@ func NewInfoCache(ctx context.Context, log logging.Logger, client control.UnaryI devStateGetter: network.DefaultNetDevStateProvider(log), } - ic.clientTelemetryEnabled.Store(cfg.TelemetryEnabled) - ic.clientTelemetryRetain.Store(cfg.TelemetryRetain > 0) + ic.clientTelemetryEnabled.Store(cfg.TelemetryConfig.Enabled) + ic.clientTelemetryRetain.Store(cfg.TelemetryConfig.Retain > 0) if cfg.DisableCache { ic.DisableAttachInfoCache() diff --git a/src/control/cmd/daos_agent/infocache_test.go b/src/control/cmd/daos_agent/infocache_test.go index 1f658055115..300dd8232db 100644 --- a/src/control/cmd/daos_agent/infocache_test.go +++ b/src/control/cmd/daos_agent/infocache_test.go @@ -25,6 +25,7 @@ import ( "github.com/daos-stack/daos/src/control/lib/hardware" "github.com/daos-stack/daos/src/control/lib/telemetry" "github.com/daos-stack/daos/src/control/logging" + "github.com/daos-stack/daos/src/control/security" ) type testInfoCacheParams struct { @@ -539,7 +540,7 @@ func TestAgent_NewInfoCache(t *testing.T) { t.Run(name, func(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) defer test.ShowBufferOnFailure(t, buf) - + tc.cfg.TelemetryConfig = security.DefaultClientTelemetryConfig() ic := NewInfoCache(test.Context(t), log, nil, tc.cfg) test.AssertEqual(t, tc.expEnabled, ic.IsAttachInfoCacheEnabled(), "") diff --git a/src/control/cmd/daos_agent/telemetry.go b/src/control/cmd/daos_agent/telemetry.go index 4c0e2d35b4c..60bd83d0b33 100644 --- a/src/control/cmd/daos_agent/telemetry.go +++ b/src/control/cmd/daos_agent/telemetry.go @@ -17,11 +17,14 @@ import ( func startPrometheusExporter(ctx context.Context, log logging.Logger, cs *promexp.ClientSource, cfg *Config) (func(), error) { expCfg := &promexp.ExporterConfig{ - Port: cfg.TelemetryPort, - Title: "DAOS Client Telemetry", + Port: cfg.TelemetryConfig.Port, + Title: "DAOS Client Telemetry", + AllowInsecure: cfg.TelemetryConfig.AllowInsecure, + HttpsCert: cfg.TelemetryConfig.HttpsCert, + HttpsKey: cfg.TelemetryConfig.HttpsKey, Register: func(ctx context.Context, log logging.Logger) error { c, err := promexp.NewClientCollector(ctx, log, cs, &promexp.CollectorOpts{ - RetainDuration: cfg.TelemetryRetain, + RetainDuration: cfg.TelemetryConfig.Retain, }) if err != nil { return err diff --git a/src/control/cmd/dmg/auto_test.go b/src/control/cmd/dmg/auto_test.go index 800fbed2a65..2c9b5701e2b 100644 --- a/src/control/cmd/dmg/auto_test.go +++ b/src/control/cmd/dmg/auto_test.go @@ -595,6 +595,10 @@ system_ram_reserved: 26 disable_hugepages: false control_log_mask: INFO control_log_file: /tmp/daos_server.log +telemetry_config: + allow_insecure: true + https_cert: /etc/daos/certs/telemetry.crt + https_key: /etc/daos/certs/telemetry.key core_dump_filter: 19 name: daos_server socket_dir: /var/run/daos_server diff --git a/src/control/cmd/dmg/main.go b/src/control/cmd/dmg/main.go index c88845a304c..e3bd8425663 100644 --- a/src/control/cmd/dmg/main.go +++ b/src/control/cmd/dmg/main.go @@ -262,6 +262,7 @@ and access control settings, along with system wide operations.` if opts.Insecure { ctlCfg.TransportConfig.AllowInsecure = true + ctlCfg.TelemetryConfig.AllowInsecure = true } if err := ctlCfg.TransportConfig.PreLoadCertData(); err != nil { return errors.Wrap(err, "Unable to load Certificate Data") diff --git a/src/control/cmd/dmg/telemetry.go b/src/control/cmd/dmg/telemetry.go index aadd24930ae..5806e44cef4 100644 --- a/src/control/cmd/dmg/telemetry.go +++ b/src/control/cmd/dmg/telemetry.go @@ -196,11 +196,17 @@ type ( Targets []string `yaml:"targets,omitempty"` } + tlsConfig struct { + InsecureSkipVerify bool `yaml:"insecure_skip_verify,omitempty"` + } + scrapeConfig struct { JobName string `yaml:"job_name"` ScrapeInterval time.Duration `yaml:"scrape_interval,omitempty"` ScrapeTimeout time.Duration `yaml:"scrape_timeout,omitempty"` StaticConfigs []*staticConfig `yaml:"static_configs,omitempty"` + Scheme string `yaml:"scheme,omitempty"` + TlsConfig tlsConfig `yaml:"tls_config,omitempty"` } promCfg struct { @@ -258,11 +264,23 @@ func (cmd *telemConfigCmd) configurePrometheus() (*installInfo, error) { return nil, err } + tc := tlsConfig{} + scheme := "" + if !cmd.cfgCmd.config.TelemetryConfig.AllowInsecure { + cmd.Infof("Prometheus configuration is setup as Secure (https) mode") + tc.InsecureSkipVerify = cmd.cfgCmd.config.TelemetryConfig.HttpsException + scheme = "https" + } else { + cmd.Infof("Prometheus configuration is setup as insecure (http) mode") + } + cfg.ScrapeConfigs = []*scrapeConfig{ { JobName: "daos", ScrapeInterval: 5 * time.Second, StaticConfigs: []*staticConfig{sc}, + Scheme: scheme, + TlsConfig: tc, }, } @@ -300,6 +318,7 @@ type metricsCmd struct { // metricsListCmd provides a list of metrics available from the requested DAOS servers. type metricsListCmd struct { baseCmd + cfgCmd cmdutil.JSONOutputCmd singleHostCmd Port uint32 `short:"p" long:"port" default:"9191" description:"Telemetry port on the host"` @@ -315,14 +334,30 @@ func (cmd *metricsListCmd) Execute(args []string) error { req := new(control.MetricsListReq) req.Port = cmd.Port req.Host = host + req.HttpsException = cmd.cfgCmd.config.TelemetryConfig.HttpsException if !cmd.JSONOutputEnabled() { cmd.Info(getConnectingMsg(req.Host, req.Port)) } + // Trying Secure Mode First, It will ignore the certificate if it's not provided + // or request with the certificate. + if req.AllowInsecure { + cmd.Debug("Trying Secure Mode (HTTPS) with Exception") + } else { + cmd.Debug("Trying Secure Mode (HTTPS) with system certificate") + } + resp, err := control.MetricsList(cmd.MustLogCtx(), req) if err != nil { - return err + cmd.Errorf("Secure Mode (HTTPS) failed: %s", err.Error()) + //Trying Insecure Mode + req.AllowInsecure = !req.AllowInsecure + cmd.Debug("Trying Insecure Mode (HTTP)") + resp, err = control.MetricsList(cmd.MustLogCtx(), req) + if err != nil { + return err + } } if cmd.JSONOutputEnabled() { @@ -354,6 +389,7 @@ func getConnectingMsg(host string, port uint32) string { // metricsQueryCmd collects the requested metrics from the requested DAOS servers. type metricsQueryCmd struct { baseCmd + cfgCmd cmdutil.JSONOutputCmd singleHostCmd Port uint32 `short:"p" long:"port" default:"9191" description:"Telemetry port on the host"` @@ -370,15 +406,28 @@ func (cmd *metricsQueryCmd) Execute(args []string) error { req := new(control.MetricsQueryReq) req.Port = cmd.Port req.Host = host + req.HttpsException = cmd.cfgCmd.config.TelemetryConfig.HttpsException req.MetricNames = common.TokenizeCommaSeparatedString(cmd.Metrics) if !cmd.JSONOutputEnabled() { cmd.Info(getConnectingMsg(req.Host, req.Port)) } + // Trying Secure Mode First, It will ignore the certificate if it's not provided + // or request with the certificate. + req.AllowInsecure = false + cmd.Debug("Trying Secure Mode (HTTPS) first, with system certificate") + resp, err := control.MetricsQuery(cmd.MustLogCtx(), req) if err != nil { - return err + cmd.Errorf("Secure Mode (HTTPS) failed: %s", err.Error()) + //Trying Insecure Mode + req.AllowInsecure = !req.AllowInsecure + cmd.Debug("Trying Insecure Mode (HTTP)") + resp, err = control.MetricsQuery(cmd.MustLogCtx(), req) + if err != nil { + return err + } } if cmd.JSONOutputEnabled() { diff --git a/src/control/lib/control/config.go b/src/control/lib/control/config.go index b321e5df234..e4a5452b841 100644 --- a/src/control/lib/control/config.go +++ b/src/control/lib/control/config.go @@ -28,6 +28,7 @@ type Config struct { ControlPort int `yaml:"port"` HostList []string `yaml:"hostlist"` TransportConfig *security.TransportConfig `yaml:"transport_config"` + TelemetryConfig *security.TelemetryConfig `yaml:"telemetry_config"` Path string `yaml:"-"` } @@ -40,6 +41,7 @@ func DefaultConfig() *Config { ControlPort: build.DefaultControlPort, HostList: []string{localServer}, TransportConfig: security.DefaultClientTransportConfig(), + TelemetryConfig: security.DefaultClientTelemetryConfig(), } } diff --git a/src/control/lib/control/http.go b/src/control/lib/control/http.go index 69a2f287c1f..929719c90f5 100644 --- a/src/control/lib/control/http.go +++ b/src/control/lib/control/http.go @@ -8,6 +8,8 @@ package control import ( "context" + "crypto/tls" + "crypto/x509" "fmt" "io" "net/http" @@ -37,12 +39,16 @@ type httpGetter interface { retryer getURL() *url.URL getBody(context.Context) ([]byte, error) + getAllowInsecure() bool + getHttpsException() bool } type httpReq struct { - url *url.URL - getFn httpGetFn - getBodyFn func(context.Context, *url.URL, httpGetFn, time.Duration) ([]byte, error) + url *url.URL + getFn httpGetFn + allowInsecure bool + httpsException bool + getBodyFn func(context.Context, *url.URL, httpGetFn, time.Duration, bool, bool) ([]byte, error) } func (r *httpReq) canRetry(err error, cur uint) bool { @@ -77,6 +83,14 @@ func (r *httpReq) getURL() *url.URL { return r.url } +func (r *httpReq) getAllowInsecure() bool { + return r.allowInsecure +} + +func (r *httpReq) getHttpsException() bool { + return r.httpsException +} + func (r *httpReq) httpGetFunc() httpGetFn { if r.getFn == nil { r.getFn = http.Get @@ -88,7 +102,8 @@ func (r *httpReq) getBody(ctx context.Context) ([]byte, error) { if r.getBodyFn == nil { r.getBodyFn = httpGetBody } - return r.getBodyFn(ctx, r.getURL(), r.httpGetFunc(), r.getRetryTimeout()) + + return r.getBodyFn(ctx, r.getURL(), r.httpGetFunc(), r.getRetryTimeout(), r.getAllowInsecure(), r.getHttpsException()) } func httpGetBodyRetry(ctx context.Context, req httpGetter) ([]byte, error) { @@ -113,9 +128,34 @@ func httpGetBodyRetry(ctx context.Context, req httpGetter) ([]byte, error) { return result, err } +// httpsSecureGetFunc will prepare the GET requested using the certificate for secure mode +// and return the http.Get +func httpsSecureGetFunc(httpsException bool) (httpGetFn, error) { + rootCAs, _ := x509.SystemCertPool() + if rootCAs == nil { + return nil, errors.New("Failed to load system root certificates") + } + + tlsConfig := &tls.Config{ + RootCAs: rootCAs, + } + + if httpsException { + tlsConfig.InsecureSkipVerify = true + } + + tr := &http.Transport{ + TLSClientConfig: tlsConfig, + } + + client := &http.Client{Transport: tr} + + return client.Get, nil +} + // httpGetBody executes a simple HTTP GET request to a given URL and returns the // content of the response body. -func httpGetBody(ctx context.Context, url *url.URL, get httpGetFn, timeout time.Duration) ([]byte, error) { +func httpGetBody(ctx context.Context, url *url.URL, get httpGetFn, timeout time.Duration, allowInsecure bool, httpsException bool) ([]byte, error) { if url == nil { return nil, errors.New("nil URL") } @@ -128,19 +168,25 @@ func httpGetBody(ctx context.Context, url *url.URL, get httpGetFn, timeout time. return nil, errors.New("nil get function") } + if allowInsecure == false { + var err error + get, err = httpsSecureGetFunc(httpsException) + if err != nil { + return nil, err + } + } + httpCtx, cancel := context.WithTimeout(ctx, timeout) defer cancel() respChan := make(chan *http.Response) errChan := make(chan error) - go func() { httpResp, err := get(url.String()) if err != nil { errChan <- err return } - respChan <- httpResp }() diff --git a/src/control/lib/control/http_test.go b/src/control/lib/control/http_test.go index 6f28a0c4ce4..c9dfb6015d7 100644 --- a/src/control/lib/control/http_test.go +++ b/src/control/lib/control/http_test.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021-2022 Intel Corporation. +// (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -123,35 +123,43 @@ func newErrMockReadCloser(err error) *mockReadCloser { func TestControl_httpGetBody(t *testing.T) { defaultURL := &url.URL{Host: "testhost"} + defaultAllowInsecure := true + falseAllowInsecure := false for name, tc := range map[string]struct { - url *url.URL - timeout time.Duration - cancelCtx bool - getFn httpGetFn - expResult []byte - expErr error + url *url.URL + timeout time.Duration + cancelCtx bool + getFn httpGetFn + allowInsecure bool + httpsException bool + expResult []byte + expErr error }{ "nil url": { expErr: errors.New("nil URL"), }, "empty URL": { - url: &url.URL{}, - expErr: errors.New("host address is required"), + url: &url.URL{}, + allowInsecure: defaultAllowInsecure, + expErr: errors.New("host address is required"), }, "nil getFn": { - url: defaultURL, - expErr: errors.New("nil get function"), + url: defaultURL, + allowInsecure: defaultAllowInsecure, + expErr: errors.New("nil get function"), }, "getFn error": { - url: defaultURL, + url: defaultURL, + allowInsecure: defaultAllowInsecure, getFn: func(_ string) (*http.Response, error) { return nil, errors.New("mock getFn") }, expErr: errors.New("mock getFn"), }, "http.Response error": { - url: defaultURL, + url: defaultURL, + allowInsecure: defaultAllowInsecure, getFn: func(_ string) (*http.Response, error) { return &http.Response{ StatusCode: http.StatusNotFound, @@ -161,7 +169,8 @@ func TestControl_httpGetBody(t *testing.T) { expErr: errors.New("HTTP response error: 404 Not Found"), }, "empty body": { - url: defaultURL, + url: defaultURL, + allowInsecure: defaultAllowInsecure, getFn: func(_ string) (*http.Response, error) { return &http.Response{ StatusCode: http.StatusOK, @@ -171,7 +180,8 @@ func TestControl_httpGetBody(t *testing.T) { expResult: []byte{}, }, "success with body": { - url: defaultURL, + url: defaultURL, + allowInsecure: defaultAllowInsecure, getFn: func(_ string) (*http.Response, error) { return &http.Response{ StatusCode: http.StatusOK, @@ -180,8 +190,19 @@ func TestControl_httpGetBody(t *testing.T) { }, expResult: []byte("this is the body of an HTTP response"), }, + "failure with body in secure mode without CA certificate path": { + url: defaultURL, + allowInsecure: falseAllowInsecure, + getFn: func(_ string) (*http.Response, error) { + return &http.Response{ + StatusCode: http.StatusOK, + }, nil + }, + expErr: errors.New("Get \"//testhost\": unsupported protocol scheme"), + }, "reading body fails": { - url: defaultURL, + url: defaultURL, + allowInsecure: defaultAllowInsecure, getFn: func(_ string) (*http.Response, error) { return &http.Response{ StatusCode: http.StatusOK, @@ -191,8 +212,9 @@ func TestControl_httpGetBody(t *testing.T) { expErr: errors.New("reading HTTP response body: mock Read"), }, "request times out": { - url: defaultURL, - timeout: 5 * time.Millisecond, + url: defaultURL, + allowInsecure: defaultAllowInsecure, + timeout: 5 * time.Millisecond, getFn: func(_ string) (*http.Response, error) { time.Sleep(1 * time.Second) return &http.Response{ @@ -203,8 +225,9 @@ func TestControl_httpGetBody(t *testing.T) { expErr: HTTPReqTimedOut(defaultURL.String()), }, "request canceled": { - url: defaultURL, - cancelCtx: true, + url: defaultURL, + allowInsecure: defaultAllowInsecure, + cancelCtx: true, getFn: func(_ string) (*http.Response, error) { time.Sleep(1 * time.Second) return &http.Response{ @@ -229,7 +252,7 @@ func TestControl_httpGetBody(t *testing.T) { tc.timeout = time.Second } - result, err := httpGetBody(ctx, tc.url, tc.getFn, tc.timeout) + result, err := httpGetBody(ctx, tc.url, tc.getFn, tc.timeout, tc.allowInsecure, tc.httpsException) test.CmpErr(t, tc.expErr, err) if diff := cmp.Diff(tc.expResult, result); diff != "" { @@ -247,6 +270,7 @@ type mockHTTPGetter struct { getBodyErr error getBodyCalled uint getBodyFailures uint + httpsException bool } func (r *mockHTTPGetter) canRetry(err error, cur uint) bool { @@ -273,6 +297,14 @@ func (r *mockHTTPGetter) getURL() *url.URL { } } +func (r *mockHTTPGetter) getAllowInsecure() bool { + return true +} + +func (r *mockHTTPGetter) getHttpsException() bool { + return true +} + func (r *mockHTTPGetter) getBody(ctx context.Context) ([]byte, error) { r.getBodyCalled++ if r.getBodyCalled <= r.getBodyFailures { diff --git a/src/control/lib/control/telemetry.go b/src/control/lib/control/telemetry.go index 919e54ff284..108453584d6 100644 --- a/src/control/lib/control/telemetry.go +++ b/src/control/lib/control/telemetry.go @@ -14,6 +14,7 @@ import ( "strings" "github.com/daos-stack/daos/src/control/lib/daos" + "github.com/daos-stack/daos/src/control/logging" "github.com/pkg/errors" pclient "github.com/prometheus/client_model/go" "github.com/prometheus/common/expfmt" @@ -32,9 +33,14 @@ func (m pbMetricMap) Keys() []string { return keys } -func getMetricsURL(host string, port uint32) *url.URL { +func getMetricsURL(host string, port uint32, allowinsecure bool) *url.URL { + scheme := "https" + if allowinsecure { + scheme = "http" + } + return &url.URL{ - Scheme: "http", + Scheme: scheme, Host: fmt.Sprintf("%s:%d", host, port), Path: "metrics", } @@ -78,8 +84,11 @@ type ( // MetricsListReq is used to request the list of metrics. MetricsListReq struct { httpReq - Host string // Host to query for telemetry data - Port uint32 // Port to use for collecting telemetry data + Host string // Host to query for telemetry data + Port uint32 // Port to use for collecting telemetry data + AllowInsecure bool // Set the https end point secure + HttpsException bool // Use the Https with Exception (Insecure) + Log logging.Logger // Logging the info } // MetricsListResp contains the list of available metrics. @@ -102,8 +111,9 @@ func MetricsList(ctx context.Context, req *MetricsListReq) (*MetricsListResp, er return nil, errors.New("port must be specified") } - req.url = getMetricsURL(req.Host, req.Port) - + req.allowInsecure = req.AllowInsecure + req.httpsException = req.HttpsException + req.url = getMetricsURL(req.Host, req.Port, req.allowInsecure) scraped, err := scrapeMetrics(ctx, req) if err != nil { return nil, errors.Wrap(err, "unable to list metrics") @@ -130,9 +140,11 @@ type ( // MetricsQueryReq is used to query telemetry values. MetricsQueryReq struct { httpReq - Host string // host to query for telemetry data - Port uint32 // port to use for collecting telemetry data - MetricNames []string // if empty, collects all metrics + Host string // host to query for telemetry data + Port uint32 // port to use for collecting telemetry data + AllowInsecure bool // Set the https end point secure + HttpsException bool // Use the Https with Exception (Insecure) + MetricNames []string // if empty, collects all metrics } // MetricsQueryResp contains the list of telemetry values per host. @@ -155,8 +167,9 @@ func MetricsQuery(ctx context.Context, req *MetricsQueryReq) (*MetricsQueryResp, return nil, errors.New("port must be specified") } - req.url = getMetricsURL(req.Host, req.Port) - + req.allowInsecure = req.AllowInsecure + req.httpsException = req.HttpsException + req.url = getMetricsURL(req.Host, req.Port, req.allowInsecure) scraped, err := scrapeMetrics(ctx, req) if err != nil { return nil, errors.Wrap(err, "unable to query metrics") diff --git a/src/control/lib/control/telemetry_test.go b/src/control/lib/control/telemetry_test.go index 12076bf8e01..af0528be5f5 100644 --- a/src/control/lib/control/telemetry_test.go +++ b/src/control/lib/control/telemetry_test.go @@ -118,10 +118,10 @@ func newTestPBHistogram(numBuckets int) *pclient.Metric { return metric } -func mockScrapeFnSuccess(t *testing.T, metricFam ...*pclient.MetricFamily) func(context.Context, *url.URL, httpGetFn, time.Duration) ([]byte, error) { +func mockScrapeFnSuccess(t *testing.T, metricFam ...*pclient.MetricFamily) func(context.Context, *url.URL, httpGetFn, time.Duration, bool, bool) ([]byte, error) { t.Helper() - return func(_ context.Context, _ *url.URL, _ httpGetFn, _ time.Duration) ([]byte, error) { + return func(_ context.Context, _ *url.URL, _ httpGetFn, _ time.Duration, _ bool, _ bool) ([]byte, error) { var b strings.Builder for _, mf := range metricFam { _, err := expfmt.MetricFamilyToText(&b, mf) @@ -147,12 +147,12 @@ func TestControl_scrapeMetrics(t *testing.T) { for name, tc := range map[string]struct { req httpGetter - scrapeFn func(context.Context, *url.URL, httpGetFn, time.Duration) ([]byte, error) + scrapeFn func(context.Context, *url.URL, httpGetFn, time.Duration, bool, bool) ([]byte, error) expResult pbMetricMap expErr error }{ "check scrape params": { - scrapeFn: func(_ context.Context, url *url.URL, getter httpGetFn, timeout time.Duration) ([]byte, error) { + scrapeFn: func(_ context.Context, url *url.URL, getter httpGetFn, timeout time.Duration, allowInsecure bool, httpsException bool) ([]byte, error) { test.AssertEqual(t, testURL.Scheme, url.Scheme, "") test.AssertEqual(t, testURL.Host, url.Host, "") test.AssertEqual(t, testURL.Path, url.Path, "") @@ -166,19 +166,19 @@ func TestControl_scrapeMetrics(t *testing.T) { expResult: pbMetricMap{}, }, "HTTP scrape error": { - scrapeFn: func(context.Context, *url.URL, httpGetFn, time.Duration) ([]byte, error) { + scrapeFn: func(context.Context, *url.URL, httpGetFn, time.Duration, bool, bool) ([]byte, error) { return nil, errors.New("mock scrape") }, expErr: errors.New("mock scrape"), }, "scrape returns no content": { - scrapeFn: func(context.Context, *url.URL, httpGetFn, time.Duration) ([]byte, error) { + scrapeFn: func(context.Context, *url.URL, httpGetFn, time.Duration, bool, bool) ([]byte, error) { return []byte{}, nil }, expResult: pbMetricMap{}, }, "scrape returns bad content": { - scrapeFn: func(context.Context, *url.URL, httpGetFn, time.Duration) ([]byte, error) { + scrapeFn: func(context.Context, *url.URL, httpGetFn, time.Duration, bool, bool) ([]byte, error) { return []byte("

Hello world

"), nil }, expErr: errors.New("parsing error"), @@ -217,7 +217,7 @@ func TestControl_MetricsList(t *testing.T) { } for name, tc := range map[string]struct { - scrapeFn func(context.Context, *url.URL, httpGetFn, time.Duration) ([]byte, error) + scrapeFn func(context.Context, *url.URL, httpGetFn, time.Duration, bool, bool) ([]byte, error) req *MetricsListReq expResp *MetricsListResp expErr error @@ -237,20 +237,22 @@ func TestControl_MetricsList(t *testing.T) { }, "scrape failed": { req: &MetricsListReq{ - Host: "host1", - Port: 1066, + Host: "host1", + Port: 1066, + AllowInsecure: true, }, - scrapeFn: func(context.Context, *url.URL, httpGetFn, time.Duration) ([]byte, error) { + scrapeFn: func(context.Context, *url.URL, httpGetFn, time.Duration, bool, bool) ([]byte, error) { return nil, errors.New("mock scrape") }, expErr: errors.New("mock scrape"), }, "no metrics": { req: &MetricsListReq{ - Host: "host1", - Port: 8888, + Host: "host1", + Port: 8888, + AllowInsecure: true, }, - scrapeFn: func(context.Context, *url.URL, httpGetFn, time.Duration) ([]byte, error) { + scrapeFn: func(context.Context, *url.URL, httpGetFn, time.Duration, bool, bool) ([]byte, error) { return []byte{}, nil }, expResp: &MetricsListResp{ @@ -259,8 +261,9 @@ func TestControl_MetricsList(t *testing.T) { }, "success": { req: &MetricsListReq{ - Host: "host1", - Port: 7777, + Host: "host1", + Port: 7777, + AllowInsecure: true, }, scrapeFn: mockScrapeFnSuccess(t, testMetricFam...), expResp: &MetricsListResp{ @@ -281,7 +284,7 @@ func TestControl_MetricsList(t *testing.T) { } { t.Run(name, func(t *testing.T) { if tc.scrapeFn == nil { - tc.scrapeFn = func(context.Context, *url.URL, httpGetFn, time.Duration) ([]byte, error) { + tc.scrapeFn = func(context.Context, *url.URL, httpGetFn, time.Duration, bool, bool) ([]byte, error) { return nil, nil } } @@ -429,7 +432,7 @@ func TestControl_MetricsQuery(t *testing.T) { } for name, tc := range map[string]struct { - scrapeFn func(context.Context, *url.URL, httpGetFn, time.Duration) ([]byte, error) + scrapeFn func(context.Context, *url.URL, httpGetFn, time.Duration, bool, bool) ([]byte, error) req *MetricsQueryReq expResp *MetricsQueryResp expErr error @@ -449,20 +452,22 @@ func TestControl_MetricsQuery(t *testing.T) { }, "scrape failed": { req: &MetricsQueryReq{ - Host: "host1", - Port: 1066, + Host: "host1", + Port: 1066, + AllowInsecure: true, }, - scrapeFn: func(context.Context, *url.URL, httpGetFn, time.Duration) ([]byte, error) { + scrapeFn: func(context.Context, *url.URL, httpGetFn, time.Duration, bool, bool) ([]byte, error) { return nil, errors.New("mock scrape") }, expErr: errors.New("mock scrape"), }, "no metrics": { req: &MetricsQueryReq{ - Host: "host1", - Port: 8888, + Host: "host1", + Port: 8888, + AllowInsecure: true, }, - scrapeFn: func(context.Context, *url.URL, httpGetFn, time.Duration) ([]byte, error) { + scrapeFn: func(context.Context, *url.URL, httpGetFn, time.Duration, bool, bool) ([]byte, error) { return []byte{}, nil }, expResp: &MetricsQueryResp{ @@ -471,8 +476,9 @@ func TestControl_MetricsQuery(t *testing.T) { }, "all metrics": { req: &MetricsQueryReq{ - Host: "host1", - Port: 7777, + Host: "host1", + Port: 7777, + AllowInsecure: true, }, scrapeFn: mockScrapeFnSuccess(t, testMetricFam...), expResp: &MetricsQueryResp{ @@ -532,9 +538,10 @@ func TestControl_MetricsQuery(t *testing.T) { }, "selected metrics": { req: &MetricsQueryReq{ - Host: "host1", - Port: 7777, - MetricNames: []string{"my_generic", "my_counter"}, + Host: "host1", + Port: 7777, + AllowInsecure: true, + MetricNames: []string{"my_generic", "my_counter"}, }, scrapeFn: mockScrapeFnSuccess(t, testMetricFam...), expResp: &MetricsQueryResp{ @@ -560,9 +567,10 @@ func TestControl_MetricsQuery(t *testing.T) { }, "invalid metric name": { req: &MetricsQueryReq{ - Host: "host1", - Port: 7777, - MetricNames: []string{"my_generic", "fake"}, + Host: "host1", + Port: 7777, + AllowInsecure: true, + MetricNames: []string{"my_generic", "fake"}, }, scrapeFn: mockScrapeFnSuccess(t, testMetricFam...), expErr: errors.New("metric \"fake\" not found"), diff --git a/src/control/lib/telemetry/promexp/httpd.go b/src/control/lib/telemetry/promexp/httpd.go index 2f4c86d485d..0f6a0920dd8 100644 --- a/src/control/lib/telemetry/promexp/httpd.go +++ b/src/control/lib/telemetry/promexp/httpd.go @@ -29,9 +29,12 @@ type ( // ExporterConfig defines the configuration for the Prometheus exporter. ExporterConfig struct { - Port int - Title string - Register RegMonFn + Port int + Title string + Register RegMonFn + AllowInsecure bool + HttpsCert string + HttpsKey string } ) @@ -82,8 +85,15 @@ func StartExporter(ctx context.Context, log logging.Logger, cfg *ExporterConfig) // http listener is a blocking call go func() { log.Infof("Listening on %s", listenAddress) - err := srv.ListenAndServe() - log.Infof("Prometheus web exporter stopped: %s", err.Error()) + if cfg.AllowInsecure { + log.Infof("Prometheus web exporter started with insecure (http) mode") + err := srv.ListenAndServe() + log.Infof("Prometheus web exporter stopped: %s", err.Error()) + } else { + log.Infof("Prometheus web exporter started with secure (https) mode") + err := srv.ListenAndServeTLS(cfg.HttpsCert, cfg.HttpsKey) + log.Infof("Prometheus web exporter stopped: %s", err.Error()) + } }() return func() { diff --git a/src/control/lib/telemetry/promexp/httpd_test.go b/src/control/lib/telemetry/promexp/httpd_test.go index db69e122b71..8504b4e47c9 100644 --- a/src/control/lib/telemetry/promexp/httpd_test.go +++ b/src/control/lib/telemetry/promexp/httpd_test.go @@ -70,7 +70,9 @@ func TestPromExp_StartExporter(t *testing.T) { if tc.cfg != nil { tc.cfg.Title = t.Name() + tc.cfg.AllowInsecure = true } + cleanup, err := promexp.StartExporter(test.Context(t), log, tc.cfg) test.CmpErr(t, tc.expErr, err) if tc.expErr != nil { diff --git a/src/control/security/config.go b/src/control/security/config.go index 7358e2e73ee..485e7ffdf87 100644 --- a/src/control/security/config.go +++ b/src/control/security/config.go @@ -28,6 +28,8 @@ const ( defaultAdminKey = certDir + "admin.key" defaultAgentCert = certDir + "agent.crt" defaultAgentKey = certDir + "agent.key" + defaultTelemetryCert = certDir + "telemetry.crt" + defaultTelemetryKey = certDir + "telemetry.key" defaultClientCertDir = certDir + "clients" defaultServer = "server" defaultInsecure = false @@ -105,6 +107,29 @@ type TransportConfig struct { CertificateConfig `yaml:",inline"` } +// TelemetryConfig contains all the information on whether or not to use +// secure endpoint for telemetry and their location if their use is specified. +type TelemetryConfig struct { + Port int `yaml:"telemetry_port,omitempty"` + AllowInsecure bool `yaml:"allow_insecure,omitempty"` + Enabled bool `yaml:"telemetry_enabled,omitempty"` + Retain time.Duration `yaml:"telemetry_retain,omitempty"` + HttpsCert string `yaml:"https_cert,omitempty"` + HttpsKey string `yaml:"https_key,omitempty"` + HttpsException bool `yaml:"https_exception,omitempty"` +} + +// DefaultClientTelemetryConfig provides a default telemetry config disabling +// certificate usage and specifying certificates located under /etc/daos/certs. +func DefaultClientTelemetryConfig() *TelemetryConfig { + return &TelemetryConfig{ + Enabled: false, + AllowInsecure: true, + HttpsCert: defaultTelemetryCert, + HttpsKey: defaultTelemetryKey, + } +} + func (tc *TransportConfig) String() string { return fmt.Sprintf("allow insecure: %v", tc.AllowInsecure) } diff --git a/src/control/server/config/server.go b/src/control/server/config/server.go index 510e2d26d1b..63eb9814426 100644 --- a/src/control/server/config/server.go +++ b/src/control/server/config/server.go @@ -65,6 +65,7 @@ type Server struct { FWHelperLogFile string `yaml:"firmware_helper_log_file,omitempty"` FaultPath string `yaml:"fault_path,omitempty"` TelemetryPort int `yaml:"telemetry_port,omitempty"` + TelemetryConfig *security.TelemetryConfig `yaml:"telemetry_config"` CoreDumpFilter uint8 `yaml:"core_dump_filter,omitempty"` ClientEnvVars []string `yaml:"client_env_vars,omitempty"` SupportConfig SupportConfig `yaml:"support_config,omitempty"` @@ -319,7 +320,13 @@ func (cfg *Server) WithFirmwareHelperLogFile(filePath string) *Server { // WithTelemetryPort sets the port for the telemetry exporter. func (cfg *Server) WithTelemetryPort(port int) *Server { - cfg.TelemetryPort = port + cfg.TelemetryConfig.Port = port + return cfg +} + +// WithTelemetryConfig sets the telemetry configuration. +func (cfg *Server) WithTelemetryConfig(cfgTelemetry *security.TelemetryConfig) *Server { + cfg.TelemetryConfig = cfgTelemetry return cfg } @@ -332,6 +339,7 @@ func DefaultServer() *Server { MgmtSvcReplicas: []string{fmt.Sprintf("localhost:%d", build.DefaultControlPort)}, ControlPort: build.DefaultControlPort, TransportConfig: security.DefaultServerTransportConfig(), + TelemetryConfig: security.DefaultClientTelemetryConfig(), Hyperthreads: false, SystemRamReserved: storage.DefaultSysMemRsvd / humanize.GiByte, Path: defaultConfigPath, @@ -711,10 +719,17 @@ func (cfg *Server) Validate(log logging.Logger) (err error) { return FaultConfigNoProvider case cfg.ControlPort <= 0: return FaultConfigBadControlPort + //Support old configuration option case cfg.TelemetryPort < 0: return FaultConfigBadTelemetryPort } + if cfg.TelemetryConfig != nil { + if cfg.TelemetryConfig.Port < 0 { + return FaultConfigBadTelemetryPort + } + } + for idx, ec := range cfg.Engines { ec.Storage.ControlMetadata = cfg.Metadata ec.Storage.EngineIdx = uint(idx) diff --git a/src/control/server/config/server_test.go b/src/control/server/config/server_test.go index 475d99354cb..b3b03dffcb1 100644 --- a/src/control/server/config/server_test.go +++ b/src/control/server/config/server_test.go @@ -104,11 +104,11 @@ func uncommentServerConfig(t *testing.T, outFile string) { } key := fields[0] - // If we're in a server or a storage tier config, reset the + // If we're in a server, a storage tier config, or telemetry config reset the // seen map to allow the same params in different // server configs. lineTmp := strings.TrimLeft(line, " ") - if lineTmp == "-" { + if lineTmp == "-" || lineTmp == "telemetry_config:" { seenKeys = make(map[string]struct{}) } if _, seen := seenKeys[key]; seen && strings.HasSuffix(key, ":") { @@ -249,7 +249,11 @@ func TestServerConfig_Constructed(t *testing.T) { WithControlLogFile("/tmp/daos_server.log"). WithHelperLogFile("/tmp/daos_server_helper.log"). WithFirmwareHelperLogFile("/tmp/daos_firmware_helper.log"). - WithTelemetryPort(9191). + WithTelemetryConfig(&security.TelemetryConfig{ + AllowInsecure: true, + Port: 9191, + HttpsCert: "/etc/daos/certs/telemetry.crt", + HttpsKey: "/etc/daos/certs/telemetry.key"}). WithSystemName("daos_server"). WithSocketDir("./.daos/daos_server"). WithFabricProvider("ofi+verbs;ofi_rxm"). @@ -420,7 +424,11 @@ func TestServerConfig_MDonSSD_Constructed(t *testing.T) { Path: "/var/daos/config", }). WithControlLogFile("/tmp/daos_server.log"). - WithTelemetryPort(9191). + WithTelemetryConfig(&security.TelemetryConfig{ + AllowInsecure: true, + Port: 9191, + HttpsCert: "/etc/daos/certs/telemetry.crt", + HttpsKey: "/etc/daos/certs/telemetry.key"}). WithFabricProvider("ofi+tcp"). WithMgmtSvcReplicas("example1", "example2", "example3") diff --git a/src/control/server/server_utils.go b/src/control/server/server_utils.go index 1e8c9e0c14d..288e74ec139 100644 --- a/src/control/server/server_utils.go +++ b/src/control/server/server_utils.go @@ -655,14 +655,15 @@ func configureFirstEngine(ctx context.Context, engine *EngineInstance, sysdb *ra // registerTelemetryCallbacks sets telemetry related callbacks to // be triggered when all engines have been started. func registerTelemetryCallbacks(ctx context.Context, srv *server) { - telemPort := srv.cfg.TelemetryPort + telemPort := srv.cfg.TelemetryConfig.Port + if telemPort == 0 { return } srv.OnEnginesStarted(func(ctxIn context.Context) error { srv.log.Debug("starting Prometheus exporter") - cleanup, err := startPrometheusExporter(ctxIn, srv.log, telemPort, srv.harness.Instances()) + cleanup, err := startPrometheusExporter(ctxIn, srv) if err != nil { return err } diff --git a/src/control/server/telemetry.go b/src/control/server/telemetry.go index 4b2f624aff2..28848f6879e 100644 --- a/src/control/server/telemetry.go +++ b/src/control/server/telemetry.go @@ -68,14 +68,17 @@ func regPromEngineSources(ctx context.Context, log logging.Logger, engines []Eng return nil } -func startPrometheusExporter(ctx context.Context, log logging.Logger, port int, engines []Engine) (func(), error) { +func startPrometheusExporter(ctx context.Context, srv *server) (func(), error) { expCfg := &promexp.ExporterConfig{ - Port: port, - Title: "DAOS Engine Telemetry", + Port: srv.cfg.TelemetryConfig.Port, + Title: "DAOS Engine Telemetry", + AllowInsecure: srv.cfg.TelemetryConfig.AllowInsecure, + HttpsCert: srv.cfg.TelemetryConfig.HttpsCert, + HttpsKey: srv.cfg.TelemetryConfig.HttpsKey, Register: func(ctx context.Context, log logging.Logger) error { - return regPromEngineSources(ctx, log, engines) + return regPromEngineSources(ctx, srv.log, srv.harness.Instances()) }, } - return promexp.StartExporter(ctx, log, expCfg) + return promexp.StartExporter(ctx, srv.log, expCfg) } diff --git a/src/tests/ftest/config_file_gen.py b/src/tests/ftest/config_file_gen.py index 66172cd2201..d5b4f72ee2e 100755 --- a/src/tests/ftest/config_file_gen.py +++ b/src/tests/ftest/config_file_gen.py @@ -12,11 +12,13 @@ import sys from argparse import ArgumentParser, RawDescriptionHelpFormatter -from util.agent_utils_params import DaosAgentTransportCredentials, DaosAgentYamlParameters +from util.agent_utils_params import (DaosAgentTelemetryConfig, DaosAgentTransportCredentials, + DaosAgentYamlParameters) from util.command_utils_base import CommonConfig -from util.dmg_utils_params import DmgTransportCredentials, DmgYamlParameters +from util.dmg_utils_params import DmgTelemetryConfig, DmgTransportCredentials, DmgYamlParameters from util.exception_utils import CommandFailure -from util.server_utils_params import DaosServerTransportCredentials, DaosServerYamlParameters +from util.server_utils_params import (DaosServerTelemetryConfig, DaosServerTransportCredentials, + DaosServerYamlParameters) def generate_agent_config(args): @@ -31,6 +33,7 @@ def generate_agent_config(args): """ common_cfg = CommonConfig(args.group_name, DaosAgentTransportCredentials()) config = DaosAgentYamlParameters(args.agent_file, common_cfg) + config.telemetry_config = DaosAgentTelemetryConfig() # Update the configuration file access points config.access_points.value = args.node_list.split(",") return create_config(args, config) @@ -48,6 +51,7 @@ def generate_server_config(args): """ common_cfg = CommonConfig(args.group_name, DaosServerTransportCredentials()) config = DaosServerYamlParameters(args.server_file, common_cfg) + config.telemetry_config = DaosServerTelemetryConfig() config.engine_params[0].storage.storage_tiers[0].storage_class.value = "ram" config.engine_params[0].storage.storage_tiers[0].scm_mount.value = "/mnt/daos" config.engine_params[0].storage.storage_tiers[0].scm_size.value = 0 @@ -68,6 +72,7 @@ def generate_dmg_config(args): """ config = DmgYamlParameters( args.dmg_file, args.group_name, DmgTransportCredentials()) + config.telemetry_config = DmgTelemetryConfig() # Update the configuration file hostlist config.hostlist.value = args.node_list.split(",") return create_config(args, config) diff --git a/src/tests/ftest/control/dmg_telemetry_basic.py b/src/tests/ftest/control/dmg_telemetry_basic.py index 39eb520aef2..45cb8472392 100644 --- a/src/tests/ftest/control/dmg_telemetry_basic.py +++ b/src/tests/ftest/control/dmg_telemetry_basic.py @@ -1,5 +1,5 @@ """ -(C) Copyright 2021-2023 Intel Corporation. +(C) Copyright 2021-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -91,7 +91,7 @@ def test_container_telemetry(self): :avocado: tags=all,pr,daily_regression :avocado: tags=vm - :avocado: tags=control,telemetry,container + :avocado: tags=control,telemetry,container,secure_telemetry :avocado: tags=TestWithTelemetryBasic,test_container_telemetry """ container_qty = self.params.get("container_qty", "/run/test/*", 1) diff --git a/src/tests/ftest/control/dmg_telemetry_basic.yaml b/src/tests/ftest/control/dmg_telemetry_basic.yaml index 69af131fc6f..f9c41e9eafb 100644 --- a/src/tests/ftest/control/dmg_telemetry_basic.yaml +++ b/src/tests/ftest/control/dmg_telemetry_basic.yaml @@ -16,6 +16,8 @@ server_config: class: ram scm_mount: /mnt/daos system_ram_reserved: 1 + telemetry_config: + allow_insecure: false pool: scm_size: 2G container: @@ -23,3 +25,6 @@ container: test: container_qty: 5 open_close_qty: 3 +dmg: + telemetry_config: + allow_insecure: false diff --git a/src/tests/ftest/scripts/gen_telemetry_server_certificate.sh b/src/tests/ftest/scripts/gen_telemetry_server_certificate.sh new file mode 100755 index 00000000000..6d359fbecaa --- /dev/null +++ b/src/tests/ftest/scripts/gen_telemetry_server_certificate.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# /* +# * (C) Copyright 2024 Intel Corporation. +# * +# * SPDX-License-Identifier: BSD-2-Clause-Patent +# */ + +__usage=" + +This is just an example script for testing purpose. +Please modify to use in Production environment. + +Usage: gen_telemetry_server_certificate.sh [USER] [DIR] + USER: DAOS has server and client and the certificate need the specific file permission + based on system usage. + Use daos_server if running script on server + Use daos_agent if running script on client + + DIR: Generate telemetry certificates for DAOS metrics in the [DIR]. + By default [DIR] is the current directory. +" +DAYS=1095 + +USER=$1 +CA_HOME="${2:-.}/" +HOSTNAME=$(hostname -s) + +openssl req -x509 -newkey rsa:4096 -keyout "${CA_HOME}/telemetry.key" -out "${CA_HOME}/telemetry.crt" -sha256 -days ${DAYS} -nodes -subj "/CN=\"${HOSTNAME}\"" +chmod 0400 "${CA_HOME}/telemetry.key" +chmod 0644 "${CA_HOME}/telemetry.crt" +chown "${USER}"."${USER}" "${CA_HOME}/telemetry.key" +chown "${USER}"."${USER}" "${CA_HOME}/telemetry.crt" diff --git a/src/tests/ftest/server/storage_tiers.py b/src/tests/ftest/server/storage_tiers.py index e627b8d62c2..536c1c52baf 100644 --- a/src/tests/ftest/server/storage_tiers.py +++ b/src/tests/ftest/server/storage_tiers.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -8,7 +8,8 @@ import yaml from apricot import TestWithServers from command_utils_base import CommonConfig -from server_utils import DaosServerTransportCredentials, DaosServerYamlParameters +from server_utils import (DaosServerTelemetryConfig, DaosServerTransportCredentials, + DaosServerYamlParameters) class StorageTiers(TestWithServers): @@ -67,6 +68,7 @@ def test_tiers(self): common_config = CommonConfig("daos_server", DaosServerTransportCredentials()) config = DaosServerYamlParameters(None, common_config) + config.telemetry_config = DaosServerTelemetryConfig() config.namespace = self.server_config_namespace config.get_params(self) data = config.get_yaml_data() diff --git a/src/tests/ftest/telemetry/basic_client_telemetry.py b/src/tests/ftest/telemetry/basic_client_telemetry.py index 71b976abe30..5d0236dbb7a 100644 --- a/src/tests/ftest/telemetry/basic_client_telemetry.py +++ b/src/tests/ftest/telemetry/basic_client_telemetry.py @@ -26,7 +26,7 @@ def test_client_metrics_exist(self): :avocado: tags=all,daily_regression :avocado: tags=vm - :avocado: tags=telemetry + :avocado: tags=telemetry,secure_telemetry :avocado: tags=BasicClientTelemetry,test_client_metrics_exist """ # create pool and container diff --git a/src/tests/ftest/telemetry/basic_client_telemetry.yaml b/src/tests/ftest/telemetry/basic_client_telemetry.yaml index d585dc81fda..71c6c361cd9 100644 --- a/src/tests/ftest/telemetry/basic_client_telemetry.yaml +++ b/src/tests/ftest/telemetry/basic_client_telemetry.yaml @@ -18,9 +18,11 @@ server_config: system_ram_reserved: 1 agent_config: - telemetry_port: 9191 - telemetry_retain: 30s - telemetry_enabled: true + telemetry_config: + allow_insecure: false + telemetry_port: 9191 + telemetry_retain: 30s + telemetry_enabled: true pool: scm_size: 2G @@ -44,3 +46,6 @@ ior_write: ior_read: <<: *ior_base flags: "-v -r -R -G 1" +dmg: + telemetry_config: + allow_insecure: false diff --git a/src/tests/ftest/util/agent_utils.py b/src/tests/ftest/util/agent_utils.py index 74b79fb9796..f437bc36e04 100644 --- a/src/tests/ftest/util/agent_utils.py +++ b/src/tests/ftest/util/agent_utils.py @@ -7,7 +7,8 @@ import re import socket -from agent_utils_params import DaosAgentTransportCredentials, DaosAgentYamlParameters +from agent_utils_params import (DaosAgentTelemetryConfig, DaosAgentTransportCredentials, + DaosAgentYamlParameters) from ClusterShell.NodeSet import NodeSet from command_utils import CommandWithSubCommand, SubprocessManager, YamlCommand from command_utils_base import (CommandWithParameters, CommonConfig, EnvironmentVariables, @@ -53,6 +54,7 @@ def get_agent_command(group, cert_dir, bin_dir, config_file, run_user, config_te transport_config = DaosAgentTransportCredentials(cert_dir) common_config = CommonConfig(group, transport_config) config = DaosAgentYamlParameters(config_file, common_config) + config.telemetry_config = DaosAgentTelemetryConfig(cert_dir) command = DaosAgentCommand(bin_dir, config, run_user=run_user) if config_temp: # Setup the DaosAgentCommand to write the config file data to the @@ -285,6 +287,7 @@ def start(self): # Copy certificates self.manager.job.copy_certificates( get_log_file("daosCA/certs"), self._hosts) + self.manager.job.generate_telemetry_server_certificates(self._hosts, "daos_agent") # Verify the socket directory exists when using a non-systemctl manager if self.verify_socket_dir: diff --git a/src/tests/ftest/util/agent_utils_params.py b/src/tests/ftest/util/agent_utils_params.py index bd091a162e6..889c8debf48 100644 --- a/src/tests/ftest/util/agent_utils_params.py +++ b/src/tests/ftest/util/agent_utils_params.py @@ -5,7 +5,8 @@ """ import os -from command_utils_base import BasicParameter, LogParameter, TransportCredentials, YamlParameters +from command_utils_base import (BasicParameter, LogParameter, TelemetryConfig, + TransportCredentials, YamlParameters) class DaosAgentTransportCredentials(TransportCredentials): @@ -32,6 +33,29 @@ def _get_new(self): return DaosAgentTransportCredentials(self._log_dir) +class DaosAgentTelemetryConfig(TelemetryConfig): + # pylint: disable=too-few-public-methods + """Telemetry credentials listing certificates for secure communication.""" + + def __init__(self, log_dir=os.path.join(os.sep, "tmp")): + """Initialize a TelemetryConfig object.""" + super().__init__("/run/agent_config/telemetry_config/*", None, log_dir) + + self.telemetry_port = BasicParameter(None, 9192) + self.telemetry_enabled = BasicParameter(None) + self.telemetry_retain = BasicParameter(None) + self.https_cert = LogParameter(self._log_dir, None, "telemetry.crt") + self.https_key = LogParameter(self._log_dir, None, "telemetry.key") + + def _get_new(self): + """Get a new object based upon this one. + + Returns: + DaosServerTelemetryConfig: a new DaosServerTelemetryConfig object + """ + return DaosAgentTelemetryConfig(self._log_dir) + + class DaosAgentYamlParameters(YamlParameters): """Defines the daos_agent configuration yaml parameters.""" diff --git a/src/tests/ftest/util/command_utils.py b/src/tests/ftest/util/command_utils.py index dbed7ac3c44..ac9f94449d4 100644 --- a/src/tests/ftest/util/command_utils.py +++ b/src/tests/ftest/util/command_utils.py @@ -1044,6 +1044,28 @@ def copy_certificates(self, source, hosts): self._command, ", ".join(names)) get_file_listing(hosts, names, self.run_user).log_output(self.log) + def generate_telemetry_server_certificates(self, hosts, user): + """Generate the telemetry certificates for the test on server/client. + + Args: + hosts (NodeSet): list of the destination hosts. + user (User): User permission set on telemetry certificate file. + For server, it's daos_server and for client it's daos_agent. + """ + data = self.yaml.telemetry_config.get_certificate_data( + self.yaml.telemetry_config.get_attribute_names(LogParameter)) + destination = list(data.keys())[0] + + if not self.yaml.telemetry_config.allow_insecure.value: + certgen_dir = os.path.abspath( + os.path.join(os.getcwd(), "scripts")) + command = os.path.join(certgen_dir, "gen_telemetry_server_certificate.sh ") + command = "sudo " + command + user + " " + destination + self.log.debug("Generating the telemetry certificate command %s:", command) + result = run_remote(self.log, hosts, command, 30) + if not result.passed: + self.log.info(" WARNING: command %s failed", command) + def copy_configuration(self, hosts): """Copy the yaml configuration file to the hosts. diff --git a/src/tests/ftest/util/command_utils_base.py b/src/tests/ftest/util/command_utils_base.py index d867fbba4c8..ff2c1499b7e 100644 --- a/src/tests/ftest/util/command_utils_base.py +++ b/src/tests/ftest/util/command_utils_base.py @@ -574,6 +574,7 @@ def __init__(self, namespace, filename=None, title=None, other_params=None): self.filename = filename self.title = title self.other_params = other_params + self.telemetry_config = None def get_params(self, test): """Get values for the yaml parameters from the test yaml file. @@ -588,6 +589,9 @@ def get_params(self, test): if self.other_params is not None: self.other_params.get_params(test) + if self.telemetry_config is not None: + self.telemetry_config.get_params(test) + def get_yaml_data(self): """Convert the parameters into a dictionary to use to write a yaml file. @@ -599,6 +603,12 @@ def get_yaml_data(self): yaml_data = self.other_params.get_yaml_data() else: yaml_data = {} + + if self.telemetry_config is not None: + telemetry_yaml = self.telemetry_config.get_yaml_data() + if telemetry_yaml: + yaml_data["telemetry_config"] = telemetry_yaml + for name in self.get_param_names(): value = getattr(self, name).value if value is not None: @@ -789,6 +799,76 @@ def _get_new(self): return TransportCredentials(self.namespace, self.title, self._log_dir) +class TelemetryConfig(YamlParameters): + """Telemetry credentials listing certificates for secure communication.""" + + def __init__(self, namespace, title, log_dir): + """Initialize a TelemetryConfig object. + + Args: + namespace (str): yaml namespace (path to parameters) + title (str): namespace under which to place the + parameters when creating the yaml file. + log_dir (str): location of the certificate files + """ + super().__init__(namespace, None, title) + self._log_dir = log_dir + default_insecure = str(os.environ.get("DAOS_TEST_INSECURE_MODE", True)) + default_insecure = default_insecure.lower() == "true" + self.allow_insecure = BasicParameter(None, default_insecure) + self.telemetry_port = BasicParameter(None, 9191) + self.telemetry_retain = None + self.telemetry_enabled = None + + def get_yaml_data(self): + """Convert the parameters into a dictionary to use to write a yaml file. + + Returns: + dict: a dictionary of parameter name keys and values + + """ + yaml_data = super().get_yaml_data() + + # Convert the boolean value into a string + if self.title is not None: + yaml_data[self.title]["allow_insecure"] = self.allow_insecure.value + else: + yaml_data["allow_insecure"] = self.allow_insecure.value + + return yaml_data + + def get_certificate_data(self, name_list): + """Get certificate data by name_list. + + Args: + name_list (list): list of certificate attribute names. + + Returns: + data (dict): a dictionary of parameter directory name keys and + value. + + """ + data = {} + if not self.allow_insecure.value: + for name in name_list: + value = getattr(self, name).value + if isinstance(value, str): + dir_name, file_name = os.path.split(value) + if dir_name not in data: + data[dir_name] = [file_name] + else: + data[dir_name].append(file_name) + return data + + def _get_new(self): + """Get a new object based upon this one. + + Returns: + TelemetryConfig: a new TelemetryConfig object + """ + return TelemetryConfig(self.namespace, self.title, self._log_dir) + + class CommonConfig(YamlParameters): """Defines common daos_agent and daos_server configuration file parameters. diff --git a/src/tests/ftest/util/dmg_utils.py b/src/tests/ftest/util/dmg_utils.py index cbca403895a..aad42e662a8 100644 --- a/src/tests/ftest/util/dmg_utils.py +++ b/src/tests/ftest/util/dmg_utils.py @@ -10,7 +10,7 @@ from pwd import getpwuid from dmg_utils_base import DmgCommandBase -from dmg_utils_params import DmgTransportCredentials, DmgYamlParameters +from dmg_utils_params import DmgTelemetryConfig, DmgTransportCredentials, DmgYamlParameters from exception_utils import CommandFailure from general_utils import dict_to_str, get_numeric_list @@ -39,7 +39,8 @@ def get_dmg_command(group, cert_dir, bin_dir, config_file, config_temp=None, hos """ transport_config = DmgTransportCredentials(cert_dir) - config = DmgYamlParameters(config_file, group, transport_config) + telemetry_config = DmgTelemetryConfig(cert_dir) + config = DmgYamlParameters(config_file, group, transport_config, telemetry_config) command = DmgCommand(bin_dir, config, hostlist_suffix) if config_temp: # Setup the DaosServerCommand to write the config file data to the diff --git a/src/tests/ftest/util/dmg_utils_params.py b/src/tests/ftest/util/dmg_utils_params.py index ff9048a3b62..7faa37a0f40 100644 --- a/src/tests/ftest/util/dmg_utils_params.py +++ b/src/tests/ftest/util/dmg_utils_params.py @@ -1,10 +1,11 @@ """ - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ -from command_utils_base import BasicParameter, LogParameter, TransportCredentials, YamlParameters +from command_utils_base import (BasicParameter, LogParameter, TelemetryConfig, + TransportCredentials, YamlParameters) class DmgTransportCredentials(TransportCredentials): @@ -25,10 +26,27 @@ def _get_new(self): return DmgTransportCredentials(self._log_dir) +class DmgTelemetryConfig(TelemetryConfig): + """Telemetry credentials listing certificates for secure communication.""" + + def __init__(self, log_dir="/tmp"): + """Initialize a TelemetryConfig object.""" + super().__init__("/run/dmg/telemetry_config/*", None, log_dir) + self.https_exception = BasicParameter(None, True) + + def _get_new(self): + """Get a new object based upon this one. + + Returns: + DmgTelemetryConfig: a new DmgTelemetryConfig object + """ + return DmgTelemetryConfig(self._log_dir) + + class DmgYamlParameters(YamlParameters): """Defines the dmg configuration yaml parameters.""" - def __init__(self, filename, name, transport): + def __init__(self, filename, name, transport, telemetry=None): """Initialize a DmgYamlParameters object. Args: @@ -36,6 +54,8 @@ def __init__(self, filename, name, transport): name (str): The DAOS system name. transport (DmgTransportCredentials): dmg security configuration settings. + telemetry (DmgTelemetryConfig): dmg telemetry + configuration settings. """ super().__init__("/run/dmg/*", filename, None, transport) @@ -57,6 +77,9 @@ def __init__(self, filename, name, transport): self.hostlist = BasicParameter(None, "localhost") self.port = BasicParameter(None, 10001) + if telemetry is not None: + self.telemetry_config = telemetry + def _get_new(self): """Get a new object based upon this one. diff --git a/src/tests/ftest/util/launch_utils.py b/src/tests/ftest/util/launch_utils.py index 0f7284c50ef..2d33cceac05 100644 --- a/src/tests/ftest/util/launch_utils.py +++ b/src/tests/ftest/util/launch_utils.py @@ -867,6 +867,7 @@ def _generate_certs(self, logger): message = "Error generating certificates" self.test_result.fail_test(logger, "Prepare", message, sys.exc_info()) return 128 + return 0 def _collect_crash_files(self, logger): diff --git a/src/tests/ftest/util/server_utils.py b/src/tests/ftest/util/server_utils.py index ec79f029c6e..cd284a39a2a 100644 --- a/src/tests/ftest/util/server_utils.py +++ b/src/tests/ftest/util/server_utils.py @@ -21,7 +21,8 @@ from host_utils import get_local_host from run_utils import run_remote, stop_processes from server_utils_base import DaosServerCommand, DaosServerInformation, ServerFailed -from server_utils_params import DaosServerTransportCredentials, DaosServerYamlParameters +from server_utils_params import (DaosServerTelemetryConfig, DaosServerTransportCredentials, + DaosServerYamlParameters) from user_utils import get_chown_command @@ -45,7 +46,10 @@ def get_server_command(group, cert_dir, bin_dir, config_file, config_temp=None): transport_config = DaosServerTransportCredentials(cert_dir) common_config = CommonConfig(group, transport_config) config = DaosServerYamlParameters(config_file, common_config) + config.telemetry_config = DaosServerTelemetryConfig(cert_dir) + command = DaosServerCommand(bin_dir, config, None) + if config_temp: # Setup the DaosServerCommand to write the config file data to the # temporary file and then copy the file to all the hosts using the @@ -241,6 +245,7 @@ def prepare(self, storage=True): # Copy certificates self.manager.job.copy_certificates(get_log_file("daosCA/certs"), self._hosts) + self.manager.job.generate_telemetry_server_certificates(self._hosts, "daos_server") self._prepare_dmg_certificates() # Prepare dmg for running storage format on all server hosts diff --git a/src/tests/ftest/util/server_utils_params.py b/src/tests/ftest/util/server_utils_params.py index 19dd8ea4df3..050a6cdc17a 100644 --- a/src/tests/ftest/util/server_utils_params.py +++ b/src/tests/ftest/util/server_utils_params.py @@ -5,7 +5,8 @@ """ import os -from command_utils_base import BasicParameter, LogParameter, TransportCredentials, YamlParameters +from command_utils_base import (BasicParameter, LogParameter, TelemetryConfig, + TransportCredentials, YamlParameters) MAX_STORAGE_TIERS = 5 @@ -56,6 +57,32 @@ def _get_new(self): return DaosServerTransportCredentials(self._log_dir) +class DaosServerTelemetryConfig(TelemetryConfig): + # pylint: disable=too-few-public-methods + """Telemetry credentials listing certificates for secure communication.""" + + def __init__(self, log_dir=os.path.join(os.sep, "tmp")): + """Initialize a DaosServerTelemetryConfig object.""" + super().__init__("/run/server_config/telemetry_config/*", None, log_dir) + + # Additional daos_server telemetry credential parameters: + # - port: : Telemetry endpoint port number + # - https_cert: : Server certificate + # - https_key: : Server Key portion + # + self.telemetry_port = BasicParameter(None, 9191) + self.https_cert = LogParameter(self._log_dir, None, "telemetry.crt") + self.https_key = LogParameter(self._log_dir, None, "telemetry.key") + + def _get_new(self): + """Get a new object based upon this one. + + Returns: + DaosServerTelemetryConfig: a new DaosServerTelemetryConfig object + """ + return DaosServerTelemetryConfig(self._log_dir) + + class DaosServerYamlParameters(YamlParameters): """Defines the daos_server configuration yaml parameters.""" @@ -135,7 +162,6 @@ def __init__(self, filename, common_yaml): self.control_log_mask = BasicParameter(None, "DEBUG") self.control_log_file = LogParameter(log_dir, None, "daos_control.log") self.helper_log_file = LogParameter(log_dir, None, "daos_server_helper.log") - self.telemetry_port = BasicParameter(None, 9191) self.client_env_vars = BasicParameter(None) self.mgmt_svc_replicas = BasicParameter(None, ["localhost"]) @@ -147,6 +173,9 @@ def __init__(self, filename, common_yaml): # Control plane metadata parameters. self.metadata_params = ControlMetadataParameters(self.namespace) + # Telemetry Parameters + self.telemetry_config = BasicParameter(None) + # Defines the number of single engine config parameters to define in # the yaml file self.engines_per_host = BasicParameter(None, 0) diff --git a/src/tests/ftest/util/telemetry_utils.py b/src/tests/ftest/util/telemetry_utils.py index cc0610af872..d13a359ee8b 100644 --- a/src/tests/ftest/util/telemetry_utils.py +++ b/src/tests/ftest/util/telemetry_utils.py @@ -369,7 +369,9 @@ class TelemetryUtils(): *_gen_stats_metrics("engine_net_swim_delay"), "engine_net_uri_lookup_timeout", "engine_net_uri_lookup_other", - "engine_net_uri_lookup_self"] + "engine_net_uri_lookup_self", + 'engine_net_quota_exceeded', + 'engine_net_waitq_depth'] ENGINE_RANK_METRICS = [ "engine_rank"] ENGINE_NVME_HEALTH_METRICS = [ diff --git a/utils/config/daos_agent.yml b/utils/config/daos_agent.yml index bced9a0447b..f260a910f7d 100644 --- a/utils/config/daos_agent.yml +++ b/utils/config/daos_agent.yml @@ -26,26 +26,35 @@ # default: 10001 #port: 10001 -## Enable HTTP endpoint for remote telemetry collection. -# Note that enabling the endpoint automatically enables -# client telemetry collection. +## Enable Telemetry HTTP/HTTPS endpoint for remote client telemetry collection. # -## default endpoint state: disabled -## default endpoint port: 9192 -#telemetry_port: 9192 - -## Enable client telemetry for all DAOS clients. -# If false, clients will need to optionally enable telemetry by setting -# the D_CLIENT_METRICS_ENABLE environment variable to true. +#telemetry_config: +# # Set the client telemetry endpoint port number +# # default: 9192 +# telemetry_port: 9192 # -## default: false -#telemetry_enabled: true - -## Retain client telemetry for a period of time after the client -# process exits. +# # Enable client telemetry for all DAOS clients. +# # If false, clients will need to optionally enable telemetry by setting +# # the D_CLIENT_METRICS_ENABLE environment variable to true. +# # default: false +# telemetry_enabled: true +# +# # Retain client telemetry for a period of time after the client +# # process exits. +# # default 0 (do not retain telemetry after client exit) +# telemetry_retain: 1m +# +# # In order to disable transport security, uncomment and set allow_insecure +# # to true. Not recommended for production configurations. +# allow_insecure: false +# +# # Server certificate for use in TLS handshakes +# # DAOS client is the HTTPS server to open secure telemetry endpoint. +# https_cert: /etc/daos/certs/telemetry.crt # -## default 0 (do not retain telemetry after client exit) -#telemetry_retain: 1m +# # Key portion of Server Certificate +# # DAOS client is the HTTPS server to open secure telemetry endpoint. +# https_key: /etc/daos/certs/telemetry.key ## Configuration for user credential management. #credential_config: diff --git a/utils/config/daos_control.yml b/utils/config/daos_control.yml index 5a236052fd7..cd0566d41c2 100644 --- a/utils/config/daos_control.yml +++ b/utils/config/daos_control.yml @@ -38,3 +38,13 @@ # cert: /etc/daos/certs/admin.crt # # Key portion of Admin Certificate # key: /etc/daos/certs/admin.key + +## Configuration for telemetry collection commands. +# +#telemetry_config: +# # In order to enabled transport security, uncomment and set allow_insecure +# # to false. +# allow_insecure: true +# +# # Skip the Server certificate verification. Recommended for testing purpose only. +# https_exception: true diff --git a/utils/config/daos_server.yml b/utils/config/daos_server.yml index a9642631f2d..24b2ff3ccd3 100644 --- a/utils/config/daos_server.yml +++ b/utils/config/daos_server.yml @@ -260,11 +260,21 @@ #firmware_helper_log_file: /tmp/daos_firmware_helper.log # # -## Enable HTTP endpoint for remote telemetry collection. +## Enable Telemetry HTTP/HTTPS endpoint for remote telemetry collection. # -## default endpoint state: disabled -## default endpoint port: 9191 -#telemetry_port: 9191 +#telemetry_config: +# # In order to enabled telemetry security, uncomment and set allow_insecure to false +# allow_insecure: true +# +# # Set the server telemetry endpoint port number +# # default: 9191 +# telemetry_port: 9191 +# +# # Server certificate for use in TLS handshakes +# https_cert: /etc/daos/certs/telemetry.crt +# +# # Key portion of Server Certificate +# https_key: /etc/daos/certs/telemetry.key # # ## If desired, a set of client-side environment variables may be diff --git a/utils/config/examples/daos_server_local.yml b/utils/config/examples/daos_server_local.yml index ac5bb6ee808..237f9f27799 100644 --- a/utils/config/examples/daos_server_local.yml +++ b/utils/config/examples/daos_server_local.yml @@ -7,7 +7,9 @@ control_log_file: /tmp/daos_server.log transport_config: allow_insecure: true -telemetry_port: 9191 +telemetry_config: + allow_insecure: true + telemetry_port: 9191 engines: - diff --git a/utils/config/examples/daos_server_mdonssd.yml b/utils/config/examples/daos_server_mdonssd.yml index 8b73e53e431..1b46daf7351 100644 --- a/utils/config/examples/daos_server_mdonssd.yml +++ b/utils/config/examples/daos_server_mdonssd.yml @@ -29,7 +29,9 @@ control_metadata: # # Key portion of Server Certificate # key: /etc/daos/certs/server.key -telemetry_port: 9191 +telemetry_config: + allow_insecure: true + telemetry_port: 9191 engines: - diff --git a/utils/config/examples/daos_server_tcp.yml b/utils/config/examples/daos_server_tcp.yml index 38f40d7ec67..475baed1483 100644 --- a/utils/config/examples/daos_server_tcp.yml +++ b/utils/config/examples/daos_server_tcp.yml @@ -10,7 +10,9 @@ provider: ofi+tcp control_log_mask: DEBUG control_log_file: /tmp/daos_server.log -telemetry_port: 9191 +telemetry_config: + allow_insecure: true + telemetry_port: 9191 ## Transport Credentials Specifying certificates to secure communications ## diff --git a/utils/config/examples/daos_server_ucx.yml b/utils/config/examples/daos_server_ucx.yml index bd413af495e..9f29dbb5664 100644 --- a/utils/config/examples/daos_server_ucx.yml +++ b/utils/config/examples/daos_server_ucx.yml @@ -19,7 +19,9 @@ provider: ucx+dc_x control_log_mask: INFO control_log_file: /tmp/daos_server.log -telemetry_port: 9191 +telemetry_config: + allow_insecure: true + telemetry_port: 9191 ## Transport Credentials Specifying certificates to secure communications ## diff --git a/utils/config/examples/daos_server_verbs.yml b/utils/config/examples/daos_server_verbs.yml index 32146674739..fc199e52234 100644 --- a/utils/config/examples/daos_server_verbs.yml +++ b/utils/config/examples/daos_server_verbs.yml @@ -10,7 +10,9 @@ provider: ofi+verbs control_log_mask: INFO control_log_file: /tmp/daos_server.log -telemetry_port: 9191 +telemetry_config: + allow_insecure: true + telemetry_port: 9191 ## Transport Credentials Specifying certificates to secure communications ##