diff --git a/docs/admin/administration.md b/docs/admin/administration.md
index 049e98eb81c..d4efdfca0f5 100644
--- a/docs/admin/administration.md
+++ b/docs/admin/administration.md
@@ -286,7 +286,7 @@ written to `$HOME/.prometheus.yml`.
To start the Prometheus server with the configuration file generated by `dmg`:
```
-prometheus --config-file=$HOME/.prometheus.yml
+prometheus --config.file=$HOME/.prometheus.yml
```
## Storage Operations
diff --git a/docs/admin/deployment.md b/docs/admin/deployment.md
index e318397a28d..bddd2edd1d8 100644
--- a/docs/admin/deployment.md
+++ b/docs/admin/deployment.md
@@ -759,6 +759,56 @@ transport_config:
key: /etc/daos/certs/admin.key
```
+#### Telemetry Certificate Configuration
+
+The DAOS Telemetry framework has option to use certificates to authenticate
+between server/client and admin node.
+Creating of certificate is not part of DAOS scope and it is up to Admin to
+generate the certificate and add it to the DAOS server and client system.
+
+#### Telemetry Yaml Example
+
+Information on telelmetry config parameters in respective yaml file.
+
+```yaml
+# /etc/daos/daos_server.yml (servers)
+telemetry_config:
+ # To use telemetry in secure mode
+ allow_insecure: false
+ # Set the server telemetry endpoint port number
+ port: 9191
+ # Server certificate for use in TLS handshakes
+ https_cert: /etc/daos/certs/telemetry.crt
+ # Key portion of Server Certificate
+ https_key: /etc/daos/certs/telemetry.key
+```
+
+```yaml
+# /etc/daos/daos_agent.yml (clients)
+telemetry_config:
+ # To use telemetry in secure mode
+ allow_insecure: false
+ # Enable client telemetry for all DAOS clients.
+ enabled: true
+ # Set the client telemetry endpoint port number
+ port: 9192
+ # Retain client telemetry for a period of time after the client process exits.
+ retain: 30s
+ # Server certificate for use in TLS handshakes
+ https_cert: /etc/daos/certs/telemetry.crt
+ # Key portion of Server Certificate
+ https_key: /etc/daos/certs/telemetry.key
+```
+
+```yaml
+# /etc/daos/daos_control.yml (dmg/admin)
+telemetry_config:
+ # To use telemetry in secure mode
+ allow_insecure: false
+ # Skip the Server certificate verification. Recomendate for testing purpose only.
+ https_exception: true
+```
+
### Server Startup
The DAOS Server is started as a systemd service. The DAOS Server
diff --git a/src/control/cmd/daos_agent/config.go b/src/control/cmd/daos_agent/config.go
index 3263850df51..61b4e00a1e9 100644
--- a/src/control/cmd/daos_agent/config.go
+++ b/src/control/cmd/daos_agent/config.go
@@ -58,9 +58,11 @@ type Config struct {
IncludeFabricIfaces common.StringSet `yaml:"include_fabric_ifaces,omitempty"`
FabricInterfaces []*NUMAFabricConfig `yaml:"fabric_ifaces,omitempty"`
ProviderIdx uint // TODO SRS-31: Enable with multiprovider functionality
- TelemetryPort int `yaml:"telemetry_port,omitempty"`
- TelemetryEnabled bool `yaml:"telemetry_enabled,omitempty"`
- TelemetryRetain time.Duration `yaml:"telemetry_retain,omitempty"`
+ TelemetryConfig *security.TelemetryConfig `yaml:"telemetry_config"`
+ // Support Old config options.
+ TelemetryPort int `yaml:"telemetry_port,omitempty"`
+ TelemetryEnabled bool `yaml:"telemetry_enabled,omitempty"`
+ TelemetryRetain time.Duration `yaml:"telemetry_retain,omitempty"`
}
// Validate performs basic validation of the configuration.
@@ -73,11 +75,24 @@ func (c *Config) Validate() error {
return fmt.Errorf("invalid system name: %s", c.SystemName)
}
- if c.TelemetryRetain > 0 && c.TelemetryPort == 0 {
+ // Support Old config options and copy it to the underline new structure value.
+ if c.TelemetryRetain > 0 {
+ c.TelemetryConfig.Retain = c.TelemetryRetain
+ }
+
+ if c.TelemetryPort != 0 {
+ c.TelemetryConfig.Port = c.TelemetryPort
+ }
+
+ if c.TelemetryEnabled {
+ c.TelemetryConfig.Enabled = c.TelemetryEnabled
+ }
+
+ if c.TelemetryConfig.Retain > 0 && c.TelemetryConfig.Port == 0 {
return errors.New("telemetry_retain requires telemetry_port")
}
- if c.TelemetryEnabled && c.TelemetryPort == 0 {
+ if c.TelemetryConfig.Enabled && c.TelemetryConfig.Port == 0 {
return errors.New("telemetry_enabled requires telemetry_port")
}
@@ -90,7 +105,7 @@ func (c *Config) Validate() error {
// TelemetryExportEnabled returns true if client telemetry export is enabled.
func (c *Config) TelemetryExportEnabled() bool {
- return c.TelemetryPort > 0
+ return c.TelemetryConfig.Port > 0
}
// NUMAFabricConfig defines a list of fabric interfaces that belong to a NUMA
@@ -125,6 +140,12 @@ func LoadConfig(cfgPath string) (*Config, error) {
return nil, errors.Wrap(err, "agent config validation failed")
}
+ if !cfg.TelemetryConfig.AllowInsecure {
+ if cfg.TelemetryConfig.HttpsCert == "" || cfg.TelemetryConfig.HttpsKey == "" {
+ return nil, errors.New("For secure mode, https_cert and https_key required under telemetry_config")
+ }
+ }
+
return cfg, nil
}
@@ -139,5 +160,6 @@ func DefaultConfig() *Config {
LogLevel: common.DefaultControlLogLevel,
TransportConfig: security.DefaultAgentTransportConfig(),
CredentialConfig: &security.CredentialConfig{},
+ TelemetryConfig: security.DefaultClientTelemetryConfig(),
}
}
diff --git a/src/control/cmd/daos_agent/config_test.go b/src/control/cmd/daos_agent/config_test.go
index 59a51c5709d..34a83ba2dbc 100644
--- a/src/control/cmd/daos_agent/config_test.go
+++ b/src/control/cmd/daos_agent/config_test.go
@@ -100,6 +100,58 @@ include_fabric_ifaces: ["ib0"]
exclude_fabric_ifaces: ["ib3"]
`)
+ telemetryRetainWithBadPort := test.CreateTestFile(t, dir, `
+
+control_log_mask: debug
+transport_config:
+ allow_insecure: true
+telemetry_config:
+ telemetry_retain: 1m
+ telemetry_port: 0
+`)
+
+ telemetryEnabledWithBadPort := test.CreateTestFile(t, dir, `
+name: shire
+access_points: ["one:10001", "two:10001"]
+port: 4242
+runtime_dir: /tmp/runtime
+log_file: /home/frodo/logfile
+control_log_mask: debug
+transport_config:
+ allow_insecure: true
+telemetry_config:
+ telemetry_enabled: true
+ telemetry_port: 0
+`)
+
+ telemetryWithoutHttpsCert := test.CreateTestFile(t, dir, `
+name: shire
+access_points: ["one:10001", "two:10001"]
+port: 4242
+runtime_dir: /tmp/runtime
+log_file: /home/frodo/logfile
+control_log_mask: debug
+transport_config:
+ allow_insecure: true
+telemetry_config:
+ allow_insecure: false
+ https_cert: ""
+`)
+
+ telemetryWithoutHttpsKey := test.CreateTestFile(t, dir, `
+name: shire
+access_points: ["one:10001", "two:10001"]
+port: 4242
+runtime_dir: /tmp/runtime
+log_file: /home/frodo/logfile
+control_log_mask: debug
+transport_config:
+ allow_insecure: true
+telemetry_config:
+ allow_insecure: false
+ https_key: ""
+`)
+
for name, tc := range map[string]struct {
path string
expResult *Config
@@ -120,6 +172,22 @@ exclude_fabric_ifaces: ["ib3"]
path: emptyFile,
expResult: DefaultConfig(),
},
+ "telemetry retain with no port": {
+ path: telemetryRetainWithBadPort,
+ expErr: errors.New("telemetry_retain requires telemetry_port"),
+ },
+ "telemetry enabled with no port": {
+ path: telemetryEnabledWithBadPort,
+ expErr: errors.New("telemetry_enabled requires telemetry_port"),
+ },
+ "telemetry with secure mode with no server certificate": {
+ path: telemetryWithoutHttpsCert,
+ expErr: errors.New("For secure mode, https_cert and https_key required under telemetry_config"),
+ },
+ "telemetry with secure mode with no server key": {
+ path: telemetryWithoutHttpsKey,
+ expErr: errors.New("For secure mode, https_cert and https_key required under telemetry_config"),
+ },
"without optional items": {
path: withoutOptCfg,
expResult: &Config{
@@ -134,6 +202,7 @@ exclude_fabric_ifaces: ["ib3"]
AllowInsecure: true,
CertificateConfig: DefaultConfig().TransportConfig.CertificateConfig,
},
+ TelemetryConfig: security.DefaultClientTelemetryConfig(),
},
},
"bad log mask": {
@@ -170,6 +239,7 @@ exclude_fabric_ifaces: ["ib3"]
AllowInsecure: true,
CertificateConfig: DefaultConfig().TransportConfig.CertificateConfig,
},
+ TelemetryConfig: security.DefaultClientTelemetryConfig(),
ExcludeFabricIfaces: common.NewStringSet("ib3"),
FabricInterfaces: []*NUMAFabricConfig{
{
diff --git a/src/control/cmd/daos_agent/infocache.go b/src/control/cmd/daos_agent/infocache.go
index 9f1e8139f40..4fce2b47200 100644
--- a/src/control/cmd/daos_agent/infocache.go
+++ b/src/control/cmd/daos_agent/infocache.go
@@ -49,8 +49,8 @@ func NewInfoCache(ctx context.Context, log logging.Logger, client control.UnaryI
devStateGetter: network.DefaultNetDevStateProvider(log),
}
- ic.clientTelemetryEnabled.Store(cfg.TelemetryEnabled)
- ic.clientTelemetryRetain.Store(cfg.TelemetryRetain > 0)
+ ic.clientTelemetryEnabled.Store(cfg.TelemetryConfig.Enabled)
+ ic.clientTelemetryRetain.Store(cfg.TelemetryConfig.Retain > 0)
if cfg.DisableCache {
ic.DisableAttachInfoCache()
diff --git a/src/control/cmd/daos_agent/infocache_test.go b/src/control/cmd/daos_agent/infocache_test.go
index 1f658055115..300dd8232db 100644
--- a/src/control/cmd/daos_agent/infocache_test.go
+++ b/src/control/cmd/daos_agent/infocache_test.go
@@ -25,6 +25,7 @@ import (
"github.com/daos-stack/daos/src/control/lib/hardware"
"github.com/daos-stack/daos/src/control/lib/telemetry"
"github.com/daos-stack/daos/src/control/logging"
+ "github.com/daos-stack/daos/src/control/security"
)
type testInfoCacheParams struct {
@@ -539,7 +540,7 @@ func TestAgent_NewInfoCache(t *testing.T) {
t.Run(name, func(t *testing.T) {
log, buf := logging.NewTestLogger(t.Name())
defer test.ShowBufferOnFailure(t, buf)
-
+ tc.cfg.TelemetryConfig = security.DefaultClientTelemetryConfig()
ic := NewInfoCache(test.Context(t), log, nil, tc.cfg)
test.AssertEqual(t, tc.expEnabled, ic.IsAttachInfoCacheEnabled(), "")
diff --git a/src/control/cmd/daos_agent/telemetry.go b/src/control/cmd/daos_agent/telemetry.go
index 4c0e2d35b4c..60bd83d0b33 100644
--- a/src/control/cmd/daos_agent/telemetry.go
+++ b/src/control/cmd/daos_agent/telemetry.go
@@ -17,11 +17,14 @@ import (
func startPrometheusExporter(ctx context.Context, log logging.Logger, cs *promexp.ClientSource, cfg *Config) (func(), error) {
expCfg := &promexp.ExporterConfig{
- Port: cfg.TelemetryPort,
- Title: "DAOS Client Telemetry",
+ Port: cfg.TelemetryConfig.Port,
+ Title: "DAOS Client Telemetry",
+ AllowInsecure: cfg.TelemetryConfig.AllowInsecure,
+ HttpsCert: cfg.TelemetryConfig.HttpsCert,
+ HttpsKey: cfg.TelemetryConfig.HttpsKey,
Register: func(ctx context.Context, log logging.Logger) error {
c, err := promexp.NewClientCollector(ctx, log, cs, &promexp.CollectorOpts{
- RetainDuration: cfg.TelemetryRetain,
+ RetainDuration: cfg.TelemetryConfig.Retain,
})
if err != nil {
return err
diff --git a/src/control/cmd/dmg/auto_test.go b/src/control/cmd/dmg/auto_test.go
index 800fbed2a65..2c9b5701e2b 100644
--- a/src/control/cmd/dmg/auto_test.go
+++ b/src/control/cmd/dmg/auto_test.go
@@ -595,6 +595,10 @@ system_ram_reserved: 26
disable_hugepages: false
control_log_mask: INFO
control_log_file: /tmp/daos_server.log
+telemetry_config:
+ allow_insecure: true
+ https_cert: /etc/daos/certs/telemetry.crt
+ https_key: /etc/daos/certs/telemetry.key
core_dump_filter: 19
name: daos_server
socket_dir: /var/run/daos_server
diff --git a/src/control/cmd/dmg/main.go b/src/control/cmd/dmg/main.go
index c88845a304c..e3bd8425663 100644
--- a/src/control/cmd/dmg/main.go
+++ b/src/control/cmd/dmg/main.go
@@ -262,6 +262,7 @@ and access control settings, along with system wide operations.`
if opts.Insecure {
ctlCfg.TransportConfig.AllowInsecure = true
+ ctlCfg.TelemetryConfig.AllowInsecure = true
}
if err := ctlCfg.TransportConfig.PreLoadCertData(); err != nil {
return errors.Wrap(err, "Unable to load Certificate Data")
diff --git a/src/control/cmd/dmg/telemetry.go b/src/control/cmd/dmg/telemetry.go
index aadd24930ae..5806e44cef4 100644
--- a/src/control/cmd/dmg/telemetry.go
+++ b/src/control/cmd/dmg/telemetry.go
@@ -196,11 +196,17 @@ type (
Targets []string `yaml:"targets,omitempty"`
}
+ tlsConfig struct {
+ InsecureSkipVerify bool `yaml:"insecure_skip_verify,omitempty"`
+ }
+
scrapeConfig struct {
JobName string `yaml:"job_name"`
ScrapeInterval time.Duration `yaml:"scrape_interval,omitempty"`
ScrapeTimeout time.Duration `yaml:"scrape_timeout,omitempty"`
StaticConfigs []*staticConfig `yaml:"static_configs,omitempty"`
+ Scheme string `yaml:"scheme,omitempty"`
+ TlsConfig tlsConfig `yaml:"tls_config,omitempty"`
}
promCfg struct {
@@ -258,11 +264,23 @@ func (cmd *telemConfigCmd) configurePrometheus() (*installInfo, error) {
return nil, err
}
+ tc := tlsConfig{}
+ scheme := ""
+ if !cmd.cfgCmd.config.TelemetryConfig.AllowInsecure {
+ cmd.Infof("Prometheus configuration is setup as Secure (https) mode")
+ tc.InsecureSkipVerify = cmd.cfgCmd.config.TelemetryConfig.HttpsException
+ scheme = "https"
+ } else {
+ cmd.Infof("Prometheus configuration is setup as insecure (http) mode")
+ }
+
cfg.ScrapeConfigs = []*scrapeConfig{
{
JobName: "daos",
ScrapeInterval: 5 * time.Second,
StaticConfigs: []*staticConfig{sc},
+ Scheme: scheme,
+ TlsConfig: tc,
},
}
@@ -300,6 +318,7 @@ type metricsCmd struct {
// metricsListCmd provides a list of metrics available from the requested DAOS servers.
type metricsListCmd struct {
baseCmd
+ cfgCmd
cmdutil.JSONOutputCmd
singleHostCmd
Port uint32 `short:"p" long:"port" default:"9191" description:"Telemetry port on the host"`
@@ -315,14 +334,30 @@ func (cmd *metricsListCmd) Execute(args []string) error {
req := new(control.MetricsListReq)
req.Port = cmd.Port
req.Host = host
+ req.HttpsException = cmd.cfgCmd.config.TelemetryConfig.HttpsException
if !cmd.JSONOutputEnabled() {
cmd.Info(getConnectingMsg(req.Host, req.Port))
}
+ // Trying Secure Mode First, It will ignore the certificate if it's not provided
+ // or request with the certificate.
+ if req.AllowInsecure {
+ cmd.Debug("Trying Secure Mode (HTTPS) with Exception")
+ } else {
+ cmd.Debug("Trying Secure Mode (HTTPS) with system certificate")
+ }
+
resp, err := control.MetricsList(cmd.MustLogCtx(), req)
if err != nil {
- return err
+ cmd.Errorf("Secure Mode (HTTPS) failed: %s", err.Error())
+ //Trying Insecure Mode
+ req.AllowInsecure = !req.AllowInsecure
+ cmd.Debug("Trying Insecure Mode (HTTP)")
+ resp, err = control.MetricsList(cmd.MustLogCtx(), req)
+ if err != nil {
+ return err
+ }
}
if cmd.JSONOutputEnabled() {
@@ -354,6 +389,7 @@ func getConnectingMsg(host string, port uint32) string {
// metricsQueryCmd collects the requested metrics from the requested DAOS servers.
type metricsQueryCmd struct {
baseCmd
+ cfgCmd
cmdutil.JSONOutputCmd
singleHostCmd
Port uint32 `short:"p" long:"port" default:"9191" description:"Telemetry port on the host"`
@@ -370,15 +406,28 @@ func (cmd *metricsQueryCmd) Execute(args []string) error {
req := new(control.MetricsQueryReq)
req.Port = cmd.Port
req.Host = host
+ req.HttpsException = cmd.cfgCmd.config.TelemetryConfig.HttpsException
req.MetricNames = common.TokenizeCommaSeparatedString(cmd.Metrics)
if !cmd.JSONOutputEnabled() {
cmd.Info(getConnectingMsg(req.Host, req.Port))
}
+ // Trying Secure Mode First, It will ignore the certificate if it's not provided
+ // or request with the certificate.
+ req.AllowInsecure = false
+ cmd.Debug("Trying Secure Mode (HTTPS) first, with system certificate")
+
resp, err := control.MetricsQuery(cmd.MustLogCtx(), req)
if err != nil {
- return err
+ cmd.Errorf("Secure Mode (HTTPS) failed: %s", err.Error())
+ //Trying Insecure Mode
+ req.AllowInsecure = !req.AllowInsecure
+ cmd.Debug("Trying Insecure Mode (HTTP)")
+ resp, err = control.MetricsQuery(cmd.MustLogCtx(), req)
+ if err != nil {
+ return err
+ }
}
if cmd.JSONOutputEnabled() {
diff --git a/src/control/lib/control/config.go b/src/control/lib/control/config.go
index b321e5df234..e4a5452b841 100644
--- a/src/control/lib/control/config.go
+++ b/src/control/lib/control/config.go
@@ -28,6 +28,7 @@ type Config struct {
ControlPort int `yaml:"port"`
HostList []string `yaml:"hostlist"`
TransportConfig *security.TransportConfig `yaml:"transport_config"`
+ TelemetryConfig *security.TelemetryConfig `yaml:"telemetry_config"`
Path string `yaml:"-"`
}
@@ -40,6 +41,7 @@ func DefaultConfig() *Config {
ControlPort: build.DefaultControlPort,
HostList: []string{localServer},
TransportConfig: security.DefaultClientTransportConfig(),
+ TelemetryConfig: security.DefaultClientTelemetryConfig(),
}
}
diff --git a/src/control/lib/control/http.go b/src/control/lib/control/http.go
index 69a2f287c1f..929719c90f5 100644
--- a/src/control/lib/control/http.go
+++ b/src/control/lib/control/http.go
@@ -8,6 +8,8 @@ package control
import (
"context"
+ "crypto/tls"
+ "crypto/x509"
"fmt"
"io"
"net/http"
@@ -37,12 +39,16 @@ type httpGetter interface {
retryer
getURL() *url.URL
getBody(context.Context) ([]byte, error)
+ getAllowInsecure() bool
+ getHttpsException() bool
}
type httpReq struct {
- url *url.URL
- getFn httpGetFn
- getBodyFn func(context.Context, *url.URL, httpGetFn, time.Duration) ([]byte, error)
+ url *url.URL
+ getFn httpGetFn
+ allowInsecure bool
+ httpsException bool
+ getBodyFn func(context.Context, *url.URL, httpGetFn, time.Duration, bool, bool) ([]byte, error)
}
func (r *httpReq) canRetry(err error, cur uint) bool {
@@ -77,6 +83,14 @@ func (r *httpReq) getURL() *url.URL {
return r.url
}
+func (r *httpReq) getAllowInsecure() bool {
+ return r.allowInsecure
+}
+
+func (r *httpReq) getHttpsException() bool {
+ return r.httpsException
+}
+
func (r *httpReq) httpGetFunc() httpGetFn {
if r.getFn == nil {
r.getFn = http.Get
@@ -88,7 +102,8 @@ func (r *httpReq) getBody(ctx context.Context) ([]byte, error) {
if r.getBodyFn == nil {
r.getBodyFn = httpGetBody
}
- return r.getBodyFn(ctx, r.getURL(), r.httpGetFunc(), r.getRetryTimeout())
+
+ return r.getBodyFn(ctx, r.getURL(), r.httpGetFunc(), r.getRetryTimeout(), r.getAllowInsecure(), r.getHttpsException())
}
func httpGetBodyRetry(ctx context.Context, req httpGetter) ([]byte, error) {
@@ -113,9 +128,34 @@ func httpGetBodyRetry(ctx context.Context, req httpGetter) ([]byte, error) {
return result, err
}
+// httpsSecureGetFunc will prepare the GET requested using the certificate for secure mode
+// and return the http.Get
+func httpsSecureGetFunc(httpsException bool) (httpGetFn, error) {
+ rootCAs, _ := x509.SystemCertPool()
+ if rootCAs == nil {
+ return nil, errors.New("Failed to load system root certificates")
+ }
+
+ tlsConfig := &tls.Config{
+ RootCAs: rootCAs,
+ }
+
+ if httpsException {
+ tlsConfig.InsecureSkipVerify = true
+ }
+
+ tr := &http.Transport{
+ TLSClientConfig: tlsConfig,
+ }
+
+ client := &http.Client{Transport: tr}
+
+ return client.Get, nil
+}
+
// httpGetBody executes a simple HTTP GET request to a given URL and returns the
// content of the response body.
-func httpGetBody(ctx context.Context, url *url.URL, get httpGetFn, timeout time.Duration) ([]byte, error) {
+func httpGetBody(ctx context.Context, url *url.URL, get httpGetFn, timeout time.Duration, allowInsecure bool, httpsException bool) ([]byte, error) {
if url == nil {
return nil, errors.New("nil URL")
}
@@ -128,19 +168,25 @@ func httpGetBody(ctx context.Context, url *url.URL, get httpGetFn, timeout time.
return nil, errors.New("nil get function")
}
+ if allowInsecure == false {
+ var err error
+ get, err = httpsSecureGetFunc(httpsException)
+ if err != nil {
+ return nil, err
+ }
+ }
+
httpCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
respChan := make(chan *http.Response)
errChan := make(chan error)
-
go func() {
httpResp, err := get(url.String())
if err != nil {
errChan <- err
return
}
-
respChan <- httpResp
}()
diff --git a/src/control/lib/control/http_test.go b/src/control/lib/control/http_test.go
index 6f28a0c4ce4..c9dfb6015d7 100644
--- a/src/control/lib/control/http_test.go
+++ b/src/control/lib/control/http_test.go
@@ -1,5 +1,5 @@
//
-// (C) Copyright 2021-2022 Intel Corporation.
+// (C) Copyright 2021-2024 Intel Corporation.
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
@@ -123,35 +123,43 @@ func newErrMockReadCloser(err error) *mockReadCloser {
func TestControl_httpGetBody(t *testing.T) {
defaultURL := &url.URL{Host: "testhost"}
+ defaultAllowInsecure := true
+ falseAllowInsecure := false
for name, tc := range map[string]struct {
- url *url.URL
- timeout time.Duration
- cancelCtx bool
- getFn httpGetFn
- expResult []byte
- expErr error
+ url *url.URL
+ timeout time.Duration
+ cancelCtx bool
+ getFn httpGetFn
+ allowInsecure bool
+ httpsException bool
+ expResult []byte
+ expErr error
}{
"nil url": {
expErr: errors.New("nil URL"),
},
"empty URL": {
- url: &url.URL{},
- expErr: errors.New("host address is required"),
+ url: &url.URL{},
+ allowInsecure: defaultAllowInsecure,
+ expErr: errors.New("host address is required"),
},
"nil getFn": {
- url: defaultURL,
- expErr: errors.New("nil get function"),
+ url: defaultURL,
+ allowInsecure: defaultAllowInsecure,
+ expErr: errors.New("nil get function"),
},
"getFn error": {
- url: defaultURL,
+ url: defaultURL,
+ allowInsecure: defaultAllowInsecure,
getFn: func(_ string) (*http.Response, error) {
return nil, errors.New("mock getFn")
},
expErr: errors.New("mock getFn"),
},
"http.Response error": {
- url: defaultURL,
+ url: defaultURL,
+ allowInsecure: defaultAllowInsecure,
getFn: func(_ string) (*http.Response, error) {
return &http.Response{
StatusCode: http.StatusNotFound,
@@ -161,7 +169,8 @@ func TestControl_httpGetBody(t *testing.T) {
expErr: errors.New("HTTP response error: 404 Not Found"),
},
"empty body": {
- url: defaultURL,
+ url: defaultURL,
+ allowInsecure: defaultAllowInsecure,
getFn: func(_ string) (*http.Response, error) {
return &http.Response{
StatusCode: http.StatusOK,
@@ -171,7 +180,8 @@ func TestControl_httpGetBody(t *testing.T) {
expResult: []byte{},
},
"success with body": {
- url: defaultURL,
+ url: defaultURL,
+ allowInsecure: defaultAllowInsecure,
getFn: func(_ string) (*http.Response, error) {
return &http.Response{
StatusCode: http.StatusOK,
@@ -180,8 +190,19 @@ func TestControl_httpGetBody(t *testing.T) {
},
expResult: []byte("this is the body of an HTTP response"),
},
+ "failure with body in secure mode without CA certificate path": {
+ url: defaultURL,
+ allowInsecure: falseAllowInsecure,
+ getFn: func(_ string) (*http.Response, error) {
+ return &http.Response{
+ StatusCode: http.StatusOK,
+ }, nil
+ },
+ expErr: errors.New("Get \"//testhost\": unsupported protocol scheme"),
+ },
"reading body fails": {
- url: defaultURL,
+ url: defaultURL,
+ allowInsecure: defaultAllowInsecure,
getFn: func(_ string) (*http.Response, error) {
return &http.Response{
StatusCode: http.StatusOK,
@@ -191,8 +212,9 @@ func TestControl_httpGetBody(t *testing.T) {
expErr: errors.New("reading HTTP response body: mock Read"),
},
"request times out": {
- url: defaultURL,
- timeout: 5 * time.Millisecond,
+ url: defaultURL,
+ allowInsecure: defaultAllowInsecure,
+ timeout: 5 * time.Millisecond,
getFn: func(_ string) (*http.Response, error) {
time.Sleep(1 * time.Second)
return &http.Response{
@@ -203,8 +225,9 @@ func TestControl_httpGetBody(t *testing.T) {
expErr: HTTPReqTimedOut(defaultURL.String()),
},
"request canceled": {
- url: defaultURL,
- cancelCtx: true,
+ url: defaultURL,
+ allowInsecure: defaultAllowInsecure,
+ cancelCtx: true,
getFn: func(_ string) (*http.Response, error) {
time.Sleep(1 * time.Second)
return &http.Response{
@@ -229,7 +252,7 @@ func TestControl_httpGetBody(t *testing.T) {
tc.timeout = time.Second
}
- result, err := httpGetBody(ctx, tc.url, tc.getFn, tc.timeout)
+ result, err := httpGetBody(ctx, tc.url, tc.getFn, tc.timeout, tc.allowInsecure, tc.httpsException)
test.CmpErr(t, tc.expErr, err)
if diff := cmp.Diff(tc.expResult, result); diff != "" {
@@ -247,6 +270,7 @@ type mockHTTPGetter struct {
getBodyErr error
getBodyCalled uint
getBodyFailures uint
+ httpsException bool
}
func (r *mockHTTPGetter) canRetry(err error, cur uint) bool {
@@ -273,6 +297,14 @@ func (r *mockHTTPGetter) getURL() *url.URL {
}
}
+func (r *mockHTTPGetter) getAllowInsecure() bool {
+ return true
+}
+
+func (r *mockHTTPGetter) getHttpsException() bool {
+ return true
+}
+
func (r *mockHTTPGetter) getBody(ctx context.Context) ([]byte, error) {
r.getBodyCalled++
if r.getBodyCalled <= r.getBodyFailures {
diff --git a/src/control/lib/control/telemetry.go b/src/control/lib/control/telemetry.go
index 919e54ff284..108453584d6 100644
--- a/src/control/lib/control/telemetry.go
+++ b/src/control/lib/control/telemetry.go
@@ -14,6 +14,7 @@ import (
"strings"
"github.com/daos-stack/daos/src/control/lib/daos"
+ "github.com/daos-stack/daos/src/control/logging"
"github.com/pkg/errors"
pclient "github.com/prometheus/client_model/go"
"github.com/prometheus/common/expfmt"
@@ -32,9 +33,14 @@ func (m pbMetricMap) Keys() []string {
return keys
}
-func getMetricsURL(host string, port uint32) *url.URL {
+func getMetricsURL(host string, port uint32, allowinsecure bool) *url.URL {
+ scheme := "https"
+ if allowinsecure {
+ scheme = "http"
+ }
+
return &url.URL{
- Scheme: "http",
+ Scheme: scheme,
Host: fmt.Sprintf("%s:%d", host, port),
Path: "metrics",
}
@@ -78,8 +84,11 @@ type (
// MetricsListReq is used to request the list of metrics.
MetricsListReq struct {
httpReq
- Host string // Host to query for telemetry data
- Port uint32 // Port to use for collecting telemetry data
+ Host string // Host to query for telemetry data
+ Port uint32 // Port to use for collecting telemetry data
+ AllowInsecure bool // Set the https end point secure
+ HttpsException bool // Use the Https with Exception (Insecure)
+ Log logging.Logger // Logging the info
}
// MetricsListResp contains the list of available metrics.
@@ -102,8 +111,9 @@ func MetricsList(ctx context.Context, req *MetricsListReq) (*MetricsListResp, er
return nil, errors.New("port must be specified")
}
- req.url = getMetricsURL(req.Host, req.Port)
-
+ req.allowInsecure = req.AllowInsecure
+ req.httpsException = req.HttpsException
+ req.url = getMetricsURL(req.Host, req.Port, req.allowInsecure)
scraped, err := scrapeMetrics(ctx, req)
if err != nil {
return nil, errors.Wrap(err, "unable to list metrics")
@@ -130,9 +140,11 @@ type (
// MetricsQueryReq is used to query telemetry values.
MetricsQueryReq struct {
httpReq
- Host string // host to query for telemetry data
- Port uint32 // port to use for collecting telemetry data
- MetricNames []string // if empty, collects all metrics
+ Host string // host to query for telemetry data
+ Port uint32 // port to use for collecting telemetry data
+ AllowInsecure bool // Set the https end point secure
+ HttpsException bool // Use the Https with Exception (Insecure)
+ MetricNames []string // if empty, collects all metrics
}
// MetricsQueryResp contains the list of telemetry values per host.
@@ -155,8 +167,9 @@ func MetricsQuery(ctx context.Context, req *MetricsQueryReq) (*MetricsQueryResp,
return nil, errors.New("port must be specified")
}
- req.url = getMetricsURL(req.Host, req.Port)
-
+ req.allowInsecure = req.AllowInsecure
+ req.httpsException = req.HttpsException
+ req.url = getMetricsURL(req.Host, req.Port, req.allowInsecure)
scraped, err := scrapeMetrics(ctx, req)
if err != nil {
return nil, errors.Wrap(err, "unable to query metrics")
diff --git a/src/control/lib/control/telemetry_test.go b/src/control/lib/control/telemetry_test.go
index 12076bf8e01..af0528be5f5 100644
--- a/src/control/lib/control/telemetry_test.go
+++ b/src/control/lib/control/telemetry_test.go
@@ -118,10 +118,10 @@ func newTestPBHistogram(numBuckets int) *pclient.Metric {
return metric
}
-func mockScrapeFnSuccess(t *testing.T, metricFam ...*pclient.MetricFamily) func(context.Context, *url.URL, httpGetFn, time.Duration) ([]byte, error) {
+func mockScrapeFnSuccess(t *testing.T, metricFam ...*pclient.MetricFamily) func(context.Context, *url.URL, httpGetFn, time.Duration, bool, bool) ([]byte, error) {
t.Helper()
- return func(_ context.Context, _ *url.URL, _ httpGetFn, _ time.Duration) ([]byte, error) {
+ return func(_ context.Context, _ *url.URL, _ httpGetFn, _ time.Duration, _ bool, _ bool) ([]byte, error) {
var b strings.Builder
for _, mf := range metricFam {
_, err := expfmt.MetricFamilyToText(&b, mf)
@@ -147,12 +147,12 @@ func TestControl_scrapeMetrics(t *testing.T) {
for name, tc := range map[string]struct {
req httpGetter
- scrapeFn func(context.Context, *url.URL, httpGetFn, time.Duration) ([]byte, error)
+ scrapeFn func(context.Context, *url.URL, httpGetFn, time.Duration, bool, bool) ([]byte, error)
expResult pbMetricMap
expErr error
}{
"check scrape params": {
- scrapeFn: func(_ context.Context, url *url.URL, getter httpGetFn, timeout time.Duration) ([]byte, error) {
+ scrapeFn: func(_ context.Context, url *url.URL, getter httpGetFn, timeout time.Duration, allowInsecure bool, httpsException bool) ([]byte, error) {
test.AssertEqual(t, testURL.Scheme, url.Scheme, "")
test.AssertEqual(t, testURL.Host, url.Host, "")
test.AssertEqual(t, testURL.Path, url.Path, "")
@@ -166,19 +166,19 @@ func TestControl_scrapeMetrics(t *testing.T) {
expResult: pbMetricMap{},
},
"HTTP scrape error": {
- scrapeFn: func(context.Context, *url.URL, httpGetFn, time.Duration) ([]byte, error) {
+ scrapeFn: func(context.Context, *url.URL, httpGetFn, time.Duration, bool, bool) ([]byte, error) {
return nil, errors.New("mock scrape")
},
expErr: errors.New("mock scrape"),
},
"scrape returns no content": {
- scrapeFn: func(context.Context, *url.URL, httpGetFn, time.Duration) ([]byte, error) {
+ scrapeFn: func(context.Context, *url.URL, httpGetFn, time.Duration, bool, bool) ([]byte, error) {
return []byte{}, nil
},
expResult: pbMetricMap{},
},
"scrape returns bad content": {
- scrapeFn: func(context.Context, *url.URL, httpGetFn, time.Duration) ([]byte, error) {
+ scrapeFn: func(context.Context, *url.URL, httpGetFn, time.Duration, bool, bool) ([]byte, error) {
return []byte("
Hello world
"), nil
},
expErr: errors.New("parsing error"),
@@ -217,7 +217,7 @@ func TestControl_MetricsList(t *testing.T) {
}
for name, tc := range map[string]struct {
- scrapeFn func(context.Context, *url.URL, httpGetFn, time.Duration) ([]byte, error)
+ scrapeFn func(context.Context, *url.URL, httpGetFn, time.Duration, bool, bool) ([]byte, error)
req *MetricsListReq
expResp *MetricsListResp
expErr error
@@ -237,20 +237,22 @@ func TestControl_MetricsList(t *testing.T) {
},
"scrape failed": {
req: &MetricsListReq{
- Host: "host1",
- Port: 1066,
+ Host: "host1",
+ Port: 1066,
+ AllowInsecure: true,
},
- scrapeFn: func(context.Context, *url.URL, httpGetFn, time.Duration) ([]byte, error) {
+ scrapeFn: func(context.Context, *url.URL, httpGetFn, time.Duration, bool, bool) ([]byte, error) {
return nil, errors.New("mock scrape")
},
expErr: errors.New("mock scrape"),
},
"no metrics": {
req: &MetricsListReq{
- Host: "host1",
- Port: 8888,
+ Host: "host1",
+ Port: 8888,
+ AllowInsecure: true,
},
- scrapeFn: func(context.Context, *url.URL, httpGetFn, time.Duration) ([]byte, error) {
+ scrapeFn: func(context.Context, *url.URL, httpGetFn, time.Duration, bool, bool) ([]byte, error) {
return []byte{}, nil
},
expResp: &MetricsListResp{
@@ -259,8 +261,9 @@ func TestControl_MetricsList(t *testing.T) {
},
"success": {
req: &MetricsListReq{
- Host: "host1",
- Port: 7777,
+ Host: "host1",
+ Port: 7777,
+ AllowInsecure: true,
},
scrapeFn: mockScrapeFnSuccess(t, testMetricFam...),
expResp: &MetricsListResp{
@@ -281,7 +284,7 @@ func TestControl_MetricsList(t *testing.T) {
} {
t.Run(name, func(t *testing.T) {
if tc.scrapeFn == nil {
- tc.scrapeFn = func(context.Context, *url.URL, httpGetFn, time.Duration) ([]byte, error) {
+ tc.scrapeFn = func(context.Context, *url.URL, httpGetFn, time.Duration, bool, bool) ([]byte, error) {
return nil, nil
}
}
@@ -429,7 +432,7 @@ func TestControl_MetricsQuery(t *testing.T) {
}
for name, tc := range map[string]struct {
- scrapeFn func(context.Context, *url.URL, httpGetFn, time.Duration) ([]byte, error)
+ scrapeFn func(context.Context, *url.URL, httpGetFn, time.Duration, bool, bool) ([]byte, error)
req *MetricsQueryReq
expResp *MetricsQueryResp
expErr error
@@ -449,20 +452,22 @@ func TestControl_MetricsQuery(t *testing.T) {
},
"scrape failed": {
req: &MetricsQueryReq{
- Host: "host1",
- Port: 1066,
+ Host: "host1",
+ Port: 1066,
+ AllowInsecure: true,
},
- scrapeFn: func(context.Context, *url.URL, httpGetFn, time.Duration) ([]byte, error) {
+ scrapeFn: func(context.Context, *url.URL, httpGetFn, time.Duration, bool, bool) ([]byte, error) {
return nil, errors.New("mock scrape")
},
expErr: errors.New("mock scrape"),
},
"no metrics": {
req: &MetricsQueryReq{
- Host: "host1",
- Port: 8888,
+ Host: "host1",
+ Port: 8888,
+ AllowInsecure: true,
},
- scrapeFn: func(context.Context, *url.URL, httpGetFn, time.Duration) ([]byte, error) {
+ scrapeFn: func(context.Context, *url.URL, httpGetFn, time.Duration, bool, bool) ([]byte, error) {
return []byte{}, nil
},
expResp: &MetricsQueryResp{
@@ -471,8 +476,9 @@ func TestControl_MetricsQuery(t *testing.T) {
},
"all metrics": {
req: &MetricsQueryReq{
- Host: "host1",
- Port: 7777,
+ Host: "host1",
+ Port: 7777,
+ AllowInsecure: true,
},
scrapeFn: mockScrapeFnSuccess(t, testMetricFam...),
expResp: &MetricsQueryResp{
@@ -532,9 +538,10 @@ func TestControl_MetricsQuery(t *testing.T) {
},
"selected metrics": {
req: &MetricsQueryReq{
- Host: "host1",
- Port: 7777,
- MetricNames: []string{"my_generic", "my_counter"},
+ Host: "host1",
+ Port: 7777,
+ AllowInsecure: true,
+ MetricNames: []string{"my_generic", "my_counter"},
},
scrapeFn: mockScrapeFnSuccess(t, testMetricFam...),
expResp: &MetricsQueryResp{
@@ -560,9 +567,10 @@ func TestControl_MetricsQuery(t *testing.T) {
},
"invalid metric name": {
req: &MetricsQueryReq{
- Host: "host1",
- Port: 7777,
- MetricNames: []string{"my_generic", "fake"},
+ Host: "host1",
+ Port: 7777,
+ AllowInsecure: true,
+ MetricNames: []string{"my_generic", "fake"},
},
scrapeFn: mockScrapeFnSuccess(t, testMetricFam...),
expErr: errors.New("metric \"fake\" not found"),
diff --git a/src/control/lib/telemetry/promexp/httpd.go b/src/control/lib/telemetry/promexp/httpd.go
index 2f4c86d485d..0f6a0920dd8 100644
--- a/src/control/lib/telemetry/promexp/httpd.go
+++ b/src/control/lib/telemetry/promexp/httpd.go
@@ -29,9 +29,12 @@ type (
// ExporterConfig defines the configuration for the Prometheus exporter.
ExporterConfig struct {
- Port int
- Title string
- Register RegMonFn
+ Port int
+ Title string
+ Register RegMonFn
+ AllowInsecure bool
+ HttpsCert string
+ HttpsKey string
}
)
@@ -82,8 +85,15 @@ func StartExporter(ctx context.Context, log logging.Logger, cfg *ExporterConfig)
// http listener is a blocking call
go func() {
log.Infof("Listening on %s", listenAddress)
- err := srv.ListenAndServe()
- log.Infof("Prometheus web exporter stopped: %s", err.Error())
+ if cfg.AllowInsecure {
+ log.Infof("Prometheus web exporter started with insecure (http) mode")
+ err := srv.ListenAndServe()
+ log.Infof("Prometheus web exporter stopped: %s", err.Error())
+ } else {
+ log.Infof("Prometheus web exporter started with secure (https) mode")
+ err := srv.ListenAndServeTLS(cfg.HttpsCert, cfg.HttpsKey)
+ log.Infof("Prometheus web exporter stopped: %s", err.Error())
+ }
}()
return func() {
diff --git a/src/control/lib/telemetry/promexp/httpd_test.go b/src/control/lib/telemetry/promexp/httpd_test.go
index db69e122b71..8504b4e47c9 100644
--- a/src/control/lib/telemetry/promexp/httpd_test.go
+++ b/src/control/lib/telemetry/promexp/httpd_test.go
@@ -70,7 +70,9 @@ func TestPromExp_StartExporter(t *testing.T) {
if tc.cfg != nil {
tc.cfg.Title = t.Name()
+ tc.cfg.AllowInsecure = true
}
+
cleanup, err := promexp.StartExporter(test.Context(t), log, tc.cfg)
test.CmpErr(t, tc.expErr, err)
if tc.expErr != nil {
diff --git a/src/control/security/config.go b/src/control/security/config.go
index 7358e2e73ee..485e7ffdf87 100644
--- a/src/control/security/config.go
+++ b/src/control/security/config.go
@@ -28,6 +28,8 @@ const (
defaultAdminKey = certDir + "admin.key"
defaultAgentCert = certDir + "agent.crt"
defaultAgentKey = certDir + "agent.key"
+ defaultTelemetryCert = certDir + "telemetry.crt"
+ defaultTelemetryKey = certDir + "telemetry.key"
defaultClientCertDir = certDir + "clients"
defaultServer = "server"
defaultInsecure = false
@@ -105,6 +107,29 @@ type TransportConfig struct {
CertificateConfig `yaml:",inline"`
}
+// TelemetryConfig contains all the information on whether or not to use
+// secure endpoint for telemetry and their location if their use is specified.
+type TelemetryConfig struct {
+ Port int `yaml:"telemetry_port,omitempty"`
+ AllowInsecure bool `yaml:"allow_insecure,omitempty"`
+ Enabled bool `yaml:"telemetry_enabled,omitempty"`
+ Retain time.Duration `yaml:"telemetry_retain,omitempty"`
+ HttpsCert string `yaml:"https_cert,omitempty"`
+ HttpsKey string `yaml:"https_key,omitempty"`
+ HttpsException bool `yaml:"https_exception,omitempty"`
+}
+
+// DefaultClientTelemetryConfig provides a default telemetry config disabling
+// certificate usage and specifying certificates located under /etc/daos/certs.
+func DefaultClientTelemetryConfig() *TelemetryConfig {
+ return &TelemetryConfig{
+ Enabled: false,
+ AllowInsecure: true,
+ HttpsCert: defaultTelemetryCert,
+ HttpsKey: defaultTelemetryKey,
+ }
+}
+
func (tc *TransportConfig) String() string {
return fmt.Sprintf("allow insecure: %v", tc.AllowInsecure)
}
diff --git a/src/control/server/config/server.go b/src/control/server/config/server.go
index 510e2d26d1b..63eb9814426 100644
--- a/src/control/server/config/server.go
+++ b/src/control/server/config/server.go
@@ -65,6 +65,7 @@ type Server struct {
FWHelperLogFile string `yaml:"firmware_helper_log_file,omitempty"`
FaultPath string `yaml:"fault_path,omitempty"`
TelemetryPort int `yaml:"telemetry_port,omitempty"`
+ TelemetryConfig *security.TelemetryConfig `yaml:"telemetry_config"`
CoreDumpFilter uint8 `yaml:"core_dump_filter,omitempty"`
ClientEnvVars []string `yaml:"client_env_vars,omitempty"`
SupportConfig SupportConfig `yaml:"support_config,omitempty"`
@@ -319,7 +320,13 @@ func (cfg *Server) WithFirmwareHelperLogFile(filePath string) *Server {
// WithTelemetryPort sets the port for the telemetry exporter.
func (cfg *Server) WithTelemetryPort(port int) *Server {
- cfg.TelemetryPort = port
+ cfg.TelemetryConfig.Port = port
+ return cfg
+}
+
+// WithTelemetryConfig sets the telemetry configuration.
+func (cfg *Server) WithTelemetryConfig(cfgTelemetry *security.TelemetryConfig) *Server {
+ cfg.TelemetryConfig = cfgTelemetry
return cfg
}
@@ -332,6 +339,7 @@ func DefaultServer() *Server {
MgmtSvcReplicas: []string{fmt.Sprintf("localhost:%d", build.DefaultControlPort)},
ControlPort: build.DefaultControlPort,
TransportConfig: security.DefaultServerTransportConfig(),
+ TelemetryConfig: security.DefaultClientTelemetryConfig(),
Hyperthreads: false,
SystemRamReserved: storage.DefaultSysMemRsvd / humanize.GiByte,
Path: defaultConfigPath,
@@ -711,10 +719,17 @@ func (cfg *Server) Validate(log logging.Logger) (err error) {
return FaultConfigNoProvider
case cfg.ControlPort <= 0:
return FaultConfigBadControlPort
+ //Support old configuration option
case cfg.TelemetryPort < 0:
return FaultConfigBadTelemetryPort
}
+ if cfg.TelemetryConfig != nil {
+ if cfg.TelemetryConfig.Port < 0 {
+ return FaultConfigBadTelemetryPort
+ }
+ }
+
for idx, ec := range cfg.Engines {
ec.Storage.ControlMetadata = cfg.Metadata
ec.Storage.EngineIdx = uint(idx)
diff --git a/src/control/server/config/server_test.go b/src/control/server/config/server_test.go
index 475d99354cb..b3b03dffcb1 100644
--- a/src/control/server/config/server_test.go
+++ b/src/control/server/config/server_test.go
@@ -104,11 +104,11 @@ func uncommentServerConfig(t *testing.T, outFile string) {
}
key := fields[0]
- // If we're in a server or a storage tier config, reset the
+ // If we're in a server, a storage tier config, or telemetry config reset the
// seen map to allow the same params in different
// server configs.
lineTmp := strings.TrimLeft(line, " ")
- if lineTmp == "-" {
+ if lineTmp == "-" || lineTmp == "telemetry_config:" {
seenKeys = make(map[string]struct{})
}
if _, seen := seenKeys[key]; seen && strings.HasSuffix(key, ":") {
@@ -249,7 +249,11 @@ func TestServerConfig_Constructed(t *testing.T) {
WithControlLogFile("/tmp/daos_server.log").
WithHelperLogFile("/tmp/daos_server_helper.log").
WithFirmwareHelperLogFile("/tmp/daos_firmware_helper.log").
- WithTelemetryPort(9191).
+ WithTelemetryConfig(&security.TelemetryConfig{
+ AllowInsecure: true,
+ Port: 9191,
+ HttpsCert: "/etc/daos/certs/telemetry.crt",
+ HttpsKey: "/etc/daos/certs/telemetry.key"}).
WithSystemName("daos_server").
WithSocketDir("./.daos/daos_server").
WithFabricProvider("ofi+verbs;ofi_rxm").
@@ -420,7 +424,11 @@ func TestServerConfig_MDonSSD_Constructed(t *testing.T) {
Path: "/var/daos/config",
}).
WithControlLogFile("/tmp/daos_server.log").
- WithTelemetryPort(9191).
+ WithTelemetryConfig(&security.TelemetryConfig{
+ AllowInsecure: true,
+ Port: 9191,
+ HttpsCert: "/etc/daos/certs/telemetry.crt",
+ HttpsKey: "/etc/daos/certs/telemetry.key"}).
WithFabricProvider("ofi+tcp").
WithMgmtSvcReplicas("example1", "example2", "example3")
diff --git a/src/control/server/server_utils.go b/src/control/server/server_utils.go
index 1e8c9e0c14d..288e74ec139 100644
--- a/src/control/server/server_utils.go
+++ b/src/control/server/server_utils.go
@@ -655,14 +655,15 @@ func configureFirstEngine(ctx context.Context, engine *EngineInstance, sysdb *ra
// registerTelemetryCallbacks sets telemetry related callbacks to
// be triggered when all engines have been started.
func registerTelemetryCallbacks(ctx context.Context, srv *server) {
- telemPort := srv.cfg.TelemetryPort
+ telemPort := srv.cfg.TelemetryConfig.Port
+
if telemPort == 0 {
return
}
srv.OnEnginesStarted(func(ctxIn context.Context) error {
srv.log.Debug("starting Prometheus exporter")
- cleanup, err := startPrometheusExporter(ctxIn, srv.log, telemPort, srv.harness.Instances())
+ cleanup, err := startPrometheusExporter(ctxIn, srv)
if err != nil {
return err
}
diff --git a/src/control/server/telemetry.go b/src/control/server/telemetry.go
index 4b2f624aff2..28848f6879e 100644
--- a/src/control/server/telemetry.go
+++ b/src/control/server/telemetry.go
@@ -68,14 +68,17 @@ func regPromEngineSources(ctx context.Context, log logging.Logger, engines []Eng
return nil
}
-func startPrometheusExporter(ctx context.Context, log logging.Logger, port int, engines []Engine) (func(), error) {
+func startPrometheusExporter(ctx context.Context, srv *server) (func(), error) {
expCfg := &promexp.ExporterConfig{
- Port: port,
- Title: "DAOS Engine Telemetry",
+ Port: srv.cfg.TelemetryConfig.Port,
+ Title: "DAOS Engine Telemetry",
+ AllowInsecure: srv.cfg.TelemetryConfig.AllowInsecure,
+ HttpsCert: srv.cfg.TelemetryConfig.HttpsCert,
+ HttpsKey: srv.cfg.TelemetryConfig.HttpsKey,
Register: func(ctx context.Context, log logging.Logger) error {
- return regPromEngineSources(ctx, log, engines)
+ return regPromEngineSources(ctx, srv.log, srv.harness.Instances())
},
}
- return promexp.StartExporter(ctx, log, expCfg)
+ return promexp.StartExporter(ctx, srv.log, expCfg)
}
diff --git a/src/tests/ftest/config_file_gen.py b/src/tests/ftest/config_file_gen.py
index 66172cd2201..d5b4f72ee2e 100755
--- a/src/tests/ftest/config_file_gen.py
+++ b/src/tests/ftest/config_file_gen.py
@@ -12,11 +12,13 @@
import sys
from argparse import ArgumentParser, RawDescriptionHelpFormatter
-from util.agent_utils_params import DaosAgentTransportCredentials, DaosAgentYamlParameters
+from util.agent_utils_params import (DaosAgentTelemetryConfig, DaosAgentTransportCredentials,
+ DaosAgentYamlParameters)
from util.command_utils_base import CommonConfig
-from util.dmg_utils_params import DmgTransportCredentials, DmgYamlParameters
+from util.dmg_utils_params import DmgTelemetryConfig, DmgTransportCredentials, DmgYamlParameters
from util.exception_utils import CommandFailure
-from util.server_utils_params import DaosServerTransportCredentials, DaosServerYamlParameters
+from util.server_utils_params import (DaosServerTelemetryConfig, DaosServerTransportCredentials,
+ DaosServerYamlParameters)
def generate_agent_config(args):
@@ -31,6 +33,7 @@ def generate_agent_config(args):
"""
common_cfg = CommonConfig(args.group_name, DaosAgentTransportCredentials())
config = DaosAgentYamlParameters(args.agent_file, common_cfg)
+ config.telemetry_config = DaosAgentTelemetryConfig()
# Update the configuration file access points
config.access_points.value = args.node_list.split(",")
return create_config(args, config)
@@ -48,6 +51,7 @@ def generate_server_config(args):
"""
common_cfg = CommonConfig(args.group_name, DaosServerTransportCredentials())
config = DaosServerYamlParameters(args.server_file, common_cfg)
+ config.telemetry_config = DaosServerTelemetryConfig()
config.engine_params[0].storage.storage_tiers[0].storage_class.value = "ram"
config.engine_params[0].storage.storage_tiers[0].scm_mount.value = "/mnt/daos"
config.engine_params[0].storage.storage_tiers[0].scm_size.value = 0
@@ -68,6 +72,7 @@ def generate_dmg_config(args):
"""
config = DmgYamlParameters(
args.dmg_file, args.group_name, DmgTransportCredentials())
+ config.telemetry_config = DmgTelemetryConfig()
# Update the configuration file hostlist
config.hostlist.value = args.node_list.split(",")
return create_config(args, config)
diff --git a/src/tests/ftest/control/dmg_telemetry_basic.py b/src/tests/ftest/control/dmg_telemetry_basic.py
index 39eb520aef2..45cb8472392 100644
--- a/src/tests/ftest/control/dmg_telemetry_basic.py
+++ b/src/tests/ftest/control/dmg_telemetry_basic.py
@@ -1,5 +1,5 @@
"""
-(C) Copyright 2021-2023 Intel Corporation.
+(C) Copyright 2021-2024 Intel Corporation.
SPDX-License-Identifier: BSD-2-Clause-Patent
"""
@@ -91,7 +91,7 @@ def test_container_telemetry(self):
:avocado: tags=all,pr,daily_regression
:avocado: tags=vm
- :avocado: tags=control,telemetry,container
+ :avocado: tags=control,telemetry,container,secure_telemetry
:avocado: tags=TestWithTelemetryBasic,test_container_telemetry
"""
container_qty = self.params.get("container_qty", "/run/test/*", 1)
diff --git a/src/tests/ftest/control/dmg_telemetry_basic.yaml b/src/tests/ftest/control/dmg_telemetry_basic.yaml
index 69af131fc6f..f9c41e9eafb 100644
--- a/src/tests/ftest/control/dmg_telemetry_basic.yaml
+++ b/src/tests/ftest/control/dmg_telemetry_basic.yaml
@@ -16,6 +16,8 @@ server_config:
class: ram
scm_mount: /mnt/daos
system_ram_reserved: 1
+ telemetry_config:
+ allow_insecure: false
pool:
scm_size: 2G
container:
@@ -23,3 +25,6 @@ container:
test:
container_qty: 5
open_close_qty: 3
+dmg:
+ telemetry_config:
+ allow_insecure: false
diff --git a/src/tests/ftest/scripts/gen_telemetry_server_certificate.sh b/src/tests/ftest/scripts/gen_telemetry_server_certificate.sh
new file mode 100755
index 00000000000..6d359fbecaa
--- /dev/null
+++ b/src/tests/ftest/scripts/gen_telemetry_server_certificate.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# /*
+# * (C) Copyright 2024 Intel Corporation.
+# *
+# * SPDX-License-Identifier: BSD-2-Clause-Patent
+# */
+
+__usage="
+
+This is just an example script for testing purpose.
+Please modify to use in Production environment.
+
+Usage: gen_telemetry_server_certificate.sh [USER] [DIR]
+ USER: DAOS has server and client and the certificate need the specific file permission
+ based on system usage.
+ Use daos_server if running script on server
+ Use daos_agent if running script on client
+
+ DIR: Generate telemetry certificates for DAOS metrics in the [DIR].
+ By default [DIR] is the current directory.
+"
+DAYS=1095
+
+USER=$1
+CA_HOME="${2:-.}/"
+HOSTNAME=$(hostname -s)
+
+openssl req -x509 -newkey rsa:4096 -keyout "${CA_HOME}/telemetry.key" -out "${CA_HOME}/telemetry.crt" -sha256 -days ${DAYS} -nodes -subj "/CN=\"${HOSTNAME}\""
+chmod 0400 "${CA_HOME}/telemetry.key"
+chmod 0644 "${CA_HOME}/telemetry.crt"
+chown "${USER}"."${USER}" "${CA_HOME}/telemetry.key"
+chown "${USER}"."${USER}" "${CA_HOME}/telemetry.crt"
diff --git a/src/tests/ftest/server/storage_tiers.py b/src/tests/ftest/server/storage_tiers.py
index e627b8d62c2..536c1c52baf 100644
--- a/src/tests/ftest/server/storage_tiers.py
+++ b/src/tests/ftest/server/storage_tiers.py
@@ -1,5 +1,5 @@
"""
- (C) Copyright 2020-2023 Intel Corporation.
+ (C) Copyright 2020-2024 Intel Corporation.
SPDX-License-Identifier: BSD-2-Clause-Patent
"""
@@ -8,7 +8,8 @@
import yaml
from apricot import TestWithServers
from command_utils_base import CommonConfig
-from server_utils import DaosServerTransportCredentials, DaosServerYamlParameters
+from server_utils import (DaosServerTelemetryConfig, DaosServerTransportCredentials,
+ DaosServerYamlParameters)
class StorageTiers(TestWithServers):
@@ -67,6 +68,7 @@ def test_tiers(self):
common_config = CommonConfig("daos_server", DaosServerTransportCredentials())
config = DaosServerYamlParameters(None, common_config)
+ config.telemetry_config = DaosServerTelemetryConfig()
config.namespace = self.server_config_namespace
config.get_params(self)
data = config.get_yaml_data()
diff --git a/src/tests/ftest/telemetry/basic_client_telemetry.py b/src/tests/ftest/telemetry/basic_client_telemetry.py
index 71b976abe30..5d0236dbb7a 100644
--- a/src/tests/ftest/telemetry/basic_client_telemetry.py
+++ b/src/tests/ftest/telemetry/basic_client_telemetry.py
@@ -26,7 +26,7 @@ def test_client_metrics_exist(self):
:avocado: tags=all,daily_regression
:avocado: tags=vm
- :avocado: tags=telemetry
+ :avocado: tags=telemetry,secure_telemetry
:avocado: tags=BasicClientTelemetry,test_client_metrics_exist
"""
# create pool and container
diff --git a/src/tests/ftest/telemetry/basic_client_telemetry.yaml b/src/tests/ftest/telemetry/basic_client_telemetry.yaml
index d585dc81fda..71c6c361cd9 100644
--- a/src/tests/ftest/telemetry/basic_client_telemetry.yaml
+++ b/src/tests/ftest/telemetry/basic_client_telemetry.yaml
@@ -18,9 +18,11 @@ server_config:
system_ram_reserved: 1
agent_config:
- telemetry_port: 9191
- telemetry_retain: 30s
- telemetry_enabled: true
+ telemetry_config:
+ allow_insecure: false
+ telemetry_port: 9191
+ telemetry_retain: 30s
+ telemetry_enabled: true
pool:
scm_size: 2G
@@ -44,3 +46,6 @@ ior_write:
ior_read:
<<: *ior_base
flags: "-v -r -R -G 1"
+dmg:
+ telemetry_config:
+ allow_insecure: false
diff --git a/src/tests/ftest/util/agent_utils.py b/src/tests/ftest/util/agent_utils.py
index 74b79fb9796..f437bc36e04 100644
--- a/src/tests/ftest/util/agent_utils.py
+++ b/src/tests/ftest/util/agent_utils.py
@@ -7,7 +7,8 @@
import re
import socket
-from agent_utils_params import DaosAgentTransportCredentials, DaosAgentYamlParameters
+from agent_utils_params import (DaosAgentTelemetryConfig, DaosAgentTransportCredentials,
+ DaosAgentYamlParameters)
from ClusterShell.NodeSet import NodeSet
from command_utils import CommandWithSubCommand, SubprocessManager, YamlCommand
from command_utils_base import (CommandWithParameters, CommonConfig, EnvironmentVariables,
@@ -53,6 +54,7 @@ def get_agent_command(group, cert_dir, bin_dir, config_file, run_user, config_te
transport_config = DaosAgentTransportCredentials(cert_dir)
common_config = CommonConfig(group, transport_config)
config = DaosAgentYamlParameters(config_file, common_config)
+ config.telemetry_config = DaosAgentTelemetryConfig(cert_dir)
command = DaosAgentCommand(bin_dir, config, run_user=run_user)
if config_temp:
# Setup the DaosAgentCommand to write the config file data to the
@@ -285,6 +287,7 @@ def start(self):
# Copy certificates
self.manager.job.copy_certificates(
get_log_file("daosCA/certs"), self._hosts)
+ self.manager.job.generate_telemetry_server_certificates(self._hosts, "daos_agent")
# Verify the socket directory exists when using a non-systemctl manager
if self.verify_socket_dir:
diff --git a/src/tests/ftest/util/agent_utils_params.py b/src/tests/ftest/util/agent_utils_params.py
index bd091a162e6..889c8debf48 100644
--- a/src/tests/ftest/util/agent_utils_params.py
+++ b/src/tests/ftest/util/agent_utils_params.py
@@ -5,7 +5,8 @@
"""
import os
-from command_utils_base import BasicParameter, LogParameter, TransportCredentials, YamlParameters
+from command_utils_base import (BasicParameter, LogParameter, TelemetryConfig,
+ TransportCredentials, YamlParameters)
class DaosAgentTransportCredentials(TransportCredentials):
@@ -32,6 +33,29 @@ def _get_new(self):
return DaosAgentTransportCredentials(self._log_dir)
+class DaosAgentTelemetryConfig(TelemetryConfig):
+ # pylint: disable=too-few-public-methods
+ """Telemetry credentials listing certificates for secure communication."""
+
+ def __init__(self, log_dir=os.path.join(os.sep, "tmp")):
+ """Initialize a TelemetryConfig object."""
+ super().__init__("/run/agent_config/telemetry_config/*", None, log_dir)
+
+ self.telemetry_port = BasicParameter(None, 9192)
+ self.telemetry_enabled = BasicParameter(None)
+ self.telemetry_retain = BasicParameter(None)
+ self.https_cert = LogParameter(self._log_dir, None, "telemetry.crt")
+ self.https_key = LogParameter(self._log_dir, None, "telemetry.key")
+
+ def _get_new(self):
+ """Get a new object based upon this one.
+
+ Returns:
+ DaosServerTelemetryConfig: a new DaosServerTelemetryConfig object
+ """
+ return DaosAgentTelemetryConfig(self._log_dir)
+
+
class DaosAgentYamlParameters(YamlParameters):
"""Defines the daos_agent configuration yaml parameters."""
diff --git a/src/tests/ftest/util/command_utils.py b/src/tests/ftest/util/command_utils.py
index dbed7ac3c44..ac9f94449d4 100644
--- a/src/tests/ftest/util/command_utils.py
+++ b/src/tests/ftest/util/command_utils.py
@@ -1044,6 +1044,28 @@ def copy_certificates(self, source, hosts):
self._command, ", ".join(names))
get_file_listing(hosts, names, self.run_user).log_output(self.log)
+ def generate_telemetry_server_certificates(self, hosts, user):
+ """Generate the telemetry certificates for the test on server/client.
+
+ Args:
+ hosts (NodeSet): list of the destination hosts.
+ user (User): User permission set on telemetry certificate file.
+ For server, it's daos_server and for client it's daos_agent.
+ """
+ data = self.yaml.telemetry_config.get_certificate_data(
+ self.yaml.telemetry_config.get_attribute_names(LogParameter))
+ destination = list(data.keys())[0]
+
+ if not self.yaml.telemetry_config.allow_insecure.value:
+ certgen_dir = os.path.abspath(
+ os.path.join(os.getcwd(), "scripts"))
+ command = os.path.join(certgen_dir, "gen_telemetry_server_certificate.sh ")
+ command = "sudo " + command + user + " " + destination
+ self.log.debug("Generating the telemetry certificate command %s:", command)
+ result = run_remote(self.log, hosts, command, 30)
+ if not result.passed:
+ self.log.info(" WARNING: command %s failed", command)
+
def copy_configuration(self, hosts):
"""Copy the yaml configuration file to the hosts.
diff --git a/src/tests/ftest/util/command_utils_base.py b/src/tests/ftest/util/command_utils_base.py
index d867fbba4c8..ff2c1499b7e 100644
--- a/src/tests/ftest/util/command_utils_base.py
+++ b/src/tests/ftest/util/command_utils_base.py
@@ -574,6 +574,7 @@ def __init__(self, namespace, filename=None, title=None, other_params=None):
self.filename = filename
self.title = title
self.other_params = other_params
+ self.telemetry_config = None
def get_params(self, test):
"""Get values for the yaml parameters from the test yaml file.
@@ -588,6 +589,9 @@ def get_params(self, test):
if self.other_params is not None:
self.other_params.get_params(test)
+ if self.telemetry_config is not None:
+ self.telemetry_config.get_params(test)
+
def get_yaml_data(self):
"""Convert the parameters into a dictionary to use to write a yaml file.
@@ -599,6 +603,12 @@ def get_yaml_data(self):
yaml_data = self.other_params.get_yaml_data()
else:
yaml_data = {}
+
+ if self.telemetry_config is not None:
+ telemetry_yaml = self.telemetry_config.get_yaml_data()
+ if telemetry_yaml:
+ yaml_data["telemetry_config"] = telemetry_yaml
+
for name in self.get_param_names():
value = getattr(self, name).value
if value is not None:
@@ -789,6 +799,76 @@ def _get_new(self):
return TransportCredentials(self.namespace, self.title, self._log_dir)
+class TelemetryConfig(YamlParameters):
+ """Telemetry credentials listing certificates for secure communication."""
+
+ def __init__(self, namespace, title, log_dir):
+ """Initialize a TelemetryConfig object.
+
+ Args:
+ namespace (str): yaml namespace (path to parameters)
+ title (str): namespace under which to place the
+ parameters when creating the yaml file.
+ log_dir (str): location of the certificate files
+ """
+ super().__init__(namespace, None, title)
+ self._log_dir = log_dir
+ default_insecure = str(os.environ.get("DAOS_TEST_INSECURE_MODE", True))
+ default_insecure = default_insecure.lower() == "true"
+ self.allow_insecure = BasicParameter(None, default_insecure)
+ self.telemetry_port = BasicParameter(None, 9191)
+ self.telemetry_retain = None
+ self.telemetry_enabled = None
+
+ def get_yaml_data(self):
+ """Convert the parameters into a dictionary to use to write a yaml file.
+
+ Returns:
+ dict: a dictionary of parameter name keys and values
+
+ """
+ yaml_data = super().get_yaml_data()
+
+ # Convert the boolean value into a string
+ if self.title is not None:
+ yaml_data[self.title]["allow_insecure"] = self.allow_insecure.value
+ else:
+ yaml_data["allow_insecure"] = self.allow_insecure.value
+
+ return yaml_data
+
+ def get_certificate_data(self, name_list):
+ """Get certificate data by name_list.
+
+ Args:
+ name_list (list): list of certificate attribute names.
+
+ Returns:
+ data (dict): a dictionary of parameter directory name keys and
+ value.
+
+ """
+ data = {}
+ if not self.allow_insecure.value:
+ for name in name_list:
+ value = getattr(self, name).value
+ if isinstance(value, str):
+ dir_name, file_name = os.path.split(value)
+ if dir_name not in data:
+ data[dir_name] = [file_name]
+ else:
+ data[dir_name].append(file_name)
+ return data
+
+ def _get_new(self):
+ """Get a new object based upon this one.
+
+ Returns:
+ TelemetryConfig: a new TelemetryConfig object
+ """
+ return TelemetryConfig(self.namespace, self.title, self._log_dir)
+
+
class CommonConfig(YamlParameters):
"""Defines common daos_agent and daos_server configuration file parameters.
diff --git a/src/tests/ftest/util/dmg_utils.py b/src/tests/ftest/util/dmg_utils.py
index cbca403895a..aad42e662a8 100644
--- a/src/tests/ftest/util/dmg_utils.py
+++ b/src/tests/ftest/util/dmg_utils.py
@@ -10,7 +10,7 @@
from pwd import getpwuid
from dmg_utils_base import DmgCommandBase
-from dmg_utils_params import DmgTransportCredentials, DmgYamlParameters
+from dmg_utils_params import DmgTelemetryConfig, DmgTransportCredentials, DmgYamlParameters
from exception_utils import CommandFailure
from general_utils import dict_to_str, get_numeric_list
@@ -39,7 +39,8 @@ def get_dmg_command(group, cert_dir, bin_dir, config_file, config_temp=None, hos
"""
transport_config = DmgTransportCredentials(cert_dir)
- config = DmgYamlParameters(config_file, group, transport_config)
+ telemetry_config = DmgTelemetryConfig(cert_dir)
+ config = DmgYamlParameters(config_file, group, transport_config, telemetry_config)
command = DmgCommand(bin_dir, config, hostlist_suffix)
if config_temp:
# Setup the DaosServerCommand to write the config file data to the
diff --git a/src/tests/ftest/util/dmg_utils_params.py b/src/tests/ftest/util/dmg_utils_params.py
index ff9048a3b62..7faa37a0f40 100644
--- a/src/tests/ftest/util/dmg_utils_params.py
+++ b/src/tests/ftest/util/dmg_utils_params.py
@@ -1,10 +1,11 @@
"""
- (C) Copyright 2020-2023 Intel Corporation.
+ (C) Copyright 2020-2024 Intel Corporation.
SPDX-License-Identifier: BSD-2-Clause-Patent
"""
-from command_utils_base import BasicParameter, LogParameter, TransportCredentials, YamlParameters
+from command_utils_base import (BasicParameter, LogParameter, TelemetryConfig,
+ TransportCredentials, YamlParameters)
class DmgTransportCredentials(TransportCredentials):
@@ -25,10 +26,27 @@ def _get_new(self):
return DmgTransportCredentials(self._log_dir)
+class DmgTelemetryConfig(TelemetryConfig):
+ """Telemetry credentials listing certificates for secure communication."""
+
+ def __init__(self, log_dir="/tmp"):
+ """Initialize a TelemetryConfig object."""
+ super().__init__("/run/dmg/telemetry_config/*", None, log_dir)
+ self.https_exception = BasicParameter(None, True)
+
+ def _get_new(self):
+ """Get a new object based upon this one.
+
+ Returns:
+ DmgTelemetryConfig: a new DmgTelemetryConfig object
+ """
+ return DmgTelemetryConfig(self._log_dir)
+
+
class DmgYamlParameters(YamlParameters):
"""Defines the dmg configuration yaml parameters."""
- def __init__(self, filename, name, transport):
+ def __init__(self, filename, name, transport, telemetry=None):
"""Initialize a DmgYamlParameters object.
Args:
@@ -36,6 +54,8 @@ def __init__(self, filename, name, transport):
name (str): The DAOS system name.
transport (DmgTransportCredentials): dmg security
configuration settings.
+ telemetry (DmgTelemetryConfig): dmg telemetry
+ configuration settings.
"""
super().__init__("/run/dmg/*", filename, None, transport)
@@ -57,6 +77,9 @@ def __init__(self, filename, name, transport):
self.hostlist = BasicParameter(None, "localhost")
self.port = BasicParameter(None, 10001)
+ if telemetry is not None:
+ self.telemetry_config = telemetry
+
def _get_new(self):
"""Get a new object based upon this one.
diff --git a/src/tests/ftest/util/launch_utils.py b/src/tests/ftest/util/launch_utils.py
index 0f7284c50ef..2d33cceac05 100644
--- a/src/tests/ftest/util/launch_utils.py
+++ b/src/tests/ftest/util/launch_utils.py
@@ -867,6 +867,7 @@ def _generate_certs(self, logger):
message = "Error generating certificates"
self.test_result.fail_test(logger, "Prepare", message, sys.exc_info())
return 128
+
return 0
def _collect_crash_files(self, logger):
diff --git a/src/tests/ftest/util/server_utils.py b/src/tests/ftest/util/server_utils.py
index ec79f029c6e..cd284a39a2a 100644
--- a/src/tests/ftest/util/server_utils.py
+++ b/src/tests/ftest/util/server_utils.py
@@ -21,7 +21,8 @@
from host_utils import get_local_host
from run_utils import run_remote, stop_processes
from server_utils_base import DaosServerCommand, DaosServerInformation, ServerFailed
-from server_utils_params import DaosServerTransportCredentials, DaosServerYamlParameters
+from server_utils_params import (DaosServerTelemetryConfig, DaosServerTransportCredentials,
+ DaosServerYamlParameters)
from user_utils import get_chown_command
@@ -45,7 +46,10 @@ def get_server_command(group, cert_dir, bin_dir, config_file, config_temp=None):
transport_config = DaosServerTransportCredentials(cert_dir)
common_config = CommonConfig(group, transport_config)
config = DaosServerYamlParameters(config_file, common_config)
+ config.telemetry_config = DaosServerTelemetryConfig(cert_dir)
+
command = DaosServerCommand(bin_dir, config, None)
+
if config_temp:
# Setup the DaosServerCommand to write the config file data to the
# temporary file and then copy the file to all the hosts using the
@@ -241,6 +245,7 @@ def prepare(self, storage=True):
# Copy certificates
self.manager.job.copy_certificates(get_log_file("daosCA/certs"), self._hosts)
+ self.manager.job.generate_telemetry_server_certificates(self._hosts, "daos_server")
self._prepare_dmg_certificates()
# Prepare dmg for running storage format on all server hosts
diff --git a/src/tests/ftest/util/server_utils_params.py b/src/tests/ftest/util/server_utils_params.py
index 19dd8ea4df3..050a6cdc17a 100644
--- a/src/tests/ftest/util/server_utils_params.py
+++ b/src/tests/ftest/util/server_utils_params.py
@@ -5,7 +5,8 @@
"""
import os
-from command_utils_base import BasicParameter, LogParameter, TransportCredentials, YamlParameters
+from command_utils_base import (BasicParameter, LogParameter, TelemetryConfig,
+ TransportCredentials, YamlParameters)
MAX_STORAGE_TIERS = 5
@@ -56,6 +57,32 @@ def _get_new(self):
return DaosServerTransportCredentials(self._log_dir)
+class DaosServerTelemetryConfig(TelemetryConfig):
+ # pylint: disable=too-few-public-methods
+ """Telemetry credentials listing certificates for secure communication."""
+
+ def __init__(self, log_dir=os.path.join(os.sep, "tmp")):
+ """Initialize a DaosServerTelemetryConfig object."""
+ super().__init__("/run/server_config/telemetry_config/*", None, log_dir)
+
+ # Additional daos_server telemetry credential parameters:
+ # - port: : Telemetry endpoint port number
+ # - https_cert: : Server certificate
+ # - https_key: : Server Key portion
+ #
+ self.telemetry_port = BasicParameter(None, 9191)
+ self.https_cert = LogParameter(self._log_dir, None, "telemetry.crt")
+ self.https_key = LogParameter(self._log_dir, None, "telemetry.key")
+
+ def _get_new(self):
+ """Get a new object based upon this one.
+
+ Returns:
+ DaosServerTelemetryConfig: a new DaosServerTelemetryConfig object
+ """
+ return DaosServerTelemetryConfig(self._log_dir)
+
+
class DaosServerYamlParameters(YamlParameters):
"""Defines the daos_server configuration yaml parameters."""
@@ -135,7 +162,6 @@ def __init__(self, filename, common_yaml):
self.control_log_mask = BasicParameter(None, "DEBUG")
self.control_log_file = LogParameter(log_dir, None, "daos_control.log")
self.helper_log_file = LogParameter(log_dir, None, "daos_server_helper.log")
- self.telemetry_port = BasicParameter(None, 9191)
self.client_env_vars = BasicParameter(None)
self.mgmt_svc_replicas = BasicParameter(None, ["localhost"])
@@ -147,6 +173,9 @@ def __init__(self, filename, common_yaml):
# Control plane metadata parameters.
self.metadata_params = ControlMetadataParameters(self.namespace)
+ # Telemetry Parameters
+ self.telemetry_config = BasicParameter(None)
+
# Defines the number of single engine config parameters to define in
# the yaml file
self.engines_per_host = BasicParameter(None, 0)
diff --git a/src/tests/ftest/util/telemetry_utils.py b/src/tests/ftest/util/telemetry_utils.py
index cc0610af872..d13a359ee8b 100644
--- a/src/tests/ftest/util/telemetry_utils.py
+++ b/src/tests/ftest/util/telemetry_utils.py
@@ -369,7 +369,9 @@ class TelemetryUtils():
*_gen_stats_metrics("engine_net_swim_delay"),
"engine_net_uri_lookup_timeout",
"engine_net_uri_lookup_other",
- "engine_net_uri_lookup_self"]
+ "engine_net_uri_lookup_self",
+ 'engine_net_quota_exceeded',
+ 'engine_net_waitq_depth']
ENGINE_RANK_METRICS = [
"engine_rank"]
ENGINE_NVME_HEALTH_METRICS = [
diff --git a/utils/config/daos_agent.yml b/utils/config/daos_agent.yml
index bced9a0447b..f260a910f7d 100644
--- a/utils/config/daos_agent.yml
+++ b/utils/config/daos_agent.yml
@@ -26,26 +26,35 @@
# default: 10001
#port: 10001
-## Enable HTTP endpoint for remote telemetry collection.
-# Note that enabling the endpoint automatically enables
-# client telemetry collection.
+## Enable Telemetry HTTP/HTTPS endpoint for remote client telemetry collection.
#
-## default endpoint state: disabled
-## default endpoint port: 9192
-#telemetry_port: 9192
-
-## Enable client telemetry for all DAOS clients.
-# If false, clients will need to optionally enable telemetry by setting
-# the D_CLIENT_METRICS_ENABLE environment variable to true.
+#telemetry_config:
+# # Set the client telemetry endpoint port number
+# # default: 9192
+# telemetry_port: 9192
#
-## default: false
-#telemetry_enabled: true
-
-## Retain client telemetry for a period of time after the client
-# process exits.
+# # Enable client telemetry for all DAOS clients.
+# # If false, clients will need to optionally enable telemetry by setting
+# # the D_CLIENT_METRICS_ENABLE environment variable to true.
+# # default: false
+# telemetry_enabled: true
+#
+# # Retain client telemetry for a period of time after the client
+# # process exits.
+# # default 0 (do not retain telemetry after client exit)
+# telemetry_retain: 1m
+#
+# # In order to disable transport security, uncomment and set allow_insecure
+# # to true. Not recommended for production configurations.
+# allow_insecure: false
+#
+# # Server certificate for use in TLS handshakes
+# # DAOS client is the HTTPS server to open secure telemetry endpoint.
+# https_cert: /etc/daos/certs/telemetry.crt
#
-## default 0 (do not retain telemetry after client exit)
-#telemetry_retain: 1m
+# # Key portion of Server Certificate
+# # DAOS client is the HTTPS server to open secure telemetry endpoint.
+# https_key: /etc/daos/certs/telemetry.key
## Configuration for user credential management.
#credential_config:
diff --git a/utils/config/daos_control.yml b/utils/config/daos_control.yml
index 5a236052fd7..cd0566d41c2 100644
--- a/utils/config/daos_control.yml
+++ b/utils/config/daos_control.yml
@@ -38,3 +38,13 @@
# cert: /etc/daos/certs/admin.crt
# # Key portion of Admin Certificate
# key: /etc/daos/certs/admin.key
+
+## Configuration for telemetry collection commands.
+#
+#telemetry_config:
+# # In order to enabled transport security, uncomment and set allow_insecure
+# # to false.
+# allow_insecure: true
+#
+# # Skip the Server certificate verification. Recommended for testing purpose only.
+# https_exception: true
diff --git a/utils/config/daos_server.yml b/utils/config/daos_server.yml
index a9642631f2d..24b2ff3ccd3 100644
--- a/utils/config/daos_server.yml
+++ b/utils/config/daos_server.yml
@@ -260,11 +260,21 @@
#firmware_helper_log_file: /tmp/daos_firmware_helper.log
#
#
-## Enable HTTP endpoint for remote telemetry collection.
+## Enable Telemetry HTTP/HTTPS endpoint for remote telemetry collection.
#
-## default endpoint state: disabled
-## default endpoint port: 9191
-#telemetry_port: 9191
+#telemetry_config:
+# # In order to enabled telemetry security, uncomment and set allow_insecure to false
+# allow_insecure: true
+#
+# # Set the server telemetry endpoint port number
+# # default: 9191
+# telemetry_port: 9191
+#
+# # Server certificate for use in TLS handshakes
+# https_cert: /etc/daos/certs/telemetry.crt
+#
+# # Key portion of Server Certificate
+# https_key: /etc/daos/certs/telemetry.key
#
#
## If desired, a set of client-side environment variables may be
diff --git a/utils/config/examples/daos_server_local.yml b/utils/config/examples/daos_server_local.yml
index ac5bb6ee808..237f9f27799 100644
--- a/utils/config/examples/daos_server_local.yml
+++ b/utils/config/examples/daos_server_local.yml
@@ -7,7 +7,9 @@ control_log_file: /tmp/daos_server.log
transport_config:
allow_insecure: true
-telemetry_port: 9191
+telemetry_config:
+ allow_insecure: true
+ telemetry_port: 9191
engines:
-
diff --git a/utils/config/examples/daos_server_mdonssd.yml b/utils/config/examples/daos_server_mdonssd.yml
index 8b73e53e431..1b46daf7351 100644
--- a/utils/config/examples/daos_server_mdonssd.yml
+++ b/utils/config/examples/daos_server_mdonssd.yml
@@ -29,7 +29,9 @@ control_metadata:
# # Key portion of Server Certificate
# key: /etc/daos/certs/server.key
-telemetry_port: 9191
+telemetry_config:
+ allow_insecure: true
+ telemetry_port: 9191
engines:
-
diff --git a/utils/config/examples/daos_server_tcp.yml b/utils/config/examples/daos_server_tcp.yml
index 38f40d7ec67..475baed1483 100644
--- a/utils/config/examples/daos_server_tcp.yml
+++ b/utils/config/examples/daos_server_tcp.yml
@@ -10,7 +10,9 @@ provider: ofi+tcp
control_log_mask: DEBUG
control_log_file: /tmp/daos_server.log
-telemetry_port: 9191
+telemetry_config:
+ allow_insecure: true
+ telemetry_port: 9191
## Transport Credentials Specifying certificates to secure communications
##
diff --git a/utils/config/examples/daos_server_ucx.yml b/utils/config/examples/daos_server_ucx.yml
index bd413af495e..9f29dbb5664 100644
--- a/utils/config/examples/daos_server_ucx.yml
+++ b/utils/config/examples/daos_server_ucx.yml
@@ -19,7 +19,9 @@ provider: ucx+dc_x
control_log_mask: INFO
control_log_file: /tmp/daos_server.log
-telemetry_port: 9191
+telemetry_config:
+ allow_insecure: true
+ telemetry_port: 9191
## Transport Credentials Specifying certificates to secure communications
##
diff --git a/utils/config/examples/daos_server_verbs.yml b/utils/config/examples/daos_server_verbs.yml
index 32146674739..fc199e52234 100644
--- a/utils/config/examples/daos_server_verbs.yml
+++ b/utils/config/examples/daos_server_verbs.yml
@@ -10,7 +10,9 @@ provider: ofi+verbs
control_log_mask: INFO
control_log_file: /tmp/daos_server.log
-telemetry_port: 9191
+telemetry_config:
+ allow_insecure: true
+ telemetry_port: 9191
## Transport Credentials Specifying certificates to secure communications
##