Skip to content

Commit

Permalink
feat: opentelemetry metrics
Browse files Browse the repository at this point in the history
Signed-off-by: Alan Clucas <[email protected]>
  • Loading branch information
Joibel committed Jun 21, 2024
1 parent 6201d75 commit 51843cc
Show file tree
Hide file tree
Showing 65 changed files with 3,199 additions and 1,502 deletions.
5 changes: 5 additions & 0 deletions .spelling
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ CRDs
CloudSQL
ClusterRoleBinding
ClusterRoles
ClusterWorkflowTemplate
Codespaces
ConfigMap
ConfigMaps
Expand Down Expand Up @@ -92,6 +93,7 @@ OAuth
OAuth2
Okta
OpenAPI
OpenTelemetry
PDBs
PProf
PVCs
Expand Down Expand Up @@ -120,6 +122,7 @@ Welch
WorkflowTemplate
WorkflowTemplates
a.m.
alpha-numerics
anded
apis
architecting
Expand Down Expand Up @@ -193,6 +196,7 @@ parameterizing
params
pprof
pre-commit
prometheus
rc2
repo
roadmap
Expand Down Expand Up @@ -247,4 +251,5 @@ webHDFS
webhook
webhooks
workflow-controller-configmap
workqueue
yaml
18 changes: 9 additions & 9 deletions cmd/workflow-controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,17 @@ func NewRootCommand() *cobra.Command {
if err != nil {
return err
}
// start a controller on instances of our custom resource
ctx, cancel := context.WithCancel(context.Background())
defer cancel()

version := argo.GetVersion()
config = restclient.AddUserAgent(config, fmt.Sprintf("argo-workflows/%s argo-controller", version.Version))
config.Burst = burst
config.QPS = qps

logs.AddK8SLogTransportWrapper(config)
metrics.AddMetricsTransportWrapper(config)
metrics.AddMetricsTransportWrapper(ctx, config)

namespace, _, err := clientConfig.Namespace()
if err != nil {
Expand All @@ -106,10 +110,6 @@ func NewRootCommand() *cobra.Command {
managedNamespace = namespace
}

// start a controller on instances of our custom resource
ctx, cancel := context.WithCancel(context.Background())
defer cancel()

wfController, err := controller.NewWorkflowController(ctx, config, kubeclientset, wfclientset, namespace, managedNamespace, executorImage, executorImagePullPolicy, logFormat, configMap, executorPlugins)
errors.CheckError(err)

Expand All @@ -118,7 +118,7 @@ func NewRootCommand() *cobra.Command {
log.Info("Leader election is turned off. Running in single-instance mode")
log.WithField("id", "single-instance").Info("starting leading")
go wfController.Run(ctx, workflowWorkers, workflowTTLWorkers, podCleanupWorkers, cronWorkflowWorkers)
go wfController.RunMetricsServer(ctx, false)
go wfController.RunPrometheusServer(ctx, false)
} else {
nodeID, ok := os.LookupEnv("LEADER_ELECTION_IDENTITY")
if !ok {
Expand All @@ -133,7 +133,7 @@ func NewRootCommand() *cobra.Command {
// for controlling the dummy metrics server
dummyCtx, dummyCancel := context.WithCancel(context.Background())
defer dummyCancel()
go wfController.RunMetricsServer(dummyCtx, true)
go wfController.RunPrometheusServer(dummyCtx, true)

go leaderelection.RunOrDie(ctx, leaderelection.LeaderElectionConfig{
Lock: &resourcelock.LeaseLock{
Expand All @@ -148,12 +148,12 @@ func NewRootCommand() *cobra.Command {
OnStartedLeading: func(ctx context.Context) {
dummyCancel()
go wfController.Run(ctx, workflowWorkers, workflowTTLWorkers, podCleanupWorkers, cronWorkflowWorkers)
go wfController.RunMetricsServer(ctx, false)
go wfController.RunPrometheusServer(ctx, false)
},
OnStoppedLeading: func() {
log.WithField("id", nodeID).Info("stopped leading")
cancel()
go wfController.RunMetricsServer(dummyCtx, true)
go wfController.RunPrometheusServer(dummyCtx, true)
},
OnNewLeader: func(identity string) {
log.WithField("leader", identity).Info("new leader")
Expand Down
26 changes: 25 additions & 1 deletion config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,24 @@ type MySQLConfig struct {
Options map[string]string `json:"options,omitempty"`
}

// MetricOptions are options for an individual named metric to change their behaviour
type MetricOption struct {
// Disable disables the emission of this metric completely
Disable bool `json:"disable,omitempty"`
// DisabledAttributes lists labels for this metric to remove that attributes to save on cardinality
DisabledAttributes []string `json:"disabledAttributes"`
// HistogramBuckets allow configuring of the buckets used in a histogram
// Has no effect on non-histogram buckets
HistogramBuckets []float64 `json:"histogramBuckets,omitempty"`
}

type MetricsTemporality string

const (
MetricsTemporalityCumulative MetricsTemporality = "Cumulative"
MetricsTemporalityDelta MetricsTemporality = "Delta"
)

// MetricsConfig defines a config for a metrics server
type MetricsConfig struct {
// Enabled controls metric emission. Default is true, set "enabled: false" to turn off
Expand All @@ -260,8 +278,14 @@ type MetricsConfig struct {
Port int `json:"port,omitempty"`
// IgnoreErrors is a flag that instructs prometheus to ignore metric emission errors
IgnoreErrors bool `json:"ignoreErrors,omitempty"`
// Secure is a flag that starts the metrics servers using TLS
// Secure is a flag that starts the metrics servers using TLS, defaults to true
Secure *bool `json:"secure,omitempty"`
// Configure metrics by name
Options map[string]MetricOption `json:"options,omitempty"`
// Temporality configures the temporality of the opentelemetry metrics.
// Valid values are Cumulative and Delta, defaulting to cumulative.
// This has no effect on prometheus metrics, which are always cumulative
Temporality MetricsTemporality `json:"temporality,omitempty"`
}

func (mc MetricsConfig) GetSecure(defaultValue bool) bool {
Expand Down
81 changes: 81 additions & 0 deletions docs/metrics-3.6.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# Metrics upgrade notes

Metrics have changed between 3.5 and 3.6.

You can now retrieve metrics using the OpenTelemetry Protocol using the [OpenTelemetry collector](https://opentelemetry.io/docs/collector/), and this is the recommended mechanism.

These notes explain the differences in using the prometheus `/metrics` endpoint to scrape metrics for a minimal effort upgrade. It is not recommended you follow this guide blindly, the new metrics have been introduced because they add value, and so they should be worth collecting and using.

## TLS

The prometheus `/metrics` endpoint defaults to TLS on.

To disable this set `.metricsConfig.secure` to `false`.

## New metrics

The following are new metrics:

* `build_info`
* `total_count`
* `pods_total_count`
* `controller_build_info`
* `cronworkflows_triggered_total`
* `workflowtemplate_triggered_total`
* `workflowtemplate_runtime`
* `k8s_request_duration`
* `queue_duration`
* `queue_longest_running`
* `queue_retries`
* `queue_unfinished_work`
* `pod_pending`

and can be disabled with

```yaml
metricsConfig:
options:
build_info:
disable: true
total_count:
disable: true
pods_total_count:
disable: true
controller_build_info:
disable: true
cronworkflows_triggered_total:
disable: true
workflowtemplate_triggered_total:
disable: true
workflowtemplate_runtime:
disable: true
k8s_request_duration:
disable: true
queue_duration:
disable: true
queue_longest_running:
disable: true
queue_retries:
disable: true}
queue_unfinished_work:
disable: true
pod_pending:
disable: true
```
## Renamed metrics
If you are using these metrics in your recording rules, dashboards or alerts you will need to use their new name after the upgrade:
| Old name | New name |
|------------------------------------|------------------------------------|
| `argo_workflows_count` | `argo_workflows_gauge` |
| `argo_workflows_pods_count` | `argo_workflows_pods_gauge` |
| `argo_workflows_queue_depth_count` | `argo_workflows_queue_depth_gauge` |
| `log_messages` | `argo_workflows_log_messages` |

## Custom metrics

Custom metric names and labels must be valid prometheus and OpenTelemetry names now. This prevents the use of `:`, which was usable in earlier versions of workflows

Custom metrics, as defined by a workflow, could be defined as one type (say counter) in one workflow, and then as a histogram of the same name in a different workflow. This would work in 3.5 if the first usage of the metric had reached TTL and been deleted. This will no-longer work in 3.6, and custom metrics may not be redefined. It doesn't really make sense to change a metric in this way, and the OpenTelemetry SDK prevents you from doing so.
Loading

0 comments on commit 51843cc

Please sign in to comment.