From ae9c924ffe129fe00ec80a0e2e83f4d665fad06b Mon Sep 17 00:00:00 2001 From: Alan Clucas Date: Mon, 24 Jun 2024 10:35:15 +0100 Subject: [PATCH] some renames Signed-off-by: Alan Clucas --- .spelling | 1 - config/config.go | 12 +- docs/metrics-3.6.md | 31 +-- docs/metrics.md | 176 +++++++++++------- workflow/controller/controller.go | 14 +- workflow/metrics/instrument.go | 2 +- workflow/metrics/metrics.go | 4 +- workflow/metrics/{options.go => modifiers.go} | 15 +- .../{options_test.go => modifiers_test.go} | 8 +- 9 files changed, 154 insertions(+), 109 deletions(-) rename workflow/metrics/{options.go => modifiers.go} (66%) rename workflow/metrics/{options_test.go => modifiers_test.go} (93%) diff --git a/.spelling b/.spelling index 1e4aabbe866c..df9fc644c217 100644 --- a/.spelling +++ b/.spelling @@ -195,7 +195,6 @@ parameterizing params pprof pre-commit -prometheus rc2 repo roadmap diff --git a/config/config.go b/config/config.go index 46e62c7a1163..3875d64e7433 100644 --- a/config/config.go +++ b/config/config.go @@ -245,10 +245,10 @@ type MySQLConfig struct { Options map[string]string `json:"options,omitempty"` } -// MetricOptions are options for an individual named metric to change their behaviour -type MetricOption struct { - // Disable disables the emission of this metric completely - Disable bool `json:"disable,omitempty"` +// MetricModifier are modifiers for an individual named metric to change their behaviour +type MetricModifier struct { + // Disabled disables the emission of this metric completely + Disabled bool `json:"disabled,omitempty"` // DisabledAttributes lists labels for this metric to remove that attributes to save on cardinality DisabledAttributes []string `json:"disabledAttributes"` // HistogramBuckets allow configuring of the buckets used in a histogram @@ -280,8 +280,8 @@ type MetricsConfig struct { IgnoreErrors bool `json:"ignoreErrors,omitempty"` // Secure is a flag that starts the metrics servers using TLS, defaults to true Secure *bool `json:"secure,omitempty"` - // Configure metrics by name - Options map[string]MetricOption `json:"options,omitempty"` + // Modifiers configure metrics by name + Modifiers map[string]MetricModifier `json:"modifiers,omitempty"` // Temporality configures the temporality of the opentelemetry metrics. // Valid values are Cumulative and Delta, defaulting to cumulative. // This has no effect on prometheus metrics, which are always cumulative diff --git a/docs/metrics-3.6.md b/docs/metrics-3.6.md index 8a4a563c63d6..65850364ec4e 100644 --- a/docs/metrics-3.6.md +++ b/docs/metrics-3.6.md @@ -4,39 +4,34 @@ Metrics have changed in 3.6. You can now retrieve metrics using the OpenTelemetry Protocol using the [OpenTelemetry collector](https://opentelemetry.io/docs/collector/), and this is the recommended mechanism. -These notes explain the differences in using the prometheus `/metrics` endpoint to scrape metrics for a minimal effort upgrade. It is not recommended you follow this guide blindly, the new metrics have been introduced because they add value, and so they should be worth collecting and using. - -## TLS - -The Prometheus `/metrics` endpoint now has TLS enabled by default. - -To disable this set `metricsConfig.secure` to `false`. +These notes explain the differences in using the Prometheus `/metrics` endpoint to scrape metrics for a minimal effort upgrade. It is not recommended you follow this guide blindly, the new metrics have been introduced because they add value, and so they should be worth collecting and using. ## New metrics The following are new metrics: * `build_info` -* `total_count` -* `pods_total_count` * `controller_build_info` * `cronworkflows_triggered_total` -* `workflowtemplate_triggered_total` -* `workflowtemplate_runtime` * `k8s_request_duration` +* `pods_total_count` +* `pod_pending_count` * `queue_duration` * `queue_longest_running` * `queue_retries` * `queue_unfinished_work` -* `pod_pending` +* `total_count` +* `workflowtemplate_runtime` +* `workflowtemplate_triggered_total` and can be disabled with ```yaml -metricsConfig: - options: +metricsConfig: | + modifiers: build_info: disable: true +... ``` ## Renamed metrics @@ -52,6 +47,12 @@ If you are using these metrics in your recording rules, dashboards, or alerts, y ## Custom metrics -Custom metric names and labels must be valid prometheus and OpenTelemetry names now. This prevents the use of `:`, which was usable in earlier versions of workflows +Custom metric names and labels must be valid Prometheus and OpenTelemetry names now. This prevents the use of `:`, which was usable in earlier versions of workflows Custom metrics, as defined by a workflow, could be defined as one type (say counter) in one workflow, and then as a histogram of the same name in a different workflow. This would work in 3.5 if the first usage of the metric had reached TTL and been deleted. This will no-longer work in 3.6, and custom metrics may not be redefined. It doesn't really make sense to change a metric in this way, and the OpenTelemetry SDK prevents you from doing so. + +## TLS + +The Prometheus `/metrics` endpoint now has TLS enabled by default. + +To disable this set `metricsConfig.secure` to `false`. diff --git a/docs/metrics.md b/docs/metrics.md index fa51d77df8e3..575ec739e7b5 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -7,34 +7,36 @@ ## Introduction -Argo emits a certain number of controller metrics that inform on the state of the controller at any given time. Furthermore, -users can also define their own custom metrics to inform on the state of their Workflows. +Argo emits a certain number of controller metrics that inform on the state of the controller at any given time. +Furthermore, users can also define their own custom metrics to inform on the state of their Workflows. -Custom Prometheus metrics can be defined to be emitted on a `Workflow`- and `Template`-level basis. These can be useful -for many cases; some examples: +Custom metrics can be defined to be emitted on a `Workflow`- and `Template`-level basis. These can be useful for many cases; some examples: - Keeping track of the duration of a `Workflow` or `Template` over time, and setting an alert if it goes beyond a threshold - Keeping track of the number of times a `Workflow` or `Template` fails over time - Reporting an important internal metric, such as a model training score or an internal error rate -Emitting custom metrics with Argo is easy, but it's important to understand what makes a good Prometheus metric and the -best way to define metrics in Argo to avoid problems such as [cardinality explosion](https://stackoverflow.com/questions/46373442/how-dangerous-are-high-cardinality-labels-in-prometheus). +Emitting custom metrics with Argo is easy, but it's important to understand what makes a good metric and the best way to define metrics in Argo to avoid problems such as [cardinality explosion](https://stackoverflow.com/questions/46373442/how-dangerous-are-high-cardinality-labels-in-prometheus). -Metrics can be collected using the OpenTelemetry protocol or via prometheus compatible scraping. +Metrics can be collected using the OpenTelemetry protocol or via Prometheus compatible scraping. ## Metrics configuration -It is possible to collect metrics via the OpenTelemetry protocol or via prometheus compatible scraping. Both of these mechanisms can be enabled at the same time, which could be useful if you'd like to migrate from one system to the other. Using multiple protocols at the same time is not intended for long term use however. +It is possible to collect metrics via the OpenTelemetry protocol or via prometheus compatible scraping. +Both of these mechanisms can be enabled at the same time, which could be useful if you'd like to migrate from one system to the other. +Using multiple protocols at the same time is not intended for long term use. -OpenTelemetry is the recommended way of collecting metrics, and the OpenTelemetry collector can export metrics to prometheus via [the prometheus remote write exporter](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/exporter/prometheusremotewriteexporter/README.md). +OpenTelemetry is the recommended way of collecting metrics. +The OpenTelemetry collector can export metrics to Prometheus via [the Prometheus remote write exporter](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/exporter/prometheusremotewriteexporter/README.md). ### OpenTelemetry protocol -To enable the OpenTelemetry protocol you must set the environment variable `OTEL_EXPORTER_OTLP_ENDPOINT` or `OTEL_EXPORTER_OTLP_METRICS_ENDPOINT`. It will not default to the default value if left blank, instead no Open +To enable the OpenTelemetry protocol you must set the environment variable `OTEL_EXPORTER_OTLP_ENDPOINT` or `OTEL_EXPORTER_OTLP_METRICS_ENDPOINT`. +It will not be enabled if left blank, unlike some other implementations. You can configure the protocol using the environment variables documented in [standard environment variables](https://opentelemetry.io/docs/languages/sdk-configuration/otlp-exporter/). -The [configuration options](#common) in the controller ConfigMap `metricsTTL`, `options` and `temporality` affect the OpenTelemetry behavior, but the other parameters do not. +The [configuration options](#common) in the controller ConfigMap `metricsTTL`, `modifiers` and `temporality` affect the OpenTelemetry behavior, but the other parameters do not. To use the [OpenTelemetry collector](https://opentelemetry.io/docs/collector/) you can configure it @@ -70,7 +72,8 @@ metricsConfig: | secure: true ``` -The metric names emitted by this mechanism are prefixed with `argo_workflows_`. `Attributes` are exposed as prometheus `labels` of the same name. +The metric names emitted by this mechanism are prefixed with `argo_workflows_`. +`Attributes` are exposed as prometheus `labels` of the same name. ### Common @@ -80,9 +83,9 @@ You can adjust various elements of the metrics configuration by changing values metricsConfig: | # MetricsTTL sets how often custom metrics are cleared from memory. Default is "0", metrics are never cleared metricsTTL: "10m" - options: + modifiers: pod_missing: - disable: true + disabled: true cronworkflows_triggered_total: disabledAttributes: - name @@ -90,18 +93,22 @@ metricsConfig: | histogramBuckets: [ 1.0, 2.0, 10.0 ] ``` -### Options +### Modifiers -Using options you can manipulate the metrics created by the workflow controller. These options apply to the built-in metrics and any custom metrics you create. Each option applies to the named metric only, and applies to all output methods. +Using modifiers you can manipulate the metrics created by the workflow controller. +These modifiers apply to the built-in metrics and any custom metrics you create. +Each modifier applies to the named metric only, and to all output methods. -`disable: true` will disable the emission of the metric from the system. +`disabled: true` will disable the emission of the metric from the system. ```yaml disabledAttributes: - namespace ``` -Will disable the attribute (label) from being emitted. The metric will be emitted with the attribute missing, the remaining attributes will still be emitted with the values correctly aggregated. This can be used to reduce cardinality of metrics. +Will disable the attribute (label) from being emitted. +The metric will be emitted with the attribute missing, the remaining attributes will still be emitted with the values correctly aggregated. +This can be used to reduce cardinality of metrics. ```yaml histogramBuckets: @@ -111,7 +118,8 @@ Will disable the attribute (label) from being emitted. The metric will be emitte - 10.0 ``` -For histogram metrics only, this will change the boundary values for the histogram buckets. All values must be floating point numbers. +For histogram metrics only, this will change the boundary values for the histogram buckets. +All values must be floating point numbers. ## Metrics and metrics in Argo @@ -124,9 +132,11 @@ Default controller metrics can be scraped from service ```workflow-controller-me ### Custom metrics -Metrics that inform on the state of a Workflow, or a series of Workflows. These custom metrics are defined by the user in the Workflow spec. +Metrics that inform on the state of a Workflow, or a series of Workflows. +These custom metrics are defined by the user in the Workflow spec. -Emitting custom metrics is the responsibility of the emitter owner. Since the user defines Workflows in Argo, the user is responsible for emitting metrics correctly. +Emitting custom metrics is the responsibility of the emitter owner. +Since the user defines Workflows in Argo, the user is responsible for emitting metrics correctly. Currently, custom metrics and their labels must be valid Prometheus and OpenTelemetry metric names, which limits them to alphanumeric characters and `_`. This applies even if you're only using OpenTelemetry for metrics. @@ -134,7 +144,8 @@ This applies even if you're only using OpenTelemetry for metrics. ### What is and isn't a Prometheus metric Prometheus metrics should be thought of as ephemeral data points of running processes; i.e., they are the answer to -the question "What is the state of my system _right now_?". Metrics should report things such as: +the question "What is the state of my system _right now_?". +Metrics should report things such as: - a counter of the number of times a workflow or steps has failed, or - a gauge of workflow duration, or @@ -147,14 +158,13 @@ Aggregating the examples above over time could answer useful questions such as: - How has the duration of this workflow changed over time? Is the current workflow running for too long? - Is our model improving over time? -Prometheus metrics should **not** be thought of as a store of data. Since metrics should only report the state of the system -at the current time, they should not be used to report historical data such as: +Prometheus metrics should **not** be thought of as a store of data. Since metrics should only report the state of the system at the current time, they should not be used to report historical data such as: - the status of an individual instance of a workflow, or - how long a particular instance of a step took to run. -Metrics are also ephemeral, meaning there is no guarantee that they will be persisted for any amount of time. If you need -a way to view and analyze historical data, consider the [workflow archive](workflow-archive.md) or reporting to logs. +Metrics are also ephemeral, meaning there is no guarantee that they will be persisted for any amount of time. +If you need a way to view and analyze historical data, consider the [workflow archive](workflow-archive.md) or reporting to logs. ### Counter, gauge and histogram @@ -249,6 +259,14 @@ A histogram recording how long each type of request took. This is contains all the information contained in `k8s_request_total` along with timings. +#### `leader` + +This gauge indicates if this workflow controller the leader in a leader elected controller setup, or is otherwise + +* It will be `1` if this is the leader, or the controller is running in standalone mode [LEADER_ELECTION_DISABLE=true](environment-variables.md#controller). +* It will be `0` otherwise. +This controller is not actively running workflows. + #### `log_messages` A count of log messages emitted by the controller by log level: `error`, `warn` and `info`. @@ -259,15 +277,18 @@ A count of log messages emitted by the controller by log level: `error`, `warn` #### `operation_duration_seconds` -A histogram of durations of operations. An operation is a single workflow reconciliation loop within the workflow-controller. It's the time for the controller to process a single workflow after it has been read from the cluster and is a measure of the performance of the controller affected by the complexity of the workflow. +A histogram of durations of operations. An operation is a single workflow reconciliation loop within the workflow-controller. +It's the time for the controller to process a single workflow after it has been read from the cluster and is a measure of the performance of the controller affected by the complexity of the workflow. This metric has no attributes. -The environment variables `OPERATION_DURATION_METRIC_BUCKET_COUNT` and `MAX_OPERATION_TIME` configure the bucket sizes for this metric, unless they are specified using an `histogramBuckets` option in the `metricsConfig` block. +The environment variables `OPERATION_DURATION_METRIC_BUCKET_COUNT` and `MAX_OPERATION_TIME` configure the bucket sizes for this metric, unless they are specified using an `histogramBuckets` modifier in the `metricsConfig` block. #### `pods_gauge` -A gauge of the number of workflow created pods currently in the cluster in each phase. It is possible for a workflow to start, but no pods be running (e.g. cluster is too busy to run them). This metric sheds light on actual work being done. +A gauge of the number of workflow created pods currently in the cluster in each phase. +It is possible for a workflow to start, but no pods be running (for example cluster is too busy to run them). +This metric sheds light on actual work being done. | attribute | explanation | |-----------|------------------------------| @@ -275,7 +296,8 @@ A gauge of the number of workflow created pods currently in the cluster in each #### `pod_missing` -A counter of pods that were not seen. E.g. by being deleted by Kubernetes. You should only see this under high load. +A counter of pods that were not seen - for example they are by being deleted by Kubernetes. +You should only see this under high load. | attribute | explanation | |--------------------|----------------------------------------| @@ -284,9 +306,23 @@ A counter of pods that were not seen. E.g. by being deleted by Kubernetes. You s `recently_started` is controlled by the [environment variable](environment-variables.md) `RECENTLY_STARTED_POD_DURATION` and defaults to 10 seconds. +#### `pod_pending_count` + +A counter of pods that have been seen in the Pending state. + +| attribute | explanation | +|--------------------|-------------------------------------------| +| `reason` | Summary of the kubernetes Reason for pending. | +| `namespace` | The namespace in which the pod is running | + +This metric ignores the `PodInitializing` reason and does not count it. +The `reason` attribute is the value from the Reason message before the `:` in the message. +This is not directly controlled by the workflow controller, so it is possible for some pod pending states to be missed. + #### `pods_total_count` -A gauge of the number of pods which have entered each phase and then observed by the controller. This is not directly controlled by the workflow controller, so it is possible for some pod phases to be missed. +A gauge of the number of pods which have entered each phase and then observed by the controller. +This is not directly controlled by the workflow controller, so it is possible for some pod phases to be missed. | attribute | explanation | |-------------|-------------------------------------------| @@ -295,7 +331,8 @@ A gauge of the number of pods which have entered each phase and then observed by #### `queue_adds_count` -A counter of additions to the work queues inside the controller. The rate of this shows how busy that area of the controller is. +A counter of additions to the work queues inside the controller. +The rate of this shows how busy that area of the controller is. | attribute | explanation | |---------------|-------------------| @@ -312,7 +349,8 @@ This and associated metrics are all directly sourced from the [client-go workque #### `queue_depth_gauge` -A gauge of the current depth of the queues. If these get large then the workflow controller is not keeping up with the cluster. +A gauge of the current depth of the queues. +If these get large then the workflow controller is not keeping up with the cluster. See [queue adds count](#queue_adds_count) for details. @@ -363,7 +401,8 @@ See [queue adds count](#queue_adds_count) for details. #### `workflow_condition` -A gauge of the number of workflows with different conditions. This will tell you the number of workflows with running pods. +A gauge of the number of workflows with different conditions. +This will tell you the number of workflows with running pods. | attribute | explanation | |-----------|-------------------------------------------------| @@ -372,7 +411,9 @@ A gauge of the number of workflows with different conditions. This will tell you #### `workflowtemplate_runtime` -A histogram of the duration of workflows using `workflowTemplateRef` only, as they enter each phase. Counts both WorkflowTemplate and ClusterWorkflowTemplate usage. Records time between entering the `Running` phase and completion, so does not include any time in `Pending`. +A histogram of the duration of workflows using `workflowTemplateRef` only, as they enter each phase. +Counts both WorkflowTemplate and ClusterWorkflowTemplate usage. +Records time between entering the `Running` phase and completion, so does not include any time in `Pending`. | attribute | explanation | |-----------------|--------------------------------------------------------------| @@ -382,7 +423,8 @@ A histogram of the duration of workflows using `workflowTemplateRef` only, as th #### `workflowtemplate_triggered_total` -A counter of workflows using `workflowTemplateRef` only, as they enter each phase. Counts both WorkflowTemplate and ClusterWorkflowTemplate usage. +A counter of workflows using `workflowTemplateRef` only, as they enter each phase. +Counts both WorkflowTemplate and ClusterWorkflowTemplate usage. | attribute | explanation | |-----------------|--------------------------------------------------------------| @@ -397,56 +439,56 @@ Please see the [Prometheus docs on metric types](https://prometheus.io/docs/conc ### How metrics work in Argo -In order to analyze the behavior of a workflow over time, we need to be able to link different instances -(i.e. individual executions) of a workflow together into a "series" for the purposes of emitting metrics. We do so by linking them together -with the same metric descriptor. +In order to analyze the behavior of a workflow over time, you need to be able to link different instances (individual executions) of a workflow together into a "series" for the purposes of emitting metrics. +You can do this by linking them together with the same metric descriptor. -In Prometheus, a metric descriptor is defined as a metric's name and its key-value labels. For example, for a metric -tracking the duration of model execution over time, a metric descriptor could be: +In Prometheus, a metric descriptor is defined as a metric's name and its key-value labels. +For example, for a metric tracking the duration of model execution over time, a metric descriptor could be: `argo_workflows_model_exec_time{model_name="model_a",phase="validation"}` -This metric then represents the amount of time that "Model A" took to train in the phase "Validation". It is important -to understand that the metric name _and_ its labels form the descriptor: `argo_workflows_model_exec_time{model_name="model_b",phase="validation"}` -is a different metric (and will track a different "series" altogether). +This metric then represents the amount of time that "Model A" took to train in the phase "Validation". +It is important to understand that the metric name _and_ its labels form the descriptor: `argo_workflows_model_exec_time{model_name="model_b",phase="validation"}`is a different metric (and will track a different "series" altogether). -Now, whenever we run our first workflow that validates "Model A" a metric with the amount of time it took it to do so will -be created and emitted. For each subsequent time that this happens, no new metrics will be emitted and the _same_ metric -will be updated with the new value. Since, in effect, we are interested on the execution time of "validation" of "Model A" -over time, we are no longer interested in the previous metric and can assume it has already been scraped. +Now, whenever you run a workflow that validates "Model A" a metric with the amount of time it took it to do so will be created and emitted. +For each subsequent time that this happens, no new metrics will be emitted and the _same_ metric will be updated with the new value. +Since, you are interested on the execution time of "validation" of "Model A" over time, you are no longer interested in the previous metric and can assume it has already been stored. -In summary, whenever you want to track a particular metric over time, you should use the same metric name _and_ metric -labels wherever it is emitted. This is how these metrics are "linked" as belonging to the same series. +In summary, whenever you want to track a particular metric over time, you should use the same metric name _and_ metric labels wherever it is emitted. +This is how these metrics are "linked" as belonging to the same series. ### Grafana Dashboard for Argo Controller Metrics Please see the [Argo Workflows metrics](https://grafana.com/grafana/dashboards/20348-argo-workflows-metrics/) Grafana dashboard. -## Defining metrics +## Defining custom metrics -Metrics are defined in-place on the Workflow/Step/Task where they are emitted from. Metrics are always processed _after_ -the Workflow/Step/Task completes, with the exception of [real-time metrics](#real-time-metrics). +Metrics are defined in-place on the Workflow/Step/Task where they are emitted from. +Metrics are always processed _after_ the Workflow/Step/Task completes, with the exception of [real-time metrics](#real-time-metrics). +Custom metrics are defined under a `prometheus` tag in the yaml for legacy reasons. +They are emitted over all active protocols. -Metric definitions **must** include a `name` and a `help` doc string. They can also include any number of `labels` (when -defining labels avoid cardinality explosion). Metrics with the same `name` **must always** use the same exact `help` string, -having different metrics with the same name, but with a different `help` string will cause an error (this is a Prometheus requirement). +Metric definitions **must** include a `name` and a `help` doc string. +They can also include any number of `labels` (when defining labels avoid cardinality explosion). +Metrics with the same `name` **must always** use the same exact `help` string, having different metrics with the same name, but with a different `help` string will cause an error (this is a Prometheus requirement). +Metrics with the same `name` may not change what type of metric they are. -All metrics can also be conditionally emitted by defining a `when` clause. This `when` clause works the same as elsewhere -in a workflow. +All metrics can also be conditionally emitted by defining a `when` clause. +This `when` clause works the same as elsewhere in a workflow. -A metric must also have a type, it can be one of `gauge`, `histogram`, and `counter` ([see below](#metric-spec)). Within -the metric type a `value` must be specified. This value can be either a literal value of be an [Argo variable](variables.md). +A metric must also have a type, it can be one of `gauge`, `histogram`, and `counter` ([see below](#metric-spec)). +Within the metric type a `value` must be specified. This value can be either a literal value of be an [Argo variable](variables.md). When defining a `histogram`, `buckets` must also be provided (see below). [Argo variables](variables.md) can be included anywhere in the metric spec, such as in `labels`, `name`, `help`, `when`, etc. -Metric names can only contain alphanumeric characters, `_`, and `:`. +Metric names can only contain alphanumeric characters and `_` for compatibility with both Prometheus and OpenTelemetry, even if only one of these protocols is in use. ### Metric Spec -In Argo you can define a metric on the `Workflow` level or on the `Template` level. Here is an example of a `Workflow` -level Gauge metric that will report the Workflow duration time: +In Argo you can define a metric on the `Workflow` level or on the `Template` level. +Here is an example of a `Workflow` level Gauge metric that will report the Workflow duration time: ```yaml apiVersion: argoproj.io/v1alpha1 @@ -551,8 +593,9 @@ Finally, an example of a `Template`-level Histogram metric that tracks an intern ### Real-Time Metrics -Argo supports a limited number of real-time metrics. These metrics are emitted in real-time, beginning when the step execution starts -and ending when it completes. Real-time metrics are only available on Gauge type metrics and with a [limited number of variables](variables.md#real-time-metrics). +Argo supports a limited number of real-time metrics. +These metrics are emitted in real-time, beginning when the step execution starts and ending when it completes. +Real-time metrics are only available on Gauge type metrics and with a [limited number of variables](variables.md#real-time-metrics). To define a real-time metric simply add `realtime: true` to a gauge metric with a valid real-time variable. For example: @@ -564,7 +607,8 @@ To define a real-time metric simply add `realtime: true` to a gauge metric with ## Metrics endpoint -By default, metrics are emitted by the workflow-controller on port 9090 on the `/metrics` path. By port-forwarding to the pod you can view the metrics in your browser at `http://localhost:9090/metrics`: +By default, metrics are emitted by the workflow-controller on port 9090 on the `/metrics` path. +By port-forwarding to the pod you can view the metrics in your browser at `http://localhost:9090/metrics`: `kubectl -n argo port-forward deploy/workflow-controller 9090:9090` diff --git a/workflow/controller/controller.go b/workflow/controller/controller.go index 06882229dc84..d37f18e41987 100644 --- a/workflow/controller/controller.go +++ b/workflow/controller/controller.go @@ -1353,12 +1353,12 @@ func (wfc *WorkflowController) getMaxStackDepth() int { func (wfc *WorkflowController) getMetricsServerConfig() *metrics.Config { // Metrics config - options := make(map[string]metrics.MetricOption) - for name, option := range wfc.Config.MetricsConfig.Options { - options[name] = metrics.MetricOption{ - Disable: option.Disable, - DisabledAttributes: option.DisabledAttributes, - HistogramBuckets: option.HistogramBuckets, + modifiers := make(map[string]metrics.Modifier) + for name, modifier := range wfc.Config.MetricsConfig.Modifiers { + modifiers[name] = metrics.Modifier{ + Disabled: modifier.Disabled, + DisabledAttributes: modifier.DisabledAttributes, + HistogramBuckets: modifier.HistogramBuckets, } } @@ -1370,7 +1370,7 @@ func (wfc *WorkflowController) getMetricsServerConfig() *metrics.Config { IgnoreErrors: wfc.Config.MetricsConfig.IgnoreErrors, // Default to true for 3.6 Secure: wfc.Config.MetricsConfig.GetSecure(true), - Options: options, + Modifiers: modifiers, Temporality: wfc.Config.MetricsConfig.Temporality, } return &metricsConfig diff --git a/workflow/metrics/instrument.go b/workflow/metrics/instrument.go index 89f8477b8f41..65bc5bf21791 100644 --- a/workflow/metrics/instrument.go +++ b/workflow/metrics/instrument.go @@ -142,7 +142,7 @@ func (m *Metrics) createInstrument(instType instrumentType, name, desc, unit str } func (m *Metrics) buckets(name string, defaultBuckets []float64) []float64 { - if opts, ok := m.config.Options[name]; ok { + if opts, ok := m.config.Modifiers[name]; ok { if len(opts.HistogramBuckets) > 0 { buckets := opts.HistogramBuckets sort.Float64s(buckets) diff --git a/workflow/metrics/metrics.go b/workflow/metrics/metrics.go index a874bfa8194b..8e10dced87cc 100644 --- a/workflow/metrics/metrics.go +++ b/workflow/metrics/metrics.go @@ -27,8 +27,8 @@ type Config struct { TTL time.Duration IgnoreErrors bool Secure bool - DefaultOptions MetricOption - Options map[string]MetricOption + DefaultOptions Modifier + Modifiers map[string]Modifier Temporality wfconfig.MetricsTemporality } diff --git a/workflow/metrics/options.go b/workflow/metrics/modifiers.go similarity index 66% rename from workflow/metrics/options.go rename to workflow/metrics/modifiers.go index 1c185c704a86..984c21867361 100644 --- a/workflow/metrics/options.go +++ b/workflow/metrics/modifiers.go @@ -5,8 +5,9 @@ import ( metricsdk "go.opentelemetry.io/otel/sdk/metric" ) -type MetricOption struct { - Disable bool +// Modifier holds options to change the behaviour for a single metric +type Modifier struct { + Disabled bool DisabledAttributes []string HistogramBuckets []float64 } @@ -14,13 +15,13 @@ type MetricOption struct { // Create an opentelemetry 'view' which disables whole metrics or aggregates across labels func view(config *Config) metricsdk.Option { views := make([]metricsdk.View, 0) - for metric, opt := range config.Options { - if opt.Disable { + for metric, modifier := range config.Modifiers { + if modifier.Disabled { views = append(views, metricsdk.NewView(metricsdk.Instrument{Name: metric}, metricsdk.Stream{Aggregation: metricsdk.AggregationDrop{}})) - } else if len(opt.DisabledAttributes) > 0 { - keys := make([]attribute.Key, len(opt.DisabledAttributes)) - for i, key := range opt.DisabledAttributes { + } else if len(modifier.DisabledAttributes) > 0 { + keys := make([]attribute.Key, len(modifier.DisabledAttributes)) + for i, key := range modifier.DisabledAttributes { keys[i] = attribute.Key(key) } views = append(views, metricsdk.NewView(metricsdk.Instrument{Name: metric}, diff --git a/workflow/metrics/options_test.go b/workflow/metrics/modifiers_test.go similarity index 93% rename from workflow/metrics/options_test.go rename to workflow/metrics/modifiers_test.go index 1914d5b8cfc0..af3532ece93f 100644 --- a/workflow/metrics/options_test.go +++ b/workflow/metrics/modifiers_test.go @@ -11,9 +11,9 @@ import ( func TestViewDisable(t *testing.T) { // Same metric as TestMetrics, but disabled by a view m, te, err := createTestMetrics(&Config{ - Options: map[string]MetricOption{ + Modifiers: map[string]Modifier{ nameOperationDuration: { - Disable: true, + Disabled: true, }, }, }) @@ -27,7 +27,7 @@ func TestViewDisable(t *testing.T) { func TestViewDisabledAttributes(t *testing.T) { // Disable the error cause label m, te, err := createTestMetrics(&Config{ - Options: map[string]MetricOption{ + Modifiers: map[string]Modifier{ nameErrorCount: { DisabledAttributes: []string{labelErrorCause}, }, @@ -53,7 +53,7 @@ func TestViewHistogramBuckets(t *testing.T) { // Same metric as TestMetrics, but buckets changed bounds := []float64{1.0, 3.0, 5.0, 10.0} m, te, err := createTestMetrics(&Config{ - Options: map[string]MetricOption{ + Modifiers: map[string]Modifier{ nameOperationDuration: { HistogramBuckets: bounds, },