diff --git a/charts/pyrometer/README.md b/charts/pyrometer/README.md index 80c51fed4..134887e07 100644 --- a/charts/pyrometer/README.md +++ b/charts/pyrometer/README.md @@ -1,3 +1,27 @@ ## Pyrometer chart A chart to deploy the [pyrometer](https://gitlab.com/tezos-kiln/pyrometer) Tezos monitoring tool. + +Pass a complete pyrometer configuration with the `config` key in values, yaml, it will be transparently applied to pyrometer. + +### Prometheus exporter + +Pyrometer is a self-sustaining tool that manages its own alerts and alerting channels. + +Quoting pyrometer [architecture doc](https://gitlab.com/tezos-kiln/pyrometer/-/blob/main/doc/monitoring.md): + +> Primary installation target for initial monitoring implementation is a +personal computer. Consequently, implementation should prioritize +simplicity when it comes to number of individual, isolated components, +processes, their runtime dependencies, +administration/configuration. + + +The Prometheus exporter for Pyrometer consumes pyrometer events using webhooks and monitors only one of them: baker health status. It then aggregates the number of unhealthy bakers and exposes this as a prometheus metric. + +The ServiceMonitor and PrometheusRule are also included in the chart. + +This gives you: + +* the concept of an active alert that can be fed into an incident management system such as pagerduty. +* the ability to monitor a baker baking for several addresses, where it is not desirable to alert for an individual unhealthy address, but only when all the configured bakers are unhealtly. The threshold is configurable in the chart. diff --git a/charts/pyrometer/scripts/pyrometer_exporter.py b/charts/pyrometer/scripts/pyrometer_exporter.py new file mode 100644 index 000000000..4c2b61fbe --- /dev/null +++ b/charts/pyrometer/scripts/pyrometer_exporter.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python +from flask import Flask, request, jsonify +import requests +import datetime + +import logging +log = logging.getLogger('werkzeug') +log.setLevel(logging.ERROR) + +application = Flask(__name__) + +unhealthy_bakers = set() + +@application.route('/pyrometer_webhook', methods=['POST']) +def pyrometer_webhook(): + ''' + Receive all events from pyrometer + ''' + for msg in request.get_json(): + if msg["kind"] == "baker_unhealthy": + print(f"Baker {msg['baker']} is unhealthy") + unhealthy_bakers.add(msg["baker"]) + if msg["kind"] == "baker_recovered": + print(f"Baker {msg['baker']} recovered") + unhealthy_bakers.remove(msg["baker"]) + + return "Webhook received" + +@application.route('/metrics', methods=['GET']) +def prometheus_metrics(): + ''' + Prometheus endpoint + ''' + return f'''# total number of monitored bakers that are currently unhealthy +pyrometer_unhealthy_bakers_total {len(unhealthy_bakers)} +''' + +if __name__ == "__main__": + application.run(host = "0.0.0.0", port = 31732, debug = False) diff --git a/charts/pyrometer/templates/deployment.yaml b/charts/pyrometer/templates/deployment.yaml index 6b133078d..ea8f73171 100644 --- a/charts/pyrometer/templates/deployment.yaml +++ b/charts/pyrometer/templates/deployment.yaml @@ -30,6 +30,18 @@ spec: volumeMounts: - name: config-volume mountPath: /config/ + - name: prom-exporter + image: {{ .Values.tezos_k8s_images.utils }} + ports: + - name: metrics + containerPort: 31732 + protocol: TCP + command: + - /usr/local/bin/python + args: + - "-c" + - | +{{ tpl ($.Files.Get (print "scripts/pyrometer_exporter.py")) $ | indent 12 }} volumes: - name: config-volume configMap: diff --git a/charts/pyrometer/templates/prometheusrule.yaml b/charts/pyrometer/templates/prometheusrule.yaml new file mode 100644 index 000000000..59c876ba6 --- /dev/null +++ b/charts/pyrometer/templates/prometheusrule.yaml @@ -0,0 +1,20 @@ +{{- if .Values.prometheusRule.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: +{{- toYaml .Values.prometheusRule.labels | nindent 4 }} + name: baker-external-monitor-alerter +spec: + groups: + - name: pyrometer.rules + rules: + - alert: BakersUnhealthy + annotations: + description: '{{ .Values.prometheusRule.numberOfUnhealthyBakersAlertThreshold }} or more unhealthy bakers' + summary: "{{ .Values.prometheusRule.numberOfUnhealthyBakersAlertThreshold }} or more unhealthy Tezos baker according to Pyrometer external monitoring" + expr: pyrometer_unhealthy_bakers_total{namespace="{{ .Release.Namespace }}"} >= {{ .Values.prometheusRule.numberOfUnhealthyBakersAlertThreshold }} + for: 1m + labels: + severity: critical +{{- end }} diff --git a/charts/pyrometer/templates/service.yaml b/charts/pyrometer/templates/service.yaml index 25c40ada4..5fae3dbd2 100644 --- a/charts/pyrometer/templates/service.yaml +++ b/charts/pyrometer/templates/service.yaml @@ -3,6 +3,8 @@ kind: Service metadata: name: pyrometer namespace: {{ .Release.Namespace }} + labels: + app: pyrometer spec: type: NodePort ports: @@ -10,5 +12,9 @@ spec: targetPort: http protocol: TCP name: http + - port: 31732 + targetPort: metrics + protocol: TCP + name: metrics selector: app: pyrometer diff --git a/charts/pyrometer/templates/servicemonitor.yaml b/charts/pyrometer/templates/servicemonitor.yaml new file mode 100644 index 000000000..27831e347 --- /dev/null +++ b/charts/pyrometer/templates/servicemonitor.yaml @@ -0,0 +1,17 @@ +{{- if .Values.serviceMonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app: pyrometer + name: pyrometer-service-monitor + namespace: {{ .Release.Namespace }} +spec: + endpoints: + - interval: 15s + port: metrics + path: /metrics + selector: + matchLabels: + app: pyrometer +{{- end }} diff --git a/charts/pyrometer/values.yaml b/charts/pyrometer/values.yaml index f7cc943a4..fff2f50b7 100644 --- a/charts/pyrometer/values.yaml +++ b/charts/pyrometer/values.yaml @@ -1,7 +1,15 @@ -# Pass below the pyrometer config, in yaml format (will be converted to toml) -config: {} images: pyrometer: registry.gitlab.com/tezos-kiln/pyrometer:latest +tezos_k8s_images: + utils: tezos-k8s-utils:dev +# Pass below the pyrometer config, in yaml format +config: + node_monitor: + nodes: + - http://tezos-node-rpc:8732 + webhook: + enabled: true + url: http://127.0.0.1:31732/pyrometer_webhook ingress: enabled: false className: "" @@ -13,3 +21,12 @@ ingress: # - secretName: chart-example-tls # hosts: # - chart-example.local + +# Prometheus Operator is required in your cluster in order to enable +# serviceMonitor and prometheusRule below. +serviceMonitor: + enabled: false +prometheusRule: + enabled: false + numberOfUnhealthyBakersAlertThreshold: 1 + labels: {}