From 329a81f0e28ceaf684f8c6623d723fa48eb9276a Mon Sep 17 00:00:00 2001 From: pancho horrillo Date: Tue, 5 Mar 2019 17:02:07 +0100 Subject: [PATCH] auditor.go: don't flood storage with alerts when error in getting proofs Co-authored-by: iknite Co-authored-by: Jose Luis Lucas --- .../files/grafana/dashboards/QED.json | 378 +++++++++++------- gossip/auditor/auditor.go | 4 +- gossip/metrics/metrics.go | 9 + 3 files changed, 240 insertions(+), 151 deletions(-) diff --git a/deploy/aws/provision/files/grafana/dashboards/QED.json b/deploy/aws/provision/files/grafana/dashboards/QED.json index c3351a743..e696f5925 100644 --- a/deploy/aws/provision/files/grafana/dashboards/QED.json +++ b/deploy/aws/provision/files/grafana/dashboards/QED.json @@ -16,140 +16,9 @@ "gnetId": null, "graphTooltip": 2, "id": 3, - "iteration": 1551789062746, + "iteration": 1551801181841, "links": [], "panels": [ - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 34, - "title": "General", - "type": "row" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "fill": 1, - "gridPos": { - "h": 5, - "w": 12, - "x": 0, - "y": 1 - }, - "id": 23, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(qed_instances_count)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "QEDs", - "refId": "E" - }, - { - "expr": "sum(qed_sender_instances_count)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "senders", - "refId": "A" - }, - { - "expr": "sum(qed_auditor_instances_count)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "auditors", - "refId": "B" - }, - { - "expr": "sum(qed_monitor_instances_count)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "monitors", - "refId": "C" - }, - { - "expr": "sum(qed_publisher_instances_count)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "publishers", - "refId": "D" - }, - { - "expr": "sum(qed_store_instances_count)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "store", - "refId": "F" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Instances running", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, { "cacheTimeout": null, "colorBackground": false, @@ -174,7 +43,7 @@ "h": 5, "w": 4, "x": 12, - "y": 1 + "y": 0 }, "id": 4, "interval": null, @@ -255,7 +124,7 @@ "h": 5, "w": 4, "x": 16, - "y": 1 + "y": 0 }, "id": 41, "interval": null, @@ -334,7 +203,7 @@ "h": 5, "w": 4, "x": 20, - "y": 1 + "y": 0 }, "id": 20, "interval": null, @@ -398,8 +267,139 @@ "h": 1, "w": 24, "x": 0, + "y": 5 + }, + "id": 34, + "title": "General", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, "y": 6 }, + "id": 23, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(qed_instances_count)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "QEDs", + "refId": "E" + }, + { + "expr": "sum(qed_sender_instances_count)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "senders", + "refId": "A" + }, + { + "expr": "sum(qed_auditor_instances_count)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "auditors", + "refId": "B" + }, + { + "expr": "sum(qed_monitor_instances_count)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "monitors", + "refId": "C" + }, + { + "expr": "sum(qed_publisher_instances_count)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "publishers", + "refId": "D" + }, + { + "expr": "sum(qed_store_instances_count)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "store", + "refId": "F" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Instances running", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 11 + }, "id": 32, "title": "Servers", "type": "row" @@ -414,7 +414,7 @@ "h": 9, "w": 12, "x": 0, - "y": 7 + "y": 12 }, "id": 2, "legend": { @@ -500,7 +500,7 @@ "h": 9, "w": 12, "x": 12, - "y": 7 + "y": 12 }, "id": 10, "legend": { @@ -584,7 +584,7 @@ "h": 9, "w": 12, "x": 0, - "y": 16 + "y": 21 }, "id": 12, "legend": { @@ -668,7 +668,7 @@ "h": 9, "w": 12, "x": 12, - "y": 16 + "y": 21 }, "id": 19, "legend": { @@ -748,7 +748,7 @@ "h": 1, "w": 24, "x": 0, - "y": 25 + "y": 30 }, "id": 30, "title": "Agents", @@ -775,7 +775,7 @@ "h": 5, "w": 6, "x": 0, - "y": 26 + "y": 31 }, "id": 38, "interval": null, @@ -833,6 +833,86 @@ ], "valueName": "delta" }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "description": "Auditor failed get membership proof", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 2, + "x": 6, + "y": 31 + }, + "id": 43, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(qed_auditor_get_membership_proof_err_total)", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": "", + "title": "Auditor err", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, { "cacheTimeout": null, "colorBackground": false, @@ -854,7 +934,7 @@ "h": 5, "w": 6, "x": 9, - "y": 26 + "y": 31 }, "id": 37, "interval": null, @@ -933,7 +1013,7 @@ "h": 5, "w": 6, "x": 17, - "y": 26 + "y": 31 }, "id": 39, "interval": null, @@ -1001,7 +1081,7 @@ "h": 9, "w": 12, "x": 0, - "y": 31 + "y": 36 }, "id": 17, "legend": { @@ -1110,7 +1190,7 @@ "h": 9, "w": 12, "x": 12, - "y": 31 + "y": 36 }, "id": 18, "legend": { @@ -1204,7 +1284,7 @@ "h": 1, "w": 24, "x": 0, - "y": 40 + "y": 45 }, "id": 28, "title": "Store", @@ -1231,7 +1311,7 @@ "h": 8, "w": 8, "x": 0, - "y": 41 + "y": 46 }, "id": 25, "interval": null, @@ -1311,7 +1391,7 @@ "h": 8, "w": 8, "x": 8, - "y": 41 + "y": 46 }, "id": 26, "interval": null, @@ -1391,7 +1471,7 @@ "h": 8, "w": 8, "x": 16, - "y": 41 + "y": 46 }, "id": 35, "interval": null, diff --git a/gossip/auditor/auditor.go b/gossip/auditor/auditor.go index b5af15762..0d3ee656b 100644 --- a/gossip/auditor/auditor.go +++ b/gossip/auditor/auditor.go @@ -179,8 +179,8 @@ func (t MembershipTask) Do() { proof, err := t.qed.MembershipDigest(t.s.Snapshot.EventDigest, t.s.Snapshot.Version) if err != nil { // retry - t.sendAlert(fmt.Sprintf("Unable to verify snapshot %v", t.s.Snapshot)) - log.Infof("Error executing membership query: %v", err) + metrics.QedAuditorGetMembershipProofErrTotal.Inc() + log.Infof("Unable to get membership proof from QED server: %s", err.Error()) return } diff --git a/gossip/metrics/metrics.go b/gossip/metrics/metrics.go index c7e1a488f..4d7c66593 100644 --- a/gossip/metrics/metrics.go +++ b/gossip/metrics/metrics.go @@ -89,6 +89,13 @@ var ( }, ) + QedAuditorGetMembershipProofErrTotal = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "qed_auditor_get_membership_proof_err_total", + Help: "Number of errors trying to get membership proofs by auditors.", + }, + ) + metricsList = []prometheus.Collector{ QedAuditorInstancesCount, QedMonitorInstancesCount, @@ -101,6 +108,8 @@ var ( QedAuditorBatchesProcessSeconds, QedMonitorBatchesProcessSeconds, QedPublisherBatchesProcessSeconds, + + QedAuditorGetMembershipProofErrTotal, } )