From 18d2a35d4acd356058687824bf56b15b97c1814d Mon Sep 17 00:00:00 2001 From: mabasian <54101509+mabasian@users.noreply.github.com> Date: Tue, 3 Sep 2024 15:35:27 +0200 Subject: [PATCH] ADD: CSM Alerts and notif (#2028) * FIX: frontend csm adderd to alert boxes * ADD: the fetch csm to monitoring * ADD: backend csm func * ADD: node alrt csm fetched * ADD: CSM to the control alert --- launcher/src/backend/Monitoring.js | 195 ++++++++++++++---- launcher/src/background.js | 4 + .../node-page/components/alert/NodeAlert.vue | 93 ++++++++- .../UI/the-control/ControlAlert.vue | 77 +++++++ launcher/src/languages/en.json | 3 +- launcher/src/store/ControlService.js | 3 + launcher/src/store/services.js | 10 +- 7 files changed, 338 insertions(+), 47 deletions(-) diff --git a/launcher/src/backend/Monitoring.js b/launcher/src/backend/Monitoring.js index 527512a749..2ace566b06 100755 --- a/launcher/src/backend/Monitoring.js +++ b/launcher/src/backend/Monitoring.js @@ -436,11 +436,11 @@ export class Monitoring { var query = rpc_method.trim().indexOf("{") < 0 ? JSON.stringify({ - jsonrpc: "2.0", - method: rpc_method.trim(), - params: rpc_params, - id: 1, - }) + jsonrpc: "2.0", + method: rpc_method.trim(), + params: rpc_params, + id: 1, + }) : rpc_method; // Define default response @@ -2623,8 +2623,8 @@ export class Monitoring { const addr_type = Array.isArray(addr) ? "arr" : typeof addr === "string" && ["public", "local"].includes(addr) - ? "str" - : "invalid"; + ? "str" + : "invalid"; addr = addr_type == "str" ? addr.toLowerCase().trim() : addr; if (addr_type == "invalid") { return { @@ -2712,7 +2712,7 @@ export class Monitoring { for (let i = 0; i < serviceInfos.length; i++) { const hashDependencies = serviceInfos[i].config.dependencies.consensusClients.length || - serviceInfos[i].config.dependencies.executionClients.length + serviceInfos[i].config.dependencies.executionClients.length ? "yes" : "no"; easyInfos.push({ @@ -3249,8 +3249,9 @@ rm -rf diskoutput const parsedJson = JSON.parse(stdoutJson); let message = - `${parsedJson?.message || ""}${parsedJson?.message && parsedJson?.stacktraces ? "\n" : ""}${parsedJson?.stacktraces || "" - }`.trim() || output; + `${parsedJson?.message || ""}${parsedJson?.message && parsedJson?.stacktraces ? "\n" : ""}${ + parsedJson?.stacktraces || "" + }`.trim() || output; return { pubkey: pubkey, @@ -3340,6 +3341,7 @@ rm -rf diskoutput ]; } } + /** * Will gather metrics from Prometheus and evaluate. * If thresholds are exceeded, an alert will be generated and added to the retuned array. @@ -3353,100 +3355,217 @@ rm -rf diskoutput } const queries = { app_monitoring_readyz: "max((app_monitoring_readyz)) by (cluster_name, cluster_hash, cluster_peer)", - cluster_missed_attestations: "max(increase(core_tracker_failed_duties_total[10m])) by (cluster_hash, cluster_name)", - cluster_failure_rate: "floor(100 * (max(increase(core_tracker_success_duties_total[15m])) by (cluster_hash, cluster_name) / max(increase(core_tracker_expect_duties_total[15m])) by (cluster_hash, cluster_name)))", - percentage_failed_sync_message_duty: "(\n sum(increase(core_tracker_failed_duties_total[1h])) by (cluster_name,cluster_hash,cluster_peer)\n) \n/ \n(\n sum(increase(core_tracker_failed_duties_total[1h])) by (cluster_name,cluster_hash,cluster_peer) \n + \n sum(increase(core_bcast_broadcast_total[1h])) by (cluster_name,cluster_hash,cluster_peer) \n)", + cluster_missed_attestations: + "max(increase(core_tracker_failed_duties_total[10m])) by (cluster_hash, cluster_name)", + cluster_failure_rate: + "floor(100 * (max(increase(core_tracker_success_duties_total[15m])) by (cluster_hash, cluster_name) / max(increase(core_tracker_expect_duties_total[15m])) by (cluster_hash, cluster_name)))", + percentage_failed_sync_message_duty: + "(\n sum(increase(core_tracker_failed_duties_total[1h])) by (cluster_name,cluster_hash,cluster_peer)\n) \n/ \n(\n sum(increase(core_tracker_failed_duties_total[1h])) by (cluster_name,cluster_hash,cluster_peer) \n + \n sum(increase(core_bcast_broadcast_total[1h])) by (cluster_name,cluster_hash,cluster_peer) \n)", connected_relays: "group (p2p_relay_connections) by (cluster_peer)", peer_ping_latency: "histogram_quantile(0.90, sum(rate(p2p_ping_latency_secs_bucket[2m])) by (le,peer))", - } + }; const queryPromises = Object.entries(queries).map(([key, query]) => { - return this.queryPrometheus(encodeURIComponent(query)).then(result => ({ key, result })); + return this.queryPrometheus(encodeURIComponent(query)).then((result) => ({ key, result })); }); const results = await Promise.all(queryPromises); - let alerts = results.map((metric) => { - if (metric.result.status != "success") { - return; - } - if (metric.key === "peer_ping_latency") { - let value = Math.max(...metric.result.data.result.map((r) => r.value[1])); + let alerts = results + .map((metric) => { + if (metric.result.status != "success") { + return; + } + if (metric.key === "peer_ping_latency") { + let value = Math.max(...metric.result.data.result.map((r) => r.value[1])); + return this.parseObolCharonAlerts(metric.key, value); + } + let value = metric.result.data.result[0].value[1]; return this.parseObolCharonAlerts(metric.key, value); - } - let value = metric.result.data.result[0].value[1]; - return this.parseObolCharonAlerts(metric.key, value); - }).filter((alert) => alert); + }) + .filter((alert) => alert); return alerts; - } catch (error) { log.error("Fetching Obol Charon Alerts Failed:\n" + error); - return [] + return []; } } parseObolCharonAlerts(key, value) { - value = 0 + value = 0; //app_monitoring_readyz if (key === "app_monitoring_readyz") { switch (value) { case 0: return { name: "Cluster in Unknown Status", - level: "warning" + level: "warning", }; case 2: return { name: "Beacon Node Down", - level: "critical" + level: "critical", }; case 4: return { name: "Cluster Insufficient Peers", - level: "warning" + level: "warning", }; case 6: return { name: "Cluster Missing Validators", - level: "critical" + level: "critical", }; case 7: return { name: "Beacon Node Zero Peers", - level: "critical" + level: "critical", }; } } if (key === "cluster_missed_attestations" && value > 0) { return { name: "Cluster Missed Attestations", - level: "critical" + level: "critical", }; } if (key === "cluster_failure_rate" && value < 95) { return { name: "Cluster Failure Rate", - level: "critical" + level: "critical", }; } if (key === "percentage_failed_sync_message_duty" && value > 0.1) { return { name: "Failed Sync Msg Duty", - level: "critical" + level: "critical", }; } if (key === "connected_relays" && value < 1) { return { name: "Num. Connected Relays", - level: "warning" + level: "warning", }; } if (key === "peer_ping_latency" && value > 0.4) { return { name: "Peer Ping Latency", - level: "warning" + level: "warning", }; } } + + /** + * Will gather metrics from Prometheus and evaluate. + * If thresholds are exceeded, an alert will be generated and added to the retuned array. + * @returns {Object[]} Array of alerts e.g. [{name: "Cluster in Unknown Status", level: "warning"}, {name: "Beacon Node Down", level: "critical"}] + */ + async fetchCsmAlerts() { + try { + const serviceInfos = await this.getServiceInfos("LCOMService"); + if (serviceInfos.length < 1) { + return []; + } + + const queries = { + lcoms_initial_slashing_submitted: "lcoms_initial_slashing_submitted", + lcoms_withdrawal_submitted: "lcoms_withdrawal_submitted", + lcoms_stealing_penalty: "lcoms_stealing_penalty", + lcoms_stealing_penalty_stolenAmount: "lcoms_stealing_penalty_stolenAmount", + lcoms_exit_request: "lcoms_exit_request", + lcoms_exit_request_timestamp: "lcoms_exit_request_timestamp", + lcoms_fee_to_distribute: "lcoms_fee_to_distribute", + lcoms_node_operator_status: "lcoms_node_operator_status", + lcoms_current_bond: "lcoms_current_bond", + lcoms_required_bond: "lcoms_required_bond", + }; + + const queryPromises = Object.entries(queries).map(([key, query]) => { + return this.queryPrometheus(encodeURIComponent(query)).then((result) => ({ key, result })); + }); + + const results = await Promise.all(queryPromises); + + let currentBond = null; + let requiredBond = null; + + let alerts = results + .map((metric) => { + if (metric.result.status !== "success") { + return []; + } + + const value = parseFloat(metric.result.data.result[0].value[1]); + + if (metric.key === "lcoms_current_bond") { + currentBond = value; + } else if (metric.key === "lcoms_required_bond") { + requiredBond = value; + } + + return this.parseCsmAlerts(metric.key, value); + }) + .filter((alert) => alert !== null); + + if (currentBond !== null && requiredBond !== null) { + const bondDifference = currentBond - requiredBond; + const bondAlert = this.parseCsmAlerts("bond_difference", bondDifference); + if (bondAlert) { + alerts.push(bondAlert); + } + } + + return alerts; + } catch (error) { + log.error("Fetching CSM Alerts Failed:\n" + error); + return []; + } + } + + parseCsmAlerts(key, value) { + if (key === "lcoms_initial_slashing_submitted" && value > 0) { + return { + name: "slashing event", + level: "critical", + }; + } + if (key === "lcoms_withdrawal_submitted" && value > 0) { + return { + name: "withdrawal submitted", + level: "notification", + }; + } + if (key === "lcoms_stealing_penalty" && value > 0) { + return { + name: "EL stealing penalty", + level: "critical", + }; + } + if (key === "lcoms_exit_request" && value > 0) { + return { + name: "exit request", + level: "critical", + }; + } + if (key === "lcoms_fee_to_distribute" && value > 0) { + return { + name: "none-claimed rewards", + level: "notification", + }; + } + if (key === "lcoms_node_operator_status" && value < 1) { + return { + name: "node operator inactive", + level: "critical", + }; + } + if (key === "bond_difference" && value < 0) { + return { + name: "Insufficient Bond", + level: "critical", + }; + } + return []; + } } diff --git a/launcher/src/background.js b/launcher/src/background.js index 9e606f6334..8dcfcd53c8 100755 --- a/launcher/src/background.js +++ b/launcher/src/background.js @@ -745,6 +745,10 @@ ipcMain.handle("fetchObolCharonAlerts", async () => { return await monitoring.fetchObolCharonAlerts(); }); +ipcMain.handle("fetchCsmAlerts", async () => { + return await monitoring.fetchCsmAlerts(); +}); + // Scheme must be registered before the app is ready protocol.registerSchemesAsPrivileged([{ scheme: "app", privileges: { secure: true, standard: true } }]); diff --git a/launcher/src/components/UI/node-page/components/alert/NodeAlert.vue b/launcher/src/components/UI/node-page/components/alert/NodeAlert.vue index 03953c4544..04d1fca8e5 100755 --- a/launcher/src/components/UI/node-page/components/alert/NodeAlert.vue +++ b/launcher/src/components/UI/node-page/components/alert/NodeAlert.vue @@ -159,6 +159,55 @@ + + +
+ + + + + + + + + + +