Skip to content

Commit

Permalink
ADD: CSM Alerts and notif (stereum-dev#2028)
Browse files Browse the repository at this point in the history
* FIX: frontend csm adderd to alert boxes

* ADD: the fetch csm to monitoring

* ADD: backend csm func

* ADD: node alrt csm fetched

* ADD: CSM to the control alert
  • Loading branch information
mabasian authored Sep 3, 2024
1 parent f17d1a9 commit 18d2a35
Show file tree
Hide file tree
Showing 7 changed files with 338 additions and 47 deletions.
195 changes: 157 additions & 38 deletions launcher/src/backend/Monitoring.js
Original file line number Diff line number Diff line change
Expand Up @@ -436,11 +436,11 @@ export class Monitoring {
var query =
rpc_method.trim().indexOf("{") < 0
? JSON.stringify({
jsonrpc: "2.0",
method: rpc_method.trim(),
params: rpc_params,
id: 1,
})
jsonrpc: "2.0",
method: rpc_method.trim(),
params: rpc_params,
id: 1,
})
: rpc_method;

// Define default response
Expand Down Expand Up @@ -2623,8 +2623,8 @@ export class Monitoring {
const addr_type = Array.isArray(addr)
? "arr"
: typeof addr === "string" && ["public", "local"].includes(addr)
? "str"
: "invalid";
? "str"
: "invalid";
addr = addr_type == "str" ? addr.toLowerCase().trim() : addr;
if (addr_type == "invalid") {
return {
Expand Down Expand Up @@ -2712,7 +2712,7 @@ export class Monitoring {
for (let i = 0; i < serviceInfos.length; i++) {
const hashDependencies =
serviceInfos[i].config.dependencies.consensusClients.length ||
serviceInfos[i].config.dependencies.executionClients.length
serviceInfos[i].config.dependencies.executionClients.length
? "yes"
: "no";
easyInfos.push({
Expand Down Expand Up @@ -3249,8 +3249,9 @@ rm -rf diskoutput
const parsedJson = JSON.parse(stdoutJson);

let message =
`${parsedJson?.message || ""}${parsedJson?.message && parsedJson?.stacktraces ? "\n" : ""}${parsedJson?.stacktraces || ""
}`.trim() || output;
`${parsedJson?.message || ""}${parsedJson?.message && parsedJson?.stacktraces ? "\n" : ""}${
parsedJson?.stacktraces || ""
}`.trim() || output;

return {
pubkey: pubkey,
Expand Down Expand Up @@ -3340,6 +3341,7 @@ rm -rf diskoutput
];
}
}

/**
* Will gather metrics from Prometheus and evaluate.
* If thresholds are exceeded, an alert will be generated and added to the retuned array.
Expand All @@ -3353,100 +3355,217 @@ rm -rf diskoutput
}
const queries = {
app_monitoring_readyz: "max((app_monitoring_readyz)) by (cluster_name, cluster_hash, cluster_peer)",
cluster_missed_attestations: "max(increase(core_tracker_failed_duties_total[10m])) by (cluster_hash, cluster_name)",
cluster_failure_rate: "floor(100 * (max(increase(core_tracker_success_duties_total[15m])) by (cluster_hash, cluster_name) / max(increase(core_tracker_expect_duties_total[15m])) by (cluster_hash, cluster_name)))",
percentage_failed_sync_message_duty: "(\n sum(increase(core_tracker_failed_duties_total[1h])) by (cluster_name,cluster_hash,cluster_peer)\n) \n/ \n(\n sum(increase(core_tracker_failed_duties_total[1h])) by (cluster_name,cluster_hash,cluster_peer) \n + \n sum(increase(core_bcast_broadcast_total[1h])) by (cluster_name,cluster_hash,cluster_peer) \n)",
cluster_missed_attestations:
"max(increase(core_tracker_failed_duties_total[10m])) by (cluster_hash, cluster_name)",
cluster_failure_rate:
"floor(100 * (max(increase(core_tracker_success_duties_total[15m])) by (cluster_hash, cluster_name) / max(increase(core_tracker_expect_duties_total[15m])) by (cluster_hash, cluster_name)))",
percentage_failed_sync_message_duty:
"(\n sum(increase(core_tracker_failed_duties_total[1h])) by (cluster_name,cluster_hash,cluster_peer)\n) \n/ \n(\n sum(increase(core_tracker_failed_duties_total[1h])) by (cluster_name,cluster_hash,cluster_peer) \n + \n sum(increase(core_bcast_broadcast_total[1h])) by (cluster_name,cluster_hash,cluster_peer) \n)",
connected_relays: "group (p2p_relay_connections) by (cluster_peer)",
peer_ping_latency: "histogram_quantile(0.90, sum(rate(p2p_ping_latency_secs_bucket[2m])) by (le,peer))",
}
};

const queryPromises = Object.entries(queries).map(([key, query]) => {
return this.queryPrometheus(encodeURIComponent(query)).then(result => ({ key, result }));
return this.queryPrometheus(encodeURIComponent(query)).then((result) => ({ key, result }));
});

const results = await Promise.all(queryPromises);

let alerts = results.map((metric) => {
if (metric.result.status != "success") {
return;
}
if (metric.key === "peer_ping_latency") {
let value = Math.max(...metric.result.data.result.map((r) => r.value[1]));
let alerts = results
.map((metric) => {
if (metric.result.status != "success") {
return;
}
if (metric.key === "peer_ping_latency") {
let value = Math.max(...metric.result.data.result.map((r) => r.value[1]));
return this.parseObolCharonAlerts(metric.key, value);
}
let value = metric.result.data.result[0].value[1];
return this.parseObolCharonAlerts(metric.key, value);
}
let value = metric.result.data.result[0].value[1];
return this.parseObolCharonAlerts(metric.key, value);
}).filter((alert) => alert);
})
.filter((alert) => alert);

return alerts;

} catch (error) {
log.error("Fetching Obol Charon Alerts Failed:\n" + error);
return []
return [];
}
}

parseObolCharonAlerts(key, value) {
value = 0
value = 0;
//app_monitoring_readyz
if (key === "app_monitoring_readyz") {
switch (value) {
case 0:
return {
name: "Cluster in Unknown Status",
level: "warning"
level: "warning",
};
case 2:
return {
name: "Beacon Node Down",
level: "critical"
level: "critical",
};
case 4:
return {
name: "Cluster Insufficient Peers",
level: "warning"
level: "warning",
};
case 6:
return {
name: "Cluster Missing Validators",
level: "critical"
level: "critical",
};
case 7:
return {
name: "Beacon Node Zero Peers",
level: "critical"
level: "critical",
};
}
}
if (key === "cluster_missed_attestations" && value > 0) {
return {
name: "Cluster Missed Attestations",
level: "critical"
level: "critical",
};
}
if (key === "cluster_failure_rate" && value < 95) {
return {
name: "Cluster Failure Rate",
level: "critical"
level: "critical",
};
}
if (key === "percentage_failed_sync_message_duty" && value > 0.1) {
return {
name: "Failed Sync Msg Duty",
level: "critical"
level: "critical",
};
}
if (key === "connected_relays" && value < 1) {
return {
name: "Num. Connected Relays",
level: "warning"
level: "warning",
};
}
if (key === "peer_ping_latency" && value > 0.4) {
return {
name: "Peer Ping Latency",
level: "warning"
level: "warning",
};
}
}

/**
* Will gather metrics from Prometheus and evaluate.
* If thresholds are exceeded, an alert will be generated and added to the retuned array.
* @returns {Object[]} Array of alerts e.g. [{name: "Cluster in Unknown Status", level: "warning"}, {name: "Beacon Node Down", level: "critical"}]
*/
async fetchCsmAlerts() {
try {
const serviceInfos = await this.getServiceInfos("LCOMService");
if (serviceInfos.length < 1) {
return [];
}

const queries = {
lcoms_initial_slashing_submitted: "lcoms_initial_slashing_submitted",
lcoms_withdrawal_submitted: "lcoms_withdrawal_submitted",
lcoms_stealing_penalty: "lcoms_stealing_penalty",
lcoms_stealing_penalty_stolenAmount: "lcoms_stealing_penalty_stolenAmount",
lcoms_exit_request: "lcoms_exit_request",
lcoms_exit_request_timestamp: "lcoms_exit_request_timestamp",
lcoms_fee_to_distribute: "lcoms_fee_to_distribute",
lcoms_node_operator_status: "lcoms_node_operator_status",
lcoms_current_bond: "lcoms_current_bond",
lcoms_required_bond: "lcoms_required_bond",
};

const queryPromises = Object.entries(queries).map(([key, query]) => {
return this.queryPrometheus(encodeURIComponent(query)).then((result) => ({ key, result }));
});

const results = await Promise.all(queryPromises);

let currentBond = null;
let requiredBond = null;

let alerts = results
.map((metric) => {
if (metric.result.status !== "success") {
return [];
}

const value = parseFloat(metric.result.data.result[0].value[1]);

if (metric.key === "lcoms_current_bond") {
currentBond = value;
} else if (metric.key === "lcoms_required_bond") {
requiredBond = value;
}

return this.parseCsmAlerts(metric.key, value);
})
.filter((alert) => alert !== null);

if (currentBond !== null && requiredBond !== null) {
const bondDifference = currentBond - requiredBond;
const bondAlert = this.parseCsmAlerts("bond_difference", bondDifference);
if (bondAlert) {
alerts.push(bondAlert);
}
}

return alerts;
} catch (error) {
log.error("Fetching CSM Alerts Failed:\n" + error);
return [];
}
}

parseCsmAlerts(key, value) {
if (key === "lcoms_initial_slashing_submitted" && value > 0) {
return {
name: "slashing event",
level: "critical",
};
}
if (key === "lcoms_withdrawal_submitted" && value > 0) {
return {
name: "withdrawal submitted",
level: "notification",
};
}
if (key === "lcoms_stealing_penalty" && value > 0) {
return {
name: "EL stealing penalty",
level: "critical",
};
}
if (key === "lcoms_exit_request" && value > 0) {
return {
name: "exit request",
level: "critical",
};
}
if (key === "lcoms_fee_to_distribute" && value > 0) {
return {
name: "none-claimed rewards",
level: "notification",
};
}
if (key === "lcoms_node_operator_status" && value < 1) {
return {
name: "node operator inactive",
level: "critical",
};
}
if (key === "bond_difference" && value < 0) {
return {
name: "Insufficient Bond",
level: "critical",
};
}
return [];
}
}
4 changes: 4 additions & 0 deletions launcher/src/background.js
Original file line number Diff line number Diff line change
Expand Up @@ -745,6 +745,10 @@ ipcMain.handle("fetchObolCharonAlerts", async () => {
return await monitoring.fetchObolCharonAlerts();
});

ipcMain.handle("fetchCsmAlerts", async () => {
return await monitoring.fetchCsmAlerts();
});

// Scheme must be registered before the app is ready
protocol.registerSchemesAsPrivileged([{ scheme: "app", privileges: { secure: true, standard: true } }]);

Expand Down
Loading

0 comments on commit 18d2a35

Please sign in to comment.