Skip to content

Commit

Permalink
feat: add PagerDuty alert integration for the case when epoch registr…
Browse files Browse the repository at this point in the history
…ation fails after maximum retries. (#1367)
  • Loading branch information
sergeytimoshin authored Dec 3, 2024
1 parent 8815582 commit 30dfade
Show file tree
Hide file tree
Showing 7 changed files with 82 additions and 4 deletions.
5 changes: 5 additions & 0 deletions forester/src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ pub struct StartArgs {
#[arg(long, env = "FORESTER_PUSH_GATEWAY_URL")]
pub push_gateway_url: Option<String>,

#[arg(long, env = "FORESTER_PAGERDUTY_ROUTING_KEY")]
pub pagerduty_routing_key: Option<String>,

#[arg(long, env = "FORESTER_WS_RPC_URL")]
pub ws_rpc_url: Option<String>,

Expand Down Expand Up @@ -117,6 +120,8 @@ pub struct StatusArgs {

#[arg(long, env = "FORESTER_PUSH_GATEWAY_URL")]
pub push_gateway_url: Option<String>,
#[arg(long, env = "FORESTER_PAGERDUTY_ROUTING_KEY")]
pub pagerduty_routing_key: Option<String>,
/// Select to run compressed token program tests.
#[clap(long)]
pub full: bool,
Expand Down
3 changes: 3 additions & 0 deletions forester/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ pub struct ExternalServicesConfig {
pub prover_url: Option<String>,
pub photon_api_key: Option<String>,
pub pushgateway_url: Option<String>,
pub pagerduty_routing_key: Option<String>,
}

#[derive(Debug, Clone, Copy)]
Expand Down Expand Up @@ -132,6 +133,7 @@ impl ForesterConfig {
prover_url: args.prover_url.clone(),
photon_api_key: args.photon_api_key.clone(),
pushgateway_url: args.push_gateway_url.clone(),
pagerduty_routing_key: args.pagerduty_routing_key.clone(),
},
retry_config: RetryConfig {
max_retries: args.max_retries,
Expand Down Expand Up @@ -179,6 +181,7 @@ impl ForesterConfig {
prover_url: None,
photon_api_key: None,
pushgateway_url: args.push_gateway_url.clone(),
pagerduty_routing_key: args.pagerduty_routing_key.clone(),
},
retry_config: RetryConfig::default(),
queue_config: QueueConfig::default(),
Expand Down
19 changes: 19 additions & 0 deletions forester/src/epoch_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use crate::{ForesterConfig, ForesterEpochInfo};
use light_client::rpc_pool::SolanaRpcPool;

use crate::metrics::{push_metrics, queue_metric_update, update_forester_sol_balance};
use crate::pagerduty::send_pagerduty_alert;
use crate::tree_finder::TreeFinder;
use dashmap::DashMap;
use forester_utils::forester_epoch::{
Expand Down Expand Up @@ -476,6 +477,24 @@ impl<R: RpcConnection, I: Indexer<R>> EpochManager<R, I> {
if attempt < max_retries - 1 {
sleep(retry_delay).await;
} else {
if let Err(alert_err) = send_pagerduty_alert(
&self
.config
.external_services
.pagerduty_routing_key
.clone()
.unwrap(),
&format!(
"Forester failed to register for epoch {} after {} attempts",
epoch, max_retries
),
"critical",
&format!("Forester {}", self.config.payer_keypair.pubkey()),
)
.await
{
error!("Failed to send PagerDuty alert: {:?}", alert_err);
}
return Err(e);
}
}
Expand Down
1 change: 1 addition & 0 deletions forester/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ pub mod epoch_manager;
pub mod errors;
pub mod forester_status;
pub mod metrics;
pub mod pagerduty;
pub mod photon_indexer;
pub mod pubsub_client;
pub mod queue_helpers;
Expand Down
53 changes: 53 additions & 0 deletions forester/src/pagerduty.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
use reqwest::Client;
use serde::Serialize;
use std::time::Duration;

#[derive(Debug, Serialize)]
struct PagerDutyPayload {
routing_key: String,
event_action: String,
payload: PagerDutyAlertPayload,
}

#[derive(Debug, Serialize)]
struct PagerDutyAlertPayload {
summary: String,
severity: String,
source: String,
}

pub async fn send_pagerduty_alert(
routing_key: &str,
summary: &str,
severity: &str,
source: &str,
) -> Result<(), Box<dyn std::error::Error>> {
let client = Client::builder().timeout(Duration::from_secs(10)).build()?;

let payload = PagerDutyPayload {
routing_key: routing_key.to_string(),
event_action: "trigger".to_string(),
payload: PagerDutyAlertPayload {
summary: summary.to_string(),
severity: severity.to_string(),
source: source.to_string(),
},
};

let response = client
.post("https://events.pagerduty.com/v2/enqueue")
.json(&payload)
.send()
.await?;

if !response.status().is_success() {
return Err(format!(
"Failed to send PagerDuty alert. Status: {}, Body: {}",
response.status(),
response.text().await?
)
.into());
}

Ok(())
}
1 change: 1 addition & 0 deletions forester/tests/test_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ pub fn forester_config() -> ForesterConfig {
prover_url: Some("http://localhost:3001".to_string()),
photon_api_key: None,
pushgateway_url: None,
pagerduty_routing_key: None,
},
retry_config: Default::default(),
queue_config: Default::default(),
Expand Down
4 changes: 0 additions & 4 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 30dfade

Please sign in to comment.