diff --git a/crates/sui-core/src/traffic_controller/metrics.rs b/crates/sui-core/src/traffic_controller/metrics.rs index b9264b3163c9e3..1762f69c08fc78 100644 --- a/crates/sui-core/src/traffic_controller/metrics.rs +++ b/crates/sui-core/src/traffic_controller/metrics.rs @@ -18,6 +18,7 @@ pub struct TrafficControllerMetrics { pub num_dry_run_blocked_requests: IntCounter, pub tally_handled: IntCounter, pub error_tally_handled: IntCounter, + pub deadmans_switch_enabled: IntGauge, } impl TrafficControllerMetrics { @@ -85,6 +86,13 @@ impl TrafficControllerMetrics { registry ) .unwrap(), + deadmans_switch_enabled: register_int_gauge_with_registry!( + "deadmans_switch_enabled", + "If 1, the deadman's switch is enabled and all traffic control + should be getting bypassed", + registry + ) + .unwrap(), } } diff --git a/crates/sui-core/src/traffic_controller/mod.rs b/crates/sui-core/src/traffic_controller/mod.rs index 933bb1b68f6c89..a880ce0d9dcee3 100644 --- a/crates/sui-core/src/traffic_controller/mod.rs +++ b/crates/sui-core/src/traffic_controller/mod.rs @@ -80,6 +80,9 @@ impl TrafficController { .as_ref() .map(|config| config.drain_path.exists()) .unwrap_or(false); + metrics + .deadmans_switch_enabled + .set(mem_drainfile_present as i64); let ret = Self { tally_channel: tx, @@ -90,15 +93,15 @@ impl TrafficController { metrics: metrics.clone(), dry_run_mode: policy_config.dry_run, }; - let blocklists = ret.blocklists.clone(); spawn_monitored_task!(run_tally_loop( rx, policy_config, fw_config, - blocklists, - metrics, + ret.blocklists.clone(), + metrics.clone(), mem_drainfile_present, )); + spawn_monitored_task!(run_clear_blocklists_loop(ret.blocklists.clone(), metrics)); ret } @@ -206,6 +209,29 @@ impl TrafficController { } } +/// Although we clear IPs from the blocklist lazily when they are checked, +/// it's possible that over time we may accumulate a large number of stale +/// IPs in the blocklist for clients that are added, then once blocked, +/// never checked again. This function runs periodically to clear out any +/// such stale IPs. This also ensures that the blocklist length metric +/// accurately reflects TTL. +async fn run_clear_blocklists_loop(blocklists: Blocklists, metrics: Arc) { + loop { + tokio::time::sleep(Duration::from_secs(3)).await; + let now = SystemTime::now(); + blocklists.clients.retain(|_, expiration| now < *expiration); + blocklists + .proxied_clients + .retain(|_, expiration| now < *expiration); + metrics + .connection_ip_blocklist_len + .set(blocklists.clients.len() as i64); + metrics + .proxy_ip_blocklist_len + .set(blocklists.proxied_clients.len() as i64); + } +} + async fn run_tally_loop( mut receiver: mpsc::Receiver, policy_config: PolicyConfig, @@ -279,6 +305,7 @@ async fn run_tally_loop( warn!("Draining Node firewall."); File::create(&fw_config.drain_path) .expect("Failed to touch nodefw drain file"); + metrics.deadmans_switch_enabled.set(1); } } }