From 8d571bb6f5ae51f87592b7dea79f9ba3ba7beb67 Mon Sep 17 00:00:00 2001 From: William Smith Date: Mon, 24 Jun 2024 18:31:01 -0700 Subject: [PATCH] [TrafficControl] Make blocklist clearing periodic; Metrics improvements --- .../src/traffic_controller/metrics.rs | 8 +++++ crates/sui-core/src/traffic_controller/mod.rs | 35 +++++++++++++++++-- 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/crates/sui-core/src/traffic_controller/metrics.rs b/crates/sui-core/src/traffic_controller/metrics.rs index b9264b3163c9e3..1762f69c08fc78 100644 --- a/crates/sui-core/src/traffic_controller/metrics.rs +++ b/crates/sui-core/src/traffic_controller/metrics.rs @@ -18,6 +18,7 @@ pub struct TrafficControllerMetrics { pub num_dry_run_blocked_requests: IntCounter, pub tally_handled: IntCounter, pub error_tally_handled: IntCounter, + pub deadmans_switch_enabled: IntGauge, } impl TrafficControllerMetrics { @@ -85,6 +86,13 @@ impl TrafficControllerMetrics { registry ) .unwrap(), + deadmans_switch_enabled: register_int_gauge_with_registry!( + "deadmans_switch_enabled", + "If 1, the deadman's switch is enabled and all traffic control + should be getting bypassed", + registry + ) + .unwrap(), } } diff --git a/crates/sui-core/src/traffic_controller/mod.rs b/crates/sui-core/src/traffic_controller/mod.rs index 933bb1b68f6c89..6db5a8d5f0dbbb 100644 --- a/crates/sui-core/src/traffic_controller/mod.rs +++ b/crates/sui-core/src/traffic_controller/mod.rs @@ -80,6 +80,9 @@ impl TrafficController { .as_ref() .map(|config| config.drain_path.exists()) .unwrap_or(false); + metrics + .deadmans_switch_enabled + .set(mem_drainfile_present as i64); let ret = Self { tally_channel: tx, @@ -90,15 +93,17 @@ impl TrafficController { metrics: metrics.clone(), dry_run_mode: policy_config.dry_run, }; - let blocklists = ret.blocklists.clone(); + let tally_loop_blocklists = ret.blocklists.clone(); + let clear_loop_blocklists = ret.blocklists.clone(); spawn_monitored_task!(run_tally_loop( rx, policy_config, fw_config, - blocklists, - metrics, + tally_loop_blocklists, + metrics.clone(), mem_drainfile_present, )); + spawn_monitored_task!(run_clear_blocklists_loop(clear_loop_blocklists, metrics)); ret } @@ -206,6 +211,29 @@ impl TrafficController { } } +/// Although we clear IPs from the blocklist lazily when they are checked, +/// it's possible that over time we may accumulate a large number of stale +/// IPs in the blocklist for clients that are added, then once blocked, +/// never checked again. This function runs periodically to clear out any +/// such stale IPs. This also ensures that the blocklist length metric +/// accurately reflects TTL. +async fn run_clear_blocklists_loop(blocklists: Blocklists, metrics: Arc) { + loop { + tokio::time::sleep(Duration::from_secs(3)).await; + let now = SystemTime::now(); + blocklists.clients.retain(|_, expiration| now < *expiration); + blocklists + .proxied_clients + .retain(|_, expiration| now < *expiration); + metrics + .connection_ip_blocklist_len + .set(blocklists.clients.len() as i64); + metrics + .proxy_ip_blocklist_len + .set(blocklists.proxied_clients.len() as i64); + } +} + async fn run_tally_loop( mut receiver: mpsc::Receiver, policy_config: PolicyConfig, @@ -279,6 +307,7 @@ async fn run_tally_loop( warn!("Draining Node firewall."); File::create(&fw_config.drain_path) .expect("Failed to touch nodefw drain file"); + metrics.deadmans_switch_enabled.set(1); } } }