Skip to content

Commit

Permalink
[TrafficControl] Make blocklist clearing periodic; Metrics improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
williampsmith committed Jun 25, 2024
1 parent 25335da commit 09a5fa5
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 0 deletions.
8 changes: 8 additions & 0 deletions crates/sui-core/src/traffic_controller/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ pub struct TrafficControllerMetrics {
pub num_dry_run_blocked_requests: IntCounter,
pub tally_handled: IntCounter,
pub error_tally_handled: IntCounter,
pub deadmans_switch_enabled: IntGauge,
}

impl TrafficControllerMetrics {
Expand Down Expand Up @@ -85,6 +86,13 @@ impl TrafficControllerMetrics {
registry
)
.unwrap(),
deadmans_switch_enabled: register_int_gauge_with_registry!(
"deadmans_switch_enabled",
"If 1, the deadman's switch is enabled and all traffic control
should be getting bypassed",
registry
)
.unwrap(),
}
}

Expand Down
31 changes: 31 additions & 0 deletions crates/sui-core/src/traffic_controller/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@ impl TrafficController {
.as_ref()
.map(|config| config.drain_path.exists())
.unwrap_or(false);
metrics
.deadmans_switch_enabled
.set(mem_drainfile_present as i64);

let ret = Self {
tally_channel: tx,
Expand All @@ -99,6 +102,10 @@ impl TrafficController {
metrics,
mem_drainfile_present,
));
spawn_monitored_task!(run_clear_blocklists_loop(
ret.blocklists.clone(),
metrics.clone(),
));
ret
}

Expand Down Expand Up @@ -206,6 +213,29 @@ impl TrafficController {
}
}

/// Although we clear IPs from the blocklist lazily when they are checked,
/// it's possible that over time we may accumulate a large number of stale
/// IPs in the blocklist for clients that are added, then once blocked,
/// never checked again. This function runs periodically to clear out any
/// such stale IPs. This also ensures that the blocklist length metric
/// accurately reflects TTL.
async fn run_clear_blocklists_loop(blocklists: Blocklists, metrics: Arc<TrafficControllerMetrics>) {
loop {
tokio::time::sleep(Duration::from_secs(3)).await;
let now = SystemTime::now();
blocklists.clients.retain(|_, expiration| now < *expiration);
blocklists
.proxied_clients
.retain(|_, expiration| now < *expiration);
metrics
.connection_ip_blocklist_len
.set(blocklists.clients.len() as i64);
metrics
.proxy_ip_blocklist_len
.set(blocklists.proxied_clients.len() as i64);
}
}

async fn run_tally_loop(
mut receiver: mpsc::Receiver<TrafficTally>,
policy_config: PolicyConfig,
Expand Down Expand Up @@ -279,6 +309,7 @@ async fn run_tally_loop(
warn!("Draining Node firewall.");
File::create(&fw_config.drain_path)
.expect("Failed to touch nodefw drain file");
metrics.deadmans_switch_enabled.set(1);
}
}
}
Expand Down

0 comments on commit 09a5fa5

Please sign in to comment.