Skip to content

Commit

Permalink
Add /live health endpoint to admin server
Browse files Browse the repository at this point in the history
Create a `Health` module which tracks if a panic occurs
anywhere in the code base (which may or may not be on the main thread),
and moves the system to unhealthy.

In the future we could add extra checks to this module as we discover
more things that impact proxy health.

Closes #73
  • Loading branch information
markmandel committed Mar 30, 2021
1 parent b5e088f commit 2445b15
Show file tree
Hide file tree
Showing 6 changed files with 166 additions and 6 deletions.
7 changes: 7 additions & 0 deletions docs/admin.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,13 @@ admin:
The admin interface provides the following endpoints:
## /live
This provides a liveness probe endpoint, most commonly used in
[Kubernetes based systems](https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-a-liveness-command).
Will return an HTTP status of 200 when all health checks pass.
## /metrics
Outputs [Prometheus](https://prometheus.io/) formatted metrics for this proxy.
Expand Down
19 changes: 15 additions & 4 deletions src/proxy/admin.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,35 +23,41 @@ use hyper::{Body, Method, Request, Response, Server as HyperServer, StatusCode};
use slog::{error, info, o, Logger};
use tokio::sync::watch;

use crate::proxy::Metrics;
use crate::proxy::{Health, Metrics};

pub struct Admin {
log: Logger,
/// The address that the Admin server starts on
addr: SocketAddr,
metrics: Arc<Metrics>,
health: Arc<Health>,
}

impl Admin {
pub fn new(base: &Logger, addr: SocketAddr, metrics: Arc<Metrics>) -> Self {
pub fn new(base: &Logger, addr: SocketAddr, metrics: Arc<Metrics>, heath: Health) -> Self {
Admin {
log: base.new(o!("source" => "proxy::Admin")),
addr,
metrics,
health: Arc::new(heath),
}
}

pub fn run(&self, mut shutdown_rx: watch::Receiver<()>) {
info!(self.log, "Starting admin endpoint"; "address" => self.addr.to_string());

let metrics = self.metrics.clone();
let health = self.health.clone();
let make_svc = make_service_fn(move |_conn| {
let metrics = metrics.clone();
let health = health.clone();
async move {
let metrics = metrics.clone();
let health = health.clone();
Ok::<_, Infallible>(service_fn(move |req| {
let metrics = metrics.clone();
async move { Ok::<_, Infallible>(handle_request(req, metrics)) }
let health = health.clone();
async move { Ok::<_, Infallible>(handle_request(req, metrics, health)) }
}))
}
});
Expand All @@ -71,9 +77,14 @@ impl Admin {
}
}

fn handle_request(request: Request<Body>, metrics: Arc<Metrics>) -> Response<Body> {
fn handle_request(
request: Request<Body>,
metrics: Arc<Metrics>,
health: Arc<Health>,
) -> Response<Body> {
match (request.method(), request.uri().path()) {
(&Method::GET, "/metrics") => metrics.collect_metrics(),
(&Method::GET, "/live") => health.check_healthy(),
(_, _) => {
let mut response = Response::new(Body::empty());
*response.status_mut() = StatusCode::NOT_FOUND;
Expand Down
5 changes: 3 additions & 2 deletions src/proxy/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ use crate::config::{
};
use crate::extensions::{default_registry, CreateFilterError, FilterChain, FilterRegistry};
use crate::proxy::server::metrics::Metrics as ProxyMetrics;
use crate::proxy::{Admin as ProxyAdmin, Metrics, Server};
use crate::proxy::{Admin as ProxyAdmin, Health, Metrics, Server};

pub(super) enum ValidatedSource {
Static {
Expand Down Expand Up @@ -120,7 +120,8 @@ impl From<Arc<Config>> for Builder<PendingValidation> {
fn from(config: Arc<Config>) -> Self {
let log = logger();
let metrics = Arc::new(Metrics::new(&log, Registry::default()));
let admin = ProxyAdmin::new(&log, config.admin.address, metrics.clone());
let health = Health::new(&log);
let admin = ProxyAdmin::new(&log, config.admin.address, metrics.clone(), health);
Builder {
config,
filter_registry: default_registry(&log),
Expand Down
83 changes: 83 additions & 0 deletions src/proxy/health.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
/*
* Copyright 2021 Google LLC All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

use std::sync::atomic::AtomicBool;

use hyper::{Body, Response, StatusCode};
use slog::{error, o, Logger};
use std::panic;
use std::sync::atomic::Ordering::Relaxed;
use std::sync::Arc;

pub struct Health {
log: Logger,
healthy: Arc<AtomicBool>,
}

impl Health {
pub fn new(base: &Logger) -> Self {
let health = Self {
log: base.new(o!("source" => "proxy::Health")),
healthy: Arc::new(AtomicBool::new(true)),
};

let log = health.log.clone();
let healthy = health.healthy.clone();
let default_hook = panic::take_hook();
panic::set_hook(Box::new(move |panic_info| {
error!(log, "Panic has occurred. Moving to Unhealthy");
healthy.swap(false, Relaxed);
default_hook(panic_info);
}));

health
}

/// returns a HTTP 200 response if the proxy is healthy.
pub fn check_healthy(&self) -> Response<Body> {
if self.healthy.load(Relaxed) {
return Response::new("ok".into());
};

let mut response = Response::new(Body::empty());
*response.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
response
}
}

#[cfg(test)]
mod tests {
use crate::proxy::health::Health;
use crate::test_utils::logger;
use hyper::StatusCode;
use std::panic;

#[test]
fn panic_hook() {
let log = logger();
let health = Health::new(&log);

let response = health.check_healthy();
assert_eq!(response.status(), StatusCode::OK);

let _ = panic::catch_unwind(|| {
panic!("oh no!");
});

let response = health.check_healthy();
assert_eq!(response.status(), StatusCode::INTERNAL_SERVER_ERROR);
}
}
2 changes: 2 additions & 0 deletions src/proxy/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,13 @@

pub(crate) use admin::Admin;
pub use builder::{logger, Builder, PendingValidation, Validated};
pub(crate) use health::Health;
pub(crate) use metrics::Metrics;
pub use server::Server;

mod admin;
mod builder;
mod health;
mod metrics;
mod server;
mod sessions;
56 changes: 56 additions & 0 deletions tests/health.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/*
* Copyright 2021 Google LLC All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#[cfg(test)]
mod tests {
use quilkin::config::{Admin, Builder, EndPoint};
use quilkin::proxy::Builder as ProxyBuilder;
use quilkin::test_utils::TestHelper;
use std::panic;
use std::sync::Arc;

#[tokio::test]
async fn health_server() {
let mut t = TestHelper::default();

// create server configuration
let server_port = 12349;
let server_config = Builder::empty()
.with_port(server_port)
.with_static(vec![], vec![EndPoint::new("127.0.0.1:0".parse().unwrap())])
.with_admin(Admin {
address: "[::]:9093".parse().unwrap(),
})
.build();
t.run_server_with_builder(ProxyBuilder::from(Arc::new(server_config)));

let resp = reqwest::get("http://localhost:9093/live")
.await
.unwrap()
.text()
.await
.unwrap();

assert_eq!("ok", resp);

let _ = panic::catch_unwind(|| {
panic!("oh no!");
});

let resp = reqwest::get("http://localhost:9093/live").await.unwrap();
assert!(resp.status().is_server_error(), "Should be unhealthy");
}
}

0 comments on commit 2445b15

Please sign in to comment.