Skip to content

Commit

Permalink
inbound: Improve policy metrics (#1237)
Browse files Browse the repository at this point in the history
We recently introduced metrics to help surface inbound policy decisions,
but in practice these haven't been as useful as we might hope.
Specifically, error metrics do not include the `target_addr` label so
these metrics can't be correlated with servers, etc. This change
improves error metrics and also introduces new metrics to describe
authorization decisions: authorization denials shouldn't be classified
as errors, really, anyway.

This change also improves TCP forwarding authorization so that policy
changes can be honored at runtime: previously authorized connections may
dropped if the policy is updated so that the connection is no longer
authorized.

The gateway is also updated to enforce HTTP policies at runtime as well
so that policy changes can be honored after the connection has been
established.

This change introduces new metrics:

* `inbound_http_authz_allow_total`
* `inbound_http_authz_deny_total`
* `inbound_tcp_authz_allow_total`
* `inbound_tcp_authz_deny_total`
* `inbound_tcp_authz_terminate_total`

_allow_ metrics include `target_addr`, `srv_name`, and `saz_name`
labels. _deny_ and _terminate_ metics include only `target_addr` and
`srv_name` labels.

Authorization denials are no longer reflected in inbound_tcp_error or
inbound_http_error metrics.

A number of internal changes have been made to support this:

* The `inbound::policy::authorize` module includes middlewares for TCP
  and HTTP authorization, replacing the prior method of enforcing policy
  in the stack/router. This module ensures that metrics are recorded for
  policy decisions.
* The `error-metrics` crate has been removed. In its place a `monitor`
  type has been added to the `stack` crate, supporting a general way to
  observe errors, decoupled from the metrics registry.
* Inbound and outbound error metrics are now tracked in the inbound and
  outbound crates, respectively. Inbound- and outbound-specific error
  types are also moved into their rspective crates.
* The `app_core::errors` module has been updated to only define the
  types it needs to instrument the error response layer. Error responses
  are now primarily instrumented via the `HttpError` type so that errors
  that should be handled can be configured where the error is thrown.
  The error type now holds an underlying source error so that the error
  metrics layer can see through this wrapper type to track the
  underlying error cause.
* Server & Authorization labels are no longer handled as a free-form
  maps. We currently read only the `name` label from each; and this
  label is required.
  • Loading branch information
olix0r authored Sep 3, 2021
1 parent 6adffd2 commit 29c22af
Show file tree
Hide file tree
Showing 66 changed files with 1,940 additions and 1,204 deletions.
14 changes: 2 additions & 12 deletions Cargo.lock
Original file line number Diff line number Diff line change
Expand Up @@ -668,7 +668,6 @@ dependencies = [
"linkerd-duplex",
"linkerd-errno",
"linkerd-error",
"linkerd-error-metrics",
"linkerd-error-respond",
"linkerd-exp-backoff",
"linkerd-http-classify",
Expand Down Expand Up @@ -750,6 +749,7 @@ dependencies = [
"linkerd-tonic-watch",
"linkerd-tracing",
"linkerd2-proxy-api",
"parking_lot",
"thiserror",
"tokio",
"tokio-test",
Expand Down Expand Up @@ -801,6 +801,7 @@ dependencies = [
"linkerd-identity",
"linkerd-io",
"linkerd-tracing",
"parking_lot",
"pin-project",
"thiserror",
"tokio",
Expand Down Expand Up @@ -910,17 +911,6 @@ dependencies = [
"futures",
]

[[package]]
name = "linkerd-error-metrics"
version = "0.1.0"
dependencies = [
"futures",
"linkerd-metrics",
"parking_lot",
"pin-project",
"tower",
]

[[package]]
name = "linkerd-error-respond"
version = "0.1.0"
Expand Down
1 change: 0 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ members = [
"linkerd/duplex",
"linkerd/error",
"linkerd/errno",
"linkerd/error-metrics",
"linkerd/error-respond",
"linkerd/exp-backoff",
"linkerd/http-box",
Expand Down
39 changes: 28 additions & 11 deletions linkerd/app/admin/src/stack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@ use linkerd_app_core::{
serve,
svc::{self, ExtractParam, InsertParam, Param},
tls, trace,
transport::{self, listen::Bind, ClientAddr, Local, Remote, ServerAddr},
Error,
transport::{self, listen::Bind, ClientAddr, Local, OrigDstAddr, Remote, ServerAddr},
Error, Result,
};
use linkerd_app_inbound as inbound;
use std::{pin::Pin, time::Duration};
use thiserror::Error;
use tokio::sync::mpsc;
Expand Down Expand Up @@ -64,7 +65,7 @@ impl Config {
bind: B,
identity: Option<LocalCrtKey>,
report: R,
metrics: metrics::Proxy,
metrics: inbound::Metrics,
trace: trace::Handle,
drain: drain::Watch,
shutdown: mpsc::UnboundedSender<()>,
Expand All @@ -79,11 +80,11 @@ impl Config {
let (ready, latch) = crate::server::Readiness::new();
let admin = crate::server::Admin::new(report, ready, shutdown, trace);
let admin = svc::stack(move |_| admin.clone())
.push(metrics.http_endpoint.to_layer::<classify::Response, _, Http>())
.push(metrics.proxy.http_endpoint.to_layer::<classify::Response, _, Http>())
.push(metrics.http_errors.to_layer())
.push_on_service(
svc::layers()
.push(metrics.http_errors.clone())
.push(errors::layer())
.push(errors::respond::layer())
.push(http::BoxResponse::layer()),
)
.push(http::NewServeHttp::layer(Default::default(), drain.clone()))
Expand Down Expand Up @@ -130,8 +131,10 @@ impl Config {
)
.push(svc::BoxNewService::layer())
.push(detect::NewDetectService::layer(detect::Config::<http::DetectHttp>::from_timeout(DETECT_TIMEOUT)))
.push(transport::metrics::NewServer::layer(metrics.transport))
.push_map_target(|(tls, addrs): (tls::ConditionalServerTls, B::Addrs)| {
.push(transport::metrics::NewServer::layer(metrics.proxy.transport))
.push_map_target(move |(tls, addrs): (tls::ConditionalServerTls, B::Addrs)| {
// TODO(ver): We should enforce policy here; but we need to permit liveness probes
// for destination pods to startup...
Tcp {
tls,
client: addrs.param(),
Expand Down Expand Up @@ -161,8 +164,7 @@ impl Param<transport::labels::Key> for Tcp {
self.tls.clone(),
self.addr.into(),
// TODO(ver) enforce policies on the proxy's admin port.
Default::default(),
Default::default(),
metrics::ServerLabel("default:admin".to_string()),
)
}
}
Expand All @@ -175,13 +177,28 @@ impl Param<http::Version> for Http {
}
}

impl Param<OrigDstAddr> for Http {
fn param(&self) -> OrigDstAddr {
OrigDstAddr(self.tcp.addr.into())
}
}

impl Param<metrics::ServerLabel> for Http {
fn param(&self) -> metrics::ServerLabel {
metrics::ServerLabel("default:admin".to_string())
}
}

impl Param<metrics::EndpointLabels> for Http {
fn param(&self) -> metrics::EndpointLabels {
metrics::InboundEndpointLabels {
tls: self.tcp.tls.clone(),
authority: None,
target_addr: self.tcp.addr.into(),
policy: Default::default(),
policy: metrics::AuthzLabels {
server: self.param(),
authz: "default:all-unauthenticated".to_string(),
},
}
.into()
}
Expand Down
1 change: 0 additions & 1 deletion linkerd/app/core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ linkerd-detect = { path = "../../detect" }
linkerd-duplex = { path = "../../duplex" }
linkerd-errno = { path = "../../errno" }
linkerd-error = { path = "../../error" }
linkerd-error-metrics = { path = "../../error-metrics" }
linkerd-error-respond = { path = "../../error-respond" }
linkerd-exp-backoff = { path = "../../exp-backoff" }
linkerd-http-classify = { path = "../../http-classify" }
Expand Down
2 changes: 1 addition & 1 deletion linkerd/app/core/src/control.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ impl Config {

svc::stack(ConnectTcp::new(self.connect.keepalive))
.push(tls::Client::layer(identity))
.push_timeout(self.connect.timeout)
.push_connect_timeout(self.connect.timeout)
.push(self::client::layer())
.push_on_service(svc::MapErrLayer::new(Into::into))
.into_new_service()
Expand Down
44 changes: 44 additions & 0 deletions linkerd/app/core/src/errors/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
pub mod respond;

use linkerd_error::Error;
pub use linkerd_timeout::{FailFastError, ResponseTimeout};
use thiserror::Error;

#[derive(Debug, Error)]
#[error("connect timed out after {0:?}")]
pub(crate) struct ConnectTimeout(pub std::time::Duration);

#[derive(Debug, Error)]
#[error("{source}")]
pub struct HttpError {
#[source]
source: Error,
http_status: http::StatusCode,
grpc_status: tonic::Code,
}

impl HttpError {
pub fn bad_request(source: impl Into<Error>) -> Self {
Self {
source: source.into(),
http_status: http::StatusCode::BAD_REQUEST,
grpc_status: tonic::Code::InvalidArgument,
}
}

pub fn forbidden(source: impl Into<Error>) -> Self {
Self {
source: source.into(),
http_status: http::StatusCode::FORBIDDEN,
grpc_status: tonic::Code::PermissionDenied,
}
}

pub fn loop_detected(source: impl Into<Error>) -> Self {
Self {
source: source.into(),
http_status: http::StatusCode::LOOP_DETECTED,
grpc_status: tonic::Code::Aborted,
}
}
}
Loading

0 comments on commit 29c22af

Please sign in to comment.