From 947bef4fdf236d37bd9f8793202ef63c97fabfab Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 26 Apr 2024 20:03:48 +0100 Subject: [PATCH] Review discussion: Cap size of routing cache. This also includes more details on the relative costs we're trying to offset, and the futur ease for impling/finding a suitable hashmap. --- crates/opte-api/src/lib.rs | 2 +- xde/src/lib.rs | 2 +- xde/src/route.rs | 59 +++++++++++++++++++++++++++++++++++--- 3 files changed, 57 insertions(+), 6 deletions(-) diff --git a/crates/opte-api/src/lib.rs b/crates/opte-api/src/lib.rs index c82d1cb2..60b1c1d7 100644 --- a/crates/opte-api/src/lib.rs +++ b/crates/opte-api/src/lib.rs @@ -47,7 +47,7 @@ pub use ulp::*; /// /// We rely on CI and the check-api-version.sh script to verify that /// this number is incremented anytime the oxide-api code changes. -pub const API_VERSION: u64 = 28; +pub const API_VERSION: u64 = 29; /// Major version of the OPTE package. pub const MAJOR_VERSION: u64 = 0; diff --git a/xde/src/lib.rs b/xde/src/lib.rs index e6026582..d7ff6301 100644 --- a/xde/src/lib.rs +++ b/xde/src/lib.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2022 Oxide Computer Company +// Copyright 2024 Oxide Computer Company // xde - A mac provider for OPTE-based network implementations. #![feature(extern_types)] diff --git a/xde/src/route.rs b/xde/src/route.rs index 0b13e029..8e9e92ea 100644 --- a/xde/src/route.rs +++ b/xde/src/route.rs @@ -1,3 +1,9 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2024 Oxide Computer Company + use crate::ip; use crate::sys; use crate::xde::xde_underlay_port; @@ -18,6 +24,9 @@ use opte::engine::ip6::Ipv6Addr; /// The duration a cached route remains valid for. const MAX_ROUTE_LIFETIME: Duration = Duration::from_millis(100); +/// Maximum cache size, set to prevent excessive map modification latency. +const MAX_CACHE_ENTRIES: usize = 512; + extern "C" { pub fn __dtrace_probe_next__hop( dst: uintptr_t, @@ -144,7 +153,7 @@ fn netstack_rele(ns: *mut ip::netstack_t) { // // Let's say this host (sled1) wants to send a packet to sled2. Our // sled1 host lives on network `fd00:::/64` while our -// sled2 host lives on `fd00:::/64` -- the key point +// sled2 host lives on `fd00:::/64` -- the key point // being they are two different networks and thus must be routed to // talk to each other. For sled1 to send this packet it will attempt // to look up destination `fd00:::7777` (in this case @@ -387,6 +396,35 @@ fn next_hop<'a>( } /// A simple caching layer over `next_hop`. +/// +/// [`next_hop`] has a latency distribution which roughly looks like this: +/// ```text +/// t(ns) Count +/// 1024 | 337 +/// 1280 | 108 +/// 1536 |@@@@@@@@@@@@@@@@@@@@@ 376883 +/// 1792 |@@@@@@@@@@@@@@@ 264693 +/// 2048 |@ 17798 +/// 2304 |@ 14791 +/// 2560 |@@ 32901 +/// 2816 |@ 10730 +/// 3072 | 3459 +/// ``` +/// +/// Naturally, bringing this down to O(ns) is desirable. Usually, illumos +/// holds `ire_t`s per `conn_t`, but we're aiming to be more fine-grained +/// with DDM -- so we need a tradeoff between 'asking about the best route +/// per-packet' and 'holding a route until it is expired'. We choose, for noe, +/// to hold a route for 100ms. +/// +/// Note, this uses a `BTreeMap`, but we would prefer the more consistent +/// (faster) add/remove costs of a `HashMap`. As `BTreeMap` modification costs +/// outpace the cost of `next_hop` between 256--512 entries, we currently set 512 +/// as a cap on cache size to prevent significant packet stalls. This may be tricky +/// to tune. +/// +/// (See: https://github.com/oxidecomputer/opte/pull/499#discussion_r1581164767 +/// for some performance numbers.) #[derive(Clone)] pub struct RouteCache(Arc>>); @@ -432,15 +470,28 @@ impl RouteCache { _ => {} } + // We've had a definitive flow miss, but we need to cap the cache + // size to prevent excessive modification latencies at high flow + // counts. + // If full and we have no old entry to update, drop the lock and do + // not insert. + // XXX: Want to profile in future to see if LRU expiry is + // affordable/sane here. + // XXX: A HashMap would exchange insert cost for lookup. + let route_cache = (maybe_route.is_some() + || route_cache.len() <= MAX_CACHE_ENTRIES) + .then_some(route_cache); + // `next_hop` might fail for myriad reasons, but we still // send the packet on an underlay device depending on our // progress. However, we do not want to cache bad mappings. - match next_hop(&key, xde) { - Ok(route) => { + match (route_cache, next_hop(&key, xde)) { + (Some(mut route_cache), Ok(route)) => { route_cache.insert(key, route.cached(xde, t)); route } - Err(underlay_dev) => Route { + (None, Ok(route)) => route, + (_, Err(underlay_dev)) => Route { src: EtherAddr::zero(), dst: EtherAddr::zero(), underlay_dev,