From 6fa27a335a58181262e85824922bdf0dc68cfc9b Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxide.computer>
Date: Thu, 10 Nov 2022 15:11:47 -0500
Subject: [PATCH] [sled-agent] Monitor for Tofino driver as factor in 'are we
 scrimlet' decision (#1918)

- Adds `hardware` module to Sled Agent for minimally monitoring `devinfo` output. This will certainly evolve, but this PR includes a "bare minimum" real example of tracking the tofino driver.
- Stop relying on `rss-config.toml` to decide if we're running on a scrimlet. Instead...
  - (General case) Rely on monitoring hardware
  - (Testing, manual case) Provide a `force-scrimlet` option to make the sled agent assume that it is a scrimlet

Fixes https://github.com/oxidecomputer/minimum-upgradable-product/issues/19
Part of https://github.com/oxidecomputer/omicron/issues/1917
Part of https://github.com/oxidecomputer/omicron/issues/823
Pre-requisite for https://github.com/oxidecomputer/minimum-upgradable-product/issues/16
Pre-requisite for https://github.com/oxidecomputer/minimum-upgradable-product/issues/18
---
 Cargo.lock                                 |  11 +
 sled-agent/Cargo.toml                      |   1 +
 sled-agent/src/bootstrap/agent.rs          |  30 +--
 sled-agent/src/bootstrap/server.rs         |   2 +-
 sled-agent/src/config.rs                   |   3 +
 sled-agent/src/hardware/illumos/mod.rs     | 228 +++++++++++++++++++++
 sled-agent/src/hardware/mod.rs             |  28 +++
 sled-agent/src/hardware/non_illumos/mod.rs |  37 ++++
 sled-agent/src/lib.rs                      |   1 +
 sled-agent/src/nexus.rs                    |  43 ++++
 sled-agent/src/server.rs                   |  61 +-----
 sled-agent/src/sled_agent.rs               | 219 ++++++++++++++++++--
 smf/sled-agent/config.toml                 |   9 +
 13 files changed, 576 insertions(+), 97 deletions(-)
 create mode 100644 sled-agent/src/hardware/illumos/mod.rs
 create mode 100644 sled-agent/src/hardware/mod.rs
 create mode 100644 sled-agent/src/hardware/non_illumos/mod.rs

diff --git a/Cargo.lock b/Cargo.lock
index b4d9873d73..ccfd503648 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2450,6 +2450,16 @@ dependencies = [
  "unicode-normalization",
 ]
 
+[[package]]
+name = "illumos-devinfo"
+version = "0.1.0"
+source = "git+https://github.com/oxidecomputer/illumos-devinfo?rev=8fca0709e5137a3758374cb41ab1bfc60b03e6a9#8fca0709e5137a3758374cb41ab1bfc60b03e6a9"
+dependencies = [
+ "anyhow",
+ "libc",
+ "num_enum",
+]
+
 [[package]]
 name = "illumos-sys-hdrs"
 version = "0.1.0"
@@ -3455,6 +3465,7 @@ dependencies = [
  "expectorate",
  "futures",
  "http",
+ "illumos-devinfo",
  "internal-dns-client",
  "ipnetwork",
  "libc",
diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml
index c0fd3cc29c..44691d9bb7 100644
--- a/sled-agent/Cargo.toml
+++ b/sled-agent/Cargo.toml
@@ -59,6 +59,7 @@ vsss-rs = { version = "2.0.0", default-features = false, features = ["std"] }
 zone = "0.1"
 
 [target.'cfg(target_os = "illumos")'.dependencies]
+illumos-devinfo = { git = "https://github.com/oxidecomputer/illumos-devinfo", rev = "8fca0709e5137a3758374cb41ab1bfc60b03e6a9" }
 opte-ioctl = { git = "https://github.com/oxidecomputer/opte", rev = "23fdf5856f10f23e2d26865d2d7e2d3bc537bca3" }
 
 [dev-dependencies]
diff --git a/sled-agent/src/bootstrap/agent.rs b/sled-agent/src/bootstrap/agent.rs
index b5b964edb0..252b3789d5 100644
--- a/sled-agent/src/bootstrap/agent.rs
+++ b/sled-agent/src/bootstrap/agent.rs
@@ -272,19 +272,11 @@ impl Agent {
             *self.share.lock().await = Some(share);
         }
 
-        // TODO(https://github.com/oxidecomputer/omicron/issues/823):
-        // Currently, the prescence or abscence of RSS is our signal
-        // for "is this a scrimlet or not".
-        // Longer-term, we should make this call based on the underlying
-        // hardware.
-        let is_scrimlet = self.rss.lock().await.is_some();
-
         // Server does not exist, initialize it.
         let server = SledServer::start(
             &self.sled_config,
             self.parent_log.clone(),
             sled_address,
-            is_scrimlet,
             request.clone(),
         )
         .await
@@ -464,9 +456,13 @@ impl Agent {
         Ok(rack_secret)
     }
 
-    // Initializes the Rack Setup Service.
-    async fn start_rss(&self, config: &Config) -> Result<(), BootstrapError> {
+    /// Initializes the Rack Setup Service, if requested by `config`.
+    pub async fn start_rss(
+        &self,
+        config: &Config,
+    ) -> Result<(), BootstrapError> {
         if let Some(rss_config) = &config.rss_config {
+            info!(&self.log, "bootstrap service initializing RSS");
             let rss = RssHandle::start_rss(
                 &self.parent_log,
                 rss_config.clone(),
@@ -484,20 +480,6 @@ impl Agent {
         }
         Ok(())
     }
-
-    /// Performs device initialization:
-    ///
-    /// - Verifies, unpacks, and launches other services.
-    pub async fn initialize(
-        &self,
-        config: &Config,
-    ) -> Result<(), BootstrapError> {
-        info!(&self.log, "bootstrap service initializing");
-
-        self.start_rss(config).await?;
-
-        Ok(())
-    }
 }
 
 // We intentionally DO NOT derive `Debug` or `Serialize`; both provide avenues
diff --git a/sled-agent/src/bootstrap/server.rs b/sled-agent/src/bootstrap/server.rs
index 35193f6c33..d79d82e47b 100644
--- a/sled-agent/src/bootstrap/server.rs
+++ b/sled-agent/src/bootstrap/server.rs
@@ -113,7 +113,7 @@ impl Server {
         // This ordering allows the bootstrap agent to communicate with
         // other bootstrap agents on the rack during the initialization
         // process.
-        if let Err(e) = server.bootstrap_agent.initialize(&config).await {
+        if let Err(e) = server.bootstrap_agent.start_rss(&config).await {
             server.inner.abort();
             return Err(e.to_string());
         }
diff --git a/sled-agent/src/config.rs b/sled-agent/src/config.rs
index e1042f3b4b..7986b3b647 100644
--- a/sled-agent/src/config.rs
+++ b/sled-agent/src/config.rs
@@ -19,6 +19,9 @@ pub struct Config {
     pub id: Uuid,
     /// Configuration for the sled agent debug log
     pub log: ConfigLogging,
+    /// Optionally force the sled to self-identify as a scrimlet (or gimlet,
+    /// if set to false).
+    pub stub_scrimlet: Option<bool>,
     /// Optional VLAN ID to be used for tagging guest VNICs.
     pub vlan: Option<VlanID>,
     /// Optional list of zpools to be used as "discovered disks".
diff --git a/sled-agent/src/hardware/illumos/mod.rs b/sled-agent/src/hardware/illumos/mod.rs
new file mode 100644
index 0000000000..e9a54cfc77
--- /dev/null
+++ b/sled-agent/src/hardware/illumos/mod.rs
@@ -0,0 +1,228 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+use illumos_devinfo::DevInfo;
+use slog::Logger;
+use std::sync::Arc;
+use std::sync::Mutex;
+use tokio::sync::broadcast;
+use tokio::task::JoinHandle;
+
+// A snapshot of information about the underlying Tofino device
+struct TofinoSnapshot {
+    exists: bool,
+    driver_loaded: bool,
+}
+
+impl TofinoSnapshot {
+    fn new() -> Self {
+        Self { exists: false, driver_loaded: false }
+    }
+}
+
+// A snapshot of information about the underlying hardware
+struct HardwareSnapshot {
+    tofino: TofinoSnapshot,
+}
+
+impl HardwareSnapshot {
+    fn new() -> Self {
+        Self { tofino: TofinoSnapshot::new() }
+    }
+}
+
+// Describes a view of the Tofino switch.
+enum TofinoView {
+    // The view of the Tofino switch exactly matches the snapshot of hardware.
+    Real(TofinoSnapshot),
+    // The Tofino switch has been "stubbed out", and the underlying hardware is
+    // being ignored.
+    Stub { active: bool },
+}
+
+// A cached copy of "our latest view of what hardware exists".
+//
+// This struct can be expanded arbitrarily, as it's useful for the Sled Agent
+// to perceive hardware.
+//
+// Q: Why bother caching this information at all? Why not rely on devinfo for
+// all queries?
+// A: By keeping an in-memory representation, we can "diff" with the information
+// reported from libdevinfo to decide when to send notifications and change
+// which services are currently executing.
+struct HardwareView {
+    tofino: TofinoView,
+    // TODO: Add U.2s, M.2s, other devices.
+}
+
+impl HardwareView {
+    fn new() -> Self {
+        Self { tofino: TofinoView::Real(TofinoSnapshot::new()) }
+    }
+
+    fn new_stub_tofino(active: bool) -> Self {
+        Self { tofino: TofinoView::Stub { active } }
+    }
+}
+
+const TOFINO_SUBSYSTEM_VID: i32 = 0x1d1c;
+const TOFINO_SUBSYSTEM_ID: i32 = 0x100;
+
+fn node_name(subsystem_vid: i32, subsystem_id: i32) -> String {
+    format!("pci{subsystem_vid:x},{subsystem_id:x}")
+}
+
+// Performs a single walk of the device info tree, updating our view of hardware
+// and sending notifications to any subscribers.
+fn poll_device_tree(
+    log: &Logger,
+    inner: &Arc<Mutex<HardwareView>>,
+    tx: &broadcast::Sender<super::HardwareUpdate>,
+) -> Result<(), String> {
+    // Construct a view of hardware by walking the device tree.
+    let mut device_info = DevInfo::new().map_err(|e| e.to_string())?;
+    let mut node_walker = device_info.walk_node();
+    let mut polled_hw = HardwareSnapshot::new();
+    while let Some(node) =
+        node_walker.next().transpose().map_err(|e| e.to_string())?
+    {
+        if node.node_name()
+            == node_name(TOFINO_SUBSYSTEM_VID, TOFINO_SUBSYSTEM_ID)
+        {
+            polled_hw.tofino.exists = true;
+            polled_hw.tofino.driver_loaded =
+                node.driver_name().as_deref() == Some("tofino");
+        }
+    }
+
+    // After inspecting the device tree, diff with the old view, and provide
+    // necessary updates.
+    let mut updates = vec![];
+    {
+        let mut inner = inner.lock().unwrap();
+        match inner.tofino {
+            TofinoView::Real(TofinoSnapshot { driver_loaded, exists }) => {
+                use super::HardwareUpdate::*;
+                // Identify if the Tofino device changed power states.
+                if exists != polled_hw.tofino.exists {
+                    updates.push(TofinoDeviceChange);
+                }
+
+                // Identify if the Tofino driver was recently loaded/unloaded.
+                match (driver_loaded, polled_hw.tofino.driver_loaded) {
+                    (false, true) => updates.push(TofinoLoaded),
+                    (true, false) => updates.push(TofinoUnloaded),
+                    _ => (),
+                };
+
+                // Update our view of the underlying hardware
+                inner.tofino = TofinoView::Real(polled_hw.tofino);
+            }
+            TofinoView::Stub { .. } => (),
+        }
+    };
+
+    for update in updates.into_iter() {
+        info!(log, "Update from polling device tree: {:?}", update);
+        let _ = tx.send(update);
+    }
+
+    Ok(())
+}
+
+async fn hardware_tracking_task(
+    log: Logger,
+    inner: Arc<Mutex<HardwareView>>,
+    tx: broadcast::Sender<super::HardwareUpdate>,
+) {
+    loop {
+        if let Err(err) = poll_device_tree(&log, &inner, &tx) {
+            warn!(log, "Failed to query device tree: {err}");
+        }
+        tokio::time::sleep(tokio::time::Duration::from_secs(5)).await;
+    }
+}
+
+/// A representation of the underlying hardware.
+///
+/// This structure provides interfaces for both querying and for receiving new
+/// events.
+pub struct HardwareManager {
+    log: Logger,
+    inner: Arc<Mutex<HardwareView>>,
+    tx: broadcast::Sender<super::HardwareUpdate>,
+    _worker: JoinHandle<()>,
+}
+
+impl HardwareManager {
+    /// Creates a new representation of the underlying hardware, and initializes
+    /// a task which periodically updates that representation.
+    ///
+    /// Arguments:
+    /// - `stub_scrimlet`: Identifies if we should ignore the attached Tofino
+    /// device, and assume the device is a scrimlet (true) or gimlet (false).
+    /// If this argument is not supplied, we assume the device is a gimlet until
+    /// device scanning informs us otherwise.
+    pub fn new(
+        log: Logger,
+        stub_scrimlet: Option<bool>,
+    ) -> Result<Self, String> {
+        let log = log.new(o!("component" => "HardwareManager"));
+
+        // The size of the broadcast channel is arbitrary, but bounded.
+        // If the channel fills up, old notifications will be dropped, and the
+        // receiver will receive a tokio::sync::broadcast::error::RecvError::Lagged
+        // error, indicating they should re-scan the hardware themselves.
+        let (tx, _) = broadcast::channel(1024);
+        let hw = match stub_scrimlet {
+            None => HardwareView::new(),
+            Some(active) => HardwareView::new_stub_tofino(active),
+        };
+        let inner = Arc::new(Mutex::new(hw));
+
+        // Force the device tree to be polled at least once before returning.
+        // This mitigates issues where the Sled Agent could try to propagate
+        // an "empty" view of hardware to other consumers before the first
+        // query.
+        poll_device_tree(&log, &inner, &tx)
+            .map_err(|err| format!("Failed to poll device tree: {err}"))?;
+
+        let log2 = log.clone();
+        let inner2 = inner.clone();
+        let tx2 = tx.clone();
+        let _worker = tokio::task::spawn(async move {
+            hardware_tracking_task(log2, inner2, tx2).await
+        });
+
+        Ok(Self { log, inner, tx, _worker })
+    }
+
+    pub fn is_scrimlet(&self) -> bool {
+        let inner = self.inner.lock().unwrap();
+        match inner.tofino {
+            TofinoView::Real(TofinoSnapshot { exists, .. }) => exists,
+            TofinoView::Stub { active } => active,
+        }
+    }
+
+    pub fn is_scrimlet_driver_loaded(&self) -> bool {
+        let inner = self.inner.lock().unwrap();
+        match inner.tofino {
+            TofinoView::Real(TofinoSnapshot { driver_loaded, .. }) => {
+                driver_loaded
+            }
+            TofinoView::Stub { active } => active,
+        }
+    }
+
+    pub fn monitor(&self) -> broadcast::Receiver<super::HardwareUpdate> {
+        info!(self.log, "Monitoring for hardware updates");
+        self.tx.subscribe()
+        // TODO: Do we want to send initial messages, based on the existing
+        // state? Or should we leave this responsibility to the caller, to
+        // start monitoring, and then query for the initial state?
+        //
+        // This could simplify the `SledAgent::monitor` function?
+    }
+}
diff --git a/sled-agent/src/hardware/mod.rs b/sled-agent/src/hardware/mod.rs
new file mode 100644
index 0000000000..b678087a8e
--- /dev/null
+++ b/sled-agent/src/hardware/mod.rs
@@ -0,0 +1,28 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+cfg_if::cfg_if! {
+    if #[cfg(target_os = "illumos")] {
+        mod illumos;
+        pub(crate) use illumos::*;
+    } else {
+        mod non_illumos;
+        pub(crate) use non_illumos::*;
+    }
+}
+
+/// Provides information from the underlying hardware about updates
+/// which may require action on behalf of the Sled Agent.
+///
+/// These updates should generally be "non-opinionated" - the higher
+/// layers of the sled agent can make the call to ignore these updates
+/// or not.
+#[derive(Clone, Debug)]
+#[allow(dead_code)]
+pub enum HardwareUpdate {
+    TofinoDeviceChange,
+    TofinoLoaded,
+    TofinoUnloaded,
+    // TODO: Notify about disks being added / removed, etc.
+}
diff --git a/sled-agent/src/hardware/non_illumos/mod.rs b/sled-agent/src/hardware/non_illumos/mod.rs
new file mode 100644
index 0000000000..18e89c4dba
--- /dev/null
+++ b/sled-agent/src/hardware/non_illumos/mod.rs
@@ -0,0 +1,37 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+use slog::Logger;
+use tokio::sync::broadcast;
+
+/// An unimplemented, stub representation of the underlying hardware.
+///
+/// This is intended for non-illumos systems to have roughly the same interface
+/// as illumos systems - it allows compilation to "work" on non-illumos
+/// platforms, which can be handy for editor support.
+///
+/// If you're actually trying to run the Sled Agent on non-illumos platforms,
+/// use the simulated sled agent, which does not attempt to abstract hardware.
+pub struct HardwareManager {}
+
+impl HardwareManager {
+    pub fn new(
+        _log: Logger,
+        _stub_scrimlet: Option<bool>,
+    ) -> Result<Self, String> {
+        unimplemented!("Accessing hardware unsupported on non-illumos");
+    }
+
+    pub fn is_scrimlet(&self) -> bool {
+        unimplemented!("Accessing hardware unsupported on non-illumos");
+    }
+
+    pub fn is_scrimlet_driver_loaded(&self) -> bool {
+        unimplemented!("Accessing hardware unsupported on non-illumos");
+    }
+
+    pub fn monitor(&self) -> broadcast::Receiver<super::HardwareUpdate> {
+        unimplemented!("Accessing hardware unsupported on non-illumos");
+    }
+}
diff --git a/sled-agent/src/lib.rs b/sled-agent/src/lib.rs
index d36880b06d..73dcdca062 100644
--- a/sled-agent/src/lib.rs
+++ b/sled-agent/src/lib.rs
@@ -21,6 +21,7 @@ pub mod common;
 // Modules for the non-simulated sled agent.
 pub mod bootstrap;
 pub mod config;
+mod hardware;
 mod http_entrypoints;
 pub mod illumos;
 mod instance;
diff --git a/sled-agent/src/nexus.rs b/sled-agent/src/nexus.rs
index 738371efd4..9f9206545a 100644
--- a/sled-agent/src/nexus.rs
+++ b/sled-agent/src/nexus.rs
@@ -13,8 +13,12 @@ use internal_dns_client::{
 };
 use omicron_common::address::NEXUS_INTERNAL_PORT;
 use slog::Logger;
+use std::future::Future;
 use std::net::Ipv6Addr;
+use std::pin::Pin;
 use std::sync::Arc;
+use tokio::sync::mpsc;
+use tokio::task::JoinHandle;
 
 struct Inner {
     log: Logger,
@@ -59,3 +63,42 @@ impl LazyNexusClient {
         ))
     }
 }
+
+type NexusRequestFut = dyn Future<Output = ()> + Send;
+type NexusRequest = Pin<Box<NexusRequestFut>>;
+
+/// A queue of futures which represent requests to Nexus.
+pub struct NexusRequestQueue {
+    tx: mpsc::UnboundedSender<NexusRequest>,
+    _worker: JoinHandle<()>,
+}
+
+impl NexusRequestQueue {
+    /// Creates a new request queue, along with a worker which executes
+    /// any incoming tasks.
+    pub fn new() -> Self {
+        // TODO(https://github.com/oxidecomputer/omicron/issues/1917):
+        // In the future, this should basically just be a wrapper around a
+        // generation number, and we shouldn't be serializing requests to Nexus.
+        //
+        // In the meanwhile, we're using an unbounded_channel for simplicity, so
+        // that we don't need to cope with dropped notifications /
+        // retransmissions.
+        let (tx, mut rx) = mpsc::unbounded_channel();
+
+        let _worker = tokio::spawn(async move {
+            while let Some(fut) = rx.recv().await {
+                fut.await;
+            }
+        });
+
+        Self { tx, _worker }
+    }
+
+    /// Gets access to the sending portion of the request queue.
+    ///
+    /// Callers can use this to add their own requests.
+    pub fn sender(&self) -> &mpsc::UnboundedSender<NexusRequest> {
+        &self.tx
+    }
+}
diff --git a/sled-agent/src/server.rs b/sled-agent/src/server.rs
index 4319f7019a..4aa9078890 100644
--- a/sled-agent/src/server.rs
+++ b/sled-agent/src/server.rs
@@ -9,9 +9,6 @@ use super::http_entrypoints::api as http_api;
 use super::sled_agent::SledAgent;
 use crate::bootstrap::params::SledAgentRequest;
 use crate::nexus::LazyNexusClient;
-use omicron_common::backoff::{
-    internal_service_policy_with_max, retry_notify, BackoffError,
-};
 use slog::Logger;
 use std::net::{SocketAddr, SocketAddrV6};
 use uuid::Uuid;
@@ -21,7 +18,6 @@ use uuid::Uuid;
 pub struct Server {
     /// Dropshot server for the API.
     http_server: dropshot::HttpServer<SledAgent>,
-    _nexus_notifier_handle: tokio::task::JoinHandle<()>,
 }
 
 impl Server {
@@ -38,7 +34,6 @@ impl Server {
         config: &Config,
         log: Logger,
         addr: SocketAddrV6,
-        is_scrimlet: bool,
         request: SledAgentRequest,
     ) -> Result<Server, String> {
         info!(log, "setting up sled agent server");
@@ -71,61 +66,7 @@ impl Server {
         .map_err(|error| format!("initializing server: {}", error))?
         .start();
 
-        let sled_address = http_server.local_addr();
-        let sled_id = config.id;
-        let nexus_notifier_handle = tokio::task::spawn(async move {
-            // Notify the control plane that we're up, and continue trying this
-            // until it succeeds. We retry with an randomized, capped exponential
-            // backoff.
-            //
-            // TODO-robustness if this returns a 400 error, we probably want to
-            // return a permanent error from the `notify_nexus` closure.
-            let notify_nexus = || async {
-                info!(
-                    log,
-                    "contacting server nexus, registering sled: {}", sled_id
-                );
-                let role = if is_scrimlet {
-                    nexus_client::types::SledRole::Scrimlet
-                } else {
-                    nexus_client::types::SledRole::Gimlet
-                };
-
-                let nexus_client = lazy_nexus_client
-                    .get()
-                    .await
-                    .map_err(|err| BackoffError::transient(err.to_string()))?;
-                nexus_client
-                    .sled_agent_put(
-                        &sled_id,
-                        &nexus_client::types::SledAgentStartupInfo {
-                            sa_address: sled_address.to_string(),
-                            role,
-                        },
-                    )
-                    .await
-                    .map_err(|err| BackoffError::transient(err.to_string()))
-            };
-            let log_notification_failure = |err, delay| {
-                warn!(
-                    log,
-                    "failed to notify nexus about sled agent: {}, will retry in {:?}", err, delay;
-                );
-            };
-            retry_notify(
-                internal_service_policy_with_max(
-                    std::time::Duration::from_secs(1),
-                ),
-                notify_nexus,
-                log_notification_failure,
-            )
-            .await
-            .expect("Expected an infinite retry loop contacting Nexus");
-        });
-        Ok(Server {
-            http_server,
-            _nexus_notifier_handle: nexus_notifier_handle,
-        })
+        Ok(Server { http_server })
     }
 
     /// Wait for the given server to shut down
diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs
index c5e11a1f4b..4cc5d65ad7 100644
--- a/sled-agent/src/sled_agent.rs
+++ b/sled-agent/src/sled_agent.rs
@@ -6,6 +6,7 @@
 
 use crate::bootstrap::params::SledAgentRequest;
 use crate::config::Config;
+use crate::hardware::HardwareManager;
 use crate::illumos::vnic::VnicKind;
 use crate::illumos::zfs::{
     Mountpoint, ZONE_ZFS_DATASET, ZONE_ZFS_DATASET_MOUNTPOINT,
@@ -13,7 +14,7 @@ use crate::illumos::zfs::{
 use crate::illumos::zone::IPADM;
 use crate::illumos::{execute, PFEXEC};
 use crate::instance_manager::InstanceManager;
-use crate::nexus::LazyNexusClient;
+use crate::nexus::{LazyNexusClient, NexusRequestQueue};
 use crate::params::{
     DatasetKind, DiskStateRequested, InstanceHardware, InstanceMigrateParams,
     InstanceRuntimeStateRequested, InstanceSerialConsoleData,
@@ -27,9 +28,13 @@ use omicron_common::api::{
     internal::nexus::DiskRuntimeState, internal::nexus::InstanceRuntimeState,
     internal::nexus::UpdateArtifact,
 };
+use omicron_common::backoff::{
+    internal_service_policy_with_max, retry_notify, BackoffError,
+};
 use slog::Logger;
 use std::net::SocketAddrV6;
 use std::process::Command;
+use std::sync::Arc;
 use uuid::Uuid;
 
 use crucible_client_types::VolumeConstructionRequest;
@@ -92,6 +97,9 @@ pub enum Error {
     #[error("Error managing guest networking: {0}")]
     Opte(#[from] crate::opte::Error),
 
+    #[error("Error monitoring hardware: {0}")]
+    Hardware(String),
+
     #[error("Error resolving DNS name: {0}")]
     ResolveError(#[from] internal_dns_client::multiclient::ResolveError),
 }
@@ -140,20 +148,35 @@ impl From<Error> for dropshot::HttpError {
 /// Describes an executing Sled Agent object.
 ///
 /// Contains both a connection to the Nexus, as well as managed instances.
-pub struct SledAgent {
+struct SledAgentInner {
     // ID of the Sled
     id: Uuid,
 
+    // Sled underlay address
+    addr: SocketAddrV6,
+
     // Component of Sled Agent responsible for storage and dataset management.
     storage: StorageManager,
 
     // Component of Sled Agent responsible for managing Propolis instances.
     instances: InstanceManager,
 
-    lazy_nexus_client: LazyNexusClient,
+    // Component of Sled Agent responsible for monitoring hardware.
+    hardware: HardwareManager,
 
     // Other Oxide-controlled services running on this Sled.
     services: ServiceManager,
+
+    // Lazily-acquired connection to Nexus.
+    lazy_nexus_client: LazyNexusClient,
+
+    // A serialized request queue for operations interacting with Nexus.
+    nexus_request_queue: NexusRequestQueue,
+}
+
+#[derive(Clone)]
+pub struct SledAgent {
+    inner: Arc<SledAgentInner>,
 }
 
 impl SledAgent {
@@ -298,6 +321,11 @@ impl SledAgent {
             gateway_address: request.gateway.address,
             ..Default::default()
         };
+
+        let hardware =
+            HardwareManager::new(parent_log.clone(), config.stub_scrimlet)
+                .map_err(|e| Error::Hardware(e))?;
+
         let services = ServiceManager::new(
             parent_log.clone(),
             etherstub.clone(),
@@ -309,11 +337,170 @@ impl SledAgent {
         )
         .await?;
 
-        Ok(SledAgent { id, storage, instances, lazy_nexus_client, services })
+        let sled_agent = SledAgent {
+            inner: Arc::new(SledAgentInner {
+                id,
+                addr: sled_address,
+                storage,
+                instances,
+                hardware,
+                services,
+                lazy_nexus_client,
+
+                // TODO(https://github.com/oxidecomputer/omicron/issues/1917):
+                // Propagate usage of this request queue throughout the Sled Agent.
+                //
+                // Also, we could maybe de-dup some of the backoff code in the request queue?
+                nexus_request_queue: NexusRequestQueue::new(),
+            }),
+        };
+
+        // We immediately add a notification to the request queue about our
+        // existence. If inspection of the hardware later informs us that we're
+        // actually running on a scrimlet, that's fine, the updated value will
+        // be received by Nexus eventually.
+        sled_agent.notify_nexus_about_self(&log);
+
+        // Begin monitoring the underlying hardware, and reacting to changes.
+        let sa = sled_agent.clone();
+        tokio::spawn(async move {
+            sa.monitor(log).await;
+        });
+
+        Ok(sled_agent)
+    }
+
+    // Observe the current hardware state manually.
+    //
+    // We use this when we're monitoring hardware for the first
+    // time, and if we miss notifications.
+    async fn full_hardware_scan(&self, log: &Logger) {
+        info!(log, "Performing full hardware scan");
+        self.notify_nexus_about_self(log);
+        if self.inner.hardware.is_scrimlet_driver_loaded() {
+            self.ensure_scrimlet_services_active(&log).await;
+        } else {
+            self.ensure_scrimlet_services_deactive(&log).await;
+        }
+    }
+
+    async fn monitor(&self, log: Logger) {
+        // Start monitoring the hardware for changes
+        let mut hardware_updates = self.inner.hardware.monitor();
+
+        // Scan the system manually for events we have have missed
+        // before we started monitoring.
+        self.full_hardware_scan(&log).await;
+
+        // Rely on monitoring for tracking all future updates.
+        loop {
+            use tokio::sync::broadcast::error::RecvError;
+            match hardware_updates.recv().await {
+                Ok(update) => match update {
+                    crate::hardware::HardwareUpdate::TofinoDeviceChange => {
+                        // Inform Nexus that we're now a scrimlet, instead of a Gimlet.
+                        //
+                        // This won't block on Nexus responding; it may take while before
+                        // Nexus actually comes online.
+                        self.notify_nexus_about_self(&log);
+                    }
+                    crate::hardware::HardwareUpdate::TofinoLoaded => {
+                        self.ensure_scrimlet_services_active(&log).await;
+                    }
+                    crate::hardware::HardwareUpdate::TofinoUnloaded => {
+                        self.ensure_scrimlet_services_deactive(&log).await;
+                    }
+                },
+                Err(RecvError::Lagged(count)) => {
+                    warn!(log, "Hardware monitor missed {count} messages");
+                    self.full_hardware_scan(&log).await;
+                }
+                Err(RecvError::Closed) => {
+                    warn!(log, "Hardware monitor receiver closed; exiting");
+                    return;
+                }
+            }
+        }
+    }
+
+    async fn ensure_scrimlet_services_active(&self, log: &Logger) {
+        // TODO(https://github.com/oxidecomputer/omicron/issues/823): Launch the switch zone, with
+        // Dendrite, MGS, and any other services we want to enable.
+        warn!(log, "Activating scrimlet services not yet implemented");
+    }
+
+    async fn ensure_scrimlet_services_deactive(&self, log: &Logger) {
+        // TODO(https://github.com/oxidecomputer/omicron/issues/823): Terminate the switch zone.
+        warn!(log, "Deactivating scrimlet services not yet implemented");
     }
 
     pub fn id(&self) -> Uuid {
-        self.id
+        self.inner.id
+    }
+
+    // Sends a request to Nexus informing it that the current sled exists.
+    fn notify_nexus_about_self(&self, log: &Logger) {
+        let sled_id = self.inner.id;
+        let lazy_nexus_client = self.inner.lazy_nexus_client.clone();
+        let sled_address = self.inner.addr;
+        let is_scrimlet = self.inner.hardware.is_scrimlet();
+        let log = log.clone();
+        let fut = async move {
+            // Notify the control plane that we're up, and continue trying this
+            // until it succeeds. We retry with an randomized, capped exponential
+            // backoff.
+            //
+            // TODO-robustness if this returns a 400 error, we probably want to
+            // return a permanent error from the `notify_nexus` closure.
+            let notify_nexus = || async {
+                info!(
+                    log,
+                    "contacting server nexus, registering sled: {}", sled_id
+                );
+                let role = if is_scrimlet {
+                    nexus_client::types::SledRole::Scrimlet
+                } else {
+                    nexus_client::types::SledRole::Gimlet
+                };
+
+                let nexus_client = lazy_nexus_client
+                    .get()
+                    .await
+                    .map_err(|err| BackoffError::transient(err.to_string()))?;
+                nexus_client
+                    .sled_agent_put(
+                        &sled_id,
+                        &nexus_client::types::SledAgentStartupInfo {
+                            sa_address: sled_address.to_string(),
+                            role,
+                        },
+                    )
+                    .await
+                    .map_err(|err| BackoffError::transient(err.to_string()))
+            };
+            let log_notification_failure = |err, delay| {
+                warn!(
+                    log,
+                    "failed to notify nexus about sled agent: {}, will retry in {:?}", err, delay;
+                );
+            };
+            retry_notify(
+                internal_service_policy_with_max(
+                    std::time::Duration::from_secs(1),
+                ),
+                notify_nexus,
+                log_notification_failure,
+            )
+            .await
+            .expect("Expected an infinite retry loop contacting Nexus");
+        };
+        self.inner
+            .nexus_request_queue
+            .sender()
+            .send(Box::pin(fut))
+            .unwrap_or_else(|err| {
+                panic!("Failed to send future to request queue: {err}");
+            });
     }
 
     /// Ensures that particular services should be initialized.
@@ -324,7 +511,7 @@ impl SledAgent {
         &self,
         requested_services: ServiceEnsureBody,
     ) -> Result<(), Error> {
-        self.services.ensure(requested_services).await?;
+        self.inner.services.ensure(requested_services).await?;
         Ok(())
     }
 
@@ -335,7 +522,8 @@ impl SledAgent {
         dataset_kind: DatasetKind,
         address: SocketAddrV6,
     ) -> Result<(), Error> {
-        self.storage
+        self.inner
+            .storage
             .upsert_filesystem(zpool_uuid, dataset_kind, address)
             .await?;
         Ok(())
@@ -349,7 +537,8 @@ impl SledAgent {
         target: InstanceRuntimeStateRequested,
         migrate: Option<InstanceMigrateParams>,
     ) -> Result<InstanceRuntimeState, Error> {
-        self.instances
+        self.inner
+            .instances
             .ensure(instance_id, initial, target, migrate)
             .await
             .map_err(|e| Error::Instance(e))
@@ -373,7 +562,7 @@ impl SledAgent {
         &self,
         artifact: UpdateArtifact,
     ) -> Result<(), Error> {
-        let nexus_client = self.lazy_nexus_client.get().await?;
+        let nexus_client = self.inner.lazy_nexus_client.get().await?;
         crate::updates::download_artifact(artifact, &nexus_client).await?;
         Ok(())
     }
@@ -384,7 +573,8 @@ impl SledAgent {
         byte_offset: ByteOffset,
         max_bytes: Option<usize>,
     ) -> Result<InstanceSerialConsoleData, Error> {
-        self.instances
+        self.inner
+            .instances
             .instance_serial_console_buffer_data(
                 instance_id,
                 byte_offset,
@@ -401,7 +591,8 @@ impl SledAgent {
         disk_id: Uuid,
         snapshot_id: Uuid,
     ) -> Result<(), Error> {
-        self.instances
+        self.inner
+            .instances
             .instance_issue_disk_snapshot_request(
                 instance_id,
                 disk_id,
@@ -430,7 +621,11 @@ impl SledAgent {
         _vpc_id: Uuid,
         rules: &[VpcFirewallRule],
     ) -> Result<(), Error> {
-        self.instances.firewall_rules_ensure(rules).await.map_err(Error::from)
+        self.inner
+            .instances
+            .firewall_rules_ensure(rules)
+            .await
+            .map_err(Error::from)
     }
 }
 
diff --git a/smf/sled-agent/config.toml b/smf/sled-agent/config.toml
index 9af1db6f2e..cf082e7dfc 100644
--- a/smf/sled-agent/config.toml
+++ b/smf/sled-agent/config.toml
@@ -2,6 +2,15 @@
 
 id = "fb0f7546-4d46-40ca-9d56-cbb810684ca7"
 
+# Identifies if the sled should be unconditionally treated as a scrimlet.
+#
+# If this is set to "true", the sled agent treats itself as a scrimlet.
+# If this is set to "false", the sled agent treats itself as a gimlet.
+# If this is unset:
+# - On illumos, the sled automatically detects whether or not it is a scrimlet.
+# - On all other platforms, the sled assumes it is a gimlet.
+# stub_scrimlet = true
+
 # A file-backed zpool can be manually created with the following:
 # # truncate -s 10GB testpool.vdev
 # # zpool create oxp_d462a7f7-b628-40fe-80ff-4e4189e2d62b "$PWD/testpool.vdev"