From 17b18badc30506e8b3fe3b1aa5bfaadc3d6b8478 Mon Sep 17 00:00:00 2001
From: Serapheim Dimitropoulos <serapheim@delphix.com>
Date: Thu, 30 Jun 2022 11:20:53 -0700
Subject: [PATCH 1/2] DLPX-81825 same-checkpoint alloc and free leaks blocks
 (#500)

When a segment is marked as allocated and then gets freed within the
same checkpoint it never actually makes it back to the set of
allocatable segments effectively leaking its space.
---
 .../zettacache/src/block_allocator/mod.rs     | 24 +++----------------
 1 file changed, 3 insertions(+), 21 deletions(-)

diff --git a/cmd/zfs_object_agent/zettacache/src/block_allocator/mod.rs b/cmd/zfs_object_agent/zettacache/src/block_allocator/mod.rs
index b19f310fd904..d77874e5fff6 100644
--- a/cmd/zfs_object_agent/zettacache/src/block_allocator/mod.rs
+++ b/cmd/zfs_object_agent/zettacache/src/block_allocator/mod.rs
@@ -246,13 +246,7 @@ impl SlabTrait for BitmapSlab {
             "double free at slot {:?}",
             slot
         );
-
-        if self.allocating.contains(slot) {
-            assert!(!self.freeing.contains(slot));
-            self.allocating.remove(slot);
-        } else {
-            self.freeing.insert(slot);
-        }
+        self.freeing.insert(slot);
     }
 
     fn flush_to_spacemap(&mut self, spacemap: &mut SpaceMap) {
@@ -269,14 +263,12 @@ impl SlabTrait for BitmapSlab {
         }
         self.allocating.clear();
 
+        // Space freed during this checkpoint is now available for reallocation.
         for (slot, run) in self.freeing.iter_ranges() {
             spacemap.free(Extent {
                 location: self.slot_to_location(slot),
                 size: u64::from(run) * u64::from(self.slot_size),
             });
-        }
-        // Space freed during this checkpoint is now available for reallocation.
-        for (slot, run) in self.freeing.iter_ranges() {
             with_alloctag(Self::ALLOCATABLE_TAG, || {
                 self.allocatable.insert_range(slot..(slot + run))
             });
@@ -502,17 +494,7 @@ impl SlabTrait for ExtentSlab {
         let size = extent.size;
 
         self.allocatable.verify_absent(offset, size);
-
-        match self.allocating.overlap(offset, size) {
-            Some(_) => {
-                self.freeing.verify_absent(offset, size);
-                self.allocating.remove(offset, size);
-            }
-            None => {
-                self.allocating.verify_absent(offset, size);
-                self.freeing.add(offset, size);
-            }
-        }
+        self.freeing.add(offset, size);
     }
 
     fn flush_to_spacemap(&mut self, spacemap: &mut SpaceMap) {

From 786d0dbeac80b28e05fb05202324c40ddd2ad7ee Mon Sep 17 00:00:00 2001
From: Serapheim Dimitropoulos <serapheim@delphix.com>
Date: Thu, 30 Jun 2022 15:23:28 -0700
Subject: [PATCH 2/2] DLPX-80711 zcache device removal (#419)

---
 cmd/zfs_object_agent/Cargo.lock               |   1 +
 cmd/zfs_object_agent/util/src/message.rs      |   9 +
 cmd/zfs_object_agent/util/src/vec_ext.rs      |  66 ++-
 .../util/src/zettacache_stats.rs              |   8 +-
 cmd/zfs_object_agent/zcache/Cargo.toml        |   1 +
 cmd/zfs_object_agent/zcache/src/main.rs       |  32 +-
 cmd/zfs_object_agent/zcache/src/remove.rs     |  74 +++
 cmd/zfs_object_agent/zcache/src/stats.rs      |   8 +-
 .../zettacache/src/base_types.rs              |  13 +
 .../zettacache/src/block_access.rs            | 105 ++--
 .../zettacache/src/block_allocator/mod.rs     | 472 +++++++++++++++---
 .../zettacache/src/block_based_log/mod.rs     |  43 ++
 .../zettacache/src/checkpoint.rs              |   3 +
 .../zettacache/src/features.rs                |   5 +
 cmd/zfs_object_agent/zettacache/src/open.rs   |  46 +-
 .../zettacache/src/slab_allocator.rs          | 334 +++++++++++--
 .../zettacache/src/space_map.rs               |   6 +
 .../zettacache/src/superblock.rs              | 101 ++--
 .../zettacache/src/zcachedb.rs                |   4 +-
 .../zettacache/src/zettacache/mod.rs          | 339 +++++++++++--
 .../zettacache/src/zettacache/removal.rs      | 118 +++++
 .../zettacache/src/zettacache/zcdb.rs         |  38 +-
 .../zettaobject/src/root_connection.rs        |  61 +++
 23 files changed, 1613 insertions(+), 274 deletions(-)
 create mode 100644 cmd/zfs_object_agent/zcache/src/remove.rs
 create mode 100644 cmd/zfs_object_agent/zettacache/src/zettacache/removal.rs

diff --git a/cmd/zfs_object_agent/Cargo.lock b/cmd/zfs_object_agent/Cargo.lock
index 59a71ffe70c0..da31414a4b63 100644
--- a/cmd/zfs_object_agent/Cargo.lock
+++ b/cmd/zfs_object_agent/Cargo.lock
@@ -3324,6 +3324,7 @@ dependencies = [
  "chrono",
  "clap",
  "exitcode",
+ "git-version",
  "humantime",
  "log",
  "more-asserts",
diff --git a/cmd/zfs_object_agent/util/src/message.rs b/cmd/zfs_object_agent/util/src/message.rs
index 076e0927b37e..e82d6a2b8fe9 100644
--- a/cmd/zfs_object_agent/util/src/message.rs
+++ b/cmd/zfs_object_agent/util/src/message.rs
@@ -139,6 +139,10 @@ pub const TYPE_ZCACHE_STATS: &str = "zcache stats";
 pub const TYPE_ZCACHE_STATUS: &str = "zcache status";
 pub const TYPE_ADD_DISK: &str = "add disk";
 pub const TYPE_EXPAND_DISK: &str = "expand disk";
+pub const TYPE_REMOVE_DISK: &str = "remove disk";
+pub const TYPE_CANCEL_DISK_REMOVAL: &str = "cancel disk removal";
+pub const TYPE_PAUSE_DISK_REMOVALS: &str = "pause disk removals";
+pub const TYPE_RESUME_DISK_REMOVALS: &str = "resume disk removals";
 pub const TYPE_SYNC_CHECKPOINT: &str = "sync checkpoint";
 pub const TYPE_INITIATE_MERGE: &str = "initiate merge";
 
@@ -152,6 +156,11 @@ pub struct ExpandDiskRequest {
     pub path: PathBuf,
 }
 
+#[derive(Serialize, Deserialize, Debug)]
+pub struct RemoveDiskRequest {
+    pub path: PathBuf,
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 pub struct ExpandDiskResponse {
     pub new_size: u64,
diff --git a/cmd/zfs_object_agent/util/src/vec_ext.rs b/cmd/zfs_object_agent/util/src/vec_ext.rs
index 4617605316ae..a9ad2312f744 100644
--- a/cmd/zfs_object_agent/util/src/vec_ext.rs
+++ b/cmd/zfs_object_agent/util/src/vec_ext.rs
@@ -243,7 +243,7 @@ pub struct VecMap<K, V> {
 
 impl<K, V> VecMap<K, V>
 where
-    K: Into<usize> + Copy,
+    K: From<usize> + Into<usize> + Copy,
 {
     /// Returns old value (or None if not present)
     pub fn insert(&mut self, key: K, value: V) -> Option<V> {
@@ -285,6 +285,10 @@ where
         self.vec[index].take()
     }
 
+    pub fn contains_key(&self, key: &K) -> bool {
+        self.get(*key).is_some()
+    }
+
     pub fn values(&self) -> impl Iterator<Item = &V> {
         self.vec.iter().filter_map(|v| v.as_ref())
     }
@@ -313,6 +317,38 @@ where
         self.vec[range].iter_mut().filter_map(|v| v.as_mut())
     }
 
+    pub fn iter(&self) -> impl DoubleEndedIterator<Item = (K, &V)> {
+        self.vec
+            .iter()
+            .enumerate()
+            .flat_map(|(k, v)| v.as_ref().map(|v| (k.into(), v)))
+    }
+
+    pub fn iter_mut(&mut self) -> impl DoubleEndedIterator<Item = (K, &mut V)> {
+        self.vec
+            .iter_mut()
+            .enumerate()
+            .flat_map(|(k, v)| v.as_mut().map(|v| (k.into(), v)))
+    }
+
+    pub fn keys(&self) -> impl DoubleEndedIterator<Item = K> + '_ {
+        self.iter().map(|(k, _)| k)
+    }
+
+    pub fn retain<F>(&mut self, mut f: F)
+    where
+        F: FnMut(&K, &mut V) -> bool,
+    {
+        for k in 0..self.vec.len() {
+            let k = k.into();
+            if let Some(v) = self.get_mut(k) {
+                if !f(&k, v) {
+                    self.remove(k);
+                }
+            }
+        }
+    }
+
     /// Returns the number of elements in the map.
     pub fn len(&self) -> usize {
         self.num_entries
@@ -322,3 +358,31 @@ where
         self.len() == 0
     }
 }
+
+impl<K, V> VecMap<K, V>
+where
+    K: From<usize> + Into<usize> + Copy,
+    V: Default,
+{
+    pub fn get_mut_or_default(&mut self, key: K) -> &mut V {
+        if self.contains_key(&key) {
+            return self.get_mut(key).unwrap();
+        }
+        let old = self.insert(key, Default::default());
+        assert!(old.is_none());
+        self.get_mut(key).unwrap()
+    }
+}
+
+impl<K, V> FromIterator<(K, V)> for VecMap<K, V>
+where
+    K: From<usize> + Into<usize> + Copy,
+{
+    fn from_iter<T: IntoIterator<Item = (K, V)>>(iter: T) -> VecMap<K, V> {
+        let mut map = VecMap::default();
+        for (k, v) in iter.into_iter() {
+            map.insert(k, v);
+        }
+        map
+    }
+}
diff --git a/cmd/zfs_object_agent/util/src/zettacache_stats.rs b/cmd/zfs_object_agent/util/src/zettacache_stats.rs
index 6e1078f59c2f..63a6e47cd913 100644
--- a/cmd/zfs_object_agent/util/src/zettacache_stats.rs
+++ b/cmd/zfs_object_agent/util/src/zettacache_stats.rs
@@ -386,8 +386,8 @@ pub enum CacheStatCounter {
     SpeculativeBufferBytesAvailable,
     SlabCapacity,
     AvailableSpace,
-    AvailableBlocksSize,
-    AvailableSlabsSize,
+    FreeBlocksSize,
+    FreeSlabsSize,
 }
 
 impl Display for CacheStatCounter {
@@ -447,8 +447,8 @@ impl Sub<&Self> for &CacheStats {
                 | CacheStatCounter::SpeculativeBufferBytesAvailable
                 | CacheStatCounter::SlabCapacity
                 | CacheStatCounter::AvailableSpace
-                | CacheStatCounter::AvailableSlabsSize
-                | CacheStatCounter::AvailableBlocksSize => {
+                | CacheStatCounter::FreeSlabsSize
+                | CacheStatCounter::FreeBlocksSize => {
                     *diff_stat = self_stat.clone();
                 }
                 // Everything else should be subtracted
diff --git a/cmd/zfs_object_agent/zcache/Cargo.toml b/cmd/zfs_object_agent/zcache/Cargo.toml
index 4290009e0ea6..f9125b59512e 100644
--- a/cmd/zfs_object_agent/zcache/Cargo.toml
+++ b/cmd/zfs_object_agent/zcache/Cargo.toml
@@ -13,6 +13,7 @@ chrono = "0.4"
 async-trait = "0.1.51"
 clap={version="3.1.6",features=["derive"]}
 exitcode = "1.1.2"
+git-version = "0.3.5"
 humantime = "2.1.0"
 log = "0.4"
 more-asserts = "0.2.1"
diff --git a/cmd/zfs_object_agent/zcache/src/main.rs b/cmd/zfs_object_agent/zcache/src/main.rs
index 7b303fbf2c8b..8c574ea98cfc 100644
--- a/cmd/zfs_object_agent/zcache/src/main.rs
+++ b/cmd/zfs_object_agent/zcache/src/main.rs
@@ -12,6 +12,7 @@ mod iostat;
 mod labelclear;
 mod list;
 mod remote_channel;
+mod remove;
 mod stats;
 mod status;
 mod subcommand;
@@ -22,17 +23,24 @@ use std::path::PathBuf;
 use anyhow::Result;
 use clap::Parser;
 use clap::Subcommand;
+use git_version::git_version;
 use hits::ClearHitData;
 use log::*;
 use subcommand::ZcacheSubCommand;
 
+static GIT_VERSION: &str = git_version!(
+    fallback = match option_env!("CARGO_ZOA_GITREV") {
+        Some(value) => value,
+        None => "unknown",
+    }
+);
+
 fn main() -> Result<()> {
     async_main()
 }
 
 #[derive(Parser)]
-// XXX other commands use a git derived version here
-#[clap(version = "1.1")]
+#[clap(version=GIT_VERSION)]
 #[clap(name = "zcache")]
 #[clap(about = "ZFS ZettaCache Command")]
 #[clap(propagate_version = true)]
@@ -62,6 +70,7 @@ enum Commands {
     Stats(stats::Stats),
     Add(add::Add),
     Expand(expand::Expand),
+    Remove(remove::Remove),
     Sync(sync::Sync),
     Labelclear(labelclear::Labelclear),
     Status(status::Status),
@@ -80,6 +89,7 @@ impl Commands {
             Commands::Stats(stats) => stats,
             Commands::Add(add) => add,
             Commands::Expand(expand) => expand,
+            Commands::Remove(remove) => remove,
             Commands::Sync(sync) => sync,
             Commands::Labelclear(labelclear) => labelclear,
             Commands::ClearHitData(clear_hit_data) => clear_hit_data,
@@ -104,8 +114,26 @@ mod test_clap {
 
     use super::*;
 
+    fn neg(s: &str) {
+        assert!(Cli::try_parse_from(s.split_whitespace()).is_err());
+    }
+
     #[test]
     fn test_debug_asserts() {
         Cli::command().debug_assert();
     }
+
+    #[test]
+    fn test_zcache_remove_conflicts() {
+        neg("zcache remove --pause --resume");
+        neg("zcache remove --pause --cancel");
+        neg("zcache remove --pause -s");
+        neg("zcache remove --pause disk0");
+        neg("zcache remove --resume --cancel");
+        neg("zcache remove --resume -s");
+        neg("zcache remove --resume -s");
+        neg("zcache remove --resume disk0");
+        neg("zcache remove --cancel");
+        neg("zcache remove");
+    }
 }
diff --git a/cmd/zfs_object_agent/zcache/src/remove.rs b/cmd/zfs_object_agent/zcache/src/remove.rs
new file mode 100644
index 000000000000..d6cda783415a
--- /dev/null
+++ b/cmd/zfs_object_agent/zcache/src/remove.rs
@@ -0,0 +1,74 @@
+//! `zcache remove` subcommand
+
+use std::path::PathBuf;
+
+use anyhow::Result;
+use async_trait::async_trait;
+use clap::Parser;
+use util::message::RemoveDiskRequest;
+use util::message::TYPE_CANCEL_DISK_REMOVAL;
+use util::message::TYPE_PAUSE_DISK_REMOVALS;
+use util::message::TYPE_REMOVE_DISK;
+use util::message::TYPE_RESUME_DISK_REMOVALS;
+use util::writeln_stdout;
+
+use crate::remote_channel::RemoteChannel;
+use crate::subcommand::ZcacheSubCommand;
+
+#[derive(Parser)]
+#[clap(about = "Remove disks from the ZettaCache.")]
+pub struct Remove {
+    /// Pause ongoing removals
+    #[clap(
+        long,
+        conflicts_with_all = &["resume", "cancel", "path"]
+    )]
+    pause: bool,
+
+    /// Resume paused removals
+    #[clap(
+        long,
+        conflicts_with_all = &["cancel", "path"]
+    )]
+    resume: bool,
+
+    /// Stop and cancel in-progress removal
+    #[clap(short = 's', long)]
+    cancel: bool,
+
+    /// Supplied disk for the operation issued
+    #[clap(required_unless_present_any = &["pause", "resume"])]
+    path: Option<PathBuf>,
+}
+
+#[async_trait]
+impl ZcacheSubCommand for Remove {
+    async fn invoke(&self) -> Result<()> {
+        let mut remote = RemoteChannel::new(true).await?;
+        if self.pause {
+            remote.call(TYPE_PAUSE_DISK_REMOVALS, None).await?;
+            writeln_stdout!("Removals paused");
+        } else if self.resume {
+            remote.call(TYPE_RESUME_DISK_REMOVALS, None).await?;
+            writeln_stdout!("Resuming removals");
+        } else if self.cancel {
+            let path = self.path.clone().unwrap();
+            let request = RemoveDiskRequest { path: path.clone() };
+            remote
+                .call(
+                    TYPE_CANCEL_DISK_REMOVAL,
+                    Some(nvpair::to_nvlist(&request).unwrap()),
+                )
+                .await?;
+            writeln_stdout!("Cancelled the removal of {path:?}");
+        } else {
+            let path = self.path.clone().unwrap();
+            let request = RemoveDiskRequest { path: path.clone() };
+            remote
+                .call(TYPE_REMOVE_DISK, Some(nvpair::to_nvlist(&request).unwrap()))
+                .await?;
+            writeln_stdout!("Removing {path:?}");
+        }
+        Ok(())
+    }
+}
diff --git a/cmd/zfs_object_agent/zcache/src/stats.rs b/cmd/zfs_object_agent/zcache/src/stats.rs
index 3695f1cfdcbc..a6174e58ca51 100644
--- a/cmd/zfs_object_agent/zcache/src/stats.rs
+++ b/cmd/zfs_object_agent/zcache/src/stats.rs
@@ -256,11 +256,11 @@ impl StatsDisplay {
         if self.show_block_allocator {
             let available_space = values.value(AvailableSpace);
             let slab_capacity = values.value(SlabCapacity);
-            let free_blocks_size = values.value(AvailableBlocksSize);
-            let free_slabs_size = values.value(AvailableSlabsSize);
-            let block_allocator_allocated = slab_capacity - free_blocks_size - free_slabs_size;
+            let free_blocks_size = values.value(FreeBlocksSize);
+            let free_slabs_size = values.value(FreeSlabsSize);
+            let allocated_space = slab_capacity - free_blocks_size - free_slabs_size;
 
-            self.display_bytes(block_allocator_allocated as f64);
+            self.display_bytes(allocated_space as f64);
             self.display_bytes(available_space as f64);
             self.display_percent(free_blocks_size as f64, slab_capacity as f64);
             self.display_percent(free_slabs_size as f64, slab_capacity as f64);
diff --git a/cmd/zfs_object_agent/zettacache/src/base_types.rs b/cmd/zfs_object_agent/zettacache/src/base_types.rs
index f0aa6ffcc1d7..7e715281879f 100644
--- a/cmd/zfs_object_agent/zettacache/src/base_types.rs
+++ b/cmd/zfs_object_agent/zettacache/src/base_types.rs
@@ -91,6 +91,19 @@ impl DiskId {
     pub fn index(self) -> usize {
         self.0 as usize
     }
+    pub fn next(&self) -> DiskId {
+        DiskId(self.0 + 1)
+    }
+}
+impl From<DiskId> for usize {
+    fn from(val: DiskId) -> Self {
+        val.index()
+    }
+}
+impl From<usize> for DiskId {
+    fn from(val: usize) -> Self {
+        DiskId::new(val)
+    }
 }
 
 #[derive(Serialize, Deserialize, Copy, Clone, PartialEq, Eq, Ord, PartialOrd)]
diff --git a/cmd/zfs_object_agent/zettacache/src/block_access.rs b/cmd/zfs_object_agent/zettacache/src/block_access.rs
index f720ea9e5cdc..a64d4ab37e1d 100644
--- a/cmd/zfs_object_agent/zettacache/src/block_access.rs
+++ b/cmd/zfs_object_agent/zettacache/src/block_access.rs
@@ -49,6 +49,7 @@ use util::DeviceEntry;
 use util::DeviceList;
 use util::DeviceStatus;
 use util::From64;
+use util::VecMap;
 use uuid::Uuid;
 
 use crate::base_types::DiskId;
@@ -138,7 +139,7 @@ impl<'a> Drop for OpInProgress<'a> {
 #[derive(Debug)]
 pub struct BlockAccess {
     sector_size: usize,
-    disks: RwLock<Vec<Disk>>,
+    disks: RwLock<VecMap<DiskId, Disk>>,
     readonly: bool,
     timebase: Instant,
 }
@@ -149,7 +150,17 @@ pub struct Disk {
     file: Arc<File>,
 
     path: PathBuf,
+
+    // In most OSs it is common for a device to be represented by multiple files in different
+    // subdirectories under /dev, and subdirectories created by different cloud providers may
+    // complicate things even more. Therefore we remember each device path as the path that was
+    // supplied from the user so that we can display it back to them in a way that they
+    // understand. That said, we want to stay flexible and allow the user to specify any path
+    // that resolves to the same device. So we derive the canonical name so devices in the config
+    // can be matched with devices supplied by the user regardless of the path they are
+    // specified.
     canonical_path: PathBuf,
+
     size: Mutex<u64>,
     sector_size: usize,
     #[derivative(Debug = "ignore")]
@@ -589,9 +600,14 @@ impl Disk {
 // we can use "glommio" to use io_uring for much lower overheads.  Or SPDK
 // (which can use io_uring or nvme hardware directly).
 impl BlockAccess {
-    pub fn new(disks: Vec<Disk>, readonly: bool) -> Self {
-        let sector_size = disks
-            .iter()
+    pub fn new(disks: &BTreeMap<DiskId, PathBuf>, readonly: bool) -> Result<Self> {
+        let mut disk_map: VecMap<DiskId, Disk> = Default::default();
+        for (&disk_id, disk_path) in disks {
+            disk_map.insert(disk_id, Disk::new(disk_path, false)?);
+        }
+
+        let sector_size = disk_map
+            .values()
             .reduce(|a, b| {
                 assert_eq!(a.sector_size, b.sector_size);
                 a
@@ -599,34 +615,40 @@ impl BlockAccess {
             .unwrap()
             .sector_size;
 
-        BlockAccess {
+        Ok(BlockAccess {
             sector_size,
-            disks: RwLock::new(disks),
+            disks: RwLock::new(disk_map),
             readonly,
             timebase: Instant::now(),
-        }
+        })
     }
 
     pub fn add_disk(&self, disk: Disk) -> Result<DiskId> {
         let mut disks = self.disks.write().unwrap();
-        for existing_disk in disks.iter() {
-            if disk.canonical_path == existing_disk.canonical_path {
-                return Err(anyhow!(
-                    "disk {:?} ({:?}) is already part of the zettacache",
-                    disk.path,
-                    disk.canonical_path,
-                ));
-            }
+        if disks
+            .values()
+            .any(|existing_disk| existing_disk.canonical_path == disk.canonical_path)
+        {
+            return Err(anyhow!(
+                "disk {:?} ({:?}) is already part of the zettacache",
+                disk.path,
+                disk.canonical_path,
+            ));
         }
-        let id = DiskId::new(disks.len());
-        disks.push(disk);
+        let id = match disks.keys().next_back() {
+            Some(id) => id.next(),
+            None => DiskId::new(0),
+        };
+        disks.insert(id, disk);
         Ok(id)
     }
 
     // Returns the number of bytes added to the disk.
     pub fn expand_disk(&self, disk: DiskId) -> Result<ExpandDiskResponse> {
         let disks = self.disks.read().unwrap();
-        let disk = &disks[disk.index()];
+        let disk = disks
+            .get(disk)
+            .ok_or_else(|| anyhow!("cannot expand removed disk"))?;
         let (_, new_size) = disk_sizes(&disk.file)?;
         let mut size = disk.size.lock().unwrap();
         let additional_bytes = new_size.checked_sub(*size).ok_or_else(|| {
@@ -643,11 +665,21 @@ impl BlockAccess {
         })
     }
 
-    /// Note: In the future we'll support device removal in which case the
-    /// DiskId's will probably not be sequential.  By using this accessor we
-    /// need not assume anything about the values inside the DiskId's.
+    pub fn remove_disk(&self, disk: DiskId) {
+        let mut disks = self.disks.write().unwrap();
+        let removed = disks.remove(disk);
+        assert!(removed.is_some());
+    }
+
+    /// Accessor function that does not assume anything about the value in of the DiskIds.
+    /// Note: after device removal, DiskIds may not be sequential.
     pub fn disks(&self) -> impl Iterator<Item = DiskId> {
-        (0..self.disks.read().unwrap().len()).map(DiskId::new)
+        self.disks
+            .read()
+            .unwrap()
+            .keys()
+            .collect::<Vec<_>>()
+            .into_iter()
     }
 
     pub fn path_to_disk_id(&self, path: &Path) -> Result<DiskId> {
@@ -656,8 +688,8 @@ impl BlockAccess {
             .read()
             .unwrap()
             .iter()
-            .position(|disk| disk.canonical_path == canonical_path)
-            .map(DiskId::new)
+            .find(|(_, disk)| disk.canonical_path == canonical_path)
+            .map(|(disk_id, _)| disk_id)
             .ok_or_else(|| {
                 anyhow!("disk {path:?} ({canonical_path:?}) is not part of the zettacache")
             })
@@ -669,7 +701,7 @@ impl BlockAccess {
             .disks
             .read()
             .unwrap()
-            .iter()
+            .values()
             .map(|d| DeviceEntry {
                 name: d.path.clone(),
                 size: *d.size.lock().unwrap(),
@@ -683,7 +715,7 @@ impl BlockAccess {
         self.disks
             .read()
             .unwrap()
-            .iter()
+            .values()
             .map(|d| DeviceStatus {
                 path: d.path.clone(),
                 canonical_path: d.canonical_path.clone(),
@@ -693,7 +725,12 @@ impl BlockAccess {
     }
 
     pub fn disk_size(&self, disk: DiskId) -> u64 {
-        *self.disks.read().unwrap()[disk.index()]
+        *self
+            .disks
+            .read()
+            .unwrap()
+            .get(disk)
+            .unwrap()
             .size
             .lock()
             .unwrap()
@@ -707,7 +744,7 @@ impl BlockAccess {
     }
 
     pub fn disk_path(&self, disk: DiskId) -> PathBuf {
-        self.disks.read().unwrap()[disk.index()].path.clone()
+        self.disks.read().unwrap().get(disk).unwrap().path.clone()
     }
 
     pub fn total_capacity(&self) -> u64 {
@@ -721,7 +758,7 @@ impl BlockAccess {
         self.verify_aligned(extent.size);
 
         let disk = extent.location.disk();
-        let fut = self.disks.read().unwrap()[disk.index()].read(
+        let fut = self.disks.read().unwrap().get(disk).unwrap().read(
             extent.location.offset(),
             usize::from64(extent.size),
             io_type,
@@ -744,7 +781,13 @@ impl BlockAccess {
         self.verify_aligned(location.offset());
         self.verify_aligned(bytes.len());
         let disk = location.disk();
-        let fut = self.disks.read().unwrap()[disk.index()].write(location.offset(), bytes, io_type);
+        let fut =
+            self.disks
+                .read()
+                .unwrap()
+                .get(disk)
+                .unwrap()
+                .write(location.offset(), bytes, io_type);
         // drop disks RwLock before waiting for io
         fut.await;
     }
@@ -916,7 +959,7 @@ impl BlockAccess {
                 .disks
                 .read()
                 .unwrap()
-                .iter()
+                .values()
                 .map(|disk| disk.io_stats)
                 .collect(),
         }
diff --git a/cmd/zfs_object_agent/zettacache/src/block_allocator/mod.rs b/cmd/zfs_object_agent/zettacache/src/block_allocator/mod.rs
index d77874e5fff6..2df0048ba995 100644
--- a/cmd/zfs_object_agent/zettacache/src/block_allocator/mod.rs
+++ b/cmd/zfs_object_agent/zettacache/src/block_allocator/mod.rs
@@ -27,6 +27,7 @@ use util::with_alloctag;
 use util::writeln_stdout;
 use util::BitRange;
 use util::RangeTree;
+use util::VecMap;
 
 use self::slabs::Slabs;
 use crate::base_types::*;
@@ -110,12 +111,13 @@ trait SlabTrait {
     fn import_free(&mut self, extent: Extent);
     fn allocate(&mut self, size: u32) -> Option<Extent>;
     fn free(&mut self, extent: Extent);
-    fn flush_to_spacemap(&mut self, spacemap: &mut SpaceMap);
+    fn flush_to_spacemap(&mut self, spacemap: &mut SpaceMap) -> (u64, u64);
     fn condense_to_spacemap(&self, spacemap: &mut SpaceMap);
     fn mark_slab_info(&self, id: SlabId, spacemap: &mut SpaceMap);
     fn max_size(&self) -> u32;
     fn capacity_bytes(&self) -> u64;
     fn free_space(&self) -> u64;
+    fn freeing_space(&self) -> u64;
     fn allocated_space(&self) -> u64;
     fn num_segments(&self) -> u64;
     fn allocated_extents(&self) -> Vec<Extent>;
@@ -249,12 +251,14 @@ impl SlabTrait for BitmapSlab {
         self.freeing.insert(slot);
     }
 
-    fn flush_to_spacemap(&mut self, spacemap: &mut SpaceMap) {
+    fn flush_to_spacemap(&mut self, spacemap: &mut SpaceMap) -> (u64, u64) {
         // It could happen that a segment was allocated and then freed within the same checkpoint
         // period at which point it would be part of both `allocating` and `freeing` sets. For
         // this reason we always record `allocating` first, before `freeing`, on our spacemaps.
         // Note that segments cannot be freed and then allocated within the same checkpoint
         // period.
+
+        let allocated_bytes = u64::from(self.allocatable.len()) * u64::from(self.slot_size);
         for (slot, run) in self.allocating.iter_ranges() {
             spacemap.alloc(Extent {
                 location: self.slot_to_location(slot),
@@ -264,6 +268,7 @@ impl SlabTrait for BitmapSlab {
         self.allocating.clear();
 
         // Space freed during this checkpoint is now available for reallocation.
+        let freed_bytes = u64::from(self.freeing.len()) * u64::from(self.slot_size);
         for (slot, run) in self.freeing.iter_ranges() {
             spacemap.free(Extent {
                 location: self.slot_to_location(slot),
@@ -274,6 +279,8 @@ impl SlabTrait for BitmapSlab {
             });
         }
         self.freeing.clear();
+
+        (allocated_bytes, freed_bytes)
     }
 
     fn condense_to_spacemap(&self, spacemap: &mut SpaceMap) {
@@ -316,8 +323,13 @@ impl SlabTrait for BitmapSlab {
         u64::from(self.allocatable.len()) * u64::from(self.slot_size)
     }
 
+    fn freeing_space(&self) -> u64 {
+        u64::from(self.freeing.len()) * u64::from(self.slot_size)
+    }
+
     fn allocated_space(&self) -> u64 {
-        u64::from(self.total_slots - self.allocatable.len()) * u64::from(self.slot_size)
+        u64::from(self.total_slots - self.allocatable.len() - self.freeing.len())
+            * u64::from(self.slot_size)
     }
 
     fn mark_slab_info(&self, id: SlabId, spacemap: &mut SpaceMap) {
@@ -497,7 +509,7 @@ impl SlabTrait for ExtentSlab {
         self.freeing.add(offset, size);
     }
 
-    fn flush_to_spacemap(&mut self, spacemap: &mut SpaceMap) {
+    fn flush_to_spacemap(&mut self, spacemap: &mut SpaceMap) -> (u64, u64) {
         self.freeing.verify_space();
         self.allocating.verify_space();
         self.allocatable.verify_space();
@@ -509,6 +521,7 @@ impl SlabTrait for ExtentSlab {
         // this reason we always record `allocating` first, before `freeing`, on our spacemaps.
         // Note that segments cannot be freed and then allocated within the same checkpoint
         // period.
+        let allocated_bytes = self.allocating.space();
         for (&start, &size) in self.allocating.iter() {
             self.allocatable.verify_absent(start, size);
             spacemap.alloc(Extent::new(disk, start, size));
@@ -516,12 +529,15 @@ impl SlabTrait for ExtentSlab {
         self.allocating.clear();
 
         // Space freed during this checkpoint is now available for reallocation.
+        let freed_bytes = self.freeing.space();
         for (&start, &size) in self.freeing.iter() {
             self.allocating.verify_absent(start, size);
             spacemap.free(Extent::new(disk, start, size));
             with_alloctag(Self::ALLOCATABLE_TAG, || self.allocatable.add(start, size));
         }
         self.freeing.clear();
+
+        (allocated_bytes, freed_bytes)
     }
 
     fn condense_to_spacemap(&self, spacemap: &mut SpaceMap) {
@@ -556,8 +572,12 @@ impl SlabTrait for ExtentSlab {
         self.allocatable.space()
     }
 
+    fn freeing_space(&self) -> u64 {
+        self.freeing.space()
+    }
+
     fn allocated_space(&self) -> u64 {
-        self.total_space - self.free_space()
+        self.total_space - self.free_space() - self.freeing_space()
     }
 
     fn mark_slab_info(&self, id: SlabId, spacemap: &mut SpaceMap) {
@@ -658,8 +678,8 @@ impl SlabTrait for EvacuatingSlab {
         );
     }
 
-    fn flush_to_spacemap(&mut self, _: &mut SpaceMap) {
-        panic!("attempting to flush evacuating slab",);
+    fn flush_to_spacemap(&mut self, _: &mut SpaceMap) -> (u64, u64) {
+        panic!("attempting to flush evacuating slab");
     }
 
     fn condense_to_spacemap(&self, _: &mut SpaceMap) {
@@ -678,6 +698,10 @@ impl SlabTrait for EvacuatingSlab {
         0
     }
 
+    fn freeing_space(&self) -> u64 {
+        0
+    }
+
     fn allocated_space(&self) -> u64 {
         0
     }
@@ -766,7 +790,8 @@ impl Slab {
         self.inner.as_dyn().mark_slab_info(self.id, spacemap);
     }
 
-    fn flush_to_spacemap(&mut self, spacemap: &mut SpaceMap) {
+    // Returns (bytes_allocated, bytes_freed)
+    fn flush_to_spacemap(&mut self, spacemap: &mut SpaceMap) -> (u64, u64) {
         self.is_dirty = false;
         self.is_allocd = false;
         self.inner.as_mut_dyn().flush_to_spacemap(spacemap)
@@ -787,6 +812,10 @@ impl Slab {
         self.inner.as_dyn().free_space()
     }
 
+    fn freeing_space(&self) -> u64 {
+        self.inner.as_dyn().freeing_space()
+    }
+
     fn allocated_space(&self) -> u64 {
         self.inner.as_dyn().allocated_space()
     }
@@ -991,6 +1020,7 @@ impl BlockAllocatorBuilder {
         let phys = self.phys;
         let slabs = self.slabs;
         let block_access = self.block_access;
+        let removing_disks = slab_allocator.removing_disks().collect::<HashSet<_>>();
 
         let next_slab_to_condense = phys.next_slab_to_condense;
 
@@ -1001,22 +1031,33 @@ impl BlockAllocatorBuilder {
             phys.spacemap_next,
         );
 
-        let mut available_space = 0u64;
         let mut evacuating_slabs = Vec::new();
+        let mut noalloc_state = VecMap::<_, HashSet<_>>::default();
+        let mut per_disk_stats = block_access
+            .disks()
+            .map(|disk| (disk, DiskStats::default()))
+            .collect::<VecMap<_, _>>();
         let mut slabs_by_bucket: BTreeMap<SlabBucketSize, Vec<SlabBucketEntry>> = BTreeMap::new();
         for slab in slabs.iter() {
-            available_space += slab.free_space();
+            let slab_disk = slab.location().disk();
 
-            match &slab.inner {
-                SlabEnum::BitmapBased(_) | SlabEnum::ExtentBased(_) => {
-                    slabs_by_bucket
-                        .entry(SlabBucketSize(slab.max_size()))
-                        .or_default()
-                        .push(slab.to_slab_bucket_entry());
-                }
-                SlabEnum::Evacuating(_) => {
-                    evacuating_slabs.push(slab.id);
-                }
+            let disk_stats = per_disk_stats.get_mut(slab_disk).unwrap();
+            disk_stats.free_bytes += slab.free_space();
+            disk_stats.alloc_bytes += slab.allocated_space();
+
+            if matches!(slab.inner, SlabEnum::Evacuating(_)) {
+                evacuating_slabs.push(slab.id);
+            }
+
+            if let Some(&disk) = removing_disks.get(&slab_disk) {
+                noalloc_state.get_mut_or_default(disk).insert(slab.id);
+                disk_stats.noalloc_bytes += slab.free_space();
+            } else {
+                assert_eq!(disk_stats.noalloc_bytes, 0);
+                slabs_by_bucket
+                    .entry(SlabBucketSize(slab.max_size()))
+                    .or_default()
+                    .push(slab.to_slab_bucket_entry());
             }
         }
 
@@ -1031,15 +1072,26 @@ impl BlockAllocatorBuilder {
             dirty_slabs: Default::default(),
             slab_allocator,
             evacuating_slabs,
+            noalloc_state,
             slab_buckets,
-            available_space,
-            freeing_space: 0,
+            per_disk_stats,
             checkpoint_allocated_bytes: 0,
             block_access,
         }
     }
 }
 
+#[derive(Default)]
+struct DiskStats {
+    // The amount of allocated space in the slabs held by the block_allocator
+    alloc_bytes: u64,
+    // The amount of free space in the slabs held by the block_allocator (includes noalloc_bytes)
+    free_bytes: u64,
+    // The number of bytes that are free in the noalloc_slabs but is not available for
+    // allocations
+    noalloc_bytes: u64,
+}
+
 pub struct BlockAllocator {
     slab_size: u32,
 
@@ -1088,10 +1140,34 @@ pub struct BlockAllocator {
     slab_allocator: Arc<SlabAllocator>,
     evacuating_slabs: Vec<SlabId>,
 
+    // This field contains all the data related to the removal logic. Each disk ID that is used
+    // as a key in this map is a device marked for removal. The value is the set of slabs that
+    // are part of that disk (which are managed by the BlockAllocator). These slabs are not in
+    // the allocatable buckets, so they are not allocatable.  The removal logic uses this field
+    // as follows:
+    //
+    // [1] When we mark a device for removal in the block allocator (see `mark_noalloc_disk()`)
+    // we move any slabs that belong to that disk, away from the allocation buckets, into a new
+    // disk entry in `noalloc_state` as a set (effectively marking the slabs as unavailable for
+    // future allocations).
+    // [2] Later, when there is enough free space in the cache for the allocated_space from those
+    // noalloc slabs, the merge code will submit a remap to move that allocated space to slabs of
+    // non-removing disks.
+    // [3] When the remap code empties all the noalloc slabs of the removing disk and gives them
+    // back to the slab_allocator then we no longer need that disk ID key and we remove it from
+    // our map.
+    //
+    // For details about cancellation see `unmark_noalloc_disk()`.
+    //
+    // This field is essentially a marker. Its keys are disk IDs of devices that are being
+    // removed AND have allocated space in the block allocator. So if a device marked for removal
+    // has no slabs currently in-use from the block allocator, it will not be in the map.
+    noalloc_state: VecMap<DiskId, HashSet<SlabId>>,
+
     slab_buckets: SlabAllocationBuckets,
 
-    available_space: u64,
-    freeing_space: u64,
+    // Space statistics per disk
+    per_disk_stats: VecMap<DiskId, DiskStats>,
 
     // used only by incoming rate heuristic for condensing
     checkpoint_allocated_bytes: u64,
@@ -1132,15 +1208,17 @@ impl BlockAllocator {
         bucket.insert(new_slab.to_slab_bucket_entry());
 
         let extent = new_slab.allocate(request_size).unwrap();
+
+        self.stats_add_new_slab(&new_slab);
+        self.stats_track_allocation(extent);
+
         let old = self.slabs.insert(new_id, new_slab);
         assert!(old.is_none());
-        self.available_space += self.slab_allocator.slab_size();
+
         self.mark_slab_info(new_id);
         self.dirty_slab_id(new_id);
         trace!("{new_id:?} added to {bucket_size:?}");
 
-        self.available_space -= extent.size;
-        self.checkpoint_allocated_bytes += extent.size;
         Some(extent)
     }
 
@@ -1191,8 +1269,7 @@ impl BlockAllocator {
                             extent
                         );
                         self.dirty_slab_id(id);
-                        self.available_space -= extent.size;
-                        self.checkpoint_allocated_bytes += extent.size;
+                        self.stats_track_allocation(extent);
                         return Some(extent);
                     }
                     None => {
@@ -1234,10 +1311,127 @@ impl BlockAllocator {
 
         let slab_id = self.slab_allocator.extent_to_slab_id(extent);
         self.slabs.get_mut(slab_id).free(extent);
-        self.freeing_space += extent.size;
+
+        self.stats_track_free(extent);
         self.dirty_slab_id(slab_id);
     }
 
+    pub fn add_disk(&mut self, disk: DiskId) {
+        let inserted = self.per_disk_stats.insert(disk, Default::default());
+        assert!(inserted.is_none());
+    }
+
+    pub fn remove_disk(&mut self, disk: DiskId) {
+        self.stats_verify_disk_is_empty(disk);
+        let removed = self.per_disk_stats.remove(disk);
+        assert!(removed.is_some());
+        assert!(self.noalloc_state.get(disk).is_none());
+    }
+
+    /// Remove all slabs that belong to `disk` from our sorted slab buckets,
+    /// effectively forbidding any future allocations from them.
+    pub fn mark_noalloc_disk(&mut self, disk: DiskId) {
+        let disk_stats = self.per_disk_stats.get_mut(disk).unwrap();
+        for bucket in self.slab_buckets.0.values_mut() {
+            bucket.by_freeness.retain(|entry| {
+                let slab = self.slabs.get(entry.slab_id);
+                if slab.location().disk() != disk {
+                    true
+                } else {
+                    self.noalloc_state
+                        .get_mut_or_default(disk)
+                        .insert(entry.slab_id);
+
+                    disk_stats.noalloc_bytes += slab.free_space();
+                    false
+                }
+            });
+
+            // Even if we removed all the disk's slabs from the allocation bucket the bucket can
+            // still point to one of those slabs if we allocated from it recently.
+            if let Some(bucket_current) = bucket.last_allocated {
+                if let Some(noalloc_slabs) = self.noalloc_state.get(disk) {
+                    if noalloc_slabs.contains(&bucket_current.slab_id) {
+                        bucket.advance();
+                    }
+                }
+            }
+        }
+
+        // Make sure we also track any slabs from that disk that have already been submitted for
+        // evacuation.
+        for &slab_id in self.evacuating_slabs.iter() {
+            if self.slabs.get(slab_id).location().disk() == disk {
+                self.noalloc_state.get_mut_or_default(disk).insert(slab_id);
+            }
+        }
+
+        // If the supplied `disk` had any free data the space of that data should equal the
+        // amount of space marked as non-allocatable.
+        assert_eq!(disk_stats.free_bytes, disk_stats.noalloc_bytes);
+    }
+
+    /// Place any slabs marked as non-allocatable back to the sorted slab
+    /// buckets allowing us to allocate from them again.
+    pub fn unmark_noalloc_disk(&mut self, disk: DiskId) {
+        // Remaping the disk's data may have already finished, at which point there is nothing
+        // for us to do here.
+        if !self.disk_is_pending_remap(disk) {
+            self.stats_verify_disk_is_empty(disk);
+            return;
+        }
+
+        let disk_stats = self.per_disk_stats.get_mut(disk).unwrap();
+        self.noalloc_state
+            .get_mut(disk)
+            .unwrap()
+            .retain(|&slab_id| {
+                let slab = self.slabs.get(slab_id);
+                disk_stats.noalloc_bytes -= slab.free_space();
+
+                // Note: Any noalloc slabs that are marked as evacuating (e.g. they are currently
+                // being remapped) are not part of the allocation buckets and their free space
+                // is not accounted in the block allocator. Given that plus the fact that they
+                // will be released back to the slab allocator once remap is done, we want to
+                // skip them.
+                if matches!(slab.inner, SlabEnum::Evacuating(_)) {
+                    return true;
+                }
+
+                let bucket_size = self
+                    .slab_buckets
+                    .get_bucket_size_for_allocation_size(slab.max_size());
+                self.slab_buckets
+                    .get_bucket_for_bucket_size(bucket_size)
+                    .insert(slab.to_slab_bucket_entry());
+
+                false
+            });
+        // At this point everything is either back to allocatable or is currently evacuating.
+        // Either way there noalloc_bytes should be zero.
+        assert_eq!(disk_stats.noalloc_bytes, 0);
+
+        // If the remap for this disk was never submitted, it's set of noalloc slabs should be
+        // empty, in which case we can finish cleaning up its noalloc_state entry here.
+        if self.noalloc_state.get(disk).unwrap().is_empty() {
+            self.noalloc_state.remove(disk);
+        }
+    }
+
+    fn noalloc_slabs_to_remap(&self, disk: DiskId) -> Vec<SlabId> {
+        match self.noalloc_state.get(disk) {
+            Some(slabs) => slabs.iter().copied().collect(),
+            None => Vec::new(),
+        }
+    }
+
+    /// Returns true if the disk is marked for removal and it is waiting for a remap to empty its
+    /// slabs from the block allocator. Otherwise, the device is either non-removing or it is
+    /// removing but has no slabs in-use from the block allocator.
+    pub fn disk_is_pending_remap(&self, disk: DiskId) -> bool {
+        self.noalloc_state.contains_key(&disk)
+    }
+
     // This function is the entry-point to starting the cache rebalancing process. This will select
     // which slab(s) need to be rebalanced, mark those slabs as undergoing evacuation, and
     // allocate new disk locations for the data currently stored on those slabs. The value
@@ -1255,36 +1449,55 @@ impl BlockAllocator {
     // the rebalance process as finished. This allows the allocator to transition the slabs that
     // were undergoing evacuation to free slabs, such that the slabs can later be used for
     // allocation.
-    pub fn rebalance_init(&mut self) -> Option<BTreeMap<Extent, Option<DiskLocation>>> {
+    pub fn rebalance_init(
+        &mut self,
+        removing_disk: Option<DiskId>,
+    ) -> Option<BTreeMap<Extent, Option<DiskLocation>>> {
         // For now, ensure rebalance_fini() is called before this function can be called a second
         // time.
         assert!(self.evacuating_slabs.is_empty());
 
         let begin = Instant::now();
 
-        let slabs = self.slabs_to_rebalance();
+        let slabs = match removing_disk {
+            Some(disk) => self.noalloc_slabs_to_remap(disk),
+            None => self.slabs_to_rebalance(),
+        };
         if slabs.is_empty() {
             info!("cache rebalance is not needed");
             return None;
         }
 
-        info!("initializing rebalance of {} slabs", slabs.len());
+        info!(
+            "initializing rebalance of {} slabs {}",
+            slabs.len(),
+            if removing_disk.is_some() {
+                "(triggered by removal)"
+            } else {
+                ""
+            }
+        );
 
-        // In order to ensure the allocations performed in rebalance_slab() (called below) are
-        // not satisfied by any of the slabs we're going to rebalance, we need to remove these
-        // slabs from the list of slabs available for allocation. Further, we must remove all
-        // slabs before we do any allocations, to ensure we don't move an extent multiple times;
-        // otherwise, data corruption could occur, as the data contained in the extents, can be
-        // moved by the caller in any order.
-        //
-        // For example, if we mark an extent as moving from disk location A to B, and then again
-        // from B to C, the final data contained at disk location C could be incorrect, if the
-        // caller does the move of B to C before the move of A to B. Since we do not enforce the
-        // order in which the caller will do the copies, we need to ensure this cannot happen, by
-        // never moving an extent more than once.
-        for &id in slabs.iter() {
-            trace!("prepping slab '{:?}' for rebalancing", id);
-            self.slab_buckets.remove_slab(self.slabs.get(id));
+        if removing_disk.is_none() {
+            // In order to ensure the allocations performed in rebalance_slab() (called below) are
+            // not satisfied by any of the slabs we're going to rebalance, we need to remove these
+            // slabs from the list of slabs available for allocation. Further, we must remove all
+            // slabs before we do any allocations, to ensure we don't move an extent multiple times;
+            // otherwise, data corruption could occur, as the data contained in the extents, can be
+            // moved by the caller in any order.
+            //
+            // For example, if we mark an extent as moving from disk location A to B, and then again
+            // from B to C, the final data contained at disk location C could be incorrect, if the
+            // caller does the move of B to C before the move of A to B. Since we do not enforce the
+            // order in which the caller will do the copies, we need to ensure this cannot happen,
+            // by never moving an extent more than once.
+            //
+            // Note that for removal we've already removed the removing disk's slabs from allocation
+            // buckets so there should be nothing to remove.
+            for &id in slabs.iter() {
+                trace!("prepping slab '{:?}' for rebalancing", id);
+                self.slab_buckets.remove_slab(self.slabs.get(id));
+            }
         }
 
         let mut merged = 0;
@@ -1307,6 +1520,10 @@ impl BlockAllocator {
             merged,
         );
 
+        if let Some(disk) = removing_disk {
+            self.stats_verify_disk_is_empty(disk);
+        }
+
         assert!(!self.evacuating_slabs.is_empty());
         Some(map)
     }
@@ -1344,7 +1561,10 @@ impl BlockAllocator {
             .slabs
             .iter()
             .filter(|&slab| match slab.inner {
-                SlabEnum::BitmapBased(_) | SlabEnum::ExtentBased(_) => true,
+                SlabEnum::BitmapBased(_) | SlabEnum::ExtentBased(_) => {
+                    // Skip over slabs from disks being removed
+                    !self.noalloc_state.contains_key(&slab.location().disk())
+                }
                 SlabEnum::Evacuating(_) => false,
             })
             .map(|slab| slab.to_slab_bucket_entry())
@@ -1474,8 +1694,12 @@ impl BlockAllocator {
         // Since evacuating slabs don't have any allocatable space, we must account for that
         // here; we must do this before we transition to an evacuating slab (evacuating slabs
         // have no free space).
-        let slab = self.slabs.get(id);
-        self.available_space -= slab.free_space();
+        self.stats_track_slab_evacuation(id);
+
+        // XXX: Currently it is not possible to call `BlockAllocator.free()` from the
+        // checkpoint_task in-between the point that we flush a checkpoint and the point that we
+        // start a rebalance_init(). Thet is expected to change once DLPX-80824 lands.
+        assert_eq!(self.slabs.get(id).freeing_space(), 0);
 
         trace!("marking slab '{:?}' as evacuating", id);
 
@@ -1498,6 +1722,13 @@ impl BlockAllocator {
             // evacuating slabs cannot allocate() or free(); thus, they should never be dirty.
             assert!(!self.slabs.get(id).is_dirty);
 
+            // if slab is part of removing disk, remove it from the noalloc slabs.
+            let slab_disk = self.slabs.get(id).location().disk();
+            if let Some(noalloc_set) = self.noalloc_state.get_mut(slab_disk) {
+                let removed = noalloc_set.remove(&id);
+                assert!(removed);
+            }
+
             self.slab_allocator.free(id);
             self.slabs.remove(id);
 
@@ -1510,6 +1741,11 @@ impl BlockAllocator {
             };
             target_spacemap.mark_slab_info(id, SlabPhysType::Free);
         }
+
+        // This could be the end of the remap of a removing disk. If that's the case the set of
+        // (noalloc) slabs for that disk should be empty, which means we can remove its entry
+        // from the noalloc_state.
+        self.noalloc_state.retain(|_, slabs| !slabs.is_empty());
     }
 
     /// Return number of slabs to condense, based on the "spacemap badness" ratio.  Note that the
@@ -1606,7 +1842,9 @@ impl BlockAllocator {
         let begin = Instant::now();
         let old_pending = self.spacemap.pending_len() + self.spacemap_next.pending_len();
         let ndirty_slabs = self.dirty_slabs.len();
-        let mut allocd_slabs: u64 = 0;
+        let mut allocd_slabs = 0u64;
+        let mut total_allocated = 0;
+        let mut total_freed = 0;
         for slab_id in mem::take(&mut self.dirty_slabs) {
             if !self.slabs.exists(slab_id) {
                 // This can happen if the slab was evacuated.
@@ -1632,16 +1870,20 @@ impl BlockAllocator {
             } else {
                 &mut self.spacemap_next
             };
-            slab.flush_to_spacemap(target_spacemap);
+            let (allocated, freed) = slab.flush_to_spacemap(target_spacemap);
+
+            self.stats_flush_freeing(slab_id, freed);
+            total_allocated += allocated;
+            total_freed += freed;
         }
         debug!(
-            "flushed {} slabs ({} allocd), {} entries in {}ms",
-            ndirty_slabs,
-            allocd_slabs,
+            "flushed {} entries ({} allocd, {} freed) to {ndirty_slabs} ({allocd_slabs} allocated from) in {}ms",
             nice_number_count(
                 (self.spacemap.pending_len() + self.spacemap_next.pending_len() - old_pending)
                     as f64
             ),
+            nice_p2size(total_allocated),
+            nice_p2size(total_freed),
             begin.elapsed().as_millis()
         );
     }
@@ -1691,9 +1933,6 @@ impl BlockAllocator {
         self.flush_dirty();
         let (spacemap, spacemap_next) = self.flush_impl().await;
         self.resort_buckets();
-
-        self.available_space += self.freeing_space;
-        self.freeing_space = 0;
         self.checkpoint_allocated_bytes = 0;
 
         if completed_merge {
@@ -1737,14 +1976,110 @@ impl BlockAllocator {
         phys
     }
 
-    /// Return the amount of space in unallocated blocks.  This does not include space in empty
-    /// slabs.
-    pub fn available(&self) -> u64 {
-        self.available_space
+    /// Return the amount of allocated space in slabs that are marked for removal.
+    pub fn removing_bytes(&self) -> u64 {
+        self.noalloc_state
+            .keys()
+            .map(|disk| self.per_disk_stats.get(disk).unwrap().alloc_bytes)
+            .sum()
+    }
+
+    pub fn disk_allocated_bytes(&self, disk: DiskId) -> u64 {
+        self.per_disk_stats.get(disk).unwrap().alloc_bytes
+    }
+
+    /// Return the amount of space in unallocated blocks.  This does not include
+    /// space in empty slabs.
+    pub fn free_bytes(&self) -> u64 {
+        self.per_disk_stats
+            .values()
+            .map(|stats| stats.free_bytes)
+            .sum::<u64>()
+    }
+
+    /// Return the amount of space in unallocated blocks that's available for
+    /// allocations.  This does not include space in empty slabs nor space in
+    /// slabs that are part of a device being removed.
+    pub fn allocatable_bytes(&self) -> u64 {
+        let free_bytes = self
+            .per_disk_stats
+            .values()
+            .map(|stats| stats.free_bytes)
+            .sum::<u64>();
+        let noalloc_bytes = self
+            .per_disk_stats
+            .values()
+            .map(|stats| stats.noalloc_bytes)
+            .sum::<u64>();
+        free_bytes - noalloc_bytes
+    }
+
+    /// Incorporate this newly-created slab's free space into the BlockAllocator's `per_disk_stats`.
+    fn stats_add_new_slab(&mut self, slab: &Slab) {
+        self.per_disk_stats
+            .get_mut(slab.location().disk())
+            .unwrap()
+            .free_bytes += slab.capacity_bytes();
+    }
+
+    /// Track the newly-allocated `extent` in the BlockAllocator's `per_disk_stats`.
+    fn stats_track_allocation(&mut self, extent: Extent) {
+        let disk_stats = self.per_disk_stats.get_mut(extent.location.disk()).unwrap();
+        disk_stats.free_bytes -= extent.size;
+        disk_stats.alloc_bytes += extent.size;
+        assert_eq!(
+            disk_stats.noalloc_bytes, 0,
+            "can't allocate from non-allocatable disk"
+        );
+        self.checkpoint_allocated_bytes += extent.size;
     }
 
-    pub fn freeing(&self) -> u64 {
-        self.freeing_space
+    /// Track the `extent` that was just freed in the BlockAllocator's `per_disk_stats`.
+    fn stats_track_free(&mut self, extent: Extent) {
+        let disk = extent.location.disk();
+
+        let disk_stats = self.per_disk_stats.get_mut(disk).unwrap();
+        disk_stats.alloc_bytes -= extent.size;
+        if !self.noalloc_state.contains_key(&disk) {
+            assert_eq!(disk_stats.noalloc_bytes, 0);
+        }
+    }
+
+    /// Update the `free_bytes` of the disk where `slab_id` belongs to with the amount that was
+    /// just `freed` when flushing that slab. If that slab is part of a removing disk then
+    /// `noalloc_bytes` from that disk is also incremented by `freed`.
+    fn stats_flush_freeing(&mut self, slab_id: SlabId, freed: u64) {
+        let disk = self.slabs.get(slab_id).location().disk();
+
+        let disk_stats = self.per_disk_stats.get_mut(disk).unwrap();
+        disk_stats.free_bytes += freed;
+        if self.noalloc_state.contains_key(&disk) {
+            disk_stats.noalloc_bytes += freed;
+        }
+    }
+
+    /// Remove any space accounted for the supplied slab from `per_disk_stats` as we are
+    /// preparing it for evacuation.
+    fn stats_track_slab_evacuation(&mut self, slab_id: SlabId) {
+        let slab = self.slabs.get(slab_id);
+        let slab_disk = slab.location().disk();
+
+        let disk_stats = self.per_disk_stats.get_mut(slab_disk).unwrap();
+        disk_stats.free_bytes -= slab.free_space();
+        disk_stats.alloc_bytes -= slab.allocated_space();
+        if self.noalloc_state.contains_key(&slab_disk) {
+            disk_stats.noalloc_bytes -= slab.free_space();
+        } else {
+            assert_eq!(disk_stats.noalloc_bytes, 0);
+        }
+    }
+
+    /// Verify that the supplied `disk` doesn't contribute any space in `per_disk_stats`.
+    fn stats_verify_disk_is_empty(&self, disk: DiskId) {
+        let disk_stats = self.per_disk_stats.get(disk).unwrap();
+        assert_eq!(disk_stats.alloc_bytes, 0);
+        assert_eq!(disk_stats.free_bytes, 0);
+        assert_eq!(disk_stats.noalloc_bytes, 0);
     }
 
     fn mark_slab_info(&mut self, id: SlabId) {
@@ -1756,6 +2091,13 @@ impl BlockAllocator {
         };
         slab.mark_slab_info(target_spacemap);
     }
+
+    // Returns the number of slabs moved.
+    pub async fn transfer_metadata_for_removal(&mut self, disk: DiskId) -> u64 {
+        let mut moved = self.spacemap.transfer_data_for_removal(disk).await;
+        moved += self.spacemap_next.transfer_data_for_removal(disk).await;
+        moved
+    }
 }
 
 #[derive(Debug, Serialize, Deserialize, Clone, Copy)]
diff --git a/cmd/zfs_object_agent/zettacache/src/block_based_log/mod.rs b/cmd/zfs_object_agent/zettacache/src/block_based_log/mod.rs
index 25dd774e666d..b06f3789a565 100644
--- a/cmd/zfs_object_agent/zettacache/src/block_based_log/mod.rs
+++ b/cmd/zfs_object_agent/zettacache/src/block_based_log/mod.rs
@@ -126,6 +126,42 @@ impl<T: BlockBasedLogEntry> BlockBasedLogPhys<T> {
         }
     }
 
+    /// Find slabs used in this log that are located in the `removing_disk` and move their data
+    /// to newly allocated slabs from non-removing disks. Returns the number of slabs moved.
+    pub async fn transfer_data_for_removal(
+        &mut self,
+        removing_disk: DiskId,
+        slab_allocator: &SlabAllocator,
+        block_access: &BlockAccess,
+    ) -> u64 {
+        let mut moved = 0u64;
+        for slab in &mut self.slabs {
+            let slab_extent = slab_allocator.slab_id_to_extent(*slab);
+            if slab_extent.location.disk() != removing_disk {
+                continue;
+            }
+
+            let new_slab = slab_allocator.allocate_reserved();
+            let new_slab_extent = slab_allocator.slab_id_to_extent(new_slab);
+            assert!(new_slab_extent.location.disk() != removing_disk);
+
+            let bytes = block_access
+                .read_raw(slab_extent, DiskIoType::MaintenanceRead)
+                .await;
+            block_access
+                .write_raw(
+                    new_slab_extent.location,
+                    bytes,
+                    DiskIoType::MaintenanceWrite,
+                )
+                .await;
+            slab_allocator.free(*slab);
+            *slab = new_slab;
+            moved += 1;
+        }
+        moved
+    }
+
     fn written_extents<'a>(
         &'a self,
         slab_access: &'a SlabAccess,
@@ -327,6 +363,13 @@ impl<T: BlockBasedLogEntry> BlockBasedLog<T> {
         self.phys.trimmed = offset;
     }
 
+    // Returns the number of slabs moved.
+    pub async fn transfer_data_for_removal(&mut self, removing_disk: DiskId) -> u64 {
+        self.phys
+            .transfer_data_for_removal(removing_disk, &self.slab_allocator, &self.block_access)
+            .await
+    }
+
     async fn flush_impl<F>(&mut self, mut new_chunk_fn: F)
     where
         F: FnMut(ChunkId, LogOffset, T),
diff --git a/cmd/zfs_object_agent/zettacache/src/checkpoint.rs b/cmd/zfs_object_agent/zettacache/src/checkpoint.rs
index e51714d2618f..a3d1bd94c583 100644
--- a/cmd/zfs_object_agent/zettacache/src/checkpoint.rs
+++ b/cmd/zfs_object_agent/zettacache/src/checkpoint.rs
@@ -25,6 +25,7 @@ use crate::slab_allocator::SlabAllocator;
 use crate::slab_allocator::SlabAllocatorBuilder;
 use crate::slab_allocator::SlabAllocatorPhys;
 use crate::zettacache::merge::MergeProgressPhys;
+use crate::zettacache::removal::DeviceRemovalPhys;
 use crate::zettacache::OperationLogEntry;
 
 #[derive(Serialize, Deserialize, Default, Debug, Copy, Clone, PartialEq, Eq, Ord, PartialOrd)]
@@ -46,6 +47,8 @@ pub struct CheckpointPhys {
     pub operation_log: BlockBasedLogPhys<OperationLogEntry>,
     pub size_histogram: SizeHistogramPhys,
     pub merge_progress: Option<MergeProgressPhys>,
+    #[serde(default)]
+    pub device_removal: DeviceRemovalPhys,
 }
 
 impl CheckpointPhys {
diff --git a/cmd/zfs_object_agent/zettacache/src/features.rs b/cmd/zfs_object_agent/zettacache/src/features.rs
index 362140708411..8b7e7670a518 100644
--- a/cmd/zfs_object_agent/zettacache/src/features.rs
+++ b/cmd/zfs_object_agent/zettacache/src/features.rs
@@ -29,6 +29,7 @@ lazy_static! {
         SLAB_SIZE_32MB.clone(),
         TRIMMED_INDEX.clone(),
         DISK_GUIDS.clone(),
+        CACHE_DEVICE_REMOVAL.clone(),
     ]
     .map(|feature: Feature| (feature.name, feature.info))
     .into_iter()
@@ -53,6 +54,10 @@ lazy_static! {
         name: FeatureName("com.delphix:disk_guids".to_string()),
         info: FeatureType::Upgradeable
     };
+    pub static ref CACHE_DEVICE_REMOVAL: Feature = Feature {
+        name: FeatureName("com.delphix:cache_device_removal".to_string()),
+        info: FeatureType::Upgradeable
+    };
 }
 
 #[derive(Debug)]
diff --git a/cmd/zfs_object_agent/zettacache/src/open.rs b/cmd/zfs_object_agent/zettacache/src/open.rs
index c6ab7cb87237..d738d28960d7 100644
--- a/cmd/zfs_object_agent/zettacache/src/open.rs
+++ b/cmd/zfs_object_agent/zettacache/src/open.rs
@@ -30,9 +30,15 @@ pub enum CacheOpenMode {
 }
 
 impl CacheOpenMode {
-    pub async fn device_paths(self) -> Result<Vec<PathBuf>> {
+    pub async fn device_paths(self) -> Result<BTreeMap<DiskId, PathBuf>> {
         Ok(match self {
-            CacheOpenMode::DeviceList(paths) => paths,
+            CacheOpenMode::DeviceList(paths) => {
+                let mut disk_map = BTreeMap::new();
+                for (id, path) in paths.into_iter().enumerate() {
+                    disk_map.insert(DiskId::new(id), path);
+                }
+                disk_map
+            }
             CacheOpenMode::DiscoveryDirectory(dir, target_guid) => {
                 discover_devices(&dir, target_guid).await?
             }
@@ -75,7 +81,10 @@ impl DiscoveredDevice {
     }
 }
 
-async fn discover_devices(dir_path: &Path, target_guid: Option<CacheGuid>) -> Result<Vec<PathBuf>> {
+async fn discover_devices(
+    dir_path: &Path,
+    target_guid: Option<CacheGuid>,
+) -> Result<BTreeMap<DiskId, PathBuf>> {
     let mut caches = HashMap::<_, BTreeMap<_, _>>::new();
 
     let mut discovery = FuturesUnordered::new();
@@ -143,19 +152,21 @@ async fn discover_devices(dir_path: &Path, target_guid: Option<CacheGuid>) -> Re
         caches.retain(|cache_guid, _| *cache_guid == guid);
     }
 
-    match caches.values().next() {
-        Some(cache) => {
-            if caches.len() > 1 {
-                Err(anyhow!(
-                    "multiple valid caches found in {dir_path:?}: {:?}",
-                    caches.keys().collect::<Vec<_>>(),
-                ))
-            } else {
-                Ok(cache
-                    .iter()
-                    .map(|(_, dev)| dev.device_path.clone())
-                    .collect())
-            }
+    if caches.len() > 1 {
+        return Err(anyhow!(
+            "multiple valid caches found in {dir_path:?}: {:?}",
+            caches.keys().collect::<Vec<_>>(),
+        ));
+    }
+
+    let cache_info = caches.drain().next();
+    match cache_info {
+        Some((cache_guid, cache)) => {
+            info!("discovery: importing cache '{cache_guid:?}'");
+            Ok(cache
+                .into_iter()
+                .map(|((disk_id, _), dev)| (disk_id, dev.device_path))
+                .collect())
         }
         None => Err(anyhow!("no valid caches found in {dir_path:?}")),
     }
@@ -165,9 +176,10 @@ fn is_valid_cache(
     cache_guid: CacheGuid,
     disks: &mut BTreeMap<(DiskId, Option<DiskGuid>), DiscoveredDevice>,
 ) -> bool {
+    // We always pick the latest primary block (determined by checkpoint_id)
     let primary = disks
         .values()
-        .find(|&disk| disk.superblock.primary.is_some());
+        .max_by_key(|d| d.superblock.primary.as_ref().map(|p| p.checkpoint_id));
 
     match primary {
         Some(disk) => {
diff --git a/cmd/zfs_object_agent/zettacache/src/slab_allocator.rs b/cmd/zfs_object_agent/zettacache/src/slab_allocator.rs
index 0fadd7663cdf..b895720cf95c 100644
--- a/cmd/zfs_object_agent/zettacache/src/slab_allocator.rs
+++ b/cmd/zfs_object_agent/zettacache/src/slab_allocator.rs
@@ -1,4 +1,4 @@
-//! The disk is divided into equal-size slabs (by default, 16MB each).  The SlabAllocator knows
+//! The disk is divided into equal-size slabs (by default, 32MB each).  The SlabAllocator knows
 //! which slabs are allocated and which are free.  Other subsystems (e.g. BlockBasedLogs, the
 //! BlockAllocator, and Checkpoints) can call into the SlabAllocator to allocate slabs for their
 //! own use.
@@ -9,8 +9,11 @@
 //! Allocator which slabs are allocated, by calling SlabAllocatorBuilder::claim().
 
 use std::cmp::max;
+use std::collections::BTreeMap;
 use std::collections::HashSet;
+use std::mem;
 use std::ops::Add;
+use std::ops::AddAssign;
 use std::ops::Bound::*;
 use std::ops::Sub;
 use std::sync::Mutex;
@@ -28,7 +31,9 @@ use util::measure;
 use util::tunable;
 use util::tunable::Percent;
 use util::From64;
+use util::VecMap;
 
+use crate::base_types::DiskId;
 use crate::base_types::Extent;
 
 tunable! {
@@ -51,10 +56,25 @@ tunable! {
     static ref TARGET_AVAILABLE_SLABS_PCT: Percent = Percent::new(2.0);
 }
 
+// Note: Before device removal we used `Extent`s to indicate a present chunk of
+// capacity.  The `untagged` and `flatten` serde annotations below are used to
+// make new bits backwards compatible with the old on-disk format.
+#[derive(Debug, Serialize, Deserialize, Copy, Clone, Eq, Ord, PartialEq, PartialOrd)]
+#[serde(untagged)]
+enum SlabExtentPhys {
+    Present {
+        #[serde(flatten)]
+        extent: Extent,
+    },
+    Removed {
+        extent: Extent,
+    },
+}
+
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct SlabAllocatorPhys {
     slab_size: u64,
-    capacity: Vec<Extent>,
+    capacity: Vec<SlabExtentPhys>,
 }
 
 #[derive(Debug)]
@@ -65,8 +85,10 @@ pub struct SlabAccess {
 
 #[derive(Debug)]
 struct SlabAccessInner {
-    capacity: BiBTreeMap<SlabId, Extent>,
+    capacity: BTreeMap<SlabId, SlabExtentPhys>,
+    active_capacity: BiBTreeMap<SlabId, Extent>,
     num_slabs: u64,
+    next_slab_id: SlabId,
 }
 
 #[derive(Debug)]
@@ -85,9 +107,29 @@ pub struct SlabAllocator {
 struct Inner {
     allocatable: Vec<SlabId>,
     freeing: Vec<SlabId>,
+
+    // If there is an entry in the map below, then the entire disk has been marked as noalloc,
+    // and none of its slabs will appear in `allocatable`. When the disk is finally removed, its
+    // key is removed from this map. Note that the slab IDs that belong to a removed disk are
+    // never re-used for the lifetime of the cache.
+    noalloc_state: VecMap<DiskId, Vec<SlabId>>,
+    // Slabs that are released/freed back to the SlabAllocator and belong to a removing disk, end
+    // up here instead of `freeing`, and eventually make it back to the noalloc_state for that
+    // disk.
+    noallocing: Vec<SlabId>,
+
     reserved_slabs: u64,
 }
 
+impl Inner {
+    fn num_removing_slabs(&self, access: &SlabAccess) -> u64 {
+        self.noalloc_state
+            .keys()
+            .map(|disk| access.disk_num_slabs(disk))
+            .sum::<u64>()
+    }
+}
+
 impl SlabAllocatorPhys {
     pub fn new<I: IntoIterator<Item = Extent>>(capacity: I) -> Self {
         let mut this = Self {
@@ -105,13 +147,20 @@ impl SlabAllocatorPhys {
     {
         for extent in capacity {
             // capacity is aligned to be a multiple of slabsize
-            self.capacity
-                .push(extent.range(0, extent.size - extent.size % self.slab_size));
+            self.capacity.push(SlabExtentPhys::Present {
+                extent: extent.range(0, extent.size - extent.size % self.slab_size),
+            });
         }
     }
 
-    pub fn capacity(&self) -> &[Extent] {
-        &self.capacity
+    pub fn active_capacity_bytes(&self) -> u64 {
+        self.capacity
+            .iter()
+            .map(|extent_phys| match extent_phys {
+                SlabExtentPhys::Present { extent } => extent.size,
+                SlabExtentPhys::Removed { extent: _ } => 0,
+            })
+            .sum()
     }
 
     #[allow(dead_code)]
@@ -124,7 +173,7 @@ impl SlabAccess {
     pub fn slab_id_to_extent(&self, slab_id: SlabId) -> Extent {
         let inner = self.inner.read().unwrap();
         let (&extent_slab, containing_extent) = inner
-            .capacity
+            .active_capacity
             .left_range((Unbounded, Included(slab_id)))
             .next_back()
             .unwrap();
@@ -136,7 +185,7 @@ impl SlabAccess {
 
         let inner = self.inner.read().unwrap();
         let (&capacity_slab, capacity_extent) = inner
-            .capacity
+            .active_capacity
             .right_range((Unbounded, Included(extent.location)))
             .next_back()
             .unwrap();
@@ -145,7 +194,6 @@ impl SlabAccess {
         let slab_id =
             capacity_slab + ((extent.location - capacity_extent.location) / self.slab_size);
 
-        assert_lt!(slab_id.0, inner.num_slabs);
         debug_assert!(self.slab_id_to_extent(slab_id).contains(&extent));
         slab_id
     }
@@ -163,26 +211,63 @@ impl SlabAccess {
         let inner = self.inner.read().unwrap();
         inner.num_slabs * self.slab_size
     }
+
+    pub fn disk_capacity(&self, disk: DiskId) -> u64 {
+        let inner = self.inner.read().unwrap();
+        inner
+            .active_capacity
+            .iter()
+            .filter(|(_, &extent)| extent.location.disk() == disk)
+            .map(|(_, &extent)| extent.size)
+            .sum()
+    }
+
+    pub fn disk_num_slabs(&self, disk: DiskId) -> u64 {
+        self.disk_capacity(disk) / self.slab_size()
+    }
 }
 
 impl SlabAllocatorBuilder {
     pub fn new(phys: SlabAllocatorPhys) -> Self {
-        let mut num_slabs = 0;
+        let mut allocatable = HashSet::new();
+        let mut active_capacity = BiBTreeMap::new();
+
+        let mut total_slabs = 0;
+        let mut removed_slabs = 0;
         let capacity = phys
             .capacity
             .into_iter()
-            .map(|extent| {
-                let start = SlabId(num_slabs);
-                num_slabs += extent.size / phys.slab_size;
-                (start, extent)
+            .map(|extent_phys| {
+                let start = SlabId(total_slabs);
+                match extent_phys {
+                    SlabExtentPhys::Present { extent } => {
+                        active_capacity.insert(start, extent);
+                        let nslabs = extent.size / phys.slab_size;
+                        allocatable.extend(
+                            (total_slabs..(total_slabs + nslabs))
+                                .map(SlabId)
+                                .collect::<HashSet<SlabId>>(),
+                        );
+                        total_slabs += nslabs;
+                    }
+                    SlabExtentPhys::Removed { extent } => {
+                        let nslabs = extent.size / phys.slab_size;
+                        removed_slabs += nslabs;
+                        total_slabs += nslabs;
+                    }
+                }
+                (start, extent_phys)
             })
             .collect();
+
         Self {
-            allocatable: (0..num_slabs).map(SlabId).collect(),
+            allocatable,
             access: SlabAccess {
                 inner: RwLock::new(SlabAccessInner {
                     capacity,
-                    num_slabs,
+                    active_capacity,
+                    num_slabs: total_slabs - removed_slabs,
+                    next_slab_id: SlabId(total_slabs),
                 }),
                 slab_size: phys.slab_size,
             },
@@ -194,11 +279,28 @@ impl SlabAllocatorBuilder {
         assert!(removed, "{slab_id:?} claimed twice");
     }
 
-    pub fn build(self) -> SlabAllocator {
+    pub fn build(self, removing_disks: &[DiskId]) -> SlabAllocator {
+        let mut noalloc_slabs = removing_disks
+            .iter()
+            .copied()
+            .map(|disk| (disk, Vec::new()))
+            .collect::<VecMap<_, _>>();
+        let mut allocatable = Vec::new();
+        for slab_id in self.allocatable.into_iter() {
+            let slab_disk = self.access.slab_id_to_extent(slab_id).location.disk();
+
+            noalloc_slabs
+                .get_mut(slab_disk)
+                .unwrap_or(&mut allocatable)
+                .push(slab_id);
+        }
+
         SlabAllocator {
             inner: Mutex::new(Inner {
-                allocatable: self.allocatable.into_iter().collect(),
+                allocatable,
                 freeing: Vec::new(),
+                noalloc_state: noalloc_slabs,
+                noallocing: Vec::new(),
                 reserved_slabs: RESERVED_SLABS_PCT.apply(self.access.num_slabs()),
             }),
             access: self.access,
@@ -240,10 +342,15 @@ impl SlabAllocator {
         let mut inner = self.inner.lock().unwrap();
 
         let mut access_inner = self.access.inner.write().unwrap();
-        let first_new_slab = SlabId(access_inner.num_slabs);
+        let first_new_slab = access_inner.next_slab_id;
         // capacity is aligned to be a multiple of slabsize
         let capacity = capacity.trim_end(capacity.size - capacity.size % self.access.slab_size);
-        access_inner.capacity.insert(first_new_slab, capacity);
+        access_inner
+            .capacity
+            .insert(first_new_slab, SlabExtentPhys::Present { extent: capacity });
+        access_inner
+            .active_capacity
+            .insert(first_new_slab, capacity);
         let new_slabs = capacity.size / self.access.slab_size;
 
         // We don't want to allocate and write to the new capacity until the next checkpoint
@@ -251,19 +358,100 @@ impl SlabAllocator {
         // we add the new slabs to `freeing`.
         inner
             .freeing
-            .extend((0..new_slabs).map(|n| SlabId(access_inner.num_slabs + n)));
+            .extend((0..new_slabs).map(|i| access_inner.next_slab_id + i));
 
         access_inner.num_slabs += new_slabs;
+        access_inner.next_slab_id += new_slabs;
+    }
+
+    pub fn remove_disk(&self, disk: DiskId) {
+        assert!(self.disk_is_fully_evacuated(disk));
+
+        let mut inner = self.inner.lock().unwrap();
+        let mut access_inner_guard = self.access.inner.write().unwrap();
+        let access_inner = &mut *access_inner_guard;
+        let removed_slabs = inner.noalloc_state.remove(disk).unwrap();
+
+        access_inner.active_capacity.retain(|&slab_id, &extent| {
+            if extent.location.disk() == disk {
+                let old = access_inner
+                    .capacity
+                    .insert(slab_id, SlabExtentPhys::Removed { extent });
+                assert_eq!(old, Some(SlabExtentPhys::Present { extent }));
+                trace!(
+                    "removal: removing {:?} from slab allocator capacity",
+                    old.unwrap()
+                );
+                false
+            } else {
+                true
+            }
+        });
+        access_inner.num_slabs -= removed_slabs.len() as u64;
+
+        // Dropping guard as writer so the assertions below can grab guard as reader in
+        // slab_id_to_extent().
+        drop(access_inner_guard);
+        assert!(!inner.allocatable.iter().any(|&slab_id| self
+            .slab_id_to_extent(slab_id)
+            .location
+            .disk()
+            == disk));
+        assert!(!inner.freeing.iter().any(|&slab_id| self
+            .slab_id_to_extent(slab_id)
+            .location
+            .disk()
+            == disk));
+        assert!(!inner.noallocing.iter().any(|&slab_id| self
+            .slab_id_to_extent(slab_id)
+            .location
+            .disk()
+            == disk));
+    }
+
+    /// Marks all slabs that are part of `disk` as non-allocatable. No further allocations can be
+    /// made from these slabs and they won't be reused once freed.
+    pub fn mark_noalloc_disk(&self, disk: DiskId) {
+        let mut guard = self.inner.lock().unwrap();
+        let inner = &mut *guard;
+        let mut removing_allocatable = Vec::new();
+        inner.allocatable.retain(|&slab_id| {
+            if self.slab_id_to_extent(slab_id).location.disk() == disk {
+                removing_allocatable.push(slab_id);
+                false
+            } else {
+                true
+            }
+        });
+        inner.freeing.retain(|&slab_id| {
+            if self.slab_id_to_extent(slab_id).location.disk() == disk {
+                inner.noallocing.push(slab_id);
+                false
+            } else {
+                true
+            }
+        });
+        let replaced = inner.noalloc_state.insert(disk, removing_allocatable);
+        assert!(replaced.is_none());
+    }
+
+    /// Marks all free slabs that are part of `disk` as allocatable. This also enables
+    /// the reuse of allocated slabs that are freed.
+    pub fn unmark_noalloc_disk(&self, disk: DiskId) {
+        let mut guard = self.inner.lock().unwrap();
+        let inner = &mut *guard;
+        inner
+            .allocatable
+            .append(&mut inner.noalloc_state.remove(disk).unwrap());
+        inner.freeing.append(&mut inner.noallocing);
+        inner.allocatable.shuffle(&mut thread_rng());
     }
 
     pub fn get_phys(&self) -> SlabAllocatorPhys {
+        let access_inner = self.access.inner.read().unwrap();
         SlabAllocatorPhys {
             slab_size: self.access.slab_size,
-            capacity: self
-                .access
-                .inner
-                .read()
-                .unwrap()
+            capacity: access_inner
                 .capacity
                 .iter()
                 .map(|(_, &extent)| extent)
@@ -291,15 +479,18 @@ impl SlabAllocator {
     /// regardless of how much metadata is actually used.
     pub fn set_reservation(&self, reserved_space: u64) {
         let mut inner = self.inner.lock().unwrap();
+        let removing_slabs = inner.num_removing_slabs(&self.access);
         inner.reserved_slabs = max(
             reserved_space / self.access.slab_size,
-            RESERVED_SLABS_PCT.apply(self.access.num_slabs()),
+            RESERVED_SLABS_PCT.apply(self.access.num_slabs() - removing_slabs),
         );
     }
 
     pub fn allocate_reserved(&self) -> SlabId {
         let mut inner = self.inner.lock().unwrap();
-        if inner.allocatable.len() as u64 <= SUPER_RESERVED_SLABS_PCT.apply(self.access.num_slabs())
+        let removing_slabs = inner.num_removing_slabs(&self.access);
+        if inner.allocatable.len() as u64
+            <= SUPER_RESERVED_SLABS_PCT.apply(self.access.num_slabs() - removing_slabs)
         {
             panic!("Free slabs exhausted.");
         }
@@ -310,11 +501,20 @@ impl SlabAllocator {
 
     pub fn free(&self, slab: SlabId) {
         trace!("freeing {slab:?}");
-        self.inner.lock().unwrap().freeing.push(slab);
+        let extent_disk = self.access().slab_id_to_extent(slab).location.disk();
+        let mut inner = self.inner.lock().unwrap();
+        if inner.noalloc_state.contains_key(&extent_disk) {
+            inner.noallocing.push(slab);
+        } else {
+            inner.freeing.push(slab);
+        }
     }
 
     /// Returns the amount of non-reserved available space, in bytes. i.e. the amount that could
     /// be allocated by allocate().
+    ///
+    /// Note: Slabs that are part of a disk that's been `mark_noalloc_disk()`-ed are not
+    /// allocatable, and therefore not included in returned value.
     pub fn allocatable_bytes(&self) -> u64 {
         let inner = self.inner.lock().unwrap();
         (inner.allocatable.len() as u64).saturating_sub(inner.reserved_slabs)
@@ -322,17 +522,23 @@ impl SlabAllocator {
     }
 
     /// Returns the number of slabs that are not currently allocated.  This includes reserved and
-    /// super-reserved slabs.
+    /// super-reserved slabs and the ones marked as noalloc for removal.
     pub fn free_slabs(&self) -> u64 {
         let inner = self.inner.lock().unwrap();
-        inner.allocatable.len() as u64
+        let noalloc_slabs = inner
+            .noalloc_state
+            .values()
+            .map(|slabs| slabs.len())
+            .sum::<usize>() as u64;
+        inner.allocatable.len() as u64 + noalloc_slabs
     }
 
     /// Returns the number of slabs that we would like the block allocator to evacuate and free.
     pub fn num_slabs_to_evacuate(&self) -> u64 {
         let inner = self.inner.lock().unwrap();
-        let target_free_slabs =
-            inner.reserved_slabs + TARGET_AVAILABLE_SLABS_PCT.apply(self.access.num_slabs());
+        let removing_slabs = inner.num_removing_slabs(&self.access);
+        let target_free_slabs = inner.reserved_slabs
+            + TARGET_AVAILABLE_SLABS_PCT.apply(self.access.num_slabs() - removing_slabs);
         let current_free_slabs = inner.allocatable.len() + inner.freeing.len();
         target_free_slabs.saturating_sub(current_free_slabs as u64)
     }
@@ -346,6 +552,14 @@ impl SlabAllocator {
         // `inner.allocatable` and `inner.freeing` below as "split borrows", allowing two
         // exclusive references to different fields of the same struct.
         inner.allocatable.append(&mut inner.freeing);
+        for slab_id in mem::take(&mut inner.noallocing) {
+            let slab_disk = self.slab_id_to_extent(slab_id).location.disk();
+            inner
+                .noalloc_state
+                .get_mut(slab_disk)
+                .unwrap()
+                .push(slab_id);
+        }
         inner.allocatable.shuffle(&mut thread_rng());
     }
 
@@ -366,12 +580,50 @@ impl SlabAllocator {
     }
 
     pub fn num_slabs(&self) -> u64 {
-        self.access.num_slabs()
+        let removing_slabs = self.inner.lock().unwrap().num_removing_slabs(&self.access);
+        self.access.num_slabs() - removing_slabs
     }
 
     pub fn capacity(&self) -> u64 {
         self.access.capacity()
     }
+
+    pub fn disk_capacity(&self, disk: DiskId) -> u64 {
+        self.access.disk_capacity(disk)
+    }
+
+    pub fn disk_is_fully_evacuated(&self, disk: DiskId) -> bool {
+        let disk_num_slabs = self.access.disk_num_slabs(disk);
+        let noalloc_slabs = self
+            .inner
+            .lock()
+            .unwrap()
+            .noalloc_state
+            .get(disk)
+            .map(|slabs| slabs.len())
+            .unwrap_or_default() as u64;
+        disk_num_slabs == noalloc_slabs
+    }
+
+    pub fn removing_capacity(&self) -> u64 {
+        self.inner
+            .lock()
+            .unwrap()
+            .noalloc_state
+            .keys()
+            .map(|disk_id| self.disk_capacity(disk_id))
+            .sum()
+    }
+
+    pub fn removing_disks(&self) -> impl Iterator<Item = DiskId> {
+        self.inner
+            .lock()
+            .unwrap()
+            .noalloc_state
+            .keys()
+            .collect::<Vec<_>>()
+            .into_iter()
+    }
 }
 
 #[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord, Hash)]
@@ -389,6 +641,12 @@ impl Add<u64> for SlabId {
     }
 }
 
+impl AddAssign<u64> for SlabId {
+    fn add_assign(&mut self, other: u64) {
+        *self = SlabId(self.0 + other);
+    }
+}
+
 impl Sub<SlabId> for SlabId {
     type Output = u64;
     fn sub(self, rhs: SlabId) -> u64 {
@@ -401,3 +659,9 @@ impl From<SlabId> for usize {
         usize::from64(val.0)
     }
 }
+
+impl From<usize> for SlabId {
+    fn from(val: usize) -> Self {
+        SlabId(val as u64)
+    }
+}
diff --git a/cmd/zfs_object_agent/zettacache/src/space_map.rs b/cmd/zfs_object_agent/zettacache/src/space_map.rs
index 8f09a0037aa1..7a4f5d147bc6 100644
--- a/cmd/zfs_object_agent/zettacache/src/space_map.rs
+++ b/cmd/zfs_object_agent/zettacache/src/space_map.rs
@@ -5,6 +5,7 @@ use futures::stream::StreamExt;
 use serde::Deserialize;
 use serde::Serialize;
 
+use crate::base_types::DiskId;
 use crate::base_types::Extent;
 use crate::block_access::BlockAccess;
 use crate::block_allocator::SlabPhysType;
@@ -131,4 +132,9 @@ impl SpaceMap {
         self.log.clear();
         self.alloc_entries = 0;
     }
+
+    // Returns the number of slabs moved.
+    pub async fn transfer_data_for_removal(&mut self, disk: DiskId) -> u64 {
+        self.log.transfer_data_for_removal(disk).await
+    }
 }
diff --git a/cmd/zfs_object_agent/zettacache/src/superblock.rs b/cmd/zfs_object_agent/zettacache/src/superblock.rs
index 5133bd8d0668..a6cc4670bd39 100644
--- a/cmd/zfs_object_agent/zettacache/src/superblock.rs
+++ b/cmd/zfs_object_agent/zettacache/src/superblock.rs
@@ -57,7 +57,6 @@ impl DiskPhys {
     }
 }
 
-/// State that's only needed on the primary disk (currently, always DiskId(0)).
 #[derive(Serialize, Deserialize, Debug, Clone)]
 pub struct PrimaryPhys {
     pub checkpoint_id: CheckpointId,
@@ -74,6 +73,8 @@ pub struct PrimaryPhys {
 /// Subset of PrimaryPhys that's needed to get the feature flags.
 #[derive(Deserialize, Debug, Clone)]
 pub struct PrimaryFeaturesPhys {
+    // The checkpoint_id is needed so we can pick the latest primary
+    checkpoint_id: CheckpointId,
     feature_flags: Vec<FeatureName>,
 }
 
@@ -92,19 +93,28 @@ impl PrimaryPhys {
         }
     }
 
-    /// Write superblocks to all disks.
+    pub async fn write(
+        &self,
+        primary_disk: DiskId,
+        cache_guid: CacheGuid,
+        block_access: &BlockAccess,
+    ) {
+        let phys = SuperblockPhys {
+            primary: Some(self.clone()),
+            disk: primary_disk,
+            cache_guid,
+            disk_guid: Some(self.disks.get(&primary_disk).unwrap().guid),
+        };
+        phys.write(block_access, primary_disk).await;
+    }
+
+    /// Write superblocks to all disks. Used during cache creation.
     pub async fn write_all(
         &self,
         primary_disk: DiskId,
         cache_guid: CacheGuid,
         block_access: &BlockAccess,
     ) {
-        // Write the non-primary disks first, so that newly-added disks will
-        // have their superblocks present before the primary superblock is
-        // updated to indicate that they are part of the cache.  If we wrote all
-        // the disks (including the primary) at once, we could crash after the
-        // primary was updated but a new disk had not yet been updated.  The
-        // cache would be left in an inconsistent state and could not be opened.
         block_access
             .disks()
             .filter(|&disk| disk != primary_disk)
@@ -120,14 +130,7 @@ impl PrimaryPhys {
             .collect::<FuturesUnordered<_>>()
             .count()
             .await;
-
-        let phys = SuperblockPhys {
-            primary: Some(self.clone()),
-            disk: primary_disk,
-            cache_guid,
-            disk_guid: Some(self.disks.get(&primary_disk).unwrap().guid),
-        };
-        phys.write(block_access, primary_disk).await;
+        self.write(primary_disk, cache_guid, block_access).await;
     }
 
     pub async fn read_features(block_access: &BlockAccess) -> Result<Vec<FeatureName>> {
@@ -144,14 +147,14 @@ impl PrimaryPhys {
 
         let (mut primary, primary_disk, cache_guid) = results
             .iter()
-            .find_map(|result| {
-                if let Ok(phys) = result {
-                    phys.primary
-                        .as_ref()
-                        .map(|primary| (primary.clone(), phys.disk, phys.cache_guid))
-                } else {
-                    None
-                }
+            .flatten()
+            .max_by_key(|phys| phys.primary.as_ref().map(|p| p.checkpoint_id))
+            .map(|phys| {
+                (
+                    phys.primary.as_ref().unwrap().clone(),
+                    phys.disk,
+                    phys.cache_guid,
+                )
             })
             .ok_or_else(|| anyhow!("Primary Superblock not found"))?;
 
@@ -164,17 +167,12 @@ impl PrimaryPhys {
             })
             .collect::<Vec<_>>();
 
-        for (id, result) in results.iter().enumerate() {
-            // XXX proper error handling
-            // XXX we should be able to reorder them?
-            if let Ok(phys) = result {
-                let disk = DiskId::new(id);
-                assert_eq!(disk, phys.disk);
-                assert!(phys.primary.is_none() || phys.disk == primary_disk);
-                assert_eq!(phys.cache_guid, cache_guid);
-                if let Some(disk_guid) = phys.disk_guid {
-                    assert_eq!(disk_guid, primary.disks.get(&disk).unwrap().guid);
-                }
+        // XXX proper error handling
+        for phys in results.iter().flatten() {
+            assert_eq!(phys.cache_guid, cache_guid);
+            assert!(phys.disk != primary_disk || phys.primary.is_some());
+            if let Some(disk_guid) = phys.disk_guid {
+                assert_eq!(disk_guid, primary.disks.get(&phys.disk).unwrap().guid);
             }
         }
         let sector_size = block_access.round_up_to_sector::<u64>(1);
@@ -225,7 +223,7 @@ impl SuperblockPhys {
             .await
     }
 
-    async fn write(&self, block_access: &BlockAccess, disk: DiskId) {
+    pub async fn write(&self, block_access: &BlockAccess, disk: DiskId) {
         maybe_die_with(|| format!("before writing {:#?}", self));
         debug!("writing {:#?}", self);
         let raw = block_access.chunk_to_raw(EncodeType::Json, self);
@@ -237,6 +235,7 @@ impl SuperblockPhys {
                 DiskIoType::MaintenanceWrite,
             )
             .await;
+        maybe_die_with(|| format!("after writing {:#?}", self));
     }
 
     pub async fn dump_all(block_access: &BlockAccess) {
@@ -268,33 +267,27 @@ impl SuperblockPhys {
 }
 
 impl PrimaryFeaturesPhys {
-    /// Return value is (Self, primary_disk, cache_guid, extra_disks)
     pub async fn read(block_access: &BlockAccess) -> Result<Self> {
         let results = SuperblockFeaturesPhys::read_all(block_access).await;
 
         let (primary, primary_disk, cache_guid) = results
             .iter()
-            .find_map(|result| {
-                if let Ok(phys) = result {
-                    phys.primary
-                        .as_ref()
-                        .map(|primary| (primary.clone(), phys.disk, phys.cache_guid))
-                } else {
-                    None
-                }
+            .flatten()
+            .max_by_key(|phys| phys.primary.as_ref().map(|p| p.checkpoint_id))
+            .map(|phys| {
+                (
+                    phys.primary.as_ref().unwrap().clone(),
+                    phys.disk,
+                    phys.cache_guid,
+                )
             })
             .ok_or_else(|| anyhow!("Primary Superblock not found"))?;
 
-        for (id, result) in results.iter().enumerate() {
-            // XXX proper error handling
-            // XXX we should be able to reorder them?
-            if let Ok(phys) = result {
-                assert_eq!(DiskId::new(id), phys.disk);
-                assert_eq!(phys.cache_guid, cache_guid);
-                assert!(phys.primary.is_none() || phys.disk == primary_disk);
-            }
+        // XXX proper error handling
+        for phys in results.iter().flatten() {
+            assert_eq!(phys.cache_guid, cache_guid);
+            assert!(phys.disk != primary_disk || phys.primary.is_some());
         }
-
         Ok(primary)
     }
 }
diff --git a/cmd/zfs_object_agent/zettacache/src/zcachedb.rs b/cmd/zfs_object_agent/zettacache/src/zcachedb.rs
index 4f734a2a34f4..0a95d5cf9b09 100644
--- a/cmd/zfs_object_agent/zettacache/src/zcachedb.rs
+++ b/cmd/zfs_object_agent/zettacache/src/zcachedb.rs
@@ -1,5 +1,7 @@
+use std::collections::BTreeMap;
 use std::path::PathBuf;
 
+use crate::base_types::DiskId;
 use crate::zettacache::zcdb::ZCacheDBHandle;
 use crate::CacheOpenMode;
 
@@ -138,7 +140,7 @@ impl ZettaCacheDBCommand {
 
     async fn issue_pool_state_command(
         command: ZettaCacheDBCommand,
-        paths: Vec<PathBuf>,
+        paths: BTreeMap<DiskId, PathBuf>,
     ) -> Result<(), anyhow::Error> {
         let mut handle = ZCacheDBHandle::open(paths).await?;
         match command {
diff --git a/cmd/zfs_object_agent/zettacache/src/zettacache/mod.rs b/cmd/zfs_object_agent/zettacache/src/zettacache/mod.rs
index 2b85340351a0..f09994227a42 100644
--- a/cmd/zfs_object_agent/zettacache/src/zettacache/mod.rs
+++ b/cmd/zfs_object_agent/zettacache/src/zettacache/mod.rs
@@ -1,5 +1,6 @@
 pub mod merge;
 pub mod remap;
+pub mod removal;
 pub mod zcdb;
 
 use std::collections::btree_map;
@@ -33,6 +34,7 @@ use futures::FutureExt;
 use log::*;
 use lru::LruCache;
 use more_asserts::*;
+use rand::seq::IteratorRandom;
 use rand::Rng;
 use serde::Deserialize;
 use serde::Serialize;
@@ -72,6 +74,7 @@ use self::merge::MergeMessage;
 use self::merge::MergeProgressPhys;
 use self::merge::MergeState;
 use self::remap::RemapState;
+use self::removal::DeviceRemoval;
 use crate::atime_histogram::AtimeHistogram;
 use crate::atime_histogram::AtimeHistogramPhys;
 use crate::base_types::*;
@@ -95,6 +98,7 @@ use crate::slab_allocator::SlabAllocatorPhys;
 use crate::slab_allocator::RESERVED_SLABS_PCT;
 use crate::superblock::DiskPhys;
 use crate::superblock::PrimaryPhys;
+use crate::superblock::SuperblockPhys;
 use crate::superblock::SUPERBLOCK_SIZE;
 use crate::CacheOpenMode;
 
@@ -152,6 +156,13 @@ tunable! {
     static ref CORRUPTION_FILL: u8 = 0x31; // fill blocks with '1'
 }
 
+// Max number of merge cycles needed for a device to be removed given that it no longer has
+// any user data. This should be at most 2 which is the worst case where we started a removal
+// mid-way of a merge and the device had no user data (e.g. data in the block allocator). In
+// that case we start counting our merges and the first merge doesn't really count because
+// removal started midway so the index metadata was not fully moved away from the device.
+const REMOVAL_MAX_MERGES_FOR_METADATA_EVACUATION: u8 = 2;
+
 /// A PendingChange is the in-core data structure for tracking changes to the index between merges.
 /// Two types of events are tracked: insertions and lookup hits (atime update). Note that there is
 /// no removal event here because we do not allow removes in general. Cache content is only removed
@@ -219,7 +230,6 @@ struct Locked {
     block_access: Arc<BlockAccess>,
     primary: PrimaryPhys,
     guid: CacheGuid,
-    primary_disk: DiskId,
     block_allocator: BlockAllocator,
     pending_changes: PendingChanges,
     pending_changes_trigger: usize,
@@ -240,6 +250,7 @@ struct Locked {
     // This is needed to ensure that writes complete before we complete the next
     // checkpoint, so that they are persisted to disk.
     outstanding_writes: ConcurrentBatch,
+    device_removal: DeviceRemoval,
 
     atime: Atime,
     stats: Arc<CacheStats>,
@@ -398,6 +409,32 @@ impl ZettaCache {
         }
     }
 
+    pub async fn remove_disk(&self, path: &Path) -> Result<()> {
+        match &*self.inner.load() {
+            Some(inner) => inner.remove_disk(path).await,
+            None => Err(anyhow!("disk {path:?} is not part of the zettacache")),
+        }
+    }
+
+    pub async fn cancel_disk_removal(&self, path: &Path) -> Result<()> {
+        match &*self.inner.load() {
+            Some(inner) => inner.cancel_disk_removal(path).await,
+            None => Err(anyhow!("disk {path:?} is not part of the zettacache")),
+        }
+    }
+
+    pub async fn pause_disk_removals(&self) {
+        if let Some(inner) = &*self.inner.load() {
+            inner.pause_disk_removals().await;
+        }
+    }
+
+    pub async fn resume_disk_removals(&self) {
+        if let Some(inner) = &*self.inner.load() {
+            inner.resume_disk_removals().await;
+        }
+    }
+
     pub async fn initiate_merge(&self) {
         if let Some(inner) = &*self.inner.load() {
             inner.initiate_merge().await;
@@ -458,11 +495,12 @@ impl ZettaCache {
 
 impl Inner {
     async fn create(paths: Vec<PathBuf>) -> Result<()> {
-        let mut disks: Vec<Disk> = Vec::with_capacity(paths.len());
-        for path in paths {
-            disks.push(Disk::new(&path, false)?);
-        }
-        let block_access = BlockAccess::new(disks, false);
+        let paths = paths
+            .into_iter()
+            .enumerate()
+            .map(|(id, path)| (DiskId::new(id), path))
+            .collect();
+        let block_access = BlockAccess::new(&paths, false)?;
 
         let guid = CacheGuid::new();
 
@@ -490,8 +528,10 @@ impl Inner {
             ),
 
             merge_progress: None,
+            device_removal: Default::default(),
         };
-        let slab_allocator = SlabAllocatorBuilder::new(checkpoint.slab_allocator.clone()).build();
+        let slab_allocator =
+            SlabAllocatorBuilder::new(checkpoint.slab_allocator.clone()).build(&Vec::new());
         let checkpoint_extents = checkpoint.write(&block_access, &slab_allocator).await;
         PrimaryPhys::new(
             block_access
@@ -548,25 +588,21 @@ impl Inner {
         if paths.is_empty() {
             return Err(CacheOpenError::NoDevices);
         }
-        let mut disks: Vec<Disk> = Vec::with_capacity(paths.len());
-        for path in &paths {
-            disks.push(Disk::new(path, false)?);
-        }
-        let block_access = Arc::new(BlockAccess::new(disks, false));
+        let block_access = Arc::new(BlockAccess::new(&paths, false)?);
 
         let feature_flags = match PrimaryPhys::read_features(&block_access).await {
             Ok(f) => f,
             Err(_) => {
                 // XXX need proper create CLI
-                Self::create(paths.clone()).await?;
+                Self::create(paths.values().cloned().collect()).await?;
                 PrimaryPhys::read_features(&block_access).await.unwrap()
             }
         };
-        check_features(&feature_flags)
-            .map_err(|e| CacheOpenError::IncompatibleFeatures(paths, e))?;
+        check_features(&feature_flags).map_err(|e| {
+            CacheOpenError::IncompatibleFeatures(paths.values().cloned().collect(), e)
+        })?;
 
-        let (mut primary, primary_disk, guid, extra_disks) =
-            PrimaryPhys::read(&block_access).await.unwrap();
+        let (mut primary, _, guid, extra_disks) = PrimaryPhys::read(&block_access).await.unwrap();
 
         let mut checkpoint = CheckpointPhys::read(&block_access, &primary.checkpoint).await?;
         assert_eq!(checkpoint.id, primary.checkpoint_id);
@@ -640,7 +676,7 @@ impl Inner {
         .await;
         block_builder.claim(&mut slab_builder);
 
-        let slab_allocator = Arc::new(slab_builder.build());
+        let slab_allocator = Arc::new(slab_builder.build(&checkpoint.device_removal.pending));
         let block_allocator = block_builder.build(slab_allocator.clone());
 
         let operation_log = BlockBasedLog::open(
@@ -742,10 +778,10 @@ impl Inner {
             size_histogram: checkpoint.size_histogram,
             operation_log,
             primary,
-            primary_disk,
             guid,
             outstanding_reads: Default::default(),
             outstanding_writes: Default::default(),
+            device_removal: DeviceRemoval::open(checkpoint.device_removal),
             atime: checkpoint.last_atime,
             block_allocator,
             slab_allocator,
@@ -1005,6 +1041,7 @@ impl Inner {
                             let mut locked = lock_non_send_measured!(&self.locked).await;
                             locked.rotate_index(&mut indices.old, new_index);
                             locked.block_allocator.rebalance_fini();
+                            locked.device_removal.complete_merge_cycle();
                             indices.new = None;
                             merging = None;
                             completed_merge = true;
@@ -1491,7 +1528,7 @@ impl Inner {
     }
 
     async fn add_disk(&self, path: &Path) -> Result<()> {
-        lock_non_send_measured!(&self.locked).await.add_disk(path)?;
+        lock_measured!(&self.locked).await.add_disk(path).await?;
         self.sync_checkpoint().await;
         Ok(())
     }
@@ -1505,6 +1542,28 @@ impl Inner {
         Ok(additional_bytes)
     }
 
+    async fn remove_disk(&self, path: &Path) -> Result<()> {
+        self.locked.lock().await.remove_disk(path)?;
+        self.sync_checkpoint().await;
+        Ok(())
+    }
+
+    async fn cancel_disk_removal(&self, path: &Path) -> Result<()> {
+        self.locked.lock().await.cancel_disk_removal(path)?;
+        self.sync_checkpoint().await;
+        Ok(())
+    }
+
+    async fn pause_disk_removals(&self) {
+        self.locked.lock().await.pause_disk_removals();
+        self.sync_checkpoint().await;
+    }
+
+    async fn resume_disk_removals(&self) {
+        self.locked.lock().await.resume_disk_removals();
+        self.sync_checkpoint().await;
+    }
+
     async fn initiate_merge(&self) {
         lock_non_send_measured!(&self.locked).await.request_merge();
         self.sync_checkpoint().await;
@@ -1808,6 +1867,75 @@ impl Locked {
         completed_merge: bool,
         pool_guids: PoolGuidMappingPhys,
     ) {
+        if let Some(removal_entry) = self.device_removal.removing_disk_entry() {
+            let disk = removal_entry.disk;
+
+            // The stages of the removal are the following:
+            //
+            // [1] We wait for the merge logic to evict enough blocks so that there's sufficient
+            // space available to the block allocator to move all blocks off the target disk.
+            //
+            // [2] Once we have enough free space to start the evacuation for the first disk
+            // being removed, we issue a rebalance in the block allocator to move its data to
+            // non-removing disks.
+            //
+            // [3] Once that evacuation is done we want to move any metadata away from that disk
+            // so that its fully evacuated. The first piece of metadata that we move is anything
+            // that's not refreshed every merge cycle by the index (currently this means only the
+            // block allocator's spacemaps) - we explicitly move those.
+            //
+            // [4] Once that is done, we have the expecation that all other metadata (index,
+            // operation_log, etc..) will evacuate implicitly over the next couple of merge cyles
+            // as we free their old data which may reside in the removing disks and allocate new
+            // ones in allocatable disks. Since we don't have explict control of the merge tasks
+            // that we spawn from here, we bound this removal stage to a specific number of
+            // cycles (see REMOVAL_MAX_MERGES_FOR_METADATA_EVACUATION). If that bound is ever
+            // crossed then we panic as it is implied that the removal is not making progress.
+            //
+            // [5] Once the disk is fully evacuated we remove it from our in-memory structures
+            // which will later be flushed to disk, effectively finishing the removal of said
+            // disk.
+            //
+            // XXX: The REMOVAL_MAX_MERGES_FOR_METADATA_EVACUATION mechanism is not ideal for
+            // now as we don't store the progress that we've made in terms of merge cycles
+            // on disk for each removal. This means that if the agent restarts we'll have
+            // to wait for another N merges before detecting that we aren't making progress.
+            if !self.block_allocator.disk_is_pending_remap(disk) {
+                if !self.slab_allocator.disk_is_fully_evacuated(disk) {
+                    match removal_entry.merge_cycles_left {
+                        Some(cycles_left) => {
+                            assert_gt!(
+                                cycles_left,
+                                0,
+                                "took more than {} merge cycles to remove the devices metadata",
+                                REMOVAL_MAX_MERGES_FOR_METADATA_EVACUATION
+                            );
+                        }
+                        None => {
+                            let slabs_moved = self
+                                .block_allocator
+                                .transfer_metadata_for_removal(disk)
+                                .await;
+                            removal_entry.merge_cycles_left =
+                                Some(REMOVAL_MAX_MERGES_FOR_METADATA_EVACUATION);
+                            info!(
+                            "removal: {disk:?}: {slabs_moved} metadata slabs moved; awaiting at most {} merge cycles",
+                            REMOVAL_MAX_MERGES_FOR_METADATA_EVACUATION
+                        );
+                        }
+                    }
+                } else {
+                    self.slab_allocator.remove_disk(disk);
+                    self.block_allocator.remove_disk(disk);
+                    self.block_access.remove_disk(disk);
+                    self.device_removal.complete_removal(disk);
+                    let removed = self.primary.disks.remove(&disk);
+                    assert!(removed.is_some());
+                    info!("removal: completed for {disk:?}");
+                }
+            }
+        }
+
         debug!(
             "flushing checkpoint {:?}",
             self.primary.checkpoint_id.next()
@@ -1853,6 +1981,7 @@ impl Locked {
             block_allocator: self.block_allocator.flush(completed_merge).await,
             size_histogram: self.size_histogram.clone(),
             merge_progress: merge_progress_phys,
+            device_removal: self.device_removal.to_phys(),
         };
 
         let checkpoint_extents = checkpoint
@@ -1865,9 +1994,17 @@ impl Locked {
         }
         self.primary.checkpoint_id = checkpoint.id;
         self.primary.feature_flags = SUPPORTED_FEATURES.keys().cloned().collect();
-        // We need to write all the disks' superblocks in case new disks have been added.
+
+        // The following call is the last step on saving the cache's state. Any crash that
+        // happens after this and up until the next checkpoint, will have us restart from the
+        // state recorded here.
+        let primary_disk = self
+            .block_access
+            .disks()
+            .choose(&mut rand::thread_rng())
+            .unwrap();
         self.primary
-            .write_all(self.primary_disk, self.guid, &self.block_access)
+            .write(primary_disk, self.guid, &self.block_access)
             .await;
 
         self.slab_allocator.set_reservation(
@@ -1942,24 +2079,67 @@ impl Locked {
 
     fn space_to_evict(&self) -> u64 {
         let allocatable_from_slabs = self.slab_allocator.allocatable_bytes();
-        let allocatable_from_blocks = self.block_allocator.available();
-        let target_allocatable = TARGET_FREE_BLOCKS_PCT.apply(self.slab_allocator.capacity());
+        let allocatable_from_blocks = self.block_allocator.allocatable_bytes();
+        let slab_allocator_capacity =
+            self.slab_allocator.capacity() - self.slab_allocator.removing_capacity();
+
+        // If we are removing devices make sure that we have enough free space evacuate their
+        // data to the rest of the cache, on top of our TARGET_FREE_BLOCKS_PCT.
+        let bytes_to_relocate = self.block_allocator.removing_bytes();
+
+        let target_allocatable =
+            TARGET_FREE_BLOCKS_PCT.apply(slab_allocator_capacity) + bytes_to_relocate;
         let reduction = target_allocatable
             .saturating_sub(allocatable_from_slabs)
             .saturating_sub(allocatable_from_blocks);
 
         info!(
-            "want to evict {} of allocated blocks ({} allocatable slabs; {} allocatable blocks; {} target; {} freeing; {} histogram)",
+            "want to evict {} of allocated blocks ({} allocatable slabs; {} allocatable blocks; {} target; {} histogram)",
             nice_p2size(reduction),
             nice_p2size(allocatable_from_slabs),
             nice_p2size(allocatable_from_blocks),
             nice_p2size(target_allocatable),
-            nice_p2size(self.block_allocator.freeing()),
             nice_p2size(self.atime_histogram.sum_live()),
         );
         reduction
     }
 
+    /// Returns true if there is space to evacuate the allocated space of the first removing
+    /// device plus some slop space. Otherwise, false.
+    fn removal_can_evacuate(&self) -> bool {
+        if self.device_removal.paused {
+            return false;
+        }
+
+        match self.device_removal.removing_disk() {
+            Some(disk) => {
+                if !self.block_allocator.disk_is_pending_remap(disk) {
+                    return false;
+                }
+
+                let space_to_evacuate = self.block_allocator.disk_allocated_bytes(disk);
+                if space_to_evacuate == 0 {
+                    return false;
+                }
+
+                let allocatable_space = self.slab_allocator.allocatable_bytes()
+                    + self.block_allocator.allocatable_bytes();
+                let slab_allocator_capacity =
+                    self.slab_allocator.capacity() - self.slab_allocator.removing_capacity();
+                let slop = TARGET_FREE_BLOCKS_PCT.apply(slab_allocator_capacity);
+
+                debug!(
+                    "removal: {} to evacuate (plus slop: {}); {} available",
+                    nice_p2size(space_to_evacuate),
+                    nice_p2size(slop),
+                    nice_p2size(allocatable_space)
+                );
+                allocatable_space > (space_to_evacuate + slop)
+            }
+            None => false,
+        }
+    }
+
     fn need_merge(&self) -> bool {
         let mut need_merge = false;
 
@@ -1983,6 +2163,16 @@ impl Locked {
             }
         }
 
+        {
+            if self.removal_can_evacuate() {
+                debug!(
+                    "starting merge due to data evacuation of {:?}",
+                    self.device_removal.removing_disk().unwrap()
+                );
+                need_merge = true;
+            }
+        }
+
         {
             let slabs = self.slab_allocator.num_slabs_to_evacuate();
             if slabs > EVACUATION_MIN_BATCH_PCT.apply(self.slab_allocator.num_slabs()) {
@@ -1991,6 +2181,17 @@ impl Locked {
             }
         }
 
+        {
+            // If we've evacuated the disk's user data (i.e. block_allocator slabs) and misc
+            // metadata explicitly but there is no need for us to start a merge (e.g. no need to
+            // evict space) make sure that we force a merge anyway to evacuate any index metadata
+            // that are part of the removing disk.
+            if self.device_removal.need_index_evacuation() {
+                info!("starting merge due to pending removal");
+                need_merge = true;
+            }
+        }
+
         if self.merge_requested {
             debug!("starting merge due to user request");
             need_merge = true;
@@ -2043,7 +2244,12 @@ impl Locked {
         )
         .await;
 
-        let remap = match self.block_allocator.rebalance_init() {
+        let removing_disk = if self.removal_can_evacuate() {
+            self.device_removal.removing_disk()
+        } else {
+            None
+        };
+        let remap = match self.block_allocator.rebalance_init(removing_disk) {
             None => None,
             Some(map) => {
                 // We need to ensure that rebalance() won't copy from blocks that we're still in
@@ -2179,14 +2385,14 @@ impl Locked {
         self.stats
             .track_instantaneous(SlabCapacity, self.slab_allocator.capacity());
         self.stats
-            .track_instantaneous(AvailableBlocksSize, self.block_allocator.available());
+            .track_instantaneous(FreeBlocksSize, self.block_allocator.free_bytes());
         self.stats.track_instantaneous(
-            AvailableSlabsSize,
+            FreeSlabsSize,
             self.slab_allocator.free_slabs() * self.slab_allocator.slab_size(),
         );
         self.stats.track_instantaneous(
             AvailableSpace,
-            self.block_allocator.available() + self.slab_allocator.allocatable_bytes(),
+            self.block_allocator.allocatable_bytes() + self.slab_allocator.allocatable_bytes(),
         );
         let old_pending = match &self.merge {
             Some(ms) => ms.old_pending_changes.len() as u64,
@@ -2198,7 +2404,7 @@ impl Locked {
         );
     }
 
-    fn add_disk(&mut self, path: &Path) -> Result<()> {
+    async fn add_disk(&mut self, path: &Path) -> Result<()> {
         // We hold the state lock across all these operations to ensure that we're always
         // adding the last DiskId to the SlabAllocator and Primary, in the case of concurrent
         // calls to add_disk().
@@ -2210,11 +2416,20 @@ impl Locked {
                 .disk_extent(disk_id)
                 .trim_start(SUPERBLOCK_SIZE),
         );
+        self.block_allocator.add_disk(disk_id);
 
         self.primary
             .disks
             .insert(disk_id, DiskPhys::new(self.block_access.disk_size(disk_id)));
 
+        let superblock = SuperblockPhys {
+            primary: None,
+            disk: disk_id,
+            cache_guid: self.guid,
+            disk_guid: Some(self.primary.disks.get(&disk_id).unwrap().guid),
+        };
+        superblock.write(&self.block_access, disk_id).await;
+
         // The hit data isn't accurate across cache size changes, so clear
         // it, which also updates the histogram parameters to reflect the
         // new cache size.
@@ -2227,6 +2442,11 @@ impl Locked {
     // Returns the amount of additional space, in bytes.
     fn expand_disk(&mut self, path: &Path) -> Result<ExpandDiskResponse> {
         let disk = self.block_access.path_to_disk_id(path)?;
+
+        if self.device_removal.disk_is_pending_removal(disk) {
+            return Err(anyhow!("Cannot expand {path:?} - disk is being removed"));
+        }
+
         let response = self.block_access.expand_disk(disk)?;
         if response.additional_bytes > 0 {
             let phys = self.primary.disks.get_mut(&disk).unwrap();
@@ -2247,6 +2467,63 @@ impl Locked {
         Ok(response)
     }
 
+    fn remove_disk(&mut self, path: &Path) -> Result<()> {
+        let begin = Instant::now();
+
+        let disk = self.block_access.path_to_disk_id(path)?;
+        if self.device_removal.disk_is_pending_removal(disk) {
+            return Err(anyhow!("{path:?} is already being removed"));
+        }
+
+        // TODO: DLPX-81553
+        // If we get rid of all the devices in the cache we should make sure that
+        // all the threads related to Inner and Locked together with the references
+        // that they contain to various structs are dropped correctly and we don't
+        // leak anything.
+        if self.device_removal.disks_in_queue() + 1 == self.block_access.disks().count() {
+            return Err(anyhow!("removal of all disks is not allowed"));
+        }
+
+        self.slab_allocator.mark_noalloc_disk(disk);
+        self.block_allocator.mark_noalloc_disk(disk);
+        self.device_removal.add_to_queue(disk);
+
+        info!(
+            "issued removal of {path:?} in {}ms",
+            begin.elapsed().as_millis(),
+        );
+        Ok(())
+    }
+
+    fn cancel_disk_removal(&mut self, path: &Path) -> Result<()> {
+        let begin = Instant::now();
+
+        let disk = self.block_access.path_to_disk_id(path)?;
+        if !self.device_removal.disk_is_pending_removal(disk) {
+            return Err(anyhow!("{path:?} not currently being removed"));
+        }
+
+        self.slab_allocator.unmark_noalloc_disk(disk);
+        self.block_allocator.unmark_noalloc_disk(disk);
+        self.device_removal.remove_from_queue(disk);
+
+        info!(
+            "cancelled removal of {path:?} in {}ms",
+            begin.elapsed().as_millis(),
+        );
+        Ok(())
+    }
+
+    fn pause_disk_removals(&mut self) {
+        self.device_removal.paused = true;
+        info!("pausing removals");
+    }
+
+    fn resume_disk_removals(&mut self) {
+        self.device_removal.paused = false;
+        info!("resuming removals");
+    }
+
     fn request_merge(&mut self) {
         self.merge_requested = true;
     }
diff --git a/cmd/zfs_object_agent/zettacache/src/zettacache/removal.rs b/cmd/zfs_object_agent/zettacache/src/zettacache/removal.rs
new file mode 100644
index 000000000000..7bbab455748d
--- /dev/null
+++ b/cmd/zfs_object_agent/zettacache/src/zettacache/removal.rs
@@ -0,0 +1,118 @@
+use log::info;
+use serde::Deserialize;
+use serde::Serialize;
+
+use crate::base_types::DiskId;
+
+#[derive(Default, Serialize, Deserialize, Debug, Clone)]
+pub struct DeviceRemovalPhys {
+    pub pending: Vec<DiskId>,
+    paused: bool,
+}
+
+pub struct DeviceRemovalEntry {
+    pub disk: DiskId,
+    // Indicates the number of merge cycles that we expect for the removal of all index-related
+    // metadata from the removing device. Value is None if we are still waiting for the removal
+    // of BlockAllocator's data and miscellaneous metadata not related to the index (see comment
+    // in Locked::flush_checkpoint).
+    pub merge_cycles_left: Option<u8>,
+}
+
+pub struct DeviceRemoval {
+    pending: Vec<DeviceRemovalEntry>,
+    pub paused: bool,
+}
+
+impl DeviceRemoval {
+    pub fn open(phys: DeviceRemovalPhys) -> Self {
+        DeviceRemoval {
+            pending: phys
+                .pending
+                .iter()
+                .map(|&disk| DeviceRemovalEntry {
+                    disk,
+                    merge_cycles_left: None,
+                })
+                .collect(),
+            paused: phys.paused,
+        }
+    }
+
+    pub fn to_phys(&self) -> DeviceRemovalPhys {
+        DeviceRemovalPhys {
+            pending: self
+                .pending
+                .iter()
+                .map(|removal_entry| removal_entry.disk)
+                .collect(),
+            paused: self.paused,
+        }
+    }
+
+    /// Return the DiskId that we are actively removing.
+    pub fn removing_disk(&self) -> Option<DiskId> {
+        self.pending.get(0).map(|removal_entry| removal_entry.disk)
+    }
+
+    /// Return the DeviceRemovalEntry of the disk we are actively removing.
+    pub fn removing_disk_entry(&mut self) -> Option<&mut DeviceRemovalEntry> {
+        self.pending.get_mut(0)
+    }
+
+    pub fn disks_in_queue(&self) -> usize {
+        self.pending.len()
+    }
+
+    pub fn add_to_queue(&mut self, disk: DiskId) {
+        self.pending.push(DeviceRemovalEntry {
+            disk,
+            merge_cycles_left: None,
+        });
+    }
+
+    pub fn remove_from_queue(&mut self, disk: DiskId) {
+        let pos = self
+            .pending
+            .iter()
+            .position(|removal_entry| removal_entry.disk == disk)
+            .unwrap();
+        self.pending.remove(pos);
+    }
+
+    pub fn disk_is_pending_removal(&self, disk: DiskId) -> bool {
+        self.pending
+            .iter()
+            .any(|removal_entry| removal_entry.disk == disk)
+    }
+
+    /// Returns true if we are in the last stage of removal where we are waiting for
+    /// the index metadata to be evacuated from our currently removing disk. False
+    /// otherwise.
+    pub fn need_index_evacuation(&self) -> bool {
+        if let Some(removal_entry) = self.pending.get(0) {
+            return removal_entry.merge_cycles_left.is_some();
+        }
+        false
+    }
+
+    /// Called at the completion of each merge cycle, we decrement the `merge_cycles_left`
+    /// counter of the removal in progress as a sign for tracking our progress (see comment in
+    /// Locked::flush_checkpoint).
+    pub fn complete_merge_cycle(&mut self) {
+        if let Some(removal_entry) = self.pending.get_mut(0) {
+            if let Some(cycles_left) = removal_entry.merge_cycles_left.as_mut() {
+                *cycles_left = cycles_left.checked_sub(1).unwrap();
+                info!(
+                    "removal: {cycles_left} merge cycles left for {:?}",
+                    removal_entry.disk
+                )
+            }
+        }
+    }
+
+    pub fn complete_removal(&mut self, disk: DiskId) {
+        let removal_entry = self.pending.remove(0);
+        assert_eq!(removal_entry.disk, disk)
+    }
+}
diff --git a/cmd/zfs_object_agent/zettacache/src/zettacache/zcdb.rs b/cmd/zfs_object_agent/zettacache/src/zettacache/zcdb.rs
index b999ba1e5ace..37de2d9e0602 100644
--- a/cmd/zfs_object_agent/zettacache/src/zettacache/zcdb.rs
+++ b/cmd/zfs_object_agent/zettacache/src/zettacache/zcdb.rs
@@ -1,17 +1,16 @@
+use std::collections::BTreeMap;
 use std::path::PathBuf;
 use std::sync::Arc;
 
 use anyhow::Result;
 use futures::StreamExt;
 use util::nice_p2size;
-use util::writeln_stderr;
 use util::writeln_stdout;
 
 use super::CheckpointPhys;
 use crate::base_types::CacheGuid;
 use crate::base_types::DiskId;
 use crate::block_access::BlockAccess;
-use crate::block_access::Disk;
 use crate::block_allocator::zcdb::zcachedb_dump_slabs;
 use crate::block_allocator::zcdb::zcachedb_dump_spacemaps;
 use crate::features::check_features;
@@ -33,32 +32,19 @@ pub struct ZCacheDBHandle {
 }
 
 impl ZCacheDBHandle {
-    pub async fn dump_superblocks(paths: Vec<PathBuf>) -> Result<()> {
-        let mut disks: Vec<Disk> = Vec::with_capacity(paths.len());
-        for path in paths {
-            match Disk::new(&path, true) {
-                Ok(disk) => disks.push(disk),
-                Err(err) => writeln_stderr!("error: {}", err),
-            }
-        }
-        if disks.is_empty() {
-            return Ok(());
-        }
-        let block_access = BlockAccess::new(disks, true);
+    pub async fn dump_superblocks(paths: BTreeMap<DiskId, PathBuf>) -> Result<()> {
+        let block_access = BlockAccess::new(&paths, true)?;
         SuperblockPhys::dump_all(&block_access).await;
         Ok(())
     }
 
-    pub async fn open(paths: Vec<PathBuf>) -> Result<ZCacheDBHandle> {
-        let mut disks: Vec<Disk> = Vec::with_capacity(paths.len());
-        for path in &paths {
-            disks.push(Disk::new(path, true)?);
-        }
-        let block_access = Arc::new(BlockAccess::new(disks, true));
+    pub async fn open(paths: BTreeMap<DiskId, PathBuf>) -> Result<ZCacheDBHandle> {
+        let block_access = Arc::new(BlockAccess::new(&paths, true)?);
 
         let feature_flags = PrimaryPhys::read_features(&block_access).await?;
-        check_features(&feature_flags)
-            .map_err(|e| CacheOpenError::IncompatibleFeatures(paths, e))?;
+        check_features(&feature_flags).map_err(|e| {
+            CacheOpenError::IncompatibleFeatures(paths.values().cloned().collect(), e)
+        })?;
 
         let (primary, primary_disk, guid, _extra_disks) = PrimaryPhys::read(&block_access).await?;
         let checkpoint = Arc::new(CheckpointPhys::read(&block_access, &primary.checkpoint).await?);
@@ -84,13 +70,7 @@ impl ZCacheDBHandle {
         writeln_stdout!("  Primary {:?}, {:?}", self.primary_disk, self.guid);
         writeln_stdout!();
 
-        let slabs_capacity = self
-            .checkpoint
-            .slab_allocator
-            .capacity()
-            .iter()
-            .map(|extent| extent.size)
-            .sum();
+        let slabs_capacity = self.checkpoint.slab_allocator.active_capacity_bytes();
         writeln_stdout!("Slabs Region: {}", nice_p2size(slabs_capacity));
         writeln_stdout!("-------------------------------");
         let mut total_used_bytes = 0;
diff --git a/cmd/zfs_object_agent/zettaobject/src/root_connection.rs b/cmd/zfs_object_agent/zettaobject/src/root_connection.rs
index 906d2bcb5721..9285f40c2b8d 100644
--- a/cmd/zfs_object_agent/zettaobject/src/root_connection.rs
+++ b/cmd/zfs_object_agent/zettaobject/src/root_connection.rs
@@ -122,6 +122,19 @@ impl RootConnectionState {
         server.register_handler(TYPE_CLEAR_HIT_DATA, Box::new(Self::clear_hit_data));
         server.register_handler(TYPE_ADD_DISK, Box::new(Self::add_disk));
         server.register_handler(TYPE_EXPAND_DISK, Box::new(Self::expand_disk));
+        server.register_handler(TYPE_REMOVE_DISK, Box::new(Self::remove_disk));
+        server.register_handler(
+            TYPE_CANCEL_DISK_REMOVAL,
+            Box::new(Self::cancel_disk_removals),
+        );
+        server.register_handler(
+            TYPE_PAUSE_DISK_REMOVALS,
+            Box::new(Self::pause_disk_removals),
+        );
+        server.register_handler(
+            TYPE_RESUME_DISK_REMOVALS,
+            Box::new(Self::resume_disk_removals),
+        );
         server.register_handler(TYPE_SYNC_CHECKPOINT, Box::new(Self::sync_checkpoint));
         server.register_handler(TYPE_INITIATE_MERGE, Box::new(Self::initiate_merge));
         server.register_struct_handler(MessageType::ReadBlock, Box::new(Self::read_block));
@@ -606,6 +619,54 @@ impl RootConnectionState {
         }))
     }
 
+    fn remove_disk(&mut self, nvl: NvList) -> HandlerReturn {
+        let cache = self.cache.clone();
+        Ok(Box::pin(async move {
+            let request: RemoveDiskRequest = nvpair::from_nvlist(&nvl)?;
+            debug!("got {:?}", request);
+
+            let result = cache
+                .remove_disk(&request.path)
+                .await
+                .map_err(FailureMessage::new);
+            return_result((), result, true)
+        }))
+    }
+
+    fn cancel_disk_removals(&mut self, nvl: NvList) -> HandlerReturn {
+        let cache = self.cache.clone();
+        Ok(Box::pin(async move {
+            let request: RemoveDiskRequest = nvpair::from_nvlist(&nvl)?;
+            debug!("got {:?}", request);
+
+            let result = cache
+                .cancel_disk_removal(&request.path)
+                .await
+                .map_err(FailureMessage::new);
+            return_result((), result, true)
+        }))
+    }
+
+    fn pause_disk_removals(&mut self, _nvl: NvList) -> HandlerReturn {
+        let cache = self.cache.clone();
+        Ok(Box::pin(async move {
+            debug!("got pause_disk_removals");
+
+            cache.pause_disk_removals().await;
+            return_ok((), true)
+        }))
+    }
+
+    fn resume_disk_removals(&mut self, _nvl: NvList) -> HandlerReturn {
+        let cache = self.cache.clone();
+        Ok(Box::pin(async move {
+            debug!("got resume_disk_removals");
+
+            cache.resume_disk_removals().await;
+            return_ok((), true)
+        }))
+    }
+
     fn sync_checkpoint(&mut self, nvl: NvList) -> HandlerReturn {
         let cache = self.cache.clone();
         Ok(Box::pin(async move {