diff --git a/control_plane/src/bin/attachment_service.rs b/control_plane/src/bin/attachment_service.rs
index e879646b637b..d4bca59c7b35 100644
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -223,6 +223,7 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
     if attach_req.pageserver_id.is_some() {
         tenant_state.generation += 1;
     }
+    tenant_state.pageserver = attach_req.pageserver_id;
     let generation = tenant_state.generation;
 
     locked.save().await.map_err(ApiError::InternalServerError)?;
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index f354296be223..68620787bbc2 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -363,8 +363,15 @@ pub struct TimelineInfo {
     pub latest_gc_cutoff_lsn: Lsn,
     #[serde_as(as = "DisplayFromStr")]
     pub disk_consistent_lsn: Lsn,
+
+    /// The LSN that we have succesfully uploaded to remote storage
     #[serde_as(as = "DisplayFromStr")]
     pub remote_consistent_lsn: Lsn,
+
+    /// The LSN that we are advertizing to safekeepers
+    #[serde_as(as = "DisplayFromStr")]
+    pub remote_consistent_lsn_visible: Lsn,
+
     pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
     /// Sum of the size of all layer files.
     /// If a layer is present in both local FS and S3, it counts only once.
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 1ddd156a087e..a92b87632bda 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -20,6 +20,7 @@ use std::{
 
 use anyhow::{bail, Context};
 
+use serde::{Deserialize, Serialize};
 use tokio::io;
 use toml_edit::Item;
 use tracing::info;
@@ -42,6 +43,9 @@ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
 /// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
 pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
 
+/// As defined in S3 docs
+pub const MAX_KEYS_PER_DELETE: usize = 1000;
+
 const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
 
 /// Path on the remote storage, relative to some inner prefix.
@@ -50,6 +54,25 @@ const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub struct RemotePath(PathBuf);
 
+impl Serialize for RemotePath {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        serializer.collect_str(self)
+    }
+}
+
+impl<'de> Deserialize<'de> for RemotePath {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        let str = String::deserialize(deserializer)?;
+        Ok(Self(PathBuf::from(&str)))
+    }
+}
+
 impl std::fmt::Display for RemotePath {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "{}", self.0.display())
@@ -88,6 +111,10 @@ impl RemotePath {
     pub fn extension(&self) -> Option<&str> {
         self.0.extension()?.to_str()
     }
+
+    pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Path, std::path::StripPrefixError> {
+        self.0.strip_prefix(&p.0)
+    }
 }
 
 /// Storage (potentially remote) API to manage its state.
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 9262f1e88f15..acab9539042d 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -33,11 +33,10 @@ use tracing::debug;
 
 use super::StorageMetadata;
 use crate::{
-    Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Download, DownloadError, RemotePath, RemoteStorage, S3Config, MAX_KEYS_PER_DELETE,
+    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
-const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000;
-
 pub(super) mod metrics;
 
 use self::metrics::{AttemptOutcome, RequestKind};
@@ -500,7 +499,7 @@ impl RemoteStorage for S3Bucket {
             delete_objects.push(obj_id);
         }
 
-        for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
+        for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
             let started_at = start_measuring_requests(kind);
 
             let resp = self
diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs
index 163c8c0467f7..88d50905c644 100644
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -89,6 +89,22 @@ impl Generation {
             Self::Broken => panic!("Attempted to use a broken generation"),
         }
     }
+
+    pub fn next(&self) -> Generation {
+        match self {
+            Self::Valid(n) => Self::Valid(*n + 1),
+            Self::None => Self::Valid(1),
+            Self::Broken => panic!("Attempted to use a broken generation"),
+        }
+    }
+
+    pub fn into(self) -> Option<u32> {
+        if let Self::Valid(v) = self {
+            Some(v)
+        } else {
+            None
+        }
+    }
 }
 
 impl Serialize for Generation {
diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs
index 527e486fd0df..dd54cd6ecd6c 100644
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -24,6 +24,9 @@ pub enum ApiError {
     #[error("Precondition failed: {0}")]
     PreconditionFailed(Box<str>),
 
+    #[error("Shutting down")]
+    ShuttingDown,
+
     #[error(transparent)]
     InternalServerError(anyhow::Error),
 }
@@ -52,6 +55,10 @@ impl ApiError {
                 self.to_string(),
                 StatusCode::PRECONDITION_FAILED,
             ),
+            ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status(
+                "Shutting down".to_string(),
+                StatusCode::SERVICE_UNAVAILABLE,
+            ),
             ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
                 err.to_string(),
                 StatusCode::INTERNAL_SERVER_ERROR,
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index b6a2117f9cfc..90c7c11194bf 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -8,6 +8,7 @@ use anyhow::{anyhow, Context};
 use clap::{Arg, ArgAction, Command};
 
 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
+use pageserver::control_plane_client::ControlPlaneClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
@@ -20,6 +21,7 @@ use metrics::set_build_info_metric;
 use pageserver::{
     config::{defaults::*, PageServerConf},
     context::{DownloadBehavior, RequestContext},
+    deletion_queue::DeletionQueue,
     http, page_cache, page_service, task_mgr,
     task_mgr::TaskKind,
     task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
@@ -346,9 +348,22 @@ fn start_pageserver(
         }
     };
 
+    // Top-level cancellation token for the process
+    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
+
     // Set up remote storage client
     let remote_storage = create_remote_storage_client(conf)?;
 
+    // Set up deletion queue
+    let (deletion_queue, deletion_workers) = DeletionQueue::new(
+        remote_storage.clone(),
+        ControlPlaneClient::new(conf, &shutdown_pageserver),
+        conf,
+    );
+    if let Some(deletion_workers) = deletion_workers {
+        deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
+    }
+
     // Up to this point no significant I/O has been done: this should have been fast.  Record
     // duration prior to starting I/O intensive phase of startup.
     startup_checkpoint("initial", "Starting loading tenants");
@@ -379,13 +394,13 @@ fn start_pageserver(
     };
 
     // Scan the local 'tenants/' directory and start loading the tenants
-    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
-
+    let deletion_queue_client = deletion_queue.new_client();
     BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
         conf,
         TenantSharedResources {
             broker_client: broker_client.clone(),
             remote_storage: remote_storage.clone(),
+            deletion_queue_client,
         },
         order,
         shutdown_pageserver.clone(),
@@ -481,9 +496,10 @@ fn start_pageserver(
             http::routes::State::new(
                 conf,
                 http_auth.clone(),
-                remote_storage,
+                remote_storage.clone(),
                 broker_client.clone(),
                 disk_usage_eviction_state,
+                deletion_queue.new_client(),
             )
             .context("Failed to initialize router state")?,
         );
@@ -611,7 +627,12 @@ fn start_pageserver(
             // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
             // The plan is to change that over time.
             shutdown_pageserver.take();
-            BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
+            let bg_remote_storage = remote_storage.clone();
+            let bg_deletion_queue = deletion_queue.clone();
+            BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
+                bg_remote_storage.map(|_| bg_deletion_queue),
+                0,
+            ));
             unreachable!()
         }
     })
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 8ee7f28c1175..ed767b764e23 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -475,8 +475,8 @@ impl PageServerConfigBuilder {
         self.background_task_maximum_delay = BuilderValue::Set(delay);
     }
 
-    pub fn control_plane_api(&mut self, api: Url) {
-        self.control_plane_api = BuilderValue::Set(Some(api))
+    pub fn control_plane_api(&mut self, api: Option<Url>) {
+        self.control_plane_api = BuilderValue::Set(api)
     }
 
     pub fn build(self) -> anyhow::Result<PageServerConf> {
@@ -580,6 +580,27 @@ impl PageServerConf {
         self.workdir.join(TENANTS_SEGMENT_NAME)
     }
 
+    pub fn deletion_prefix(&self) -> PathBuf {
+        self.workdir.join("deletion")
+    }
+
+    pub fn deletion_list_path(&self, sequence: u64) -> PathBuf {
+        // Encode a version in the filename, so that if we ever switch away from JSON we can
+        // increment this.
+        const VERSION: u8 = 1;
+
+        self.deletion_prefix()
+            .join(format!("{sequence:016x}-{VERSION:02x}.list"))
+    }
+
+    pub fn deletion_header_path(&self) -> PathBuf {
+        // Encode a version in the filename, so that if we ever switch away from JSON we can
+        // increment this.
+        const VERSION: u8 = 1;
+
+        self.deletion_prefix().join(format!("header-{VERSION:02x}"))
+    }
+
     pub fn tenant_path(&self, tenant_id: &TenantId) -> PathBuf {
         self.tenants_path().join(tenant_id.to_string())
     }
@@ -747,7 +768,14 @@ impl PageServerConf {
                 },
                 "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
                 "background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?),
-                "control_plane_api" => builder.control_plane_api(parse_toml_string(key, item)?.parse().context("failed to parse control plane URL")?),
+                "control_plane_api" => {
+                    let parsed = parse_toml_string(key, item)?;
+                    if parsed.is_empty() {
+                        builder.control_plane_api(None)
+                    } else {
+                        builder.control_plane_api(Some(parsed.parse().context("failed to parse control plane URL")?))
+                    }
+                },
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index 192eb167894b..555f76e5239e 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -1,7 +1,9 @@
 use std::collections::HashMap;
 
-use hyper::StatusCode;
-use pageserver_api::control_api::{ReAttachRequest, ReAttachResponse};
+use pageserver_api::control_api::{
+    ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
+};
+use serde::{de::DeserializeOwned, Serialize};
 use tokio_util::sync::CancellationToken;
 use url::Url;
 use utils::{
@@ -12,25 +14,34 @@ use utils::{
 
 use crate::config::PageServerConf;
 
-// Backoffs when control plane requests do not succeed: compromise between reducing load
-// on control plane, and retrying frequently when we are blocked on a control plane
-// response to make progress.
-const BACKOFF_INCREMENT: f64 = 0.1;
-const BACKOFF_MAX: f64 = 10.0;
-
 /// The Pageserver's client for using the control plane API: this is a small subset
 /// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
-pub(crate) struct ControlPlaneClient {
+pub struct ControlPlaneClient {
     http_client: reqwest::Client,
     base_url: Url,
     node_id: NodeId,
     cancel: CancellationToken,
 }
 
+/// Represent operations which internally retry on all errors other than
+/// cancellation token firing: the only way they can fail is ShuttingDown.
+pub enum RetryForeverError {
+    ShuttingDown,
+}
+
+#[async_trait::async_trait]
+pub trait ControlPlaneGenerationsApi {
+    async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError>;
+    async fn validate(
+        &self,
+        tenants: Vec<(TenantId, Generation)>,
+    ) -> Result<HashMap<TenantId, bool>, RetryForeverError>;
+}
+
 impl ControlPlaneClient {
     /// A None return value indicates that the input `conf` object does not have control
     /// plane API enabled.
-    pub(crate) fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
+    pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
         let mut url = match conf.control_plane_api.as_ref() {
             Some(u) => u.clone(),
             None => return None,
@@ -54,27 +65,62 @@ impl ControlPlaneClient {
         })
     }
 
-    async fn try_re_attach(
+    async fn retry_http_forever<R, T>(
         &self,
-        url: Url,
-        request: &ReAttachRequest,
-    ) -> anyhow::Result<ReAttachResponse> {
-        match self.http_client.post(url).json(request).send().await {
-            Err(e) => Err(anyhow::Error::from(e)),
-            Ok(r) => {
-                if r.status() == StatusCode::OK {
-                    r.json::<ReAttachResponse>()
-                        .await
-                        .map_err(anyhow::Error::from)
-                } else {
-                    Err(anyhow::anyhow!("Unexpected status {}", r.status()))
-                }
+        url: &url::Url,
+        request: R,
+    ) -> Result<T, RetryForeverError>
+    where
+        R: Serialize,
+        T: DeserializeOwned,
+    {
+        #[derive(thiserror::Error, Debug)]
+        enum RemoteAttemptError {
+            #[error("shutdown")]
+            Shutdown,
+            #[error("remote: {0}")]
+            Remote(reqwest::Error),
+        }
+
+        match backoff::retry(
+            || async {
+                let response = self
+                    .http_client
+                    .post(url.clone())
+                    .json(&request)
+                    .send()
+                    .await
+                    .map_err(RemoteAttemptError::Remote)?;
+
+                response
+                    .error_for_status_ref()
+                    .map_err(RemoteAttemptError::Remote)?;
+                response
+                    .json::<T>()
+                    .await
+                    .map_err(RemoteAttemptError::Remote)
+            },
+            |_| false,
+            3,
+            u32::MAX,
+            "calling control plane generation validation API",
+            backoff::Cancel::new(self.cancel.clone(), || RemoteAttemptError::Shutdown),
+        )
+        .await
+        {
+            Err(RemoteAttemptError::Shutdown) => Err(RetryForeverError::ShuttingDown),
+            Err(RemoteAttemptError::Remote(_)) => {
+                panic!("We retry forever, this should never be reached");
             }
+            Ok(r) => Ok(r),
         }
     }
+}
 
-    /// Block until we get a successful response
-    pub(crate) async fn re_attach(&self) -> anyhow::Result<HashMap<TenantId, Generation>> {
+#[async_trait::async_trait]
+impl ControlPlaneGenerationsApi for ControlPlaneClient {
+    /// Block until we get a successful response, or error out if we are shut down
+    async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError> {
         let re_attach_path = self
             .base_url
             .join("re-attach")
@@ -83,37 +129,47 @@ impl ControlPlaneClient {
             node_id: self.node_id,
         };
 
-        let mut attempt = 0;
-        loop {
-            let result = self.try_re_attach(re_attach_path.clone(), &request).await;
-            match result {
-                Ok(res) => {
-                    tracing::info!(
-                        "Received re-attach response with {} tenants",
-                        res.tenants.len()
-                    );
-
-                    return Ok(res
-                        .tenants
-                        .into_iter()
-                        .map(|t| (t.id, Generation::new(t.generation)))
-                        .collect::<HashMap<_, _>>());
-                }
-                Err(e) => {
-                    tracing::error!("Error re-attaching tenants, retrying: {e:#}");
-                    backoff::exponential_backoff(
-                        attempt,
-                        BACKOFF_INCREMENT,
-                        BACKOFF_MAX,
-                        &self.cancel,
-                    )
-                    .await;
-                    if self.cancel.is_cancelled() {
-                        return Err(anyhow::anyhow!("Shutting down"));
-                    }
-                    attempt += 1;
-                }
-            }
-        }
+        let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
+        tracing::info!(
+            "Received re-attach response with {} tenants",
+            response.tenants.len()
+        );
+
+        Ok(response
+            .tenants
+            .into_iter()
+            .map(|t| (t.id, Generation::new(t.generation)))
+            .collect::<HashMap<_, _>>())
+    }
+
+    /// Block until we get a successful response, or error out if we are shut down
+    async fn validate(
+        &self,
+        tenants: Vec<(TenantId, Generation)>,
+    ) -> Result<HashMap<TenantId, bool>, RetryForeverError> {
+        let re_attach_path = self
+            .base_url
+            .join("validate")
+            .expect("Failed to build validate path");
+
+        let request = ValidateRequest {
+            tenants: tenants
+                .into_iter()
+                .map(|(id, gen)| ValidateRequestTenant {
+                    id,
+                    gen: gen
+                        .into()
+                        .expect("Generation should always be valid for a Tenant doing deletions"),
+                })
+                .collect(),
+        };
+
+        let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
+
+        Ok(response
+            .tenants
+            .into_iter()
+            .map(|rt| (rt.id, rt.valid))
+            .collect())
     }
 }
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
new file mode 100644
index 000000000000..4c0d399789a9
--- /dev/null
+++ b/pageserver/src/deletion_queue.rs
@@ -0,0 +1,1312 @@
+mod deleter;
+mod list_writer;
+mod validator;
+
+use std::collections::HashMap;
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::Duration;
+
+use crate::control_plane_client::ControlPlaneGenerationsApi;
+use crate::metrics;
+use crate::tenant::remote_timeline_client::remote_layer_path;
+use crate::tenant::remote_timeline_client::remote_timeline_path;
+use crate::virtual_file::VirtualFile;
+use anyhow::Context;
+use hex::FromHex;
+use remote_storage::{GenericRemoteStorage, RemotePath};
+use serde::Deserialize;
+use serde::Serialize;
+use serde_with::serde_as;
+use thiserror::Error;
+use tokio;
+use tokio_util::sync::CancellationToken;
+use tracing::Instrument;
+use tracing::{self, debug, error};
+use utils::crashsafe::path_with_suffix_extension;
+use utils::generation::Generation;
+use utils::id::{TenantId, TimelineId};
+use utils::lsn::AtomicLsn;
+use utils::lsn::Lsn;
+
+use self::deleter::Deleter;
+use self::list_writer::DeletionOp;
+use self::list_writer::ListWriter;
+use self::list_writer::RecoverOp;
+use self::validator::Validator;
+use deleter::DeleterMessage;
+use list_writer::ListWriterQueueMessage;
+use validator::ValidatorQueueMessage;
+
+use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName};
+
+// TODO: adminstrative "panic button" config property to disable all deletions
+// TODO: configurable for how long to wait before executing deletions
+
+/// We aggregate object deletions from many tenants in one place, for several reasons:
+/// - Coalesce deletions into fewer DeleteObjects calls
+/// - Enable Tenant/Timeline lifetimes to be shorter than the time it takes
+///   to flush any outstanding deletions.
+/// - Globally control throughput of deletions, as these are a low priority task: do
+///   not compete with the same S3 clients/connections used for higher priority uploads.
+/// - Enable gating deletions on validation of a tenant's generation number, to make
+///   it safe to multi-attach tenants (see docs/rfcs/025-generation-numbers.md)
+///
+/// There are two kinds of deletion: deferred and immediate.  A deferred deletion
+/// may be intentionally delayed to protect passive readers of S3 data, and is
+/// subject to a generation number validation step.  An immediate deletion is
+/// ready to execute immediately, and is only queued up so that it can be coalesced
+/// with other deletions in flight.
+///
+/// Deferred deletions pass through three steps:
+/// - ListWriter: accumulate deletion requests from Timelines, and batch them up into
+///   DeletionLists, which are persisted to disk.
+/// - Validator: accumulate deletion lists, and validate them en-masse prior to passing
+///   the keys in the list onward for actual deletion.  Also validate remote_consistent_lsn
+///   updates for running timelines.
+/// - Deleter: accumulate object keys that the validator has validated, and execute them in
+///   batches of 1000 keys via DeleteObjects.
+///
+/// Non-deferred deletions, such as during timeline deletion, bypass the first
+/// two stages and are passed straight into the Deleter.
+///
+/// Internally, each stage is joined by a channel to the next.  On disk, there is only
+/// one queue (of DeletionLists), which is written by the frontend and consumed
+/// by the backend.
+#[derive(Clone)]
+pub struct DeletionQueue {
+    client: DeletionQueueClient,
+
+    // Parent cancellation token for the tokens passed into background workers
+    cancel: CancellationToken,
+}
+
+/// Opaque wrapper around individual worker tasks, to avoid making the
+/// worker objects themselves public
+pub struct DeletionQueueWorkers<C>
+where
+    C: ControlPlaneGenerationsApi + Send + Sync,
+{
+    frontend: ListWriter,
+    backend: Validator<C>,
+    executor: Deleter,
+}
+
+impl<C> DeletionQueueWorkers<C>
+where
+    C: ControlPlaneGenerationsApi + Send + Sync + 'static,
+{
+    pub fn spawn_with(mut self, runtime: &tokio::runtime::Handle) -> tokio::task::JoinHandle<()> {
+        let jh_frontend = runtime.spawn(async move {
+            self.frontend
+                .background()
+                .instrument(tracing::info_span!(parent:None, "deletion frontend"))
+                .await
+        });
+        let jh_backend = runtime.spawn(async move {
+            self.backend
+                .background()
+                .instrument(tracing::info_span!(parent:None, "deletion backend"))
+                .await
+        });
+        let jh_executor = runtime.spawn(async move {
+            self.executor
+                .background()
+                .instrument(tracing::info_span!(parent:None, "deletion executor"))
+                .await
+        });
+
+        runtime.spawn({
+            async move {
+                jh_frontend.await.expect("error joining frontend worker");
+                jh_backend.await.expect("error joining backend worker");
+                drop(jh_executor.await.expect("error joining executor worker"));
+            }
+        })
+    }
+}
+
+/// A FlushOp is just a oneshot channel, where we send the transmit side down
+/// another channel, and the receive side will receive a message when the channel
+/// we're flushing has reached the FlushOp we sent into it.
+///
+/// The only extra behavior beyond the channel is that the notify() method does not
+/// return an error when the receive side has been dropped, because in this use case
+/// it is harmless (the code that initiated the flush no longer cares about the result).
+#[derive(Debug)]
+struct FlushOp {
+    tx: tokio::sync::oneshot::Sender<()>,
+}
+
+impl FlushOp {
+    fn new() -> (Self, tokio::sync::oneshot::Receiver<()>) {
+        let (tx, rx) = tokio::sync::oneshot::channel::<()>();
+        (Self { tx }, rx)
+    }
+
+    fn notify(self) {
+        if self.tx.send(()).is_err() {
+            // oneshot channel closed. This is legal: a client could be destroyed while waiting for a flush.
+            debug!("deletion queue flush from dropped client");
+        };
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct DeletionQueueClient {
+    tx: tokio::sync::mpsc::Sender<ListWriterQueueMessage>,
+    executor_tx: tokio::sync::mpsc::Sender<DeleterMessage>,
+
+    lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct TenantDeletionList {
+    /// For each Timeline, a list of key fragments to append to the timeline remote path
+    /// when reconstructing a full key
+    #[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
+    timelines: HashMap<TimelineId, Vec<String>>,
+
+    /// The generation in which this deletion was emitted: note that this may not be the
+    /// same as the generation of any layers being deleted.  The generation of the layer
+    /// has already been absorbed into the keys in `objects`
+    generation: Generation,
+}
+
+impl TenantDeletionList {
+    pub(crate) fn len(&self) -> usize {
+        self.timelines.values().map(|v| v.len()).sum()
+    }
+}
+
+/// For HashMaps using a `hex` compatible key, where we would like to encode the key as a string
+fn to_hex_map<S, V, I>(input: &HashMap<I, V>, serializer: S) -> Result<S::Ok, S::Error>
+where
+    S: serde::Serializer,
+    V: Serialize,
+    I: AsRef<[u8]>,
+{
+    let transformed = input.iter().map(|(k, v)| (hex::encode(k), v.clone()));
+
+    transformed
+        .collect::<HashMap<String, &V>>()
+        .serialize(serializer)
+}
+
+/// For HashMaps using a FromHex key, where we would like to decode the key
+fn from_hex_map<'de, D, V, I>(deserializer: D) -> Result<HashMap<I, V>, D::Error>
+where
+    D: serde::de::Deserializer<'de>,
+    V: Deserialize<'de>,
+    I: FromHex + std::hash::Hash + Eq,
+{
+    let hex_map = HashMap::<String, V>::deserialize(deserializer)?;
+    hex_map
+        .into_iter()
+        .map(|(k, v)| {
+            I::from_hex(k)
+                .map(|k| (k, v))
+                .map_err(|_| serde::de::Error::custom("Invalid hex ID"))
+        })
+        .collect()
+}
+
+/// Files ending with this suffix will be ignored and erased
+/// during recovery as startup.
+const TEMP_SUFFIX: &str = ".tmp";
+
+#[serde_as]
+#[derive(Debug, Serialize, Deserialize)]
+struct DeletionList {
+    /// Serialization version, for future use
+    version: u8,
+
+    /// Used for constructing a unique key for each deletion list we write out.
+    sequence: u64,
+
+    /// To avoid repeating tenant/timeline IDs in every key, we store keys in
+    /// nested HashMaps by TenantTimelineID.  Each Tenant only appears once
+    /// with one unique generation ID: if someone tries to push a second generation
+    /// ID for the same tenant, we will start a new DeletionList.
+    #[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
+    tenants: HashMap<TenantId, TenantDeletionList>,
+
+    /// Avoid having to walk `tenants` to calculate the number of keys in
+    /// the nested deletion lists
+    size: usize,
+
+    /// Set to true when the list has undergone validation with the control
+    /// plane and the remaining contents of `tenants` are valid.  A list may
+    /// also be implicitly marked valid by DeletionHeader.validated_sequence
+    /// advancing to >= DeletionList.sequence
+    #[serde(default)]
+    #[serde(skip_serializing_if = "std::ops::Not::not")]
+    validated: bool,
+}
+
+#[serde_as]
+#[derive(Debug, Serialize, Deserialize)]
+struct DeletionHeader {
+    /// Serialization version, for future use
+    version: u8,
+
+    /// The highest sequence number (inclusive) that has been validated.  All deletion
+    /// lists on disk with a sequence <= this value are safe to execute.
+    validated_sequence: u64,
+}
+
+impl DeletionHeader {
+    const VERSION_LATEST: u8 = 1;
+
+    fn new(validated_sequence: u64) -> Self {
+        Self {
+            version: Self::VERSION_LATEST,
+            validated_sequence,
+        }
+    }
+
+    async fn save(&self, conf: &'static PageServerConf) -> anyhow::Result<()> {
+        debug!("Saving deletion list header {:?}", self);
+        let header_bytes = serde_json::to_vec(self).context("serialize deletion header")?;
+        let header_path = conf.deletion_header_path();
+        let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
+        VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes)
+            .await
+            .map_err(Into::into)
+    }
+}
+
+impl DeletionList {
+    const VERSION_LATEST: u8 = 1;
+    fn new(sequence: u64) -> Self {
+        Self {
+            version: Self::VERSION_LATEST,
+            sequence,
+            tenants: HashMap::new(),
+            size: 0,
+            validated: false,
+        }
+    }
+
+    fn is_empty(&self) -> bool {
+        self.tenants.is_empty()
+    }
+
+    fn len(&self) -> usize {
+        self.size
+    }
+
+    /// Returns true if the push was accepted, false if the caller must start a new
+    /// deletion list.
+    fn push(
+        &mut self,
+        tenant: &TenantId,
+        timeline: &TimelineId,
+        generation: Generation,
+        objects: &mut Vec<RemotePath>,
+    ) -> bool {
+        if objects.is_empty() {
+            // Avoid inserting an empty TimelineDeletionList: this preserves the property
+            // that if we have no keys, then self.objects is empty (used in Self::is_empty)
+            return true;
+        }
+
+        let tenant_entry = self
+            .tenants
+            .entry(*tenant)
+            .or_insert_with(|| TenantDeletionList {
+                timelines: HashMap::new(),
+                generation,
+            });
+
+        if tenant_entry.generation != generation {
+            // Only one generation per tenant per list: signal to
+            // caller to start a new list.
+            return false;
+        }
+
+        let timeline_entry = tenant_entry
+            .timelines
+            .entry(*timeline)
+            .or_insert_with(Vec::new);
+
+        let timeline_remote_path = remote_timeline_path(tenant, timeline);
+
+        self.size += objects.len();
+        timeline_entry.extend(objects.drain(..).map(|p| {
+            p.strip_prefix(&timeline_remote_path)
+                .expect("Timeline paths always start with the timeline prefix")
+                .to_string_lossy()
+                .to_string()
+        }));
+        true
+    }
+
+    fn into_remote_paths(self) -> Vec<RemotePath> {
+        let mut result = Vec::new();
+        for (tenant, tenant_deletions) in self.tenants.into_iter() {
+            for (timeline, timeline_layers) in tenant_deletions.timelines.into_iter() {
+                let timeline_remote_path = remote_timeline_path(&tenant, &timeline);
+                result.extend(
+                    timeline_layers
+                        .into_iter()
+                        .map(|l| timeline_remote_path.join(&PathBuf::from(l))),
+                );
+            }
+        }
+
+        result
+    }
+
+    async fn save(&self, conf: &'static PageServerConf) -> anyhow::Result<()> {
+        let path = conf.deletion_list_path(self.sequence);
+        let temp_path = path_with_suffix_extension(&path, TEMP_SUFFIX);
+
+        let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
+        VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes)
+            .await
+            .map_err(Into::into)
+    }
+}
+
+impl std::fmt::Display for DeletionList {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "DeletionList<seq={}, tenants={}, keys={}>",
+            self.sequence,
+            self.tenants.len(),
+            self.size
+        )
+    }
+}
+
+struct PendingLsn {
+    projected: Lsn,
+    result_slot: Arc<AtomicLsn>,
+}
+
+struct TenantLsnState {
+    timelines: HashMap<TimelineId, PendingLsn>,
+
+    // In what generation was the most recent update proposed?
+    generation: Generation,
+}
+
+#[derive(Default)]
+struct VisibleLsnUpdates {
+    tenants: HashMap<TenantId, TenantLsnState>,
+}
+
+impl VisibleLsnUpdates {
+    fn new() -> Self {
+        Self {
+            tenants: HashMap::new(),
+        }
+    }
+}
+
+impl std::fmt::Debug for VisibleLsnUpdates {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "VisibleLsnUpdates({} tenants)", self.tenants.len())
+    }
+}
+
+#[derive(Error, Debug)]
+pub enum DeletionQueueError {
+    #[error("Deletion queue unavailable during shutdown")]
+    ShuttingDown,
+}
+
+impl DeletionQueueClient {
+    pub(crate) fn broken() -> Self {
+        // Channels whose receivers are immediately dropped.
+        let (tx, _rx) = tokio::sync::mpsc::channel(1);
+        let (executor_tx, _executor_rx) = tokio::sync::mpsc::channel(1);
+        Self {
+            tx,
+            executor_tx,
+            lsn_table: Arc::default(),
+        }
+    }
+
+    /// This is cancel-safe.  If you drop the future before it completes, the message
+    /// is not pushed, although in the context of the deletion queue it doesn't matter: once
+    /// we decide to do a deletion the decision is always final.
+    async fn do_push<T>(
+        &self,
+        queue: &tokio::sync::mpsc::Sender<T>,
+        msg: T,
+    ) -> Result<(), DeletionQueueError> {
+        match queue.send(msg).await {
+            Ok(_) => Ok(()),
+            Err(e) => {
+                // This shouldn't happen, we should shut down all tenants before
+                // we shut down the global delete queue.  If we encounter a bug like this,
+                // we may leak objects as deletions won't be processed.
+                error!("Deletion queue closed while pushing, shutting down? ({e})");
+                Err(DeletionQueueError::ShuttingDown)
+            }
+        }
+    }
+
+    pub(crate) async fn recover(
+        &self,
+        attached_tenants: HashMap<TenantId, Generation>,
+    ) -> Result<(), DeletionQueueError> {
+        self.do_push(
+            &self.tx,
+            ListWriterQueueMessage::Recover(RecoverOp { attached_tenants }),
+        )
+        .await
+    }
+
+    /// When a Timeline wishes to update the remote_consistent_lsn that it exposes to the outside
+    /// world, it must validate its generation number before doing so.  Rather than do this synchronously,
+    /// we allow the timeline to publish updates at will via this API, and then read back what LSN was most
+    /// recently validated separately.
+    ///
+    /// In this function we publish the LSN to the `projected` field of the timeline's entry in the VisibleLsnUpdates.  The
+    /// backend will later wake up and notice that the tenant's generation requires validation.
+    pub(crate) async fn update_remote_consistent_lsn(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        current_generation: Generation,
+        lsn: Lsn,
+        result_slot: Arc<AtomicLsn>,
+    ) {
+        let mut locked = self
+            .lsn_table
+            .write()
+            .expect("Lock should never be poisoned");
+
+        let tenant_entry = locked.tenants.entry(tenant_id).or_insert(TenantLsnState {
+            timelines: HashMap::new(),
+            generation: current_generation,
+        });
+
+        if tenant_entry.generation != current_generation {
+            // Generation might have changed if we were detached and then re-attached: in this case,
+            // state from the previous generation cannot be trusted.
+            tenant_entry.timelines.clear();
+            tenant_entry.generation = current_generation;
+        }
+
+        tenant_entry.timelines.insert(
+            timeline_id,
+            PendingLsn {
+                projected: lsn,
+                result_slot,
+            },
+        );
+    }
+
+    /// Submit a list of layers for deletion: this function will return before the deletion is
+    /// persistent, but it may be executed at any time after this function enters: do not push
+    /// layers until you're sure they can be deleted safely (i.e. remote metadata no longer
+    /// references them).
+    ///
+    /// The `current_generation` is the generation of this pageserver's current attachment.  The
+    /// generations in `layers` are the generations in which those layers were written.
+    pub(crate) async fn push_layers(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        current_generation: Generation,
+        layers: Vec<(LayerFileName, Generation)>,
+    ) -> Result<(), DeletionQueueError> {
+        if current_generation.is_none() {
+            debug!("Enqueuing deletions in legacy mode, skipping queue");
+            let mut layer_paths = Vec::new();
+            for (layer, generation) in layers {
+                layer_paths.push(remote_layer_path(
+                    &tenant_id,
+                    &timeline_id,
+                    &layer,
+                    generation,
+                ));
+            }
+            self.push_immediate(layer_paths).await?;
+            return self.flush_immediate().await;
+        }
+
+        metrics::DELETION_QUEUE
+            .keys_submitted
+            .inc_by(layers.len() as u64);
+        self.do_push(
+            &self.tx,
+            ListWriterQueueMessage::Delete(DeletionOp {
+                tenant_id,
+                timeline_id,
+                layers,
+                generation: current_generation,
+                objects: Vec::new(),
+            }),
+        )
+        .await
+    }
+
+    /// This is cancel-safe.  If you drop the future the flush may still happen in the background.
+    async fn do_flush<T>(
+        &self,
+        queue: &tokio::sync::mpsc::Sender<T>,
+        msg: T,
+        rx: tokio::sync::oneshot::Receiver<()>,
+    ) -> Result<(), DeletionQueueError> {
+        self.do_push(queue, msg).await?;
+        if rx.await.is_err() {
+            // This shouldn't happen if tenants are shut down before deletion queue.  If we
+            // encounter a bug like this, then a flusher will incorrectly believe it has flushed
+            // when it hasn't, possibly leading to leaking objects.
+            error!("Deletion queue dropped flush op while client was still waiting");
+            Err(DeletionQueueError::ShuttingDown)
+        } else {
+            Ok(())
+        }
+    }
+
+    /// Wait until all previous deletions are persistent (either executed, or written to a DeletionList)
+    ///
+    /// This is cancel-safe.  If you drop the future the flush may still happen in the background.
+    pub async fn flush(&self) -> Result<(), DeletionQueueError> {
+        let (flush_op, rx) = FlushOp::new();
+        self.do_flush(&self.tx, ListWriterQueueMessage::Flush(flush_op), rx)
+            .await
+    }
+
+    // Wait until all previous deletions are executed
+    pub(crate) async fn flush_execute(&self) -> Result<(), DeletionQueueError> {
+        debug!("flush_execute: flushing to deletion lists...");
+        // Flush any buffered work to deletion lists
+        self.flush().await?;
+
+        // Flush the backend into the executor of deletion lists
+        let (flush_op, rx) = FlushOp::new();
+        debug!("flush_execute: flushing backend...");
+        self.do_flush(&self.tx, ListWriterQueueMessage::FlushExecute(flush_op), rx)
+            .await?;
+        debug!("flush_execute: finished flushing backend...");
+
+        // Flush any immediate-mode deletions (the above backend flush will only flush
+        // the executor if deletions had flowed through the backend)
+        debug!("flush_execute: flushing execution...");
+        let (flush_op, rx) = FlushOp::new();
+        self.do_flush(&self.executor_tx, DeleterMessage::Flush(flush_op), rx)
+            .await?;
+        debug!("flush_execute: finished flushing execution...");
+        Ok(())
+    }
+
+    /// This interface bypasses the persistent deletion queue, and any validation
+    /// that this pageserver is still elegible to execute the deletions.  It is for
+    /// use in timeline deletions, where the control plane is telling us we may
+    /// delete everything in the timeline.
+    ///
+    /// DO NOT USE THIS FROM GC OR COMPACTION CODE.  Use the regular `push_layers`.
+    pub(crate) async fn push_immediate(
+        &self,
+        objects: Vec<RemotePath>,
+    ) -> Result<(), DeletionQueueError> {
+        metrics::DELETION_QUEUE
+            .keys_submitted
+            .inc_by(objects.len() as u64);
+        self.executor_tx
+            .send(DeleterMessage::Delete(objects))
+            .await
+            .map_err(|_| DeletionQueueError::ShuttingDown)
+    }
+
+    /// Companion to push_immediate.  When this returns Ok, all prior objects sent
+    /// into push_immediate have been deleted from remote storage.
+    pub(crate) async fn flush_immediate(&self) -> Result<(), DeletionQueueError> {
+        let (flush_op, rx) = FlushOp::new();
+        self.executor_tx
+            .send(DeleterMessage::Flush(flush_op))
+            .await
+            .map_err(|_| DeletionQueueError::ShuttingDown)?;
+
+        rx.await.map_err(|_| DeletionQueueError::ShuttingDown)
+    }
+}
+
+impl DeletionQueue {
+    pub fn new_client(&self) -> DeletionQueueClient {
+        self.client.clone()
+    }
+
+    /// Caller may use the returned object to construct clients with new_client.
+    /// Caller should tokio::spawn the background() members of the two worker objects returned:
+    /// we don't spawn those inside new() so that the caller can use their runtime/spans of choice.
+    ///
+    /// If remote_storage is None, then the returned workers will also be None.
+    pub fn new<C>(
+        remote_storage: Option<GenericRemoteStorage>,
+        control_plane_client: Option<C>,
+        conf: &'static PageServerConf,
+    ) -> (Self, Option<DeletionQueueWorkers<C>>)
+    where
+        C: ControlPlaneGenerationsApi + Send + Sync,
+    {
+        // Deep channel: it consumes deletions from all timelines and we do not want to block them
+        let (tx, rx) = tokio::sync::mpsc::channel(16384);
+
+        // Shallow channel: it carries DeletionLists which each contain up to thousands of deletions
+        let (backend_tx, backend_rx) = tokio::sync::mpsc::channel(16);
+
+        // Shallow channel: it carries lists of paths, and we expect the main queueing to
+        // happen in the backend (persistent), not in this queue.
+        let (executor_tx, executor_rx) = tokio::sync::mpsc::channel(16);
+
+        let lsn_table = Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new()));
+
+        // The deletion queue has an independent cancellation token to
+        // the general pageserver shutdown token, because it stays alive a bit
+        // longer to flush after Tenants have all been torn down.
+        let cancel = CancellationToken::new();
+
+        let remote_storage = match remote_storage {
+            None => {
+                return (
+                    Self {
+                        client: DeletionQueueClient {
+                            tx,
+                            executor_tx,
+                            lsn_table: lsn_table.clone(),
+                        },
+                        cancel,
+                    },
+                    None,
+                )
+            }
+            Some(r) => r,
+        };
+
+        (
+            Self {
+                client: DeletionQueueClient {
+                    tx,
+                    executor_tx: executor_tx.clone(),
+                    lsn_table: lsn_table.clone(),
+                },
+                cancel: cancel.clone(),
+            },
+            Some(DeletionQueueWorkers {
+                frontend: ListWriter::new(conf, rx, backend_tx, cancel.clone()),
+                backend: Validator::new(
+                    conf,
+                    backend_rx,
+                    executor_tx,
+                    control_plane_client,
+                    lsn_table.clone(),
+                    cancel.clone(),
+                ),
+                executor: Deleter::new(remote_storage, executor_rx, cancel.clone()),
+            }),
+        )
+    }
+
+    pub async fn shutdown(&mut self, timeout: Duration) {
+        self.cancel.cancel();
+
+        match tokio::time::timeout(timeout, self.client.flush()).await {
+            Ok(Ok(())) => {
+                tracing::info!("Deletion queue flushed successfully on shutdown")
+            }
+            Ok(Err(DeletionQueueError::ShuttingDown)) => {
+                // This is not harmful for correctness, but is unexpected: the deletion
+                // queue's workers should stay alive as long as there are any client handles instantiated.
+                tracing::warn!("Deletion queue stopped prematurely");
+            }
+            Err(_timeout) => {
+                tracing::warn!("Timed out flushing deletion queue on shutdown")
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use hex_literal::hex;
+    use std::{
+        io::ErrorKind,
+        path::{Path, PathBuf},
+        time::Duration,
+    };
+    use tracing::info;
+
+    use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
+    use tokio::task::JoinHandle;
+
+    use crate::{
+        control_plane_client::RetryForeverError,
+        repository::Key,
+        tenant::{
+            harness::TenantHarness, remote_timeline_client::remote_timeline_path,
+            storage_layer::DeltaFileName,
+        },
+    };
+
+    use super::*;
+    pub const TIMELINE_ID: TimelineId =
+        TimelineId::from_array(hex!("11223344556677881122334455667788"));
+
+    pub const EXAMPLE_LAYER_NAME: LayerFileName = LayerFileName::Delta(DeltaFileName {
+        key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF),
+        lsn_range: Lsn(0x00000000016B59D8)..Lsn(0x00000000016B5A51),
+    });
+
+    // When you need a second layer in a test.
+    pub const EXAMPLE_LAYER_NAME_ALT: LayerFileName = LayerFileName::Delta(DeltaFileName {
+        key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF),
+        lsn_range: Lsn(0x00000000016B5A51)..Lsn(0x00000000016B5A61),
+    });
+
+    struct TestSetup {
+        harness: TenantHarness,
+        remote_fs_dir: PathBuf,
+        storage: GenericRemoteStorage,
+        mock_control_plane: MockControlPlane,
+        deletion_queue: DeletionQueue,
+        worker_join: JoinHandle<()>,
+    }
+
+    impl TestSetup {
+        /// Simulate a pageserver restart by destroying and recreating the deletion queue
+        async fn restart(&mut self) {
+            let (deletion_queue, workers) = DeletionQueue::new(
+                Some(self.storage.clone()),
+                Some(self.mock_control_plane.clone()),
+                self.harness.conf,
+            );
+
+            tracing::debug!("Spawning worker for new queue queue");
+            let worker_join = workers
+                .unwrap()
+                .spawn_with(&tokio::runtime::Handle::current());
+
+            let old_worker_join = std::mem::replace(&mut self.worker_join, worker_join);
+            let old_deletion_queue = std::mem::replace(&mut self.deletion_queue, deletion_queue);
+
+            tracing::debug!("Joining worker from previous queue");
+            old_deletion_queue.cancel.cancel();
+            old_worker_join
+                .await
+                .expect("Failed to join workers for previous deletion queue");
+        }
+
+        fn set_latest_generation(&self, gen: Generation) {
+            let tenant_id = self.harness.tenant_id;
+            self.mock_control_plane
+                .latest_generation
+                .lock()
+                .unwrap()
+                .insert(tenant_id, gen);
+        }
+
+        /// Returns remote layer file name, suitable for use in assert_remote_files
+        fn write_remote_layer(
+            &self,
+            file_name: LayerFileName,
+            gen: Generation,
+        ) -> anyhow::Result<String> {
+            let tenant_id = self.harness.tenant_id;
+            let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
+            let remote_timeline_path = self.remote_fs_dir.join(relative_remote_path.get_path());
+            std::fs::create_dir_all(&remote_timeline_path)?;
+            let remote_layer_file_name = format!("{}{}", file_name, gen.get_suffix());
+
+            let content: Vec<u8> = format!("placeholder contents of {file_name}").into();
+
+            std::fs::write(
+                remote_timeline_path.join(remote_layer_file_name.clone()),
+                content,
+            )?;
+
+            Ok(remote_layer_file_name)
+        }
+    }
+
+    #[derive(Debug, Clone)]
+    struct MockControlPlane {
+        pub latest_generation: std::sync::Arc<std::sync::Mutex<HashMap<TenantId, Generation>>>,
+    }
+
+    impl MockControlPlane {
+        fn new() -> Self {
+            Self {
+                latest_generation: Arc::default(),
+            }
+        }
+    }
+
+    #[async_trait::async_trait]
+    impl ControlPlaneGenerationsApi for MockControlPlane {
+        #[allow(clippy::diverging_sub_expression)] // False positive via async_trait
+        async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError> {
+            unimplemented!()
+        }
+        async fn validate(
+            &self,
+            tenants: Vec<(TenantId, Generation)>,
+        ) -> Result<HashMap<TenantId, bool>, RetryForeverError> {
+            let mut result = HashMap::new();
+
+            let latest_generation = self.latest_generation.lock().unwrap();
+
+            for (tenant_id, generation) in tenants {
+                if let Some(latest) = latest_generation.get(&tenant_id) {
+                    result.insert(tenant_id, *latest == generation);
+                }
+            }
+
+            Ok(result)
+        }
+    }
+
+    fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
+        let test_name = Box::leak(Box::new(format!("deletion_queue__{test_name}")));
+        let harness = TenantHarness::create(test_name)?;
+
+        // We do not load() the harness: we only need its config and remote_storage
+
+        // Set up a GenericRemoteStorage targetting a directory
+        let remote_fs_dir = harness.conf.workdir.join("remote_fs");
+        std::fs::create_dir_all(remote_fs_dir)?;
+        let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?;
+        let storage_config = RemoteStorageConfig {
+            max_concurrent_syncs: std::num::NonZeroUsize::new(
+                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
+            )
+            .unwrap(),
+            max_sync_errors: std::num::NonZeroU32::new(
+                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
+            )
+            .unwrap(),
+            storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
+        };
+        let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
+
+        let mock_control_plane = MockControlPlane::new();
+
+        let (deletion_queue, worker) = DeletionQueue::new(
+            Some(storage.clone()),
+            Some(mock_control_plane.clone()),
+            harness.conf,
+        );
+
+        let worker = worker.unwrap();
+        let worker_join = worker.spawn_with(&tokio::runtime::Handle::current());
+
+        Ok(TestSetup {
+            harness,
+            remote_fs_dir,
+            storage,
+            mock_control_plane,
+            deletion_queue,
+            worker_join,
+        })
+    }
+
+    // TODO: put this in a common location so that we can share with remote_timeline_client's tests
+    fn assert_remote_files(expected: &[&str], remote_path: &Path) {
+        let mut expected: Vec<String> = expected.iter().map(|x| String::from(*x)).collect();
+        expected.sort();
+
+        let mut found: Vec<String> = Vec::new();
+        let dir = match std::fs::read_dir(remote_path) {
+            Ok(d) => d,
+            Err(e) => {
+                if e.kind() == ErrorKind::NotFound {
+                    if expected.is_empty() {
+                        // We are asserting prefix is empty: it is expected that the dir is missing
+                        return;
+                    } else {
+                        assert_eq!(expected, Vec::<String>::new());
+                        unreachable!();
+                    }
+                } else {
+                    panic!(
+                        "Unexpected error listing {}: {e}",
+                        remote_path.to_string_lossy()
+                    );
+                }
+            }
+        };
+
+        for entry in dir.flatten() {
+            let entry_name = entry.file_name();
+            let fname = entry_name.to_str().unwrap();
+            found.push(String::from(fname));
+        }
+        found.sort();
+
+        assert_eq!(expected, found);
+    }
+
+    fn assert_local_files(expected: &[&str], directory: &Path) {
+        let dir = match std::fs::read_dir(directory) {
+            Ok(d) => d,
+            Err(_) => {
+                assert_eq!(expected, &Vec::<String>::new());
+                return;
+            }
+        };
+        let mut found = Vec::new();
+        for dentry in dir {
+            let dentry = dentry.unwrap();
+            let file_name = dentry.file_name();
+            let file_name_str = file_name.to_string_lossy();
+            found.push(file_name_str.to_string());
+        }
+        found.sort();
+        assert_eq!(expected, found);
+    }
+
+    #[tokio::test]
+    async fn deletion_queue_smoke() -> anyhow::Result<()> {
+        // Basic test that the deletion queue processes the deletions we pass into it
+        let ctx = setup("deletion_queue_smoke").expect("Failed test setup");
+        let client = ctx.deletion_queue.new_client();
+        client.recover(HashMap::new()).await?;
+
+        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
+        let tenant_id = ctx.harness.tenant_id;
+
+        let content: Vec<u8> = "victim1 contents".into();
+        let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
+        let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
+        let deletion_prefix = ctx.harness.conf.deletion_prefix();
+
+        // Exercise the distinction between the generation of the layers
+        // we delete, and the generation of the running Tenant.
+        let layer_generation = Generation::new(0xdeadbeef);
+        let now_generation = Generation::new(0xfeedbeef);
+
+        let remote_layer_file_name_1 =
+            format!("{}{}", layer_file_name_1, layer_generation.get_suffix());
+
+        // Set mock control plane state to valid for our generation
+        ctx.set_latest_generation(now_generation);
+
+        // Inject a victim file to remote storage
+        info!("Writing");
+        std::fs::create_dir_all(&remote_timeline_path)?;
+        std::fs::write(
+            remote_timeline_path.join(remote_layer_file_name_1.clone()),
+            content,
+        )?;
+        assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
+
+        // File should still be there after we push it to the queue (we haven't pushed enough to flush anything)
+        info!("Pushing");
+        client
+            .push_layers(
+                tenant_id,
+                TIMELINE_ID,
+                now_generation,
+                [(layer_file_name_1.clone(), layer_generation)].to_vec(),
+            )
+            .await?;
+        assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
+
+        assert_local_files(&[], &deletion_prefix);
+
+        // File should still be there after we write a deletion list (we haven't pushed enough to execute anything)
+        info!("Flushing");
+        client.flush().await?;
+        assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
+        assert_local_files(&["0000000000000001-01.list"], &deletion_prefix);
+
+        // File should go away when we execute
+        info!("Flush-executing");
+        client.flush_execute().await?;
+        assert_remote_files(&[], &remote_timeline_path);
+        assert_local_files(&["header-01"], &deletion_prefix);
+
+        // Flushing on an empty queue should succeed immediately, and not write any lists
+        info!("Flush-executing on empty");
+        client.flush_execute().await?;
+        assert_local_files(&["header-01"], &deletion_prefix);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn deletion_queue_validation() -> anyhow::Result<()> {
+        let ctx = setup("deletion_queue_validation").expect("Failed test setup");
+        let client = ctx.deletion_queue.new_client();
+        client.recover(HashMap::new()).await?;
+
+        // Generation that the control plane thinks is current
+        let latest_generation = Generation::new(0xdeadbeef);
+        // Generation that our DeletionQueue thinks the tenant is running with
+        let stale_generation = latest_generation.previous();
+        // Generation that our example layer file was written with
+        let layer_generation = stale_generation.previous();
+
+        ctx.set_latest_generation(latest_generation);
+
+        let tenant_id = ctx.harness.tenant_id;
+        let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
+        let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
+
+        // Initial state: a remote layer exists
+        let remote_layer_name = ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?;
+        assert_remote_files(&[&remote_layer_name], &remote_timeline_path);
+
+        tracing::debug!("Pushing...");
+        client
+            .push_layers(
+                tenant_id,
+                TIMELINE_ID,
+                stale_generation,
+                [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
+            )
+            .await?;
+
+        // We enqueued the operation in a stale generation: it should have failed validation
+        tracing::debug!("Flushing...");
+        tokio::time::timeout(Duration::from_secs(5), client.flush_execute()).await??;
+        assert_remote_files(&[&remote_layer_name], &remote_timeline_path);
+
+        tracing::debug!("Pushing...");
+        client
+            .push_layers(
+                tenant_id,
+                TIMELINE_ID,
+                latest_generation,
+                [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
+            )
+            .await?;
+
+        // We enqueued the operation in a fresh generation: it should have passed validation
+        tracing::debug!("Flushing...");
+        tokio::time::timeout(Duration::from_secs(5), client.flush_execute()).await??;
+        assert_remote_files(&[], &remote_timeline_path);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn deletion_queue_recovery() -> anyhow::Result<()> {
+        // Basic test that the deletion queue processes the deletions we pass into it
+        let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup");
+        let client = ctx.deletion_queue.new_client();
+        client.recover(HashMap::new()).await?;
+
+        let tenant_id = ctx.harness.tenant_id;
+
+        let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
+        let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
+        let deletion_prefix = ctx.harness.conf.deletion_prefix();
+
+        let layer_generation = Generation::new(0xdeadbeef);
+        let now_generation = Generation::new(0xfeedbeef);
+
+        // Inject a deletion in the generation before generation_now: after restart,
+        // this deletion should _not_ get executed (only the immediately previous
+        // generation gets that treatment)
+        let remote_layer_file_name_historical =
+            ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?;
+        client
+            .push_layers(
+                tenant_id,
+                TIMELINE_ID,
+                now_generation.previous(),
+                [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
+            )
+            .await?;
+
+        // Inject a deletion in the generation before generation_now: after restart,
+        // this deletion should get executed, because we execute deletions in the
+        // immediately previous generation on the same node.
+        let remote_layer_file_name_previous =
+            ctx.write_remote_layer(EXAMPLE_LAYER_NAME_ALT, layer_generation)?;
+        client
+            .push_layers(
+                tenant_id,
+                TIMELINE_ID,
+                now_generation,
+                [(EXAMPLE_LAYER_NAME_ALT.clone(), layer_generation)].to_vec(),
+            )
+            .await?;
+
+        client.flush().await?;
+        assert_remote_files(
+            &[
+                &remote_layer_file_name_historical,
+                &remote_layer_file_name_previous,
+            ],
+            &remote_timeline_path,
+        );
+
+        // Different generatinos for the same tenant will cause two separate
+        // deletion lists to be emitted.
+        assert_local_files(
+            &["0000000000000001-01.list", "0000000000000002-01.list"],
+            &deletion_prefix,
+        );
+
+        // Simulate a node restart: the latest generation advances
+        let now_generation = now_generation.next();
+        ctx.set_latest_generation(now_generation);
+
+        // Restart the deletion queue
+        drop(client);
+        ctx.restart().await;
+        let client = ctx.deletion_queue.new_client();
+        client
+            .recover(HashMap::from([(tenant_id, now_generation)]))
+            .await?;
+
+        info!("Flush-executing");
+        client.flush_execute().await?;
+        // The deletion from immediately prior generation was executed, the one from
+        // an older generation was not.
+        assert_remote_files(&[&remote_layer_file_name_historical], &remote_timeline_path);
+        Ok(())
+    }
+}
+
+/// A lightweight queue which can issue ordinary DeletionQueueClient objects, but doesn't do any persistence
+/// or coalescing, and doesn't actually execute any deletions unless you call pump() to kick it.
+#[cfg(test)]
+pub(crate) mod mock {
+    use tracing::info;
+
+    use crate::tenant::remote_timeline_client::remote_layer_path;
+
+    use super::*;
+    use std::sync::{
+        atomic::{AtomicUsize, Ordering},
+        Arc,
+    };
+
+    pub struct ConsumerState {
+        rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
+        executor_rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
+    }
+
+    impl ConsumerState {
+        async fn consume(&mut self, remote_storage: &GenericRemoteStorage) -> usize {
+            let mut executed = 0;
+
+            info!("Executing all pending deletions");
+
+            // Transform all executor messages to generic frontend messages
+            while let Ok(msg) = self.executor_rx.try_recv() {
+                match msg {
+                    DeleterMessage::Delete(objects) => {
+                        for path in objects {
+                            match remote_storage.delete(&path).await {
+                                Ok(_) => {
+                                    debug!("Deleted {path}");
+                                }
+                                Err(e) => {
+                                    error!("Failed to delete {path}, leaking object! ({e})");
+                                }
+                            }
+                            executed += 1;
+                        }
+                    }
+                    DeleterMessage::Flush(flush_op) => {
+                        flush_op.notify();
+                    }
+                }
+            }
+
+            while let Ok(msg) = self.rx.try_recv() {
+                match msg {
+                    ListWriterQueueMessage::Delete(op) => {
+                        let mut objects = op.objects;
+                        for (layer, generation) in op.layers {
+                            objects.push(remote_layer_path(
+                                &op.tenant_id,
+                                &op.timeline_id,
+                                &layer,
+                                generation,
+                            ));
+                        }
+
+                        for path in objects {
+                            info!("Executing deletion {path}");
+                            match remote_storage.delete(&path).await {
+                                Ok(_) => {
+                                    debug!("Deleted {path}");
+                                }
+                                Err(e) => {
+                                    error!("Failed to delete {path}, leaking object! ({e})");
+                                }
+                            }
+                            executed += 1;
+                        }
+                    }
+                    ListWriterQueueMessage::Flush(op) => {
+                        op.notify();
+                    }
+                    ListWriterQueueMessage::FlushExecute(op) => {
+                        // We have already executed all prior deletions because mock does them inline
+                        op.notify();
+                    }
+                    ListWriterQueueMessage::Recover(_) => {
+                        // no-op in mock
+                    }
+                }
+                info!("All pending deletions have been executed");
+            }
+
+            executed
+        }
+    }
+
+    pub struct MockDeletionQueue {
+        tx: tokio::sync::mpsc::Sender<ListWriterQueueMessage>,
+        executor_tx: tokio::sync::mpsc::Sender<DeleterMessage>,
+        executed: Arc<AtomicUsize>,
+        remote_storage: Option<GenericRemoteStorage>,
+        consumer: std::sync::Mutex<ConsumerState>,
+        lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
+    }
+
+    impl MockDeletionQueue {
+        pub fn new(remote_storage: Option<GenericRemoteStorage>) -> Self {
+            let (tx, rx) = tokio::sync::mpsc::channel(16384);
+            let (executor_tx, executor_rx) = tokio::sync::mpsc::channel(16384);
+
+            let executed = Arc::new(AtomicUsize::new(0));
+
+            Self {
+                tx,
+                executor_tx,
+                executed,
+                remote_storage,
+                consumer: std::sync::Mutex::new(ConsumerState { rx, executor_rx }),
+                lsn_table: Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new())),
+            }
+        }
+
+        pub fn get_executed(&self) -> usize {
+            self.executed.load(Ordering::Relaxed)
+        }
+
+        #[allow(clippy::await_holding_lock)]
+        pub async fn pump(&self) {
+            if let Some(remote_storage) = &self.remote_storage {
+                // Permit holding mutex across await, because this is only ever
+                // called once at a time in tests.
+                let mut locked = self.consumer.lock().unwrap();
+                let count = locked.consume(remote_storage).await;
+                self.executed.fetch_add(count, Ordering::Relaxed);
+            }
+        }
+
+        pub(crate) fn new_client(&self) -> DeletionQueueClient {
+            DeletionQueueClient {
+                tx: self.tx.clone(),
+                executor_tx: self.executor_tx.clone(),
+                lsn_table: self.lsn_table.clone(),
+            }
+        }
+    }
+}
diff --git a/pageserver/src/deletion_queue/deleter.rs b/pageserver/src/deletion_queue/deleter.rs
new file mode 100644
index 000000000000..5c6e7dc9d7b6
--- /dev/null
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -0,0 +1,156 @@
+//! The deleter is the final stage in the deletion queue.  It accumulates remote
+//! paths to delete, and periodically executes them in batches of up to 1000
+//! using the DeleteObjects request.
+//!
+//! Its purpose is to increase efficiency of remote storage I/O by issuing a smaller
+//! number of full-sized DeleteObjects requests, rather than a larger number of
+//! smaller requests.
+
+use remote_storage::GenericRemoteStorage;
+use remote_storage::RemotePath;
+use remote_storage::MAX_KEYS_PER_DELETE;
+use std::time::Duration;
+use tokio_util::sync::CancellationToken;
+use tracing::info;
+use tracing::warn;
+
+use crate::metrics;
+
+use super::DeletionQueueError;
+use super::FlushOp;
+
+const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);
+
+pub(super) enum DeleterMessage {
+    Delete(Vec<RemotePath>),
+    Flush(FlushOp),
+}
+
+/// Non-persistent deletion queue, for coalescing multiple object deletes into
+/// larger DeleteObjects requests.
+pub(super) struct Deleter {
+    // Accumulate up to 1000 keys for the next deletion operation
+    accumulator: Vec<RemotePath>,
+
+    rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
+
+    cancel: CancellationToken,
+    remote_storage: GenericRemoteStorage,
+}
+
+impl Deleter {
+    pub(super) fn new(
+        remote_storage: GenericRemoteStorage,
+        rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
+        cancel: CancellationToken,
+    ) -> Self {
+        Self {
+            remote_storage,
+            rx,
+            cancel,
+            accumulator: Vec::new(),
+        }
+    }
+
+    /// Wrap the remote `delete_objects` with a failpoint
+    async fn remote_delete(&self) -> Result<(), anyhow::Error> {
+        fail::fail_point!("deletion-queue-before-execute", |_| {
+            info!("Skipping execution, failpoint set");
+            metrics::DELETION_QUEUE
+                .remote_errors
+                .with_label_values(&["failpoint"])
+                .inc();
+            Err(anyhow::anyhow!("failpoint hit"))
+        });
+
+        self.remote_storage.delete_objects(&self.accumulator).await
+    }
+
+    /// Block until everything in accumulator has been executed
+    async fn flush(&mut self) -> Result<(), DeletionQueueError> {
+        while !self.accumulator.is_empty() && !self.cancel.is_cancelled() {
+            match self.remote_delete().await {
+                Ok(()) => {
+                    // Note: we assume that the remote storage layer returns Ok(()) if some
+                    // or all of the deleted objects were already gone.
+                    metrics::DELETION_QUEUE
+                        .keys_executed
+                        .inc_by(self.accumulator.len() as u64);
+                    info!(
+                        "Executed deletion batch {}..{}",
+                        self.accumulator
+                            .first()
+                            .expect("accumulator should be non-empty"),
+                        self.accumulator
+                            .last()
+                            .expect("accumulator should be non-empty"),
+                    );
+                    self.accumulator.clear();
+                }
+                Err(e) => {
+                    warn!("DeleteObjects request failed: {e:#}, will retry");
+                    metrics::DELETION_QUEUE
+                        .remote_errors
+                        .with_label_values(&["execute"])
+                        .inc();
+                }
+            };
+        }
+        if self.cancel.is_cancelled() {
+            // Expose an error because we may not have actually flushed everything
+            Err(DeletionQueueError::ShuttingDown)
+        } else {
+            Ok(())
+        }
+    }
+
+    pub(super) async fn background(&mut self) -> Result<(), DeletionQueueError> {
+        self.accumulator.reserve(MAX_KEYS_PER_DELETE);
+
+        loop {
+            if self.cancel.is_cancelled() {
+                return Err(DeletionQueueError::ShuttingDown);
+            }
+
+            let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await {
+                Ok(Some(m)) => m,
+                Ok(None) => {
+                    // All queue senders closed
+                    info!("Shutting down");
+                    return Err(DeletionQueueError::ShuttingDown);
+                }
+                Err(_) => {
+                    // Timeout, we hit deadline to execute whatever we have in hand.  These functions will
+                    // return immediately if no work is pending
+                    self.flush().await?;
+
+                    continue;
+                }
+            };
+
+            match msg {
+                DeleterMessage::Delete(mut list) => {
+                    while !list.is_empty() || self.accumulator.len() == MAX_KEYS_PER_DELETE {
+                        if self.accumulator.len() == MAX_KEYS_PER_DELETE {
+                            self.flush().await?;
+                            // If we have received this number of keys, proceed with attempting to execute
+                            assert_eq!(self.accumulator.len(), 0);
+                        }
+
+                        let available_slots = MAX_KEYS_PER_DELETE - self.accumulator.len();
+                        let take_count = std::cmp::min(available_slots, list.len());
+                        for path in list.drain(list.len() - take_count..) {
+                            self.accumulator.push(path);
+                        }
+                    }
+                }
+                DeleterMessage::Flush(flush_op) => {
+                    // If flush() errors, we drop the flush_op and the caller will get
+                    // an error recv()'ing their oneshot channel.
+                    self.flush().await?;
+                    flush_op.notify();
+                }
+            }
+        }
+    }
+}
diff --git a/pageserver/src/deletion_queue/list_writer.rs b/pageserver/src/deletion_queue/list_writer.rs
new file mode 100644
index 000000000000..618a59f8fef8
--- /dev/null
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -0,0 +1,487 @@
+//! The list writer is the first stage in the deletion queue.  It accumulates
+//! layers to delete, and periodically writes out these layers into a persistent
+//! DeletionList.
+//!
+//! The purpose of writing DeletionLists is to decouple the decision to
+//! delete an object from the validation required to execute it: even if
+//! validation is not possible, e.g. due to a control plane outage, we can
+//! still persist our intent to delete an object, in a way that would
+//! survive a restart.
+//!
+//! DeletionLists are passed onwards to the Validator.
+
+use super::DeletionHeader;
+use super::DeletionList;
+use super::FlushOp;
+use super::ValidatorQueueMessage;
+
+use std::collections::HashMap;
+use std::fs::create_dir_all;
+use std::time::Duration;
+
+use regex::Regex;
+use remote_storage::RemotePath;
+use tokio_util::sync::CancellationToken;
+use tracing::debug;
+use tracing::info;
+use tracing::warn;
+use utils::generation::Generation;
+use utils::id::TenantId;
+use utils::id::TimelineId;
+
+use crate::config::PageServerConf;
+use crate::deletion_queue::TEMP_SUFFIX;
+use crate::metrics;
+use crate::tenant::remote_timeline_client::remote_layer_path;
+use crate::tenant::storage_layer::LayerFileName;
+
+// The number of keys in a DeletionList before we will proactively persist it
+// (without reaching a flush deadline).  This aims to deliver objects of the order
+// of magnitude 1MB when we are under heavy delete load.
+const DELETION_LIST_TARGET_SIZE: usize = 16384;
+
+// Ordinarily, we only flush to DeletionList periodically, to bound the window during
+// which we might leak objects from not flushing a DeletionList after
+// the objects are already unlinked from timeline metadata.
+const FRONTEND_DEFAULT_TIMEOUT: Duration = Duration::from_millis(10000);
+
+// If someone is waiting for a flush to DeletionList, only delay a little to accumulate
+// more objects before doing the flush.
+const FRONTEND_FLUSHING_TIMEOUT: Duration = Duration::from_millis(100);
+
+#[derive(Debug)]
+pub(super) struct DeletionOp {
+    pub(super) tenant_id: TenantId,
+    pub(super) timeline_id: TimelineId,
+    // `layers` and `objects` are both just lists of objects.  `layers` is used if you do not
+    // have a config object handy to project it to a remote key, and need the consuming worker
+    // to do it for you.
+    pub(super) layers: Vec<(LayerFileName, Generation)>,
+    pub(super) objects: Vec<RemotePath>,
+
+    /// The _current_ generation of the Tenant attachment in which we are enqueuing
+    /// this deletion.
+    pub(super) generation: Generation,
+}
+
+#[derive(Debug)]
+pub(super) struct RecoverOp {
+    pub(super) attached_tenants: HashMap<TenantId, Generation>,
+}
+
+#[derive(Debug)]
+pub(super) enum ListWriterQueueMessage {
+    Delete(DeletionOp),
+    // Wait until all prior deletions make it into a persistent DeletionList
+    Flush(FlushOp),
+    // Wait until all prior deletions have been executed (i.e. objects are actually deleted)
+    FlushExecute(FlushOp),
+    // Call once after re-attaching to control plane, to notify the deletion queue about
+    // latest attached generations & load any saved deletion lists from disk.
+    Recover(RecoverOp),
+}
+
+pub(super) struct ListWriter {
+    conf: &'static PageServerConf,
+
+    // Incoming frontend requests to delete some keys
+    rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
+
+    // Outbound requests to the backend to execute deletion lists we have composed.
+    tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
+
+    // The list we are currently building, contains a buffer of keys to delete
+    // and our next sequence number
+    pending: DeletionList,
+
+    // These FlushOps should notify the next time we flush
+    pending_flushes: Vec<FlushOp>,
+
+    // Worker loop is torn down when this fires.
+    cancel: CancellationToken,
+
+    // Safety guard to do recovery exactly once
+    recovered: bool,
+}
+
+impl ListWriter {
+    // Initially DeletionHeader.validated_sequence is zero.  The place we start our
+    // sequence numbers must be higher than that.
+    const BASE_SEQUENCE: u64 = 1;
+
+    pub(super) fn new(
+        conf: &'static PageServerConf,
+        rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
+        tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
+        cancel: CancellationToken,
+    ) -> Self {
+        Self {
+            pending: DeletionList::new(Self::BASE_SEQUENCE),
+            conf,
+            rx,
+            tx,
+            pending_flushes: Vec::new(),
+            cancel,
+            recovered: false,
+        }
+    }
+
+    /// Try to flush `list` to persistent storage
+    ///
+    /// This does not return errors, because on failure to flush we do not lose
+    /// any state: flushing will be retried implicitly on the next deadline
+    async fn flush(&mut self) {
+        if self.pending.is_empty() {
+            for f in self.pending_flushes.drain(..) {
+                f.notify();
+            }
+            return;
+        }
+
+        match self.pending.save(self.conf).await {
+            Ok(_) => {
+                info!(sequence = self.pending.sequence, "Stored deletion list");
+
+                for f in self.pending_flushes.drain(..) {
+                    f.notify();
+                }
+
+                // Take the list we've accumulated, replace it with a fresh list for the next sequence
+                let next_list = DeletionList::new(self.pending.sequence + 1);
+                let list = std::mem::replace(&mut self.pending, next_list);
+
+                if let Err(e) = self.tx.send(ValidatorQueueMessage::Delete(list)).await {
+                    // This is allowed to fail: it will only happen if the backend worker is shut down,
+                    // so we can just drop this on the floor.
+                    info!("Deletion list dropped, this is normal during shutdown ({e:#})");
+                }
+            }
+            Err(e) => {
+                metrics::DELETION_QUEUE.unexpected_errors.inc();
+                warn!(
+                    sequence = self.pending.sequence,
+                    "Failed to write deletion list, will retry later ({e:#})"
+                );
+            }
+        }
+    }
+
+    /// Load the header, to learn the sequence number up to which deletions
+    /// have been validated.  We will apply validated=true to DeletionLists
+    /// <= this sequence when loading them.
+    ///
+    /// It is not an error for the header to not exist: we return None, and
+    /// the caller should act as if validated_sequence is 0
+    async fn load_validated_sequence(&self) -> Result<Option<u64>, anyhow::Error> {
+        let header_path = self.conf.deletion_header_path();
+        match tokio::fs::read(&header_path).await {
+            Ok(header_bytes) => {
+                match serde_json::from_slice::<DeletionHeader>(&header_bytes) {
+                    Ok(h) => Ok(Some(h.validated_sequence)),
+                    Err(e) => {
+                        warn!(
+                            "Failed to deserialize deletion header, ignoring {}: {e:#}",
+                            header_path.display()
+                        );
+                        // This should never happen unless we make a mistake with our serialization.
+                        // Ignoring a deletion header is not consequential for correctnes because all deletions
+                        // are ultimately allowed to fail: worst case we leak some objects for the scrubber to clean up.
+                        metrics::DELETION_QUEUE.unexpected_errors.inc();
+                        Ok(None)
+                    }
+                }
+            }
+            Err(e) => {
+                if e.kind() == std::io::ErrorKind::NotFound {
+                    debug!(
+                        "Deletion header {} not found, first start?",
+                        header_path.display()
+                    );
+                    Ok(None)
+                } else {
+                    Err(anyhow::anyhow!(e))
+                }
+            }
+        }
+    }
+
+    async fn recover(
+        &mut self,
+        attached_tenants: HashMap<TenantId, Generation>,
+    ) -> Result<(), anyhow::Error> {
+        debug!(
+            "recovering with {} attached tenants",
+            attached_tenants.len()
+        );
+
+        // Load the header
+        let validated_sequence = self.load_validated_sequence().await?.unwrap_or(0);
+
+        self.pending.sequence = validated_sequence + 1;
+
+        let deletion_directory = self.conf.deletion_prefix();
+        let mut dir = match tokio::fs::read_dir(&deletion_directory).await {
+            Ok(d) => d,
+            Err(e) => {
+                warn!(
+                    "Failed to open deletion list directory {}: {e:#}",
+                    deletion_directory.display(),
+                );
+
+                // Give up: if we can't read the deletion list directory, we probably can't
+                // write lists into it later, so the queue won't work.
+                return Err(e.into());
+            }
+        };
+
+        let list_name_pattern =
+            Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();
+
+        let header_path = self.conf.deletion_header_path();
+        let mut seqs: Vec<u64> = Vec::new();
+        while let Some(dentry) = dir.next_entry().await? {
+            let file_name = dentry.file_name();
+            let dentry_str = file_name.to_string_lossy();
+
+            if Some(file_name.as_os_str()) == header_path.file_name() {
+                // Don't try and parse the header's name like a list
+                continue;
+            }
+
+            if dentry_str.ends_with(TEMP_SUFFIX) {
+                info!("Cleaning up temporary file {dentry_str}");
+                let absolute_path = deletion_directory.join(dentry.file_name());
+                if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
+                    // Non-fatal error: we will just leave the file behind but not
+                    // try and load it.
+                    warn!(
+                        "Failed to clean up temporary file {}: {e:#}",
+                        absolute_path.display()
+                    );
+                }
+
+                continue;
+            }
+
+            let file_name = dentry.file_name().to_owned();
+            let basename = file_name.to_string_lossy();
+            let seq_part = if let Some(m) = list_name_pattern.captures(&basename) {
+                m.name("sequence")
+                    .expect("Non optional group should be present")
+                    .as_str()
+            } else {
+                warn!("Unexpected key in deletion queue: {basename}");
+                metrics::DELETION_QUEUE.unexpected_errors.inc();
+                continue;
+            };
+
+            let seq: u64 = match u64::from_str_radix(seq_part, 16) {
+                Ok(s) => s,
+                Err(e) => {
+                    warn!("Malformed key '{basename}': {e}");
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();
+                    continue;
+                }
+            };
+            seqs.push(seq);
+        }
+        seqs.sort();
+
+        // Start our next deletion list from after the last location validated by
+        // previous process lifetime, or after the last location found (it is updated
+        // below after enumerating the deletion lists)
+        self.pending.sequence = validated_sequence + 1;
+        if let Some(max_list_seq) = seqs.last() {
+            self.pending.sequence = std::cmp::max(self.pending.sequence, max_list_seq + 1);
+        }
+
+        for s in seqs {
+            let list_path = self.conf.deletion_list_path(s);
+
+            let list_bytes = tokio::fs::read(&list_path).await?;
+
+            let mut deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
+                Ok(l) => l,
+                Err(e) => {
+                    // Drop the list on the floor: any objects it referenced will be left behind
+                    // for scrubbing to clean up.  This should never happen unless we have a serialization bug.
+                    warn!(sequence = s, "Failed to deserialize deletion list: {e}");
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();
+                    continue;
+                }
+            };
+
+            if deletion_list.sequence <= validated_sequence {
+                // If the deletion list falls below valid_seq, we may assume that it was
+                // already validated the last time this pageserver ran.  Otherwise, we still
+                // load it, as it may still contain content valid in this generation.
+                deletion_list.validated = true;
+            } else {
+                // Special case optimization: if a tenant is still attached, and no other
+                // generation was issued to another node in the interval while we restarted,
+                // then we may treat deletion lists from the previous generation as if they
+                // belong to our currently attached generation, and proceed to validate & execute.
+                for (tenant_id, tenant_list) in &mut deletion_list.tenants {
+                    if let Some(attached_gen) = attached_tenants.get(tenant_id) {
+                        if attached_gen.previous() == tenant_list.generation {
+                            tenant_list.generation = *attached_gen;
+                        }
+                    }
+                }
+            }
+
+            info!(
+                validated = deletion_list.validated,
+                sequence = deletion_list.sequence,
+                "Recovered deletion list"
+            );
+
+            // We will drop out of recovery if this fails: it indicates that we are shutting down
+            // or the backend has panicked
+            metrics::DELETION_QUEUE
+                .keys_submitted
+                .inc_by(deletion_list.len() as u64);
+            self.tx
+                .send(ValidatorQueueMessage::Delete(deletion_list))
+                .await?;
+        }
+
+        info!(next_sequence = self.pending.sequence, "Replay complete");
+
+        Ok(())
+    }
+
+    /// This is the front-end ingest, where we bundle up deletion requests into DeletionList
+    /// and write them out, for later validation by the backend and execution by the executor.
+    pub(super) async fn background(&mut self) {
+        info!("Started deletion frontend worker");
+
+        // Synchronous, but we only do it once per process lifetime so it's tolerable
+        if let Err(e) = create_dir_all(&self.conf.deletion_prefix()) {
+            tracing::error!(
+                "Failed to create deletion list directory {}, deletions will not be executed ({e})",
+                self.conf.deletion_prefix().display()
+            );
+            metrics::DELETION_QUEUE.unexpected_errors.inc();
+            return;
+        }
+
+        while !self.cancel.is_cancelled() {
+            let timeout = if self.pending_flushes.is_empty() {
+                FRONTEND_DEFAULT_TIMEOUT
+            } else {
+                FRONTEND_FLUSHING_TIMEOUT
+            };
+
+            let msg = match tokio::time::timeout(timeout, self.rx.recv()).await {
+                Ok(Some(msg)) => msg,
+                Ok(None) => {
+                    // Queue sender destroyed, shutting down
+                    break;
+                }
+                Err(_) => {
+                    // Hit deadline, flush.
+                    self.flush().await;
+                    continue;
+                }
+            };
+
+            match msg {
+                ListWriterQueueMessage::Delete(op) => {
+                    assert!(
+                        self.recovered,
+                        "Cannot process deletions before recovery.  This is a bug."
+                    );
+
+                    debug!(
+                        "Delete: ingesting {} layers, {} other objects",
+                        op.layers.len(),
+                        op.objects.len()
+                    );
+
+                    let mut layer_paths = Vec::new();
+                    for (layer, generation) in op.layers {
+                        layer_paths.push(remote_layer_path(
+                            &op.tenant_id,
+                            &op.timeline_id,
+                            &layer,
+                            generation,
+                        ));
+                    }
+                    layer_paths.extend(op.objects);
+
+                    if !self.pending.push(
+                        &op.tenant_id,
+                        &op.timeline_id,
+                        op.generation,
+                        &mut layer_paths,
+                    ) {
+                        self.flush().await;
+                        let retry_succeeded = self.pending.push(
+                            &op.tenant_id,
+                            &op.timeline_id,
+                            op.generation,
+                            &mut layer_paths,
+                        );
+                        if !retry_succeeded {
+                            // Unexpected: after we flush, we should have
+                            // drained self.pending, so a conflict on
+                            // generation numbers should be impossible.
+                            tracing::error!(
+                                "Failed to enqueue deletions, leaking objects.  This is a bug."
+                            );
+                            metrics::DELETION_QUEUE.unexpected_errors.inc();
+                        }
+                    }
+                }
+                ListWriterQueueMessage::Flush(op) => {
+                    if self.pending.is_empty() {
+                        // Execute immediately
+                        debug!("Flush: No pending objects, flushing immediately");
+                        op.notify()
+                    } else {
+                        // Execute next time we flush
+                        debug!("Flush: adding to pending flush list for next deadline flush");
+                        self.pending_flushes.push(op);
+                    }
+                }
+                ListWriterQueueMessage::FlushExecute(op) => {
+                    debug!("FlushExecute: passing through to backend");
+                    // We do not flush to a deletion list here: the client sends a Flush before the FlushExecute
+                    if let Err(e) = self.tx.send(ValidatorQueueMessage::Flush(op)).await {
+                        info!("Can't flush, shutting down ({e})");
+                        // Caller will get error when their oneshot sender was dropped.
+                    }
+                }
+                ListWriterQueueMessage::Recover(op) => {
+                    if self.recovered {
+                        tracing::error!(
+                            "Deletion queue recovery called more than once.  This is a bug."
+                        );
+                        metrics::DELETION_QUEUE.unexpected_errors.inc();
+                        // Non-fatal: although this is a bug, since we did recovery at least once we may proceed.
+                        continue;
+                    }
+
+                    if let Err(e) = self.recover(op.attached_tenants).await {
+                        // This should only happen in truly unrecoverable cases, like the recovery finding that the backend
+                        // queue receiver has been dropped, or something is critically broken with
+                        // the local filesystem holding deletion lists.
+                        info!(
+                            "Deletion queue recover aborted, deletion queue will not proceed ({e})"
+                        );
+                        metrics::DELETION_QUEUE.unexpected_errors.inc();
+                        return;
+                    } else {
+                        self.recovered = true;
+                    }
+                }
+            }
+
+            if self.pending.len() > DELETION_LIST_TARGET_SIZE || !self.pending_flushes.is_empty() {
+                self.flush().await;
+            }
+        }
+        info!("Deletion queue shut down.");
+    }
+}
diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs
new file mode 100644
index 000000000000..64603045d23b
--- /dev/null
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -0,0 +1,414 @@
+//! The validator is responsible for validating DeletionLists for execution,
+//! based on whethe the generation in the DeletionList is still the latest
+//! generation for a tenant.
+//!
+//! The purpose of validation is to ensure split-brain safety in the cluster
+//! of pageservers: a deletion may only be executed if the tenant generation
+//! that originated it is still current.  See docs/rfcs/025-generation-numbers.md
+//! The purpose of accumulating lists before validating them is to reduce load
+//! on the control plane API by issuing fewer, larger requests.
+//!
+//! In addition to validating DeletionLists, the validator validates updates to remote_consistent_lsn
+//! for timelines: these are logically deletions because the safekeepers use remote_consistent_lsn
+//! to decide when old
+//!
+//! Deletions are passed onward to the Deleter.
+
+use std::collections::HashMap;
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::Duration;
+
+use tokio_util::sync::CancellationToken;
+use tracing::debug;
+use tracing::info;
+use tracing::warn;
+
+use crate::config::PageServerConf;
+use crate::control_plane_client::ControlPlaneGenerationsApi;
+use crate::control_plane_client::RetryForeverError;
+use crate::metrics;
+
+use super::deleter::DeleterMessage;
+use super::DeletionHeader;
+use super::DeletionList;
+use super::DeletionQueueError;
+use super::FlushOp;
+use super::VisibleLsnUpdates;
+
+// After this length of time, do any validation work that is pending,
+// even if we haven't accumulated many keys to delete.
+//
+// This also causes updates to remote_consistent_lsn to be validated, even
+// if there were no deletions enqueued.
+const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);
+
+// If we have received this number of keys, proceed with attempting to execute
+const AUTOFLUSH_KEY_COUNT: usize = 16384;
+
+#[derive(Debug)]
+pub(super) enum ValidatorQueueMessage {
+    Delete(DeletionList),
+    Flush(FlushOp),
+}
+pub(super) struct Validator<C>
+where
+    C: ControlPlaneGenerationsApi,
+{
+    conf: &'static PageServerConf,
+    rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
+    tx: tokio::sync::mpsc::Sender<DeleterMessage>,
+
+    // Client for calling into control plane API for validation of deletes
+    control_plane_client: Option<C>,
+
+    // DeletionLists which are waiting generation validation.  Not safe to
+    // execute until [`validate`] has processed them.
+    pending_lists: Vec<DeletionList>,
+
+    // DeletionLists which have passed validation and are ready to execute.
+    validated_lists: Vec<DeletionList>,
+
+    // Sum of all the lengths of lists in pending_lists
+    pending_key_count: usize,
+
+    // Lsn validation state: we read projected LSNs and write back visible LSNs
+    // after validation.  This is the LSN equivalent of `pending_validation_lists`:
+    // it is drained in [`validate`]
+    lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
+
+    // If we failed to rewrite a deletion list due to local filesystem I/O failure,
+    // we must remember that and refuse to advance our persistent validated sequence
+    // number past the failure.
+    list_write_failed: Option<u64>,
+
+    cancel: CancellationToken,
+}
+
+impl<C> Validator<C>
+where
+    C: ControlPlaneGenerationsApi,
+{
+    pub(super) fn new(
+        conf: &'static PageServerConf,
+        rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
+        tx: tokio::sync::mpsc::Sender<DeleterMessage>,
+        control_plane_client: Option<C>,
+        lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
+        cancel: CancellationToken,
+    ) -> Self {
+        Self {
+            conf,
+            rx,
+            tx,
+            control_plane_client,
+            lsn_table,
+            pending_lists: Vec::new(),
+            validated_lists: Vec::new(),
+            pending_key_count: 0,
+            list_write_failed: None,
+            cancel,
+        }
+    }
+    /// Process any outstanding validations of generations of pending LSN updates or pending
+    /// DeletionLists.
+    ///
+    /// Valid LSN updates propagate back to Timelines immediately, valid DeletionLists
+    /// go into the queue of ready-to-execute lists.
+    async fn validate(&mut self) -> Result<(), DeletionQueueError> {
+        let mut tenant_generations = HashMap::new();
+        for list in &self.pending_lists {
+            for (tenant_id, tenant_list) in &list.tenants {
+                // Note: DeletionLists are in logical time order, so generation always
+                // goes up.  By doing a simple insert() we will always end up with
+                // the latest generation seen for a tenant.
+                tenant_generations.insert(*tenant_id, tenant_list.generation);
+            }
+        }
+
+        let pending_lsn_updates = {
+            let mut lsn_table = self.lsn_table.write().expect("Lock should not be poisoned");
+            std::mem::take(&mut *lsn_table)
+        };
+        for (tenant_id, update) in &pending_lsn_updates.tenants {
+            let entry = tenant_generations
+                .entry(*tenant_id)
+                .or_insert(update.generation);
+            if update.generation > *entry {
+                *entry = update.generation;
+            }
+        }
+
+        if tenant_generations.is_empty() {
+            // No work to do
+            return Ok(());
+        }
+
+        let tenants_valid = if let Some(control_plane_client) = &self.control_plane_client {
+            match control_plane_client
+                .validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect())
+                .await
+            {
+                Ok(tenants) => tenants,
+                Err(RetryForeverError::ShuttingDown) => {
+                    // The only way a validation call returns an error is when the cancellation token fires
+                    return Err(DeletionQueueError::ShuttingDown);
+                }
+            }
+        } else {
+            // Control plane API disabled.  In legacy mode we consider everything valid.
+            tenant_generations.keys().map(|k| (*k, true)).collect()
+        };
+
+        let mut validated_sequence: Option<u64> = None;
+
+        // Apply the validation results to the pending LSN updates
+        for (tenant_id, tenant_lsn_state) in pending_lsn_updates.tenants {
+            let validated_generation = tenant_generations
+                .get(&tenant_id)
+                .expect("Map was built from the same keys we're reading");
+
+            let valid = tenants_valid
+                .get(&tenant_id)
+                .copied()
+                // If the tenant was missing from the validation response, it has been deleted.
+                // The Timeline that requested the LSN update is probably already torn down,
+                // or will be torn down soon.  In this case, drop the update by setting valid=false.
+                .unwrap_or(false);
+
+            if valid && *validated_generation == tenant_lsn_state.generation {
+                for (_timeline_id, pending_lsn) in tenant_lsn_state.timelines {
+                    pending_lsn.result_slot.store(pending_lsn.projected);
+                }
+            } else {
+                // If we failed validation, then do not apply any of the projected updates
+                warn!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
+                metrics::DELETION_QUEUE.dropped_lsn_updates.inc();
+            }
+        }
+
+        // Apply the validation results to the pending deletion lists
+        for list in &mut self.pending_lists {
+            // Filter the list based on whether the server responded valid: true.
+            // If a tenant is omitted in the response, it has been deleted, and we should
+            // proceed with deletion.
+            let mut mutated = false;
+            list.tenants.retain(|tenant_id, tenant| {
+                let validated_generation = tenant_generations
+                    .get(tenant_id)
+                    .expect("Map was built from the same keys we're reading");
+
+                // If the tenant was missing from the validation response, it has been deleted.
+                // This means that a deletion is valid, but also redundant since the tenant's
+                // objects should have already been deleted.  Treat it as invalid to drop the
+                // redundant deletion.
+                let valid = tenants_valid.get(tenant_id).copied().unwrap_or(false);
+
+                // A list is valid if it comes from the current _or previous_ generation.
+                // - The previous generation case is permitted due to how we store deletion lists locally:
+                // if we see the immediately previous generation in a locally stored deletion list,
+                // it proves that this node's disk was used for both current & previous generations,
+                // and therefore no other node was involved in between: the two generations may be
+                // logically treated as the same.
+                // - In that previous generation case, we rewrote it to the current generation
+                // in recover(), so the comparison here is simply an equality.
+
+                let this_list_valid = valid
+                    && (tenant.generation == *validated_generation);
+
+                if !this_list_valid {
+                    warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
+                    metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
+                    mutated = true;
+                }
+                this_list_valid
+            });
+            list.validated = true;
+
+            if mutated {
+                // Save the deletion list if we had to make changes due to stale generations.  The
+                // saved list is valid for execution.
+                if let Err(e) = list.save(self.conf).await {
+                    // Highly unexpected.  Could happen if e.g. disk full.
+                    // If we didn't save the trimmed list, it is _not_ valid to execute.
+                    warn!("Failed to save modified deletion list {list}: {e:#}");
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();
+
+                    // Rather than have a complex retry process, just drop it and leak the objects,
+                    // scrubber will clean up eventually.
+                    list.tenants.clear(); // Result is a valid-but-empty list, which is a no-op for execution.
+
+                    // We must remember this failure, to prevent later writing out a header that
+                    // would imply the unwritable list was valid on disk.
+                    if self.list_write_failed.is_none() {
+                        self.list_write_failed = Some(list.sequence);
+                    }
+                }
+            }
+
+            validated_sequence = Some(list.sequence);
+        }
+
+        if let Some(validated_sequence) = validated_sequence {
+            if let Some(list_write_failed) = self.list_write_failed {
+                // Rare error case: we failed to write out a deletion list to excise invalid
+                // entries, so we cannot advance the header's valid sequence number past that point.
+                //
+                // In this state we will continue to validate, execute and delete deletion lists,
+                // we just cannot update the header.  It should be noticed and fixed by a human due to
+                // the nonzero value of our unexpected_errors metric.
+                warn!(
+                    sequence_number = list_write_failed,
+                    "Cannot write header because writing a deletion list failed earlier",
+                );
+            } else {
+                // Write the queue header to record how far validation progressed.  This avoids having
+                // to rewrite each DeletionList to set validated=true in it.
+                let header = DeletionHeader::new(validated_sequence);
+
+                // Drop result because the validated_sequence is an optimization.  If we fail to save it,
+                // then restart, we will drop some deletion lists, creating work for scrubber.
+                // The save() function logs a warning on error.
+                if let Err(e) = header.save(self.conf).await {
+                    warn!("Failed to write deletion queue header: {e:#}");
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();
+                }
+            }
+        }
+
+        // Transfer the validated lists to the validated queue, for eventual execution
+        self.validated_lists.append(&mut self.pending_lists);
+
+        Ok(())
+    }
+
+    async fn cleanup_lists(&mut self, list_paths: Vec<PathBuf>) {
+        for list_path in list_paths {
+            debug!("Removing deletion list {}", list_path.display());
+
+            if let Err(e) = tokio::fs::remove_file(&list_path).await {
+                // Unexpected: we should have permissions and nothing else should
+                // be touching these files.  We will leave the file behind.  Subsequent
+                // pageservers will try and load it again: hopefully whatever storage
+                // issue (probably permissions) has been fixed by then.
+                tracing::error!("Failed to delete {}: {e:#}", list_path.display());
+                metrics::DELETION_QUEUE.unexpected_errors.inc();
+                break;
+            }
+        }
+    }
+
+    async fn flush(&mut self) -> Result<(), DeletionQueueError> {
+        tracing::debug!("Flushing with {} pending lists", self.pending_lists.len());
+
+        // Issue any required generation validation calls to the control plane
+        self.validate().await?;
+
+        // After successful validation, nothing is pending: any lists that
+        // made it through validation will be in validated_lists.
+        assert!(self.pending_lists.is_empty());
+        self.pending_key_count = 0;
+
+        tracing::debug!(
+            "Validation complete, have {} validated lists",
+            self.validated_lists.len()
+        );
+
+        // Return quickly if we have no validated lists to execute.  This avoids flushing the
+        // executor when an idle backend hits its autoflush interval
+        if self.validated_lists.is_empty() {
+            return Ok(());
+        }
+
+        // Drain `validated_lists` into the executor
+        let mut executing_lists = Vec::new();
+        for list in self.validated_lists.drain(..) {
+            let list_path = self.conf.deletion_list_path(list.sequence);
+            let objects = list.into_remote_paths();
+            self.tx
+                .send(DeleterMessage::Delete(objects))
+                .await
+                .map_err(|_| DeletionQueueError::ShuttingDown)?;
+            executing_lists.push(list_path);
+        }
+
+        self.flush_executor().await?;
+
+        // Erase the deletion lists whose keys have all be deleted from remote storage
+        self.cleanup_lists(executing_lists).await;
+
+        Ok(())
+    }
+
+    async fn flush_executor(&mut self) -> Result<(), DeletionQueueError> {
+        // Flush the executor, so that all the keys referenced by these deletion lists
+        // are actually removed from remote storage.  This is a precondition to deleting
+        // the deletion lists themselves.
+        let (flush_op, rx) = FlushOp::new();
+        self.tx
+            .send(DeleterMessage::Flush(flush_op))
+            .await
+            .map_err(|_| DeletionQueueError::ShuttingDown)?;
+
+        rx.await.map_err(|_| DeletionQueueError::ShuttingDown)
+    }
+
+    pub(super) async fn background(&mut self) {
+        tracing::info!("Started deletion backend worker");
+
+        while !self.cancel.is_cancelled() {
+            let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await {
+                Ok(Some(m)) => m,
+                Ok(None) => {
+                    // All queue senders closed
+                    info!("Shutting down");
+                    break;
+                }
+                Err(_) => {
+                    // Timeout, we hit deadline to execute whatever we have in hand.  These functions will
+                    // return immediately if no work is pending.
+                    match self.flush().await {
+                        Ok(()) => {}
+                        Err(DeletionQueueError::ShuttingDown) => {
+                            // If we are shutting down, then auto-flush can safely be skipped
+                        }
+                    }
+
+                    continue;
+                }
+            };
+
+            match msg {
+                ValidatorQueueMessage::Delete(list) => {
+                    if list.validated {
+                        // A pre-validated list may only be seen during recovery, if we are recovering
+                        // a DeletionList whose on-disk state has validated=true
+                        self.validated_lists.push(list)
+                    } else {
+                        self.pending_key_count += list.len();
+                        self.pending_lists.push(list);
+                    }
+
+                    if self.pending_key_count > AUTOFLUSH_KEY_COUNT {
+                        match self.flush().await {
+                            Ok(()) => {}
+                            Err(DeletionQueueError::ShuttingDown) => {
+                                // If we are shutting down, then auto-flush can safely be skipped
+                            }
+                        }
+                    }
+                }
+                ValidatorQueueMessage::Flush(op) => {
+                    match self.flush().await {
+                        Ok(()) => {
+                            op.notify();
+                        }
+                        Err(DeletionQueueError::ShuttingDown) => {
+                            // If we fail due to shutting down, we will just drop `op` to propagate that status.
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 4988641d6a73..f5c1224f01e8 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -1093,6 +1093,9 @@ components:
         remote_consistent_lsn:
           type: string
           format: hex
+        remote_consistent_lsn_visible:
+          type: string
+          format: hex
         ancestor_timeline_id:
           type: string
           format: hex
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index a8e914ba08d5..e61a9dcf3fad 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -5,6 +5,7 @@ use std::collections::HashMap;
 use std::sync::Arc;
 
 use anyhow::{anyhow, Context, Result};
+use futures::TryFutureExt;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
@@ -24,6 +25,7 @@ use super::models::{
     TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
 };
 use crate::context::{DownloadBehavior, RequestContext};
+use crate::deletion_queue::DeletionQueueClient;
 use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
@@ -34,7 +36,7 @@ use crate::tenant::mgr::{
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::timeline::Timeline;
-use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
+use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
 use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
 use utils::{
@@ -61,6 +63,7 @@ pub struct State {
     remote_storage: Option<GenericRemoteStorage>,
     broker_client: storage_broker::BrokerClientChannel,
     disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
+    deletion_queue_client: DeletionQueueClient,
 }
 
 impl State {
@@ -70,6 +73,7 @@ impl State {
         remote_storage: Option<GenericRemoteStorage>,
         broker_client: storage_broker::BrokerClientChannel,
         disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
+        deletion_queue_client: DeletionQueueClient,
     ) -> anyhow::Result<Self> {
         let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
             .iter()
@@ -82,8 +86,17 @@ impl State {
             remote_storage,
             broker_client,
             disk_usage_eviction_state,
+            deletion_queue_client,
         })
     }
+
+    fn tenant_resources(&self) -> TenantSharedResources {
+        TenantSharedResources {
+            broker_client: self.broker_client.clone(),
+            remote_storage: self.remote_storage.clone(),
+            deletion_queue_client: self.deletion_queue_client.clone(),
+        }
+    }
 }
 
 #[inline(always)]
@@ -283,7 +296,12 @@ async fn build_timeline_info_common(
     };
     let current_physical_size = Some(timeline.layer_size_sum().await);
     let state = timeline.current_state();
-    let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
+    let remote_consistent_lsn_projected = timeline
+        .get_remote_consistent_lsn_projected()
+        .unwrap_or(Lsn(0));
+    let remote_consistent_lsn_visible = timeline
+        .get_remote_consistent_lsn_visible()
+        .unwrap_or(Lsn(0));
 
     let walreceiver_status = timeline.walreceiver_status();
 
@@ -293,7 +311,8 @@ async fn build_timeline_info_common(
         ancestor_timeline_id,
         ancestor_lsn,
         disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
-        remote_consistent_lsn,
+        remote_consistent_lsn: remote_consistent_lsn_projected,
+        remote_consistent_lsn_visible,
         last_record_lsn,
         prev_record_lsn: Some(timeline.get_prev_record_lsn()),
         latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
@@ -492,24 +511,23 @@ async fn tenant_attach_handler(
 
     let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
 
-    if let Some(remote_storage) = &state.remote_storage {
-        mgr::attach_tenant(
-            state.conf,
-            tenant_id,
-            generation,
-            tenant_conf,
-            state.broker_client.clone(),
-            remote_storage.clone(),
-            &ctx,
-        )
-        .instrument(info_span!("tenant_attach", %tenant_id))
-        .await?;
-    } else {
+    if state.remote_storage.is_none() {
         return Err(ApiError::BadRequest(anyhow!(
             "attach_tenant is not possible because pageserver was configured without remote storage"
         )));
     }
 
+    mgr::attach_tenant(
+        state.conf,
+        tenant_id,
+        generation,
+        tenant_conf,
+        state.tenant_resources(),
+        &ctx,
+    )
+    .instrument(info_span!("tenant_attach", %tenant_id))
+    .await?;
+
     json_response(StatusCode::ACCEPTED, ())
 }
 
@@ -570,6 +588,7 @@ async fn tenant_load_handler(
         generation,
         state.broker_client.clone(),
         state.remote_storage.clone(),
+        state.deletion_queue_client.clone(),
         &ctx,
     )
     .instrument(info_span!("load", %tenant_id))
@@ -911,8 +930,7 @@ async fn tenant_create_handler(
         tenant_conf,
         target_tenant_id,
         generation,
-        state.broker_client.clone(),
-        state.remote_storage.clone(),
+        state.tenant_resources(),
         &ctx,
     )
     .instrument(info_span!("tenant_create", tenant_id = %target_tenant_id))
@@ -1129,6 +1147,39 @@ async fn timeline_download_remote_layers_handler_get(
     json_response(StatusCode::OK, info)
 }
 
+async fn deletion_queue_flush(
+    r: Request<Body>,
+    cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&r);
+
+    if state.remote_storage.is_none() {
+        // Nothing to do if remote storage is disabled.
+        return json_response(StatusCode::OK, ());
+    }
+
+    let execute = parse_query_param(&r, "execute")?.unwrap_or(false);
+
+    let flush = async {
+        if execute {
+            state.deletion_queue_client.flush_execute().await
+        } else {
+            state.deletion_queue_client.flush().await
+        }
+    }
+    // DeletionQueueError's only case is shutting down.
+    .map_err(|_| ApiError::ShuttingDown);
+
+    tokio::select! {
+        res = flush => {
+            res.map(|()| json_response(StatusCode::OK, ()))?
+        }
+        _ = cancel.cancelled() => {
+            Err(ApiError::ShuttingDown)
+        }
+    }
+}
+
 async fn active_timeline_of_active_tenant(
     tenant_id: TenantId,
     timeline_id: TimelineId,
@@ -1463,6 +1514,9 @@ pub fn make_router(
         .put("/v1/disk_usage_eviction/run", |r| {
             api_handler(r, disk_usage_eviction_run)
         })
+        .put("/v1/deletion_queue/flush", |r| {
+            api_handler(r, deletion_queue_flush)
+        })
         .put("/v1/tenant/:tenant_id/break", |r| {
             testing_api_handler("set tenant state to broken", r, handle_tenant_break)
         })
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 3049ad6a4e8d..e370e063ba1d 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -3,7 +3,8 @@ pub mod basebackup;
 pub mod config;
 pub mod consumption_metrics;
 pub mod context;
-mod control_plane_client;
+pub mod control_plane_client;
+pub mod deletion_queue;
 pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
@@ -27,6 +28,7 @@ pub mod failpoint_support;
 use std::path::Path;
 
 use crate::task_mgr::TaskKind;
+use deletion_queue::DeletionQueue;
 use tracing::info;
 
 /// Current storage format version
@@ -48,8 +50,8 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
 
 pub use crate::metrics::preinitialize_metrics;
 
-#[tracing::instrument]
-pub async fn shutdown_pageserver(exit_code: i32) {
+#[tracing::instrument(skip_all, fields(%exit_code))]
+pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_code: i32) {
     use std::time::Duration;
     // Shut down the libpq endpoint task. This prevents new connections from
     // being accepted.
@@ -77,6 +79,11 @@ pub async fn shutdown_pageserver(exit_code: i32) {
     )
     .await;
 
+    // Best effort to persist any outstanding deletions, to avoid leaking objects
+    if let Some(mut deletion_queue) = deletion_queue {
+        deletion_queue.shutdown(Duration::from_secs(5)).await;
+    }
+
     // Shut down the HTTP endpoint last, so that you can still check the server's
     // status while it's shutting down.
     // FIXME: We should probably stop accepting commands like attach/detach earlier.
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 98dee095a313..b085176f189e 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -887,6 +887,54 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy
     .expect("failed to define a metric")
 });
 
+pub(crate) struct DeletionQueueMetrics {
+    pub(crate) keys_submitted: IntCounter,
+    pub(crate) keys_dropped: IntCounter,
+    pub(crate) keys_executed: IntCounter,
+    pub(crate) dropped_lsn_updates: IntCounter,
+    pub(crate) unexpected_errors: IntCounter,
+    pub(crate) remote_errors: IntCounterVec,
+}
+pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
+    DeletionQueueMetrics{
+
+    keys_submitted: register_int_counter!(
+        "pageserver_deletion_queue_submitted_total",
+        "Number of objects submitted for deletion"
+    )
+    .expect("failed to define a metric"),
+
+    keys_dropped: register_int_counter!(
+        "pageserver_deletion_queue_dropped_total",
+        "Number of object deletions dropped due to stale generation."
+    )
+    .expect("failed to define a metric"),
+
+    keys_executed: register_int_counter!(
+        "pageserver_deletion_queue_executed_total",
+        "Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed."
+    )
+    .expect("failed to define a metric"),
+
+    dropped_lsn_updates: register_int_counter!(
+        "pageserver_deletion_queue_dropped_lsn_updates_total",
+        "Updates to remote_consistent_lsn dropped due to stale generation number."
+    )
+    .expect("failed to define a metric"),
+    unexpected_errors: register_int_counter!(
+        "pageserver_deletion_queue_unexpected_errors_total",
+        "Number of unexpected condiions that may stall the queue: any value above zero is unexpected."
+    )
+    .expect("failed to define a metric"),
+    remote_errors: register_int_counter_vec!(
+        "pageserver_deletion_queue_remote_errors_total",
+        "Retryable remote I/O errors while executing deletions, for example 503 responses to DeleteObjects",
+        &["op_kind"],
+    )
+    .expect("failed to define a metric")
+}
+});
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
     Upload,
@@ -1675,6 +1723,9 @@ pub fn preinitialize_metrics() {
         Lazy::force(c);
     });
 
+    // Deletion queue stats
+    Lazy::force(&DELETION_QUEUE);
+
     // countervecs
     [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
         .into_iter()
diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs
index 047fa761c36c..7a94c3449dba 100644
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -37,7 +37,7 @@ impl Key {
             | self.field6 as i128
     }
 
-    pub fn from_i128(x: i128) -> Self {
+    pub const fn from_i128(x: i128) -> Self {
         Key {
             field1: ((x >> 120) & 0xf) as u8,
             field2: ((x >> 104) & 0xFFFF) as u32,
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 650bc119b624..017322ffb29b 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -456,7 +456,7 @@ async fn task_finish(
     }
 
     if shutdown_process {
-        shutdown_pageserver(1).await;
+        shutdown_pageserver(None, 1).await;
     }
 }
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1c92c618fa6c..47bfd4a8efe8 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -57,6 +57,7 @@ use self::timeline::EvictionTaskTenantState;
 use self::timeline::TimelineResources;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
+use crate::deletion_queue::DeletionQueueClient;
 use crate::import_datadir;
 use crate::is_uninit_mark;
 use crate::metrics::TENANT_ACTIVATION;
@@ -117,7 +118,7 @@ mod span;
 
 pub mod metadata;
 mod par_fsync;
-mod remote_timeline_client;
+pub mod remote_timeline_client;
 pub mod storage_layer;
 
 pub mod config;
@@ -157,6 +158,7 @@ pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
 pub struct TenantSharedResources {
     pub broker_client: storage_broker::BrokerClientChannel,
     pub remote_storage: Option<GenericRemoteStorage>,
+    pub deletion_queue_client: DeletionQueueClient,
 }
 
 ///
@@ -197,6 +199,9 @@ pub struct Tenant {
     // provides access to timeline data sitting in the remote storage
     pub(crate) remote_storage: Option<GenericRemoteStorage>,
 
+    // Access to global deletion queue for when this tenant wants to schedule a deletion
+    deletion_queue_client: DeletionQueueClient,
+
     /// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
     cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
     cached_synthetic_tenant_size: Arc<AtomicU64>,
@@ -523,15 +528,20 @@ impl Tenant {
         conf: &'static PageServerConf,
         tenant_id: TenantId,
         generation: Generation,
-        broker_client: storage_broker::BrokerClientChannel,
+        resources: TenantSharedResources,
         tenants: &'static tokio::sync::RwLock<TenantsMap>,
-        remote_storage: GenericRemoteStorage,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Tenant>> {
         // TODO dedup with spawn_load
         let tenant_conf =
             Self::load_tenant_config(conf, &tenant_id).context("load tenant config")?;
 
+        let TenantSharedResources {
+            broker_client,
+            remote_storage,
+            deletion_queue_client,
+        } = resources;
+
         let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
         let tenant = Arc::new(Tenant::new(
             TenantState::Attaching,
@@ -540,7 +550,8 @@ impl Tenant {
             wal_redo_manager,
             tenant_id,
             generation,
-            Some(remote_storage.clone()),
+            remote_storage.clone(),
+            deletion_queue_client,
         ));
 
         // Do all the hard work in the background
@@ -571,7 +582,7 @@ impl Tenant {
                 let pending_deletion = {
                     match DeleteTenantFlow::should_resume_deletion(
                         conf,
-                        Some(&remote_storage),
+                        remote_storage.as_ref(),
                         &tenant_clone,
                     )
                     .await
@@ -660,6 +671,7 @@ impl Tenant {
         for timeline_id in remote_timeline_ids {
             let client = RemoteTimelineClient::new(
                 remote_storage.clone(),
+                self.deletion_queue_client.clone(),
                 self.conf,
                 self.tenant_id,
                 timeline_id,
@@ -726,6 +738,7 @@ impl Tenant {
                 remote_metadata,
                 TimelineResources {
                     remote_client: Some(remote_client),
+                    deletion_queue_client: self.deletion_queue_client.clone(),
                 },
                 ctx,
             )
@@ -750,6 +763,7 @@ impl Tenant {
                 timeline_id,
                 &index_part.metadata,
                 Some(remote_timeline_client),
+                self.deletion_queue_client.clone(),
                 None,
             )
             .await
@@ -851,6 +865,7 @@ impl Tenant {
             tenant_id,
             Generation::broken(),
             None,
+            DeletionQueueClient::broken(),
         ))
     }
 
@@ -895,6 +910,7 @@ impl Tenant {
             tenant_id,
             generation,
             remote_storage.clone(),
+            resources.deletion_queue_client.clone(),
         );
         let tenant = Arc::new(tenant);
 
@@ -1302,6 +1318,7 @@ impl Tenant {
                                 timeline_id,
                                 &local_metadata,
                                 Some(remote_client),
+                                self.deletion_queue_client.clone(),
                                 init_order,
                             )
                             .await
@@ -1351,6 +1368,7 @@ impl Tenant {
                         timeline_id,
                         &local_metadata,
                         None,
+                        self.deletion_queue_client.clone(),
                         init_order,
                     )
                     .await
@@ -2242,6 +2260,9 @@ impl Tenant {
         Ok(timeline)
     }
 
+    // Allow too_many_arguments because a constructor's argument list naturally grows with the
+    // number of attributes in the struct: breaking these out into a builder wouldn't be helpful.
+    #[allow(clippy::too_many_arguments)]
     fn new(
         state: TenantState,
         conf: &'static PageServerConf,
@@ -2250,6 +2271,7 @@ impl Tenant {
         tenant_id: TenantId,
         generation: Generation,
         remote_storage: Option<GenericRemoteStorage>,
+        deletion_queue_client: DeletionQueueClient,
     ) -> Tenant {
         let (state, mut rx) = watch::channel(state);
 
@@ -2317,6 +2339,7 @@ impl Tenant {
             gc_cs: tokio::sync::Mutex::new(()),
             walredo_mgr,
             remote_storage,
+            deletion_queue_client,
             state,
             cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
             cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
@@ -2856,6 +2879,7 @@ impl Tenant {
         let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() {
             let remote_client = RemoteTimelineClient::new(
                 remote_storage.clone(),
+                self.deletion_queue_client.clone(),
                 self.conf,
                 self.tenant_id,
                 timeline_id,
@@ -2866,7 +2890,10 @@ impl Tenant {
             None
         };
 
-        TimelineResources { remote_client }
+        TimelineResources {
+            remote_client,
+            deletion_queue_client: self.deletion_queue_client.clone(),
+        }
     }
 
     /// Creates intermediate timeline structure and its files.
@@ -3322,6 +3349,7 @@ pub mod harness {
     use utils::logging;
     use utils::lsn::Lsn;
 
+    use crate::deletion_queue::mock::MockDeletionQueue;
     use crate::{
         config::PageServerConf,
         repository::Key,
@@ -3383,6 +3411,7 @@ pub mod harness {
         pub generation: Generation,
         pub remote_storage: GenericRemoteStorage,
         pub remote_fs_dir: PathBuf,
+        pub deletion_queue: MockDeletionQueue,
     }
 
     static LOG_HANDLE: OnceCell<()> = OnceCell::new();
@@ -3431,6 +3460,7 @@ pub mod harness {
                 storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
             };
             let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
+            let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()));
 
             Ok(Self {
                 conf,
@@ -3439,6 +3469,7 @@ pub mod harness {
                 generation: Generation::new(0xdeadbeef),
                 remote_storage,
                 remote_fs_dir,
+                deletion_queue,
             })
         }
 
@@ -3463,6 +3494,7 @@ pub mod harness {
                 self.tenant_id,
                 self.generation,
                 Some(self.remote_storage.clone()),
+                self.deletion_queue.new_client(),
             ));
             tenant
                 .load(None, ctx)
@@ -4193,7 +4225,8 @@ mod tests {
     //
     #[tokio::test]
     async fn test_bulk_insert() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_bulk_insert")?.load().await;
+        let harness = TenantHarness::create("test_bulk_insert")?;
+        let (tenant, ctx) = harness.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -4240,7 +4273,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_random_updates() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_random_updates")?.load().await;
+        let harness = TenantHarness::create("test_random_updates")?;
+        let (tenant, ctx) = harness.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
             .await?;
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 74faee111509..6f3863dd4b2a 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -20,7 +20,10 @@ use utils::crashsafe;
 
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::control_plane_client::ControlPlaneClient;
+use crate::control_plane_client::{
+    ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
+};
+use crate::deletion_queue::DeletionQueueClient;
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::delete::DeleteTenantFlow;
@@ -116,7 +119,23 @@ pub async fn init_tenant_mgr(
 
     // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
     let tenant_generations = if let Some(client) = ControlPlaneClient::new(conf, &cancel) {
-        Some(client.re_attach().await?)
+        let result = match client.re_attach().await {
+            Ok(tenants) => tenants,
+            Err(RetryForeverError::ShuttingDown) => {
+                anyhow::bail!("Shut down while waiting for control plane re-attach response")
+            }
+        };
+
+        // The deletion queue needs to know about the startup attachment state to decide which (if any) stored
+        // deletion list entries may still be valid.  We provide that by pushing a recovery operation into
+        // the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions
+        // are processed, even though we don't block on recovery completing here.
+        resources
+            .deletion_queue_client
+            .recover(result.clone())
+            .await?;
+
+        Some(result)
     } else {
         info!("Control plane API not configured, tenant generations are disabled");
         None
@@ -285,29 +304,21 @@ pub(crate) fn schedule_local_tenant_processing(
 
     let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
         info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
-        if let Some(remote_storage) = resources.remote_storage {
-            match Tenant::spawn_attach(
+        if resources.remote_storage.is_none() {
+            warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
+            Tenant::create_broken_tenant(
                 conf,
                 tenant_id,
-                generation,
-                resources.broker_client,
-                tenants,
-                remote_storage,
-                ctx,
-            ) {
+                "attaching mark file present but no remote storage configured".to_string(),
+            )
+        } else {
+            match Tenant::spawn_attach(conf, tenant_id, generation, resources, tenants, ctx) {
                 Ok(tenant) => tenant,
                 Err(e) => {
                     error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
                     Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
                 }
             }
-        } else {
-            warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
-            Tenant::create_broken_tenant(
-                conf,
-                tenant_id,
-                "attaching mark file present but no remote storage configured".to_string(),
-            )
         }
     } else {
         info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
@@ -438,8 +449,7 @@ pub async fn create_tenant(
     tenant_conf: TenantConfOpt,
     tenant_id: TenantId,
     generation: Generation,
-    broker_client: storage_broker::BrokerClientChannel,
-    remote_storage: Option<GenericRemoteStorage>,
+    resources: TenantSharedResources,
     ctx: &RequestContext,
 ) -> Result<Arc<Tenant>, TenantMapInsertError> {
     tenant_map_insert(tenant_id, || async {
@@ -450,13 +460,9 @@ pub async fn create_tenant(
         // TODO: tenant directory remains on disk if we bail out from here on.
         //       See https://github.com/neondatabase/neon/issues/4233
 
-        let tenant_resources = TenantSharedResources {
-            broker_client,
-            remote_storage,
-        };
         let created_tenant =
             schedule_local_tenant_processing(conf, tenant_id, &tenant_directory,
-                generation, tenant_resources, None, &TENANTS, ctx)?;
+                generation, resources, None, &TENANTS, ctx)?;
         // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
         //      See https://github.com/neondatabase/neon/issues/4233
 
@@ -622,6 +628,7 @@ pub async fn load_tenant(
     generation: Generation,
     broker_client: storage_broker::BrokerClientChannel,
     remote_storage: Option<GenericRemoteStorage>,
+    deletion_queue_client: DeletionQueueClient,
     ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
     tenant_map_insert(tenant_id, || async {
@@ -635,6 +642,7 @@ pub async fn load_tenant(
         let resources = TenantSharedResources {
             broker_client,
             remote_storage,
+            deletion_queue_client
         };
         let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, generation, resources, None,  &TENANTS, ctx)
             .with_context(|| {
@@ -702,8 +710,7 @@ pub async fn attach_tenant(
     tenant_id: TenantId,
     generation: Generation,
     tenant_conf: TenantConfOpt,
-    broker_client: storage_broker::BrokerClientChannel,
-    remote_storage: GenericRemoteStorage,
+    resources: TenantSharedResources,
     ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
     tenant_map_insert(tenant_id, || async {
@@ -718,10 +725,7 @@ pub async fn attach_tenant(
             .context("check for attach marker file existence")?;
         anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
 
-        let resources = TenantSharedResources {
-            broker_client,
-            remote_storage: Some(remote_storage),
-        };
+
         let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, generation, resources, None, &TENANTS, ctx)?;
         // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
         //      See https://github.com/neondatabase/neon/issues/4233
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 6f42b54ac2ae..4e495d9bb2ab 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -116,8 +116,12 @@
 //! # Completion
 //!
 //! Once an operation has completed, we update
-//! [`UploadQueueInitialized::last_uploaded_consistent_lsn`] which indicates
-//! to safekeepers that they can delete the WAL up to that LSN.
+//! [`UploadQueueInitialized::projected_remote_consistent_lsn`] immediately,
+//! and submit a request through the DeletionQueue to update
+//! [`UploadQueueInitialized::visible_remote_consistent_lsn`] after it has
+//! validated that our generation is not stale.  It is this visible value
+//! that is advertized to safekeepers as a signal that that they can
+//! delete the WAL up to that LSN.
 //!
 //! The [`RemoteTimelineClient::wait_completion`] method can be used to wait
 //! for all pending operations to complete. It does not prevent more
@@ -200,7 +204,6 @@
 //! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
 //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
 
-mod delete;
 mod download;
 pub mod index;
 mod upload;
@@ -226,6 +229,7 @@ use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;
 
+use crate::deletion_queue::DeletionQueueClient;
 use crate::metrics::{
     MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
     RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
@@ -324,6 +328,8 @@ pub struct RemoteTimelineClient {
     metrics: Arc<RemoteTimelineClientMetrics>,
 
     storage_impl: GenericRemoteStorage,
+
+    deletion_queue_client: DeletionQueueClient,
 }
 
 impl RemoteTimelineClient {
@@ -335,6 +341,7 @@ impl RemoteTimelineClient {
     ///
     pub fn new(
         remote_storage: GenericRemoteStorage,
+        deletion_queue_client: DeletionQueueClient,
         conf: &'static PageServerConf,
         tenant_id: TenantId,
         timeline_id: TimelineId,
@@ -352,6 +359,7 @@ impl RemoteTimelineClient {
             timeline_id,
             generation,
             storage_impl: remote_storage,
+            deletion_queue_client,
             upload_queue: Mutex::new(UploadQueue::Uninitialized),
             metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
         }
@@ -413,13 +421,24 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
-    pub fn last_uploaded_consistent_lsn(&self) -> Option<Lsn> {
-        match &*self.upload_queue.lock().unwrap() {
+    pub fn remote_consistent_lsn_projected(&self) -> Option<Lsn> {
+        match &mut *self.upload_queue.lock().unwrap() {
             UploadQueue::Uninitialized => None,
-            UploadQueue::Initialized(q) => Some(q.last_uploaded_consistent_lsn),
-            UploadQueue::Stopped(q) => {
-                Some(q.upload_queue_for_deletion.last_uploaded_consistent_lsn)
-            }
+            UploadQueue::Initialized(q) => q.get_last_remote_consistent_lsn_projected(),
+            UploadQueue::Stopped(q) => q
+                .upload_queue_for_deletion
+                .get_last_remote_consistent_lsn_projected(),
+        }
+    }
+
+    pub fn remote_consistent_lsn_visible(&self) -> Option<Lsn> {
+        match &mut *self.upload_queue.lock().unwrap() {
+            UploadQueue::Uninitialized => None,
+            UploadQueue::Initialized(q) => Some(q.get_last_remote_consistent_lsn_visible()),
+            UploadQueue::Stopped(q) => Some(
+                q.upload_queue_for_deletion
+                    .get_last_remote_consistent_lsn_visible(),
+            ),
         }
     }
 
@@ -643,7 +662,7 @@ impl RemoteTimelineClient {
     /// successfully.
     pub fn schedule_layer_file_deletion(
         self: &Arc<Self>,
-        names: &[LayerFileName],
+        names: Vec<LayerFileName>,
     ) -> anyhow::Result<()> {
         let mut guard = self.upload_queue.lock().unwrap();
         let upload_queue = guard.initialized_mut()?;
@@ -663,10 +682,10 @@ impl RemoteTimelineClient {
             // Decorate our list of names with each name's generation, dropping
             // makes that are unexpectedly missing from our metadata.
             let with_generations: Vec<_> = names
-                .iter()
+                .into_iter()
                 .filter_map(|name| {
                     // Remove from latest_files, learning the file's remote generation in the process
-                    let meta = upload_queue.latest_files.remove(name);
+                    let meta = upload_queue.latest_files.remove(&name);
 
                     if let Some(meta) = meta {
                         upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
@@ -688,19 +707,17 @@ impl RemoteTimelineClient {
                 self.schedule_index_upload(upload_queue, metadata);
             }
 
-            // schedule the actual deletions
-            for (name, generation) in with_generations {
-                let op = UploadOp::Delete(Delete {
-                    file_kind: RemoteOpFileKind::Layer,
-                    layer_file_name: name.clone(),
-                    scheduled_from_timeline_delete: false,
-                    generation,
-                });
-                self.calls_unfinished_metric_begin(&op);
-                upload_queue.queued_operations.push_back(op);
-                info!("scheduled layer file deletion {name}");
+            for (name, gen) in &with_generations {
+                info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
             }
 
+            // schedule the actual deletions
+            let op = UploadOp::Delete(Delete {
+                layers: with_generations,
+            });
+            self.calls_unfinished_metric_begin(&op);
+            upload_queue.queued_operations.push_back(op);
+
             // Launch the tasks immediately, if possible
             self.launch_queued_tasks(upload_queue);
         };
@@ -833,9 +850,7 @@ impl RemoteTimelineClient {
     pub(crate) async fn delete_all(self: &Arc<Self>) -> anyhow::Result<()> {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
-        let (mut receiver, deletions_queued) = {
-            let mut deletions_queued = 0;
-
+        let layers: Vec<RemotePath> = {
             let mut locked = self.upload_queue.lock().unwrap();
             let stopped = locked.stopped_mut()?;
 
@@ -847,42 +862,30 @@ impl RemoteTimelineClient {
 
             stopped
                 .upload_queue_for_deletion
-                .queued_operations
-                .reserve(stopped.upload_queue_for_deletion.latest_files.len());
-
-            // schedule the actual deletions
-            for (name, meta) in &stopped.upload_queue_for_deletion.latest_files {
-                let op = UploadOp::Delete(Delete {
-                    file_kind: RemoteOpFileKind::Layer,
-                    layer_file_name: name.clone(),
-                    scheduled_from_timeline_delete: true,
-                    generation: meta.generation,
-                });
-
-                self.calls_unfinished_metric_begin(&op);
-                stopped
-                    .upload_queue_for_deletion
-                    .queued_operations
-                    .push_back(op);
-
-                info!("scheduled layer file deletion {name}");
-                deletions_queued += 1;
-            }
-
-            self.launch_queued_tasks(&mut stopped.upload_queue_for_deletion);
-
-            (
-                self.schedule_barrier(&mut stopped.upload_queue_for_deletion),
-                deletions_queued,
-            )
+                .latest_files
+                .drain()
+                .map(|(file_name, meta)| {
+                    remote_layer_path(
+                        &self.tenant_id,
+                        &self.timeline_id,
+                        &file_name,
+                        meta.generation,
+                    )
+                })
+                .collect()
         };
 
-        receiver.changed().await.context("upload queue shut down")?;
+        let layer_deletion_count = layers.len();
+        self.deletion_queue_client.push_immediate(layers).await?;
 
         // Do not delete index part yet, it is needed for possible retry. If we remove it first
         // and retry will arrive to different pageserver there wont be any traces of it on remote storage
         let timeline_storage_path = remote_timeline_path(&self.tenant_id, &self.timeline_id);
 
+        // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
+        // taking the burden of listing all the layers that we already know we should delete.
+        self.deletion_queue_client.flush_immediate().await?;
+
         let remaining = backoff::retry(
             || async {
                 self.storage_impl
@@ -910,17 +913,9 @@ impl RemoteTimelineClient {
             })
             .collect();
 
+        let not_referenced_count = remaining.len();
         if !remaining.is_empty() {
-            backoff::retry(
-                || async { self.storage_impl.delete_objects(&remaining).await },
-                |_e| false,
-                FAILED_UPLOAD_WARN_THRESHOLD,
-                FAILED_REMOTE_OP_RETRIES,
-                "delete_objects",
-                backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
-            )
-            .await
-            .context("delete_objects")?;
+            self.deletion_queue_client.push_immediate(remaining).await?;
         }
 
         fail::fail_point!("timeline-delete-before-index-delete", |_| {
@@ -931,18 +926,14 @@ impl RemoteTimelineClient {
 
         let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
 
-        debug!("deleting index part");
+        debug!("enqueuing index part deletion");
+        self.deletion_queue_client
+            .push_immediate([index_file_path].to_vec())
+            .await?;
 
-        backoff::retry(
-            || async { self.storage_impl.delete(&index_file_path).await },
-            |_e| false,
-            FAILED_UPLOAD_WARN_THRESHOLD,
-            FAILED_REMOTE_OP_RETRIES,
-            "delete_index",
-            backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled")),
-        )
-        .await
-        .context("delete_index")?;
+        // Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
+        // for a flush to a persistent deletion list so that we may be sure deletion will occur.
+        self.deletion_queue_client.flush_immediate().await?;
 
         fail::fail_point!("timeline-delete-after-index-delete", |_| {
             Err(anyhow::anyhow!(
@@ -950,7 +941,7 @@ impl RemoteTimelineClient {
             ))?
         });
 
-        info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");
+        info!(prefix=%timeline_storage_path, referenced=layer_deletion_count, not_referenced=%not_referenced_count, "done deleting in timeline prefix, including index_part.json");
 
         Ok(())
     }
@@ -1140,21 +1131,16 @@ impl RemoteTimelineClient {
                     }
                     res
                 }
-                UploadOp::Delete(delete) => {
-                    let path = &self
-                        .conf
-                        .timeline_path(&self.tenant_id, &self.timeline_id)
-                        .join(delete.layer_file_name.file_name());
-                    delete::delete_layer(self.conf, &self.storage_impl, path, delete.generation)
-                        .measure_remote_op(
-                            self.tenant_id,
-                            self.timeline_id,
-                            delete.file_kind,
-                            RemoteOpKind::Delete,
-                            Arc::clone(&self.metrics),
-                        )
-                        .await
-                }
+                UploadOp::Delete(delete) => self
+                    .deletion_queue_client
+                    .push_layers(
+                        self.tenant_id,
+                        self.timeline_id,
+                        self.generation,
+                        delete.layers.clone(),
+                    )
+                    .await
+                    .map_err(|e| anyhow::anyhow!(e)),
                 UploadOp::Barrier(_) => {
                     // unreachable. Barrier operations are handled synchronously in
                     // launch_queued_tasks
@@ -1210,18 +1196,12 @@ impl RemoteTimelineClient {
         }
 
         // The task has completed successfully. Remove it from the in-progress list.
-        {
+        let lsn_update = {
             let mut upload_queue_guard = self.upload_queue.lock().unwrap();
             let upload_queue = match upload_queue_guard.deref_mut() {
                 UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"),
-                UploadQueue::Stopped(stopped) => {
-                    // Special care is needed for deletions, if it was an earlier deletion (not scheduled from deletion)
-                    // then stop() took care of it so we just return.
-                    // For deletions that come from delete_all we still want to maintain metrics, launch following tasks, etc.
-                    match &task.op {
-                        UploadOp::Delete(delete) if delete.scheduled_from_timeline_delete => Some(&mut stopped.upload_queue_for_deletion),
-                        _ => None
-                    }
+                UploadQueue::Stopped(_stopped) => {
+                    None
                 },
                 UploadQueue::Initialized(qi) => { Some(qi) }
             };
@@ -1236,23 +1216,51 @@ impl RemoteTimelineClient {
 
             upload_queue.inprogress_tasks.remove(&task.task_id);
 
-            match task.op {
+            let lsn_update = match task.op {
                 UploadOp::UploadLayer(_, _) => {
                     upload_queue.num_inprogress_layer_uploads -= 1;
+                    None
                 }
                 UploadOp::UploadMetadata(_, lsn) => {
                     upload_queue.num_inprogress_metadata_uploads -= 1;
-                    upload_queue.last_uploaded_consistent_lsn = lsn; // XXX monotonicity check?
+                    // XXX monotonicity check?
+
+                    upload_queue.projected_remote_consistent_lsn = Some(lsn);
+                    if self.generation.is_none() {
+                        // Legacy mode: skip validating generation
+                        upload_queue.visible_remote_consistent_lsn.store(lsn);
+                        None
+                    } else {
+                        Some((lsn, upload_queue.visible_remote_consistent_lsn.clone()))
+                    }
                 }
                 UploadOp::Delete(_) => {
                     upload_queue.num_inprogress_deletions -= 1;
+                    None
                 }
                 UploadOp::Barrier(_) => unreachable!(),
             };
 
             // Launch any queued tasks that were unblocked by this one.
             self.launch_queued_tasks(upload_queue);
+            lsn_update
+        };
+
+        if let Some((lsn, slot)) = lsn_update {
+            // Updates to the remote_consistent_lsn we advertise to pageservers
+            // are all routed through the DeletionQueue, to enforce important
+            // data safety guarantees (see docs/rfcs/025-generation-numbers.md)
+            self.deletion_queue_client
+                .update_remote_consistent_lsn(
+                    self.tenant_id,
+                    self.timeline_id,
+                    self.generation,
+                    lsn,
+                    slot,
+                )
+                .await;
         }
+
         self.calls_unfinished_metric_end(&task.op);
     }
 
@@ -1278,8 +1286,8 @@ impl RemoteTimelineClient {
                     reason: "metadata uploads are tiny",
                 },
             ),
-            UploadOp::Delete(delete) => (
-                delete.file_kind,
+            UploadOp::Delete(_delete) => (
+                RemoteOpFileKind::Layer,
                 RemoteOpKind::Delete,
                 DontTrackSize {
                     reason: "should we track deletes? positive or negative sign?",
@@ -1341,7 +1349,10 @@ impl RemoteTimelineClient {
                         latest_files: initialized.latest_files.clone(),
                         latest_files_changes_since_metadata_upload_scheduled: 0,
                         latest_metadata: initialized.latest_metadata.clone(),
-                        last_uploaded_consistent_lsn: initialized.last_uploaded_consistent_lsn,
+                        projected_remote_consistent_lsn: None,
+                        visible_remote_consistent_lsn: initialized
+                            .visible_remote_consistent_lsn
+                            .clone(),
                         num_inprogress_layer_uploads: 0,
                         num_inprogress_metadata_uploads: 0,
                         num_inprogress_deletions: 0,
@@ -1405,13 +1416,13 @@ pub fn remote_layer_path(
     tenant_id: &TenantId,
     timeline_id: &TimelineId,
     layer_file_name: &LayerFileName,
-    layer_meta: &LayerFileMetadata,
+    generation: Generation,
 ) -> RemotePath {
     // Generation-aware key format
     let path = format!(
         "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
         layer_file_name.file_name(),
-        layer_meta.generation.get_suffix()
+        generation.get_suffix()
     );
 
     RemotePath::from_string(&path).expect("Failed to construct path")
@@ -1554,7 +1565,6 @@ mod tests {
 
     impl TestSetup {
         async fn new(test_name: &str) -> anyhow::Result<Self> {
-            // Use a current-thread runtime in the test
             let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
             let harness = TenantHarness::create(test_name)?;
             let (tenant, ctx) = harness.load().await;
@@ -1580,6 +1590,7 @@ mod tests {
                 timeline_id: TIMELINE_ID,
                 generation,
                 storage_impl: self.harness.remote_storage.clone(),
+                deletion_queue_client: self.harness.deletion_queue.new_client(),
                 upload_queue: Mutex::new(UploadQueue::Uninitialized),
                 metrics: Arc::new(RemoteTimelineClientMetrics::new(
                     &self.harness.tenant_id,
@@ -1749,7 +1760,7 @@ mod tests {
             )
             .unwrap();
         client
-            .schedule_layer_file_deletion(&[layer_file_name_1.clone()])
+            .schedule_layer_file_deletion([layer_file_name_1.clone()].to_vec())
             .unwrap();
         {
             let mut guard = client.upload_queue.lock().unwrap();
@@ -1775,6 +1786,7 @@ mod tests {
 
         // Finish them
         client.wait_completion().await.unwrap();
+        harness.deletion_queue.pump().await;
 
         assert_remote_files(
             &[
diff --git a/pageserver/src/tenant/remote_timeline_client/delete.rs b/pageserver/src/tenant/remote_timeline_client/delete.rs
deleted file mode 100644
index 7324559223d6..000000000000
--- a/pageserver/src/tenant/remote_timeline_client/delete.rs
+++ /dev/null
@@ -1,34 +0,0 @@
-//! Helper functions to delete files from remote storage with a RemoteStorage
-use anyhow::Context;
-use std::path::Path;
-use tracing::debug;
-
-use remote_storage::GenericRemoteStorage;
-
-use crate::{
-    config::PageServerConf,
-    tenant::{remote_timeline_client::remote_path, Generation},
-};
-
-pub(super) async fn delete_layer<'a>(
-    conf: &'static PageServerConf,
-    storage: &'a GenericRemoteStorage,
-    local_layer_path: &'a Path,
-    generation: Generation,
-) -> anyhow::Result<()> {
-    fail::fail_point!("before-delete-layer", |_| {
-        anyhow::bail!("failpoint before-delete-layer")
-    });
-    debug!("Deleting layer from remote storage: {local_layer_path:?}",);
-
-    let path_to_delete = remote_path(conf, local_layer_path, generation)?;
-
-    // We don't want to print an error if the delete failed if the file has
-    // already been deleted. Thankfully, in this situation S3 already
-    // does not yield an error. While OS-provided local file system APIs do yield
-    // errors, we avoid them in the `LocalFs` wrapper.
-    storage
-        .delete(&path_to_delete)
-        .await
-        .with_context(|| format!("delete remote layer from storage at {path_to_delete:?}"))
-}
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 986321552906..5c173c613ff8 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -50,7 +50,12 @@ pub async fn download_layer_file<'a>(
         .timeline_path(&tenant_id, &timeline_id)
         .join(layer_file_name.file_name());
 
-    let remote_path = remote_layer_path(&tenant_id, &timeline_id, layer_file_name, layer_metadata);
+    let remote_path = remote_layer_path(
+        &tenant_id,
+        &timeline_id,
+        layer_file_name,
+        layer_metadata.generation,
+    );
 
     // Perform a rename inspired by durable_rename from file_utils.c.
     // The sequence:
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 78ac1338db37..4fa5039d7946 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -38,6 +38,7 @@ use std::time::{Duration, Instant, SystemTime};
 use crate::context::{
     AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
 };
+use crate::deletion_queue::DeletionQueueClient;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::tenant::storage_layer::delta_layer::DeltaEntry;
 use crate::tenant::storage_layer::{
@@ -143,6 +144,7 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 /// The outward-facing resources required to build a Timeline
 pub struct TimelineResources {
     pub remote_client: Option<RemoteTimelineClient>,
+    pub deletion_queue_client: DeletionQueueClient,
 }
 
 pub struct Timeline {
@@ -521,9 +523,23 @@ impl Timeline {
         self.disk_consistent_lsn.load()
     }
 
-    pub fn get_remote_consistent_lsn(&self) -> Option<Lsn> {
+    /// remote_consistent_lsn from the perspective of the tenant's current generation,
+    /// not validated with control plane yet.
+    /// See [`Self::get_remote_consistent_lsn_visible`].
+    pub fn get_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
         if let Some(remote_client) = &self.remote_client {
-            remote_client.last_uploaded_consistent_lsn()
+            remote_client.remote_consistent_lsn_projected()
+        } else {
+            None
+        }
+    }
+
+    /// remote_consistent_lsn which the tenant is guaranteed not to go backward from,
+    /// i.e. a value of remote_consistent_lsn_projected which has undergone
+    /// generation validation in the deletion queue.
+    pub fn get_remote_consistent_lsn_visible(&self) -> Option<Lsn> {
+        if let Some(remote_client) = &self.remote_client {
+            remote_client.remote_consistent_lsn_visible()
         } else {
             None
         }
@@ -1820,7 +1836,7 @@ impl Timeline {
             for (layer, m) in needs_upload {
                 rtc.schedule_layer_file_upload(&layer.layer_desc().filename(), &m)?;
             }
-            rtc.schedule_layer_file_deletion(&needs_cleanup)?;
+            rtc.schedule_layer_file_deletion(needs_cleanup)?;
             rtc.schedule_index_upload_for_file_changes()?;
             // Tenant::create_timeline will wait for these uploads to happen before returning, or
             // on retry.
@@ -3875,7 +3891,7 @@ impl Timeline {
 
         // Also schedule the deletions in remote storage
         if let Some(remote_client) = &self.remote_client {
-            remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
+            remote_client.schedule_layer_file_deletion(layer_names_to_delete)?;
         }
 
         Ok(())
@@ -4210,7 +4226,7 @@ impl Timeline {
             }
 
             if let Some(remote_client) = &self.remote_client {
-                remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
+                remote_client.schedule_layer_file_deletion(layer_names_to_delete)?;
             }
 
             apply.flush();
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 18588cf0fd48..7d55388f44b7 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -14,6 +14,7 @@ use utils::{
 
 use crate::{
     config::PageServerConf,
+    deletion_queue::DeletionQueueClient,
     task_mgr::{self, TaskKind},
     tenant::{
         metadata::TimelineMetadata,
@@ -407,6 +408,7 @@ impl DeleteTimelineFlow {
         timeline_id: TimelineId,
         local_metadata: &TimelineMetadata,
         remote_client: Option<RemoteTimelineClient>,
+        deletion_queue_client: DeletionQueueClient,
         init_order: Option<&InitializationOrder>,
     ) -> anyhow::Result<()> {
         // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
@@ -416,7 +418,10 @@ impl DeleteTimelineFlow {
                 timeline_id,
                 local_metadata,
                 None, // Ancestor is not needed for deletion.
-                TimelineResources { remote_client },
+                TimelineResources {
+                    remote_client,
+                    deletion_queue_client,
+                },
                 init_order,
                 // Important. We dont pass ancestor above because it can be missing.
                 // Thus we need to skip the validation here.
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 7d1e9b4a39e3..0831b9cedaa5 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -370,8 +370,9 @@ pub(super) async fn handle_walreceiver_connection(
             })?;
 
         if let Some(last_lsn) = status_update {
-            let timeline_remote_consistent_lsn =
-                timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
+            let timeline_remote_consistent_lsn = timeline
+                .get_remote_consistent_lsn_visible()
+                .unwrap_or(Lsn(0));
 
             // The last LSN we processed. It is not guaranteed to survive pageserver crash.
             let last_received_lsn = last_lsn;
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index 28822335b098..08b1cb8866e8 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -1,5 +1,3 @@
-use crate::metrics::RemoteOpFileKind;
-
 use super::storage_layer::LayerFileName;
 use super::Generation;
 use crate::tenant::metadata::TimelineMetadata;
@@ -11,6 +9,7 @@ use std::fmt::Debug;
 use chrono::NaiveDateTime;
 use std::sync::Arc;
 use tracing::info;
+use utils::lsn::AtomicLsn;
 
 use std::sync::atomic::AtomicU32;
 use utils::lsn::Lsn;
@@ -58,7 +57,12 @@ pub(crate) struct UploadQueueInitialized {
     /// uploaded. `Lsn(0)` if nothing was uploaded yet.
     /// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
     /// Safekeeper can rely on it to make decisions for WAL storage.
-    pub(crate) last_uploaded_consistent_lsn: Lsn,
+    ///
+    /// visible_remote_consistent_lsn is only updated after our generation has been validated with
+    /// the control plane (unlesss a timeline's generation is None, in which case
+    /// we skip validation)
+    pub(crate) projected_remote_consistent_lsn: Option<Lsn>,
+    pub(crate) visible_remote_consistent_lsn: Arc<AtomicLsn>,
 
     // Breakdown of different kinds of tasks currently in-progress
     pub(crate) num_inprogress_layer_uploads: usize,
@@ -81,6 +85,14 @@ impl UploadQueueInitialized {
     pub(super) fn no_pending_work(&self) -> bool {
         self.inprogress_tasks.is_empty() && self.queued_operations.is_empty()
     }
+
+    pub(super) fn get_last_remote_consistent_lsn_visible(&self) -> Lsn {
+        self.visible_remote_consistent_lsn.load()
+    }
+
+    pub(super) fn get_last_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
+        self.projected_remote_consistent_lsn
+    }
 }
 
 #[derive(Clone, Copy)]
@@ -114,9 +126,8 @@ impl UploadQueue {
             latest_files: HashMap::new(),
             latest_files_changes_since_metadata_upload_scheduled: 0,
             latest_metadata: metadata.clone(),
-            // We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
-            // safekeepers from garbage-collecting anything.
-            last_uploaded_consistent_lsn: Lsn(0),
+            projected_remote_consistent_lsn: None,
+            visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)),
             // what follows are boring default initializations
             task_counter: 0,
             num_inprogress_layer_uploads: 0,
@@ -158,7 +169,10 @@ impl UploadQueue {
             latest_files: files,
             latest_files_changes_since_metadata_upload_scheduled: 0,
             latest_metadata: index_part.metadata.clone(),
-            last_uploaded_consistent_lsn: index_part.metadata.disk_consistent_lsn(),
+            projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()),
+            visible_remote_consistent_lsn: Arc::new(
+                index_part.metadata.disk_consistent_lsn().into(),
+            ),
             // what follows are boring default initializations
             task_counter: 0,
             num_inprogress_layer_uploads: 0,
@@ -201,12 +215,11 @@ pub(crate) struct UploadTask {
     pub(crate) op: UploadOp,
 }
 
+/// A deletion of some layers within the lifetime of a timeline.  This is not used
+/// for timeline deletion, which skips this queue and goes directly to DeletionQueue.
 #[derive(Debug)]
 pub(crate) struct Delete {
-    pub(crate) file_kind: RemoteOpFileKind,
-    pub(crate) layer_file_name: LayerFileName,
-    pub(crate) scheduled_from_timeline_delete: bool,
-    pub(crate) generation: Generation,
+    pub(crate) layers: Vec<(LayerFileName, Generation)>,
 }
 
 #[derive(Debug)]
@@ -217,7 +230,7 @@ pub(crate) enum UploadOp {
     /// Upload the metadata file
     UploadMetadata(IndexPart, Lsn),
 
-    /// Delete a layer file
+    /// Delete layer files
     Delete(Delete),
 
     /// Barrier. When the barrier operation is reached,
@@ -239,13 +252,9 @@ impl std::fmt::Display for UploadOp {
             UploadOp::UploadMetadata(_, lsn) => {
                 write!(f, "UploadMetadata(lsn: {})", lsn)
             }
-            UploadOp::Delete(delete) => write!(
-                f,
-                "Delete(path: {}, scheduled_from_timeline_delete: {}, gen: {:?})",
-                delete.layer_file_name.file_name(),
-                delete.scheduled_from_timeline_delete,
-                delete.generation
-            ),
+            UploadOp::Delete(delete) => {
+                write!(f, "Delete({} layers)", delete.layers.len(),)
+            }
             UploadOp::Barrier(_) => write!(f, "Barrier"),
         }
     }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 0667403ba31e..38d0aeb96026 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1481,6 +1481,16 @@ def stop(self, immediate: bool = False) -> "NeonAttachmentService":
             self.running = False
         return self
 
+    def attach_hook(self, tenant_id: TenantId, pageserver_id: int) -> int:
+        response = requests.post(
+            f"{self.env.control_plane_api}/attach_hook",
+            json={"tenant_id": str(tenant_id), "pageserver_id": pageserver_id},
+        )
+        response.raise_for_status()
+        gen = response.json()["gen"]
+        assert isinstance(gen, int)
+        return gen
+
     def __enter__(self) -> "NeonAttachmentService":
         return self
 
@@ -1689,12 +1699,7 @@ def tenant_attach(
         to call into the pageserver HTTP client.
         """
         if self.env.attachment_service is not None:
-            response = requests.post(
-                f"{self.env.control_plane_api}/attach_hook",
-                json={"tenant_id": str(tenant_id), "pageserver_id": self.id},
-            )
-            response.raise_for_status()
-            generation = response.json()["gen"]
+            generation = self.env.attachment_service.attach_hook(tenant_id, self.id)
         else:
             generation = None
 
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 9373073abf2c..9fdcd22bc25c 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -620,3 +620,8 @@ def post_tracing_event(self, level: str, message: str):
             },
         )
         self.verbose_error(res)
+
+    def deletion_queue_flush(self, execute: bool = False):
+        self.put(
+            f"http://localhost:{self.port}/v1/deletion_queue/flush?execute={'true' if execute else 'false'}"
+        ).raise_for_status()
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 2e5d75a0fcd0..70c2a06a0745 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -267,7 +267,7 @@ def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional
 
 
 def list_prefix(
-    neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None
+    neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None, delimiter: str = "/"
 ) -> ListObjectsV2OutputTypeDef:
     """
     Note that this function takes into account prefix_in_bucket.
@@ -287,7 +287,7 @@ def list_prefix(
 
     # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
     response = remote.client.list_objects_v2(
-        Delimiter="/",
+        Delimiter=delimiter,
         Bucket=remote.bucket_name,
         Prefix=prefix,
     )
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
new file mode 100644
index 000000000000..81d38ac93442
--- /dev/null
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -0,0 +1,352 @@
+"""
+
+Tests in this module exercise the pageserver's behavior around generation numbers,
+as defined in docs/rfcs/025-generation-numbers.md.  Briefly, the behaviors we require
+of the pageserver are:
+- Do not start a tenant without a generation number if control_plane_api is set
+- Remote objects must be suffixed with generation
+- Deletions may only be executed after validating generation
+- Updates to remote_consistent_lsn may only be made visible after validating generation
+"""
+
+
+import re
+import time
+from typing import Optional
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PgBin,
+    last_flush_lsn_upload,
+    wait_for_last_flush_lsn,
+)
+from fixtures.pageserver.utils import list_prefix
+from fixtures.remote_storage import (
+    RemoteStorageKind,
+)
+from fixtures.types import TenantId, TimelineId
+from fixtures.utils import print_gc_result, wait_until
+
+# A tenant configuration that is convenient for generating uploads and deletions
+# without a large amount of postgres traffic.
+TENANT_CONF = {
+    # small checkpointing and compaction targets to ensure we generate many upload operations
+    "checkpoint_distance": f"{128 * 1024}",
+    "compaction_threshold": "1",
+    "compaction_target_size": f"{128 * 1024}",
+    # no PITR horizon, we specify the horizon when we request on-demand GC
+    "pitr_interval": "0s",
+    # disable background compaction and GC. We invoke it manually when we want it to happen.
+    "gc_period": "0s",
+    "compaction_period": "0s",
+    # create image layers eagerly, so that GC can remove some layers
+    "image_creation_threshold": "1",
+}
+
+
+def generate_uploads_and_deletions(
+    env: NeonEnv,
+    *,
+    init: bool = True,
+    tenant_id: Optional[TenantId] = None,
+    timeline_id: Optional[TimelineId] = None,
+    data: Optional[str] = None,
+):
+    """
+    Using the environment's default tenant + timeline, generate a load pattern
+    that results in some uploads and some deletions to remote storage.
+    """
+
+    if tenant_id is None:
+        tenant_id = env.initial_tenant
+    assert tenant_id is not None
+
+    if timeline_id is None:
+        timeline_id = env.initial_timeline
+    assert timeline_id is not None
+
+    ps_http = env.pageserver.http_client()
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
+        if init:
+            endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
+            last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
+
+        def churn(data):
+            endpoint.safe_psql_many(
+                [
+                    f"""
+                INSERT INTO foo (id, val)
+                SELECT g, '{data}'
+                FROM generate_series(1, 20000) g
+                ON CONFLICT (id) DO UPDATE
+                SET val = EXCLUDED.val
+                """,
+                    # to ensure that GC can actually remove some layers
+                    "VACUUM foo",
+                ]
+            )
+            assert tenant_id is not None
+            assert timeline_id is not None
+            wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+            ps_http.timeline_checkpoint(tenant_id, timeline_id)
+
+        # Compaction should generate some GC-elegible layers
+        for i in range(0, 2):
+            churn(f"{i if data is None else data}")
+
+        gc_result = ps_http.timeline_gc(tenant_id, timeline_id, 0)
+        print_gc_result(gc_result)
+        assert gc_result["layers_removed"] > 0
+
+
+def get_metric_or_0(ps_http, metric: str) -> int:
+    v = ps_http.get_metric_value(metric)
+    return 0 if v is None else int(v)
+
+
+def get_deletion_queue_executed(ps_http) -> int:
+    return get_metric_or_0(ps_http, "pageserver_deletion_queue_executed_total")
+
+
+def get_deletion_queue_submitted(ps_http) -> int:
+    return get_metric_or_0(ps_http, "pageserver_deletion_queue_submitted_total")
+
+
+def get_deletion_queue_dropped(ps_http) -> int:
+    return get_metric_or_0(ps_http, "pageserver_deletion_queue_dropped_total")
+
+
+def get_deletion_queue_unexpected_errors(ps_http) -> int:
+    return get_metric_or_0(ps_http, "pageserver_deletion_queue_unexpected_errors_total")
+
+
+def get_deletion_queue_dropped_lsn_updates(ps_http) -> int:
+    return get_metric_or_0(ps_http, "pageserver_deletion_queue_dropped_lsn_updates_total")
+
+
+def get_deletion_queue_depth(ps_http) -> int:
+    """
+    Queue depth if at least one deletion has been submitted, else None
+    """
+    submitted = get_deletion_queue_submitted(ps_http)
+    executed = get_deletion_queue_executed(ps_http)
+    dropped = get_deletion_queue_dropped(ps_http)
+    depth = submitted - executed - dropped
+    log.info(f"get_deletion_queue_depth: {depth} ({submitted} - {executed} - {dropped})")
+
+    assert depth >= 0
+    return int(depth)
+
+
+def assert_deletion_queue(ps_http, size_fn) -> None:
+    v = get_deletion_queue_depth(ps_http)
+    assert v is not None
+    assert size_fn(v) is True
+
+
+def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
+    """
+    Validate behavior when a pageserver is run without generation support enabled,
+    then started again after activating it:
+    - Before upgrade, no objects should have generation suffixes
+    - After upgrade, the bucket should contain a mixture.
+    - In both cases, postgres I/O should work.
+    """
+    neon_env_builder.enable_generations = True
+    neon_env_builder.enable_pageserver_remote_storage(
+        RemoteStorageKind.MOCK_S3,
+    )
+
+    env = neon_env_builder.init_configs()
+    env.broker.try_start()
+    for sk in env.safekeepers:
+        sk.start()
+    assert env.attachment_service is not None
+    env.attachment_service.start()
+
+    env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',))
+
+    env.neon_cli.create_tenant(
+        tenant_id=env.initial_tenant, conf=TENANT_CONF, timeline_id=env.initial_timeline
+    )
+    generate_uploads_and_deletions(env)
+
+    def parse_generation_suffix(key):
+        m = re.match(".+-([0-9a-zA-Z]{8})$", key)
+        if m is None:
+            return None
+        else:
+            log.info(f"match: {m}")
+            log.info(f"group: {m.group(1)}")
+            return int(m.group(1), 16)
+
+    pre_upgrade_keys = list(
+        [o["Key"] for o in list_prefix(neon_env_builder, delimiter="")["Contents"]]
+    )
+    for key in pre_upgrade_keys:
+        assert parse_generation_suffix(key) is None
+
+    env.pageserver.stop()
+
+    # Starting without the override that disabled control_plane_api
+    env.pageserver.start()
+
+    generate_uploads_and_deletions(env, init=False)
+
+    legacy_objects: list[str] = []
+    suffixed_objects = []
+    post_upgrade_keys = list(
+        [o["Key"] for o in list_prefix(neon_env_builder, delimiter="")["Contents"]]
+    )
+    for key in post_upgrade_keys:
+        log.info(f"post-upgrade key: {key}")
+        if parse_generation_suffix(key) is not None:
+            suffixed_objects.append(key)
+        else:
+            legacy_objects.append(key)
+
+    # Bucket now contains a mixture of suffixed and non-suffixed objects
+    assert len(suffixed_objects) > 0
+    assert len(legacy_objects) > 0
+
+    assert get_deletion_queue_unexpected_errors(env.pageserver.http_client()) == 0
+
+
+def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.enable_generations = True
+    neon_env_builder.enable_pageserver_remote_storage(
+        RemoteStorageKind.MOCK_S3,
+    )
+    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+    assert env.attachment_service is not None
+
+    some_other_pageserver = 1234
+    ps_http = env.pageserver.http_client()
+
+    generate_uploads_and_deletions(env)
+
+    # Flush: pending deletions should all complete
+    assert_deletion_queue(ps_http, lambda n: n > 0)
+    ps_http.deletion_queue_flush(execute=True)
+    assert_deletion_queue(ps_http, lambda n: n == 0)
+    assert get_deletion_queue_dropped(ps_http) == 0
+
+    # Our visible remote_consistent_lsn should match projected
+    timeline = ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
+    assert timeline["remote_consistent_lsn"] == timeline["remote_consistent_lsn_visible"]
+    assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
+
+    env.pageserver.allowed_errors.extend(
+        [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
+    )
+
+    # Now advance the generation in the control plane: subsequent validations
+    # from the running pageserver will fail.  No more deletions should happen.
+    env.attachment_service.attach_hook(env.initial_tenant, some_other_pageserver)
+    generate_uploads_and_deletions(env, init=False)
+
+    assert_deletion_queue(ps_http, lambda n: n > 0)
+    queue_depth_before = get_deletion_queue_depth(ps_http)
+    executed_before = get_deletion_queue_executed(ps_http)
+    ps_http.deletion_queue_flush(execute=True)
+
+    # Queue drains to zero because we dropped deletions
+    assert_deletion_queue(ps_http, lambda n: n == 0)
+    # The executed counter has not incremented
+    assert get_deletion_queue_executed(ps_http) == executed_before
+    # The dropped counter has incremented to consume all of the deletions that were previously enqueued
+    assert get_deletion_queue_dropped(ps_http) == queue_depth_before
+
+    # Flush to S3 and see that remote_consistent_lsn does not advance: it cannot
+    # because generation validation fails.
+    timeline = ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
+    assert timeline["remote_consistent_lsn"] != timeline["remote_consistent_lsn_visible"]
+    assert get_deletion_queue_dropped_lsn_updates(ps_http) > 0
+
+    # TODO: list bucket and confirm all objects have a generation suffix.
+
+    assert get_deletion_queue_unexpected_errors(ps_http) == 0
+
+
+@pytest.mark.parametrize("keep_attachment", [True, False])
+def test_deletion_queue_recovery(
+    neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, keep_attachment: bool
+):
+    """
+    :param keep_attachment: If true, we re-attach after restart.  Else, we act as if some other
+    node took the attachment while we were restarting.
+    """
+    neon_env_builder.enable_generations = True
+    neon_env_builder.enable_pageserver_remote_storage(
+        RemoteStorageKind.MOCK_S3,
+    )
+    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+
+    ps_http = env.pageserver.http_client()
+
+    # Prevent deletion lists from being executed, to build up some backlog of deletions
+    ps_http.configure_failpoints(
+        [
+            ("deletion-queue-before-execute", "return"),
+        ]
+    )
+
+    generate_uploads_and_deletions(env)
+
+    # There should be entries in the deletion queue
+    assert_deletion_queue(ps_http, lambda n: n > 0)
+    ps_http.deletion_queue_flush()
+    before_restart_depth = get_deletion_queue_depth(ps_http)
+
+    assert get_deletion_queue_unexpected_errors(ps_http) == 0
+    assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
+
+    log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued")
+    env.pageserver.stop(immediate=True)
+
+    if not keep_attachment:
+        some_other_pageserver = 101010
+        assert env.attachment_service is not None
+        env.attachment_service.attach_hook(env.initial_tenant, some_other_pageserver)
+
+    env.pageserver.start()
+
+    def assert_deletions_submitted(n: int):
+        assert ps_http.get_metric_value("pageserver_deletion_queue_submitted_total") == n
+
+    # After restart, issue a flush to kick the deletion frontend to do recovery.
+    # It should recover all the operations we submitted before the restart.
+    ps_http.deletion_queue_flush(execute=False)
+    wait_until(20, 0.25, lambda: assert_deletions_submitted(before_restart_depth))
+
+    # The queue should drain through completely if we flush it
+    ps_http.deletion_queue_flush(execute=True)
+    wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0))
+
+    if keep_attachment:
+        # If we kept the attachment, then our pre-restart deletions should have executed
+        # successfully
+        assert get_deletion_queue_executed(ps_http) == before_restart_depth
+    else:
+        # If we lost the attachment, we should have dropped our pre-restart deletions.
+        assert get_deletion_queue_dropped(ps_http) == before_restart_depth
+        env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
+
+    assert get_deletion_queue_unexpected_errors(ps_http) == 0
+    assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
+
+    # Restart again
+    env.pageserver.stop(immediate=True)
+    env.pageserver.start()
+
+    # No deletion lists should be recovered: this demonstrates that deletion lists
+    # were cleaned up after being executed or dropped in the previous process lifetime.
+    time.sleep(1)
+    assert_deletion_queue(ps_http, lambda n: n == 0)
+
+    assert get_deletion_queue_unexpected_errors(ps_http) == 0
+    assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index c6ddb54ee6c1..9d0d42a4ef63 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -43,6 +43,12 @@ def test_tenant_delete_smoke(
     neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
 
     env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.extend(
+        [
+            # The deletion queue will complain when it encounters simulated S3 errors
+            ".*deletion executor: DeleteObjects request failed.*",
+        ]
+    )
 
     # lucky race with stopping from flushing a layer we fail to schedule any uploads
     env.pageserver.allowed_errors.append(
@@ -195,6 +201,14 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
         ]
     )
 
+    if simulate_failures:
+        env.pageserver.allowed_errors.extend(
+            [
+                # The deletion queue will complain when it encounters simulated S3 errors
+                ".*deletion executor: DeleteObjects request failed.*",
+            ]
+        )
+
     ps_http = env.pageserver.http_client()
 
     timeline_id = env.neon_cli.create_timeline("delete", tenant_id=tenant_id)
@@ -383,6 +397,7 @@ def test_tenant_delete_is_resumed_on_attach(
     assert not tenant_path.exists()
 
     if remote_storage_kind in available_s3_storages():
+        ps_http.deletion_queue_flush(execute=True)
         assert_prefix_empty(
             neon_env_builder,
             prefix="/".join(
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 0e4df21d83ca..839df69240de 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -807,6 +807,8 @@ def test_delete_orphaned_objects(
     reason = timeline_info["state"]["Broken"]["reason"]
     assert reason.endswith(f"failpoint: {failpoint}"), reason
 
+    ps_http.deletion_queue_flush(execute=True)
+
     for orphan in orphans:
         assert not orphan.exists()
         assert env.pageserver.log_contains(