diff --git a/control_plane/src/bin/attachment_service.rs b/control_plane/src/bin/attachment_service.rs index e879646b637b..d4bca59c7b35 100644 --- a/control_plane/src/bin/attachment_service.rs +++ b/control_plane/src/bin/attachment_service.rs @@ -223,6 +223,7 @@ async fn handle_attach_hook(mut req: Request) -> Result, Ap if attach_req.pageserver_id.is_some() { tenant_state.generation += 1; } + tenant_state.pageserver = attach_req.pageserver_id; let generation = tenant_state.generation; locked.save().await.map_err(ApiError::InternalServerError)?; diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index f354296be223..68620787bbc2 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -363,8 +363,15 @@ pub struct TimelineInfo { pub latest_gc_cutoff_lsn: Lsn, #[serde_as(as = "DisplayFromStr")] pub disk_consistent_lsn: Lsn, + + /// The LSN that we have succesfully uploaded to remote storage #[serde_as(as = "DisplayFromStr")] pub remote_consistent_lsn: Lsn, + + /// The LSN that we are advertizing to safekeepers + #[serde_as(as = "DisplayFromStr")] + pub remote_consistent_lsn_visible: Lsn, + pub current_logical_size: Option, // is None when timeline is Unloaded /// Sum of the size of all layer files. /// If a layer is present in both local FS and S3, it counts only once. diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 1ddd156a087e..a92b87632bda 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -20,6 +20,7 @@ use std::{ use anyhow::{bail, Context}; +use serde::{Deserialize, Serialize}; use tokio::io; use toml_edit::Item; use tracing::info; @@ -42,6 +43,9 @@ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; /// pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option = None; +/// As defined in S3 docs +pub const MAX_KEYS_PER_DELETE: usize = 1000; + const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/'; /// Path on the remote storage, relative to some inner prefix. @@ -50,6 +54,25 @@ const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/'; #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct RemotePath(PathBuf); +impl Serialize for RemotePath { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + serializer.collect_str(self) + } +} + +impl<'de> Deserialize<'de> for RemotePath { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + let str = String::deserialize(deserializer)?; + Ok(Self(PathBuf::from(&str))) + } +} + impl std::fmt::Display for RemotePath { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.0.display()) @@ -88,6 +111,10 @@ impl RemotePath { pub fn extension(&self) -> Option<&str> { self.0.extension()?.to_str() } + + pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Path, std::path::StripPrefixError> { + self.0.strip_prefix(&p.0) + } } /// Storage (potentially remote) API to manage its state. diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 9262f1e88f15..acab9539042d 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -33,11 +33,10 @@ use tracing::debug; use super::StorageMetadata; use crate::{ - Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR, + Download, DownloadError, RemotePath, RemoteStorage, S3Config, MAX_KEYS_PER_DELETE, + REMOTE_STORAGE_PREFIX_SEPARATOR, }; -const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000; - pub(super) mod metrics; use self::metrics::{AttemptOutcome, RequestKind}; @@ -500,7 +499,7 @@ impl RemoteStorage for S3Bucket { delete_objects.push(obj_id); } - for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) { + for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) { let started_at = start_measuring_requests(kind); let resp = self diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs index 163c8c0467f7..88d50905c644 100644 --- a/libs/utils/src/generation.rs +++ b/libs/utils/src/generation.rs @@ -89,6 +89,22 @@ impl Generation { Self::Broken => panic!("Attempted to use a broken generation"), } } + + pub fn next(&self) -> Generation { + match self { + Self::Valid(n) => Self::Valid(*n + 1), + Self::None => Self::Valid(1), + Self::Broken => panic!("Attempted to use a broken generation"), + } + } + + pub fn into(self) -> Option { + if let Self::Valid(v) = self { + Some(v) + } else { + None + } + } } impl Serialize for Generation { diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs index 527e486fd0df..dd54cd6ecd6c 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/utils/src/http/error.rs @@ -24,6 +24,9 @@ pub enum ApiError { #[error("Precondition failed: {0}")] PreconditionFailed(Box), + #[error("Shutting down")] + ShuttingDown, + #[error(transparent)] InternalServerError(anyhow::Error), } @@ -52,6 +55,10 @@ impl ApiError { self.to_string(), StatusCode::PRECONDITION_FAILED, ), + ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status( + "Shutting down".to_string(), + StatusCode::SERVICE_UNAVAILABLE, + ), ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::INTERNAL_SERVER_ERROR, diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index b6a2117f9cfc..90c7c11194bf 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -8,6 +8,7 @@ use anyhow::{anyhow, Context}; use clap::{Arg, ArgAction, Command}; use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp}; +use pageserver::control_plane_client::ControlPlaneClient; use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING}; use pageserver::task_mgr::WALRECEIVER_RUNTIME; @@ -20,6 +21,7 @@ use metrics::set_build_info_metric; use pageserver::{ config::{defaults::*, PageServerConf}, context::{DownloadBehavior, RequestContext}, + deletion_queue::DeletionQueue, http, page_cache, page_service, task_mgr, task_mgr::TaskKind, task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME}, @@ -346,9 +348,22 @@ fn start_pageserver( } }; + // Top-level cancellation token for the process + let shutdown_pageserver = tokio_util::sync::CancellationToken::new(); + // Set up remote storage client let remote_storage = create_remote_storage_client(conf)?; + // Set up deletion queue + let (deletion_queue, deletion_workers) = DeletionQueue::new( + remote_storage.clone(), + ControlPlaneClient::new(conf, &shutdown_pageserver), + conf, + ); + if let Some(deletion_workers) = deletion_workers { + deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle()); + } + // Up to this point no significant I/O has been done: this should have been fast. Record // duration prior to starting I/O intensive phase of startup. startup_checkpoint("initial", "Starting loading tenants"); @@ -379,13 +394,13 @@ fn start_pageserver( }; // Scan the local 'tenants/' directory and start loading the tenants - let shutdown_pageserver = tokio_util::sync::CancellationToken::new(); - + let deletion_queue_client = deletion_queue.new_client(); BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr( conf, TenantSharedResources { broker_client: broker_client.clone(), remote_storage: remote_storage.clone(), + deletion_queue_client, }, order, shutdown_pageserver.clone(), @@ -481,9 +496,10 @@ fn start_pageserver( http::routes::State::new( conf, http_auth.clone(), - remote_storage, + remote_storage.clone(), broker_client.clone(), disk_usage_eviction_state, + deletion_queue.new_client(), ) .context("Failed to initialize router state")?, ); @@ -611,7 +627,12 @@ fn start_pageserver( // Right now that tree doesn't reach very far, and `task_mgr` is used instead. // The plan is to change that over time. shutdown_pageserver.take(); - BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0)); + let bg_remote_storage = remote_storage.clone(); + let bg_deletion_queue = deletion_queue.clone(); + BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver( + bg_remote_storage.map(|_| bg_deletion_queue), + 0, + )); unreachable!() } }) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 8ee7f28c1175..ed767b764e23 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -475,8 +475,8 @@ impl PageServerConfigBuilder { self.background_task_maximum_delay = BuilderValue::Set(delay); } - pub fn control_plane_api(&mut self, api: Url) { - self.control_plane_api = BuilderValue::Set(Some(api)) + pub fn control_plane_api(&mut self, api: Option) { + self.control_plane_api = BuilderValue::Set(api) } pub fn build(self) -> anyhow::Result { @@ -580,6 +580,27 @@ impl PageServerConf { self.workdir.join(TENANTS_SEGMENT_NAME) } + pub fn deletion_prefix(&self) -> PathBuf { + self.workdir.join("deletion") + } + + pub fn deletion_list_path(&self, sequence: u64) -> PathBuf { + // Encode a version in the filename, so that if we ever switch away from JSON we can + // increment this. + const VERSION: u8 = 1; + + self.deletion_prefix() + .join(format!("{sequence:016x}-{VERSION:02x}.list")) + } + + pub fn deletion_header_path(&self) -> PathBuf { + // Encode a version in the filename, so that if we ever switch away from JSON we can + // increment this. + const VERSION: u8 = 1; + + self.deletion_prefix().join(format!("header-{VERSION:02x}")) + } + pub fn tenant_path(&self, tenant_id: &TenantId) -> PathBuf { self.tenants_path().join(tenant_id.to_string()) } @@ -747,7 +768,14 @@ impl PageServerConf { }, "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?), "background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?), - "control_plane_api" => builder.control_plane_api(parse_toml_string(key, item)?.parse().context("failed to parse control plane URL")?), + "control_plane_api" => { + let parsed = parse_toml_string(key, item)?; + if parsed.is_empty() { + builder.control_plane_api(None) + } else { + builder.control_plane_api(Some(parsed.parse().context("failed to parse control plane URL")?)) + } + }, _ => bail!("unrecognized pageserver option '{key}'"), } } diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs index 192eb167894b..555f76e5239e 100644 --- a/pageserver/src/control_plane_client.rs +++ b/pageserver/src/control_plane_client.rs @@ -1,7 +1,9 @@ use std::collections::HashMap; -use hyper::StatusCode; -use pageserver_api::control_api::{ReAttachRequest, ReAttachResponse}; +use pageserver_api::control_api::{ + ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse, +}; +use serde::{de::DeserializeOwned, Serialize}; use tokio_util::sync::CancellationToken; use url::Url; use utils::{ @@ -12,25 +14,34 @@ use utils::{ use crate::config::PageServerConf; -// Backoffs when control plane requests do not succeed: compromise between reducing load -// on control plane, and retrying frequently when we are blocked on a control plane -// response to make progress. -const BACKOFF_INCREMENT: f64 = 0.1; -const BACKOFF_MAX: f64 = 10.0; - /// The Pageserver's client for using the control plane API: this is a small subset /// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md) -pub(crate) struct ControlPlaneClient { +pub struct ControlPlaneClient { http_client: reqwest::Client, base_url: Url, node_id: NodeId, cancel: CancellationToken, } +/// Represent operations which internally retry on all errors other than +/// cancellation token firing: the only way they can fail is ShuttingDown. +pub enum RetryForeverError { + ShuttingDown, +} + +#[async_trait::async_trait] +pub trait ControlPlaneGenerationsApi { + async fn re_attach(&self) -> Result, RetryForeverError>; + async fn validate( + &self, + tenants: Vec<(TenantId, Generation)>, + ) -> Result, RetryForeverError>; +} + impl ControlPlaneClient { /// A None return value indicates that the input `conf` object does not have control /// plane API enabled. - pub(crate) fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option { + pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option { let mut url = match conf.control_plane_api.as_ref() { Some(u) => u.clone(), None => return None, @@ -54,27 +65,62 @@ impl ControlPlaneClient { }) } - async fn try_re_attach( + async fn retry_http_forever( &self, - url: Url, - request: &ReAttachRequest, - ) -> anyhow::Result { - match self.http_client.post(url).json(request).send().await { - Err(e) => Err(anyhow::Error::from(e)), - Ok(r) => { - if r.status() == StatusCode::OK { - r.json::() - .await - .map_err(anyhow::Error::from) - } else { - Err(anyhow::anyhow!("Unexpected status {}", r.status())) - } + url: &url::Url, + request: R, + ) -> Result + where + R: Serialize, + T: DeserializeOwned, + { + #[derive(thiserror::Error, Debug)] + enum RemoteAttemptError { + #[error("shutdown")] + Shutdown, + #[error("remote: {0}")] + Remote(reqwest::Error), + } + + match backoff::retry( + || async { + let response = self + .http_client + .post(url.clone()) + .json(&request) + .send() + .await + .map_err(RemoteAttemptError::Remote)?; + + response + .error_for_status_ref() + .map_err(RemoteAttemptError::Remote)?; + response + .json::() + .await + .map_err(RemoteAttemptError::Remote) + }, + |_| false, + 3, + u32::MAX, + "calling control plane generation validation API", + backoff::Cancel::new(self.cancel.clone(), || RemoteAttemptError::Shutdown), + ) + .await + { + Err(RemoteAttemptError::Shutdown) => Err(RetryForeverError::ShuttingDown), + Err(RemoteAttemptError::Remote(_)) => { + panic!("We retry forever, this should never be reached"); } + Ok(r) => Ok(r), } } +} - /// Block until we get a successful response - pub(crate) async fn re_attach(&self) -> anyhow::Result> { +#[async_trait::async_trait] +impl ControlPlaneGenerationsApi for ControlPlaneClient { + /// Block until we get a successful response, or error out if we are shut down + async fn re_attach(&self) -> Result, RetryForeverError> { let re_attach_path = self .base_url .join("re-attach") @@ -83,37 +129,47 @@ impl ControlPlaneClient { node_id: self.node_id, }; - let mut attempt = 0; - loop { - let result = self.try_re_attach(re_attach_path.clone(), &request).await; - match result { - Ok(res) => { - tracing::info!( - "Received re-attach response with {} tenants", - res.tenants.len() - ); - - return Ok(res - .tenants - .into_iter() - .map(|t| (t.id, Generation::new(t.generation))) - .collect::>()); - } - Err(e) => { - tracing::error!("Error re-attaching tenants, retrying: {e:#}"); - backoff::exponential_backoff( - attempt, - BACKOFF_INCREMENT, - BACKOFF_MAX, - &self.cancel, - ) - .await; - if self.cancel.is_cancelled() { - return Err(anyhow::anyhow!("Shutting down")); - } - attempt += 1; - } - } - } + let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?; + tracing::info!( + "Received re-attach response with {} tenants", + response.tenants.len() + ); + + Ok(response + .tenants + .into_iter() + .map(|t| (t.id, Generation::new(t.generation))) + .collect::>()) + } + + /// Block until we get a successful response, or error out if we are shut down + async fn validate( + &self, + tenants: Vec<(TenantId, Generation)>, + ) -> Result, RetryForeverError> { + let re_attach_path = self + .base_url + .join("validate") + .expect("Failed to build validate path"); + + let request = ValidateRequest { + tenants: tenants + .into_iter() + .map(|(id, gen)| ValidateRequestTenant { + id, + gen: gen + .into() + .expect("Generation should always be valid for a Tenant doing deletions"), + }) + .collect(), + }; + + let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?; + + Ok(response + .tenants + .into_iter() + .map(|rt| (rt.id, rt.valid)) + .collect()) } } diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs new file mode 100644 index 000000000000..4c0d399789a9 --- /dev/null +++ b/pageserver/src/deletion_queue.rs @@ -0,0 +1,1312 @@ +mod deleter; +mod list_writer; +mod validator; + +use std::collections::HashMap; +use std::path::PathBuf; +use std::sync::Arc; +use std::time::Duration; + +use crate::control_plane_client::ControlPlaneGenerationsApi; +use crate::metrics; +use crate::tenant::remote_timeline_client::remote_layer_path; +use crate::tenant::remote_timeline_client::remote_timeline_path; +use crate::virtual_file::VirtualFile; +use anyhow::Context; +use hex::FromHex; +use remote_storage::{GenericRemoteStorage, RemotePath}; +use serde::Deserialize; +use serde::Serialize; +use serde_with::serde_as; +use thiserror::Error; +use tokio; +use tokio_util::sync::CancellationToken; +use tracing::Instrument; +use tracing::{self, debug, error}; +use utils::crashsafe::path_with_suffix_extension; +use utils::generation::Generation; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::AtomicLsn; +use utils::lsn::Lsn; + +use self::deleter::Deleter; +use self::list_writer::DeletionOp; +use self::list_writer::ListWriter; +use self::list_writer::RecoverOp; +use self::validator::Validator; +use deleter::DeleterMessage; +use list_writer::ListWriterQueueMessage; +use validator::ValidatorQueueMessage; + +use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName}; + +// TODO: adminstrative "panic button" config property to disable all deletions +// TODO: configurable for how long to wait before executing deletions + +/// We aggregate object deletions from many tenants in one place, for several reasons: +/// - Coalesce deletions into fewer DeleteObjects calls +/// - Enable Tenant/Timeline lifetimes to be shorter than the time it takes +/// to flush any outstanding deletions. +/// - Globally control throughput of deletions, as these are a low priority task: do +/// not compete with the same S3 clients/connections used for higher priority uploads. +/// - Enable gating deletions on validation of a tenant's generation number, to make +/// it safe to multi-attach tenants (see docs/rfcs/025-generation-numbers.md) +/// +/// There are two kinds of deletion: deferred and immediate. A deferred deletion +/// may be intentionally delayed to protect passive readers of S3 data, and is +/// subject to a generation number validation step. An immediate deletion is +/// ready to execute immediately, and is only queued up so that it can be coalesced +/// with other deletions in flight. +/// +/// Deferred deletions pass through three steps: +/// - ListWriter: accumulate deletion requests from Timelines, and batch them up into +/// DeletionLists, which are persisted to disk. +/// - Validator: accumulate deletion lists, and validate them en-masse prior to passing +/// the keys in the list onward for actual deletion. Also validate remote_consistent_lsn +/// updates for running timelines. +/// - Deleter: accumulate object keys that the validator has validated, and execute them in +/// batches of 1000 keys via DeleteObjects. +/// +/// Non-deferred deletions, such as during timeline deletion, bypass the first +/// two stages and are passed straight into the Deleter. +/// +/// Internally, each stage is joined by a channel to the next. On disk, there is only +/// one queue (of DeletionLists), which is written by the frontend and consumed +/// by the backend. +#[derive(Clone)] +pub struct DeletionQueue { + client: DeletionQueueClient, + + // Parent cancellation token for the tokens passed into background workers + cancel: CancellationToken, +} + +/// Opaque wrapper around individual worker tasks, to avoid making the +/// worker objects themselves public +pub struct DeletionQueueWorkers +where + C: ControlPlaneGenerationsApi + Send + Sync, +{ + frontend: ListWriter, + backend: Validator, + executor: Deleter, +} + +impl DeletionQueueWorkers +where + C: ControlPlaneGenerationsApi + Send + Sync + 'static, +{ + pub fn spawn_with(mut self, runtime: &tokio::runtime::Handle) -> tokio::task::JoinHandle<()> { + let jh_frontend = runtime.spawn(async move { + self.frontend + .background() + .instrument(tracing::info_span!(parent:None, "deletion frontend")) + .await + }); + let jh_backend = runtime.spawn(async move { + self.backend + .background() + .instrument(tracing::info_span!(parent:None, "deletion backend")) + .await + }); + let jh_executor = runtime.spawn(async move { + self.executor + .background() + .instrument(tracing::info_span!(parent:None, "deletion executor")) + .await + }); + + runtime.spawn({ + async move { + jh_frontend.await.expect("error joining frontend worker"); + jh_backend.await.expect("error joining backend worker"); + drop(jh_executor.await.expect("error joining executor worker")); + } + }) + } +} + +/// A FlushOp is just a oneshot channel, where we send the transmit side down +/// another channel, and the receive side will receive a message when the channel +/// we're flushing has reached the FlushOp we sent into it. +/// +/// The only extra behavior beyond the channel is that the notify() method does not +/// return an error when the receive side has been dropped, because in this use case +/// it is harmless (the code that initiated the flush no longer cares about the result). +#[derive(Debug)] +struct FlushOp { + tx: tokio::sync::oneshot::Sender<()>, +} + +impl FlushOp { + fn new() -> (Self, tokio::sync::oneshot::Receiver<()>) { + let (tx, rx) = tokio::sync::oneshot::channel::<()>(); + (Self { tx }, rx) + } + + fn notify(self) { + if self.tx.send(()).is_err() { + // oneshot channel closed. This is legal: a client could be destroyed while waiting for a flush. + debug!("deletion queue flush from dropped client"); + }; + } +} + +#[derive(Clone, Debug)] +pub struct DeletionQueueClient { + tx: tokio::sync::mpsc::Sender, + executor_tx: tokio::sync::mpsc::Sender, + + lsn_table: Arc>, +} + +#[derive(Debug, Serialize, Deserialize)] +struct TenantDeletionList { + /// For each Timeline, a list of key fragments to append to the timeline remote path + /// when reconstructing a full key + #[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")] + timelines: HashMap>, + + /// The generation in which this deletion was emitted: note that this may not be the + /// same as the generation of any layers being deleted. The generation of the layer + /// has already been absorbed into the keys in `objects` + generation: Generation, +} + +impl TenantDeletionList { + pub(crate) fn len(&self) -> usize { + self.timelines.values().map(|v| v.len()).sum() + } +} + +/// For HashMaps using a `hex` compatible key, where we would like to encode the key as a string +fn to_hex_map(input: &HashMap, serializer: S) -> Result +where + S: serde::Serializer, + V: Serialize, + I: AsRef<[u8]>, +{ + let transformed = input.iter().map(|(k, v)| (hex::encode(k), v.clone())); + + transformed + .collect::>() + .serialize(serializer) +} + +/// For HashMaps using a FromHex key, where we would like to decode the key +fn from_hex_map<'de, D, V, I>(deserializer: D) -> Result, D::Error> +where + D: serde::de::Deserializer<'de>, + V: Deserialize<'de>, + I: FromHex + std::hash::Hash + Eq, +{ + let hex_map = HashMap::::deserialize(deserializer)?; + hex_map + .into_iter() + .map(|(k, v)| { + I::from_hex(k) + .map(|k| (k, v)) + .map_err(|_| serde::de::Error::custom("Invalid hex ID")) + }) + .collect() +} + +/// Files ending with this suffix will be ignored and erased +/// during recovery as startup. +const TEMP_SUFFIX: &str = ".tmp"; + +#[serde_as] +#[derive(Debug, Serialize, Deserialize)] +struct DeletionList { + /// Serialization version, for future use + version: u8, + + /// Used for constructing a unique key for each deletion list we write out. + sequence: u64, + + /// To avoid repeating tenant/timeline IDs in every key, we store keys in + /// nested HashMaps by TenantTimelineID. Each Tenant only appears once + /// with one unique generation ID: if someone tries to push a second generation + /// ID for the same tenant, we will start a new DeletionList. + #[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")] + tenants: HashMap, + + /// Avoid having to walk `tenants` to calculate the number of keys in + /// the nested deletion lists + size: usize, + + /// Set to true when the list has undergone validation with the control + /// plane and the remaining contents of `tenants` are valid. A list may + /// also be implicitly marked valid by DeletionHeader.validated_sequence + /// advancing to >= DeletionList.sequence + #[serde(default)] + #[serde(skip_serializing_if = "std::ops::Not::not")] + validated: bool, +} + +#[serde_as] +#[derive(Debug, Serialize, Deserialize)] +struct DeletionHeader { + /// Serialization version, for future use + version: u8, + + /// The highest sequence number (inclusive) that has been validated. All deletion + /// lists on disk with a sequence <= this value are safe to execute. + validated_sequence: u64, +} + +impl DeletionHeader { + const VERSION_LATEST: u8 = 1; + + fn new(validated_sequence: u64) -> Self { + Self { + version: Self::VERSION_LATEST, + validated_sequence, + } + } + + async fn save(&self, conf: &'static PageServerConf) -> anyhow::Result<()> { + debug!("Saving deletion list header {:?}", self); + let header_bytes = serde_json::to_vec(self).context("serialize deletion header")?; + let header_path = conf.deletion_header_path(); + let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX); + VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes) + .await + .map_err(Into::into) + } +} + +impl DeletionList { + const VERSION_LATEST: u8 = 1; + fn new(sequence: u64) -> Self { + Self { + version: Self::VERSION_LATEST, + sequence, + tenants: HashMap::new(), + size: 0, + validated: false, + } + } + + fn is_empty(&self) -> bool { + self.tenants.is_empty() + } + + fn len(&self) -> usize { + self.size + } + + /// Returns true if the push was accepted, false if the caller must start a new + /// deletion list. + fn push( + &mut self, + tenant: &TenantId, + timeline: &TimelineId, + generation: Generation, + objects: &mut Vec, + ) -> bool { + if objects.is_empty() { + // Avoid inserting an empty TimelineDeletionList: this preserves the property + // that if we have no keys, then self.objects is empty (used in Self::is_empty) + return true; + } + + let tenant_entry = self + .tenants + .entry(*tenant) + .or_insert_with(|| TenantDeletionList { + timelines: HashMap::new(), + generation, + }); + + if tenant_entry.generation != generation { + // Only one generation per tenant per list: signal to + // caller to start a new list. + return false; + } + + let timeline_entry = tenant_entry + .timelines + .entry(*timeline) + .or_insert_with(Vec::new); + + let timeline_remote_path = remote_timeline_path(tenant, timeline); + + self.size += objects.len(); + timeline_entry.extend(objects.drain(..).map(|p| { + p.strip_prefix(&timeline_remote_path) + .expect("Timeline paths always start with the timeline prefix") + .to_string_lossy() + .to_string() + })); + true + } + + fn into_remote_paths(self) -> Vec { + let mut result = Vec::new(); + for (tenant, tenant_deletions) in self.tenants.into_iter() { + for (timeline, timeline_layers) in tenant_deletions.timelines.into_iter() { + let timeline_remote_path = remote_timeline_path(&tenant, &timeline); + result.extend( + timeline_layers + .into_iter() + .map(|l| timeline_remote_path.join(&PathBuf::from(l))), + ); + } + } + + result + } + + async fn save(&self, conf: &'static PageServerConf) -> anyhow::Result<()> { + let path = conf.deletion_list_path(self.sequence); + let temp_path = path_with_suffix_extension(&path, TEMP_SUFFIX); + + let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list"); + VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes) + .await + .map_err(Into::into) + } +} + +impl std::fmt::Display for DeletionList { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "DeletionList", + self.sequence, + self.tenants.len(), + self.size + ) + } +} + +struct PendingLsn { + projected: Lsn, + result_slot: Arc, +} + +struct TenantLsnState { + timelines: HashMap, + + // In what generation was the most recent update proposed? + generation: Generation, +} + +#[derive(Default)] +struct VisibleLsnUpdates { + tenants: HashMap, +} + +impl VisibleLsnUpdates { + fn new() -> Self { + Self { + tenants: HashMap::new(), + } + } +} + +impl std::fmt::Debug for VisibleLsnUpdates { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "VisibleLsnUpdates({} tenants)", self.tenants.len()) + } +} + +#[derive(Error, Debug)] +pub enum DeletionQueueError { + #[error("Deletion queue unavailable during shutdown")] + ShuttingDown, +} + +impl DeletionQueueClient { + pub(crate) fn broken() -> Self { + // Channels whose receivers are immediately dropped. + let (tx, _rx) = tokio::sync::mpsc::channel(1); + let (executor_tx, _executor_rx) = tokio::sync::mpsc::channel(1); + Self { + tx, + executor_tx, + lsn_table: Arc::default(), + } + } + + /// This is cancel-safe. If you drop the future before it completes, the message + /// is not pushed, although in the context of the deletion queue it doesn't matter: once + /// we decide to do a deletion the decision is always final. + async fn do_push( + &self, + queue: &tokio::sync::mpsc::Sender, + msg: T, + ) -> Result<(), DeletionQueueError> { + match queue.send(msg).await { + Ok(_) => Ok(()), + Err(e) => { + // This shouldn't happen, we should shut down all tenants before + // we shut down the global delete queue. If we encounter a bug like this, + // we may leak objects as deletions won't be processed. + error!("Deletion queue closed while pushing, shutting down? ({e})"); + Err(DeletionQueueError::ShuttingDown) + } + } + } + + pub(crate) async fn recover( + &self, + attached_tenants: HashMap, + ) -> Result<(), DeletionQueueError> { + self.do_push( + &self.tx, + ListWriterQueueMessage::Recover(RecoverOp { attached_tenants }), + ) + .await + } + + /// When a Timeline wishes to update the remote_consistent_lsn that it exposes to the outside + /// world, it must validate its generation number before doing so. Rather than do this synchronously, + /// we allow the timeline to publish updates at will via this API, and then read back what LSN was most + /// recently validated separately. + /// + /// In this function we publish the LSN to the `projected` field of the timeline's entry in the VisibleLsnUpdates. The + /// backend will later wake up and notice that the tenant's generation requires validation. + pub(crate) async fn update_remote_consistent_lsn( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + current_generation: Generation, + lsn: Lsn, + result_slot: Arc, + ) { + let mut locked = self + .lsn_table + .write() + .expect("Lock should never be poisoned"); + + let tenant_entry = locked.tenants.entry(tenant_id).or_insert(TenantLsnState { + timelines: HashMap::new(), + generation: current_generation, + }); + + if tenant_entry.generation != current_generation { + // Generation might have changed if we were detached and then re-attached: in this case, + // state from the previous generation cannot be trusted. + tenant_entry.timelines.clear(); + tenant_entry.generation = current_generation; + } + + tenant_entry.timelines.insert( + timeline_id, + PendingLsn { + projected: lsn, + result_slot, + }, + ); + } + + /// Submit a list of layers for deletion: this function will return before the deletion is + /// persistent, but it may be executed at any time after this function enters: do not push + /// layers until you're sure they can be deleted safely (i.e. remote metadata no longer + /// references them). + /// + /// The `current_generation` is the generation of this pageserver's current attachment. The + /// generations in `layers` are the generations in which those layers were written. + pub(crate) async fn push_layers( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + current_generation: Generation, + layers: Vec<(LayerFileName, Generation)>, + ) -> Result<(), DeletionQueueError> { + if current_generation.is_none() { + debug!("Enqueuing deletions in legacy mode, skipping queue"); + let mut layer_paths = Vec::new(); + for (layer, generation) in layers { + layer_paths.push(remote_layer_path( + &tenant_id, + &timeline_id, + &layer, + generation, + )); + } + self.push_immediate(layer_paths).await?; + return self.flush_immediate().await; + } + + metrics::DELETION_QUEUE + .keys_submitted + .inc_by(layers.len() as u64); + self.do_push( + &self.tx, + ListWriterQueueMessage::Delete(DeletionOp { + tenant_id, + timeline_id, + layers, + generation: current_generation, + objects: Vec::new(), + }), + ) + .await + } + + /// This is cancel-safe. If you drop the future the flush may still happen in the background. + async fn do_flush( + &self, + queue: &tokio::sync::mpsc::Sender, + msg: T, + rx: tokio::sync::oneshot::Receiver<()>, + ) -> Result<(), DeletionQueueError> { + self.do_push(queue, msg).await?; + if rx.await.is_err() { + // This shouldn't happen if tenants are shut down before deletion queue. If we + // encounter a bug like this, then a flusher will incorrectly believe it has flushed + // when it hasn't, possibly leading to leaking objects. + error!("Deletion queue dropped flush op while client was still waiting"); + Err(DeletionQueueError::ShuttingDown) + } else { + Ok(()) + } + } + + /// Wait until all previous deletions are persistent (either executed, or written to a DeletionList) + /// + /// This is cancel-safe. If you drop the future the flush may still happen in the background. + pub async fn flush(&self) -> Result<(), DeletionQueueError> { + let (flush_op, rx) = FlushOp::new(); + self.do_flush(&self.tx, ListWriterQueueMessage::Flush(flush_op), rx) + .await + } + + // Wait until all previous deletions are executed + pub(crate) async fn flush_execute(&self) -> Result<(), DeletionQueueError> { + debug!("flush_execute: flushing to deletion lists..."); + // Flush any buffered work to deletion lists + self.flush().await?; + + // Flush the backend into the executor of deletion lists + let (flush_op, rx) = FlushOp::new(); + debug!("flush_execute: flushing backend..."); + self.do_flush(&self.tx, ListWriterQueueMessage::FlushExecute(flush_op), rx) + .await?; + debug!("flush_execute: finished flushing backend..."); + + // Flush any immediate-mode deletions (the above backend flush will only flush + // the executor if deletions had flowed through the backend) + debug!("flush_execute: flushing execution..."); + let (flush_op, rx) = FlushOp::new(); + self.do_flush(&self.executor_tx, DeleterMessage::Flush(flush_op), rx) + .await?; + debug!("flush_execute: finished flushing execution..."); + Ok(()) + } + + /// This interface bypasses the persistent deletion queue, and any validation + /// that this pageserver is still elegible to execute the deletions. It is for + /// use in timeline deletions, where the control plane is telling us we may + /// delete everything in the timeline. + /// + /// DO NOT USE THIS FROM GC OR COMPACTION CODE. Use the regular `push_layers`. + pub(crate) async fn push_immediate( + &self, + objects: Vec, + ) -> Result<(), DeletionQueueError> { + metrics::DELETION_QUEUE + .keys_submitted + .inc_by(objects.len() as u64); + self.executor_tx + .send(DeleterMessage::Delete(objects)) + .await + .map_err(|_| DeletionQueueError::ShuttingDown) + } + + /// Companion to push_immediate. When this returns Ok, all prior objects sent + /// into push_immediate have been deleted from remote storage. + pub(crate) async fn flush_immediate(&self) -> Result<(), DeletionQueueError> { + let (flush_op, rx) = FlushOp::new(); + self.executor_tx + .send(DeleterMessage::Flush(flush_op)) + .await + .map_err(|_| DeletionQueueError::ShuttingDown)?; + + rx.await.map_err(|_| DeletionQueueError::ShuttingDown) + } +} + +impl DeletionQueue { + pub fn new_client(&self) -> DeletionQueueClient { + self.client.clone() + } + + /// Caller may use the returned object to construct clients with new_client. + /// Caller should tokio::spawn the background() members of the two worker objects returned: + /// we don't spawn those inside new() so that the caller can use their runtime/spans of choice. + /// + /// If remote_storage is None, then the returned workers will also be None. + pub fn new( + remote_storage: Option, + control_plane_client: Option, + conf: &'static PageServerConf, + ) -> (Self, Option>) + where + C: ControlPlaneGenerationsApi + Send + Sync, + { + // Deep channel: it consumes deletions from all timelines and we do not want to block them + let (tx, rx) = tokio::sync::mpsc::channel(16384); + + // Shallow channel: it carries DeletionLists which each contain up to thousands of deletions + let (backend_tx, backend_rx) = tokio::sync::mpsc::channel(16); + + // Shallow channel: it carries lists of paths, and we expect the main queueing to + // happen in the backend (persistent), not in this queue. + let (executor_tx, executor_rx) = tokio::sync::mpsc::channel(16); + + let lsn_table = Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new())); + + // The deletion queue has an independent cancellation token to + // the general pageserver shutdown token, because it stays alive a bit + // longer to flush after Tenants have all been torn down. + let cancel = CancellationToken::new(); + + let remote_storage = match remote_storage { + None => { + return ( + Self { + client: DeletionQueueClient { + tx, + executor_tx, + lsn_table: lsn_table.clone(), + }, + cancel, + }, + None, + ) + } + Some(r) => r, + }; + + ( + Self { + client: DeletionQueueClient { + tx, + executor_tx: executor_tx.clone(), + lsn_table: lsn_table.clone(), + }, + cancel: cancel.clone(), + }, + Some(DeletionQueueWorkers { + frontend: ListWriter::new(conf, rx, backend_tx, cancel.clone()), + backend: Validator::new( + conf, + backend_rx, + executor_tx, + control_plane_client, + lsn_table.clone(), + cancel.clone(), + ), + executor: Deleter::new(remote_storage, executor_rx, cancel.clone()), + }), + ) + } + + pub async fn shutdown(&mut self, timeout: Duration) { + self.cancel.cancel(); + + match tokio::time::timeout(timeout, self.client.flush()).await { + Ok(Ok(())) => { + tracing::info!("Deletion queue flushed successfully on shutdown") + } + Ok(Err(DeletionQueueError::ShuttingDown)) => { + // This is not harmful for correctness, but is unexpected: the deletion + // queue's workers should stay alive as long as there are any client handles instantiated. + tracing::warn!("Deletion queue stopped prematurely"); + } + Err(_timeout) => { + tracing::warn!("Timed out flushing deletion queue on shutdown") + } + } + } +} + +#[cfg(test)] +mod test { + use hex_literal::hex; + use std::{ + io::ErrorKind, + path::{Path, PathBuf}, + time::Duration, + }; + use tracing::info; + + use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; + use tokio::task::JoinHandle; + + use crate::{ + control_plane_client::RetryForeverError, + repository::Key, + tenant::{ + harness::TenantHarness, remote_timeline_client::remote_timeline_path, + storage_layer::DeltaFileName, + }, + }; + + use super::*; + pub const TIMELINE_ID: TimelineId = + TimelineId::from_array(hex!("11223344556677881122334455667788")); + + pub const EXAMPLE_LAYER_NAME: LayerFileName = LayerFileName::Delta(DeltaFileName { + key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF), + lsn_range: Lsn(0x00000000016B59D8)..Lsn(0x00000000016B5A51), + }); + + // When you need a second layer in a test. + pub const EXAMPLE_LAYER_NAME_ALT: LayerFileName = LayerFileName::Delta(DeltaFileName { + key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF), + lsn_range: Lsn(0x00000000016B5A51)..Lsn(0x00000000016B5A61), + }); + + struct TestSetup { + harness: TenantHarness, + remote_fs_dir: PathBuf, + storage: GenericRemoteStorage, + mock_control_plane: MockControlPlane, + deletion_queue: DeletionQueue, + worker_join: JoinHandle<()>, + } + + impl TestSetup { + /// Simulate a pageserver restart by destroying and recreating the deletion queue + async fn restart(&mut self) { + let (deletion_queue, workers) = DeletionQueue::new( + Some(self.storage.clone()), + Some(self.mock_control_plane.clone()), + self.harness.conf, + ); + + tracing::debug!("Spawning worker for new queue queue"); + let worker_join = workers + .unwrap() + .spawn_with(&tokio::runtime::Handle::current()); + + let old_worker_join = std::mem::replace(&mut self.worker_join, worker_join); + let old_deletion_queue = std::mem::replace(&mut self.deletion_queue, deletion_queue); + + tracing::debug!("Joining worker from previous queue"); + old_deletion_queue.cancel.cancel(); + old_worker_join + .await + .expect("Failed to join workers for previous deletion queue"); + } + + fn set_latest_generation(&self, gen: Generation) { + let tenant_id = self.harness.tenant_id; + self.mock_control_plane + .latest_generation + .lock() + .unwrap() + .insert(tenant_id, gen); + } + + /// Returns remote layer file name, suitable for use in assert_remote_files + fn write_remote_layer( + &self, + file_name: LayerFileName, + gen: Generation, + ) -> anyhow::Result { + let tenant_id = self.harness.tenant_id; + let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID); + let remote_timeline_path = self.remote_fs_dir.join(relative_remote_path.get_path()); + std::fs::create_dir_all(&remote_timeline_path)?; + let remote_layer_file_name = format!("{}{}", file_name, gen.get_suffix()); + + let content: Vec = format!("placeholder contents of {file_name}").into(); + + std::fs::write( + remote_timeline_path.join(remote_layer_file_name.clone()), + content, + )?; + + Ok(remote_layer_file_name) + } + } + + #[derive(Debug, Clone)] + struct MockControlPlane { + pub latest_generation: std::sync::Arc>>, + } + + impl MockControlPlane { + fn new() -> Self { + Self { + latest_generation: Arc::default(), + } + } + } + + #[async_trait::async_trait] + impl ControlPlaneGenerationsApi for MockControlPlane { + #[allow(clippy::diverging_sub_expression)] // False positive via async_trait + async fn re_attach(&self) -> Result, RetryForeverError> { + unimplemented!() + } + async fn validate( + &self, + tenants: Vec<(TenantId, Generation)>, + ) -> Result, RetryForeverError> { + let mut result = HashMap::new(); + + let latest_generation = self.latest_generation.lock().unwrap(); + + for (tenant_id, generation) in tenants { + if let Some(latest) = latest_generation.get(&tenant_id) { + result.insert(tenant_id, *latest == generation); + } + } + + Ok(result) + } + } + + fn setup(test_name: &str) -> anyhow::Result { + let test_name = Box::leak(Box::new(format!("deletion_queue__{test_name}"))); + let harness = TenantHarness::create(test_name)?; + + // We do not load() the harness: we only need its config and remote_storage + + // Set up a GenericRemoteStorage targetting a directory + let remote_fs_dir = harness.conf.workdir.join("remote_fs"); + std::fs::create_dir_all(remote_fs_dir)?; + let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?; + let storage_config = RemoteStorageConfig { + max_concurrent_syncs: std::num::NonZeroUsize::new( + remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS, + ) + .unwrap(), + max_sync_errors: std::num::NonZeroU32::new( + remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS, + ) + .unwrap(), + storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()), + }; + let storage = GenericRemoteStorage::from_config(&storage_config).unwrap(); + + let mock_control_plane = MockControlPlane::new(); + + let (deletion_queue, worker) = DeletionQueue::new( + Some(storage.clone()), + Some(mock_control_plane.clone()), + harness.conf, + ); + + let worker = worker.unwrap(); + let worker_join = worker.spawn_with(&tokio::runtime::Handle::current()); + + Ok(TestSetup { + harness, + remote_fs_dir, + storage, + mock_control_plane, + deletion_queue, + worker_join, + }) + } + + // TODO: put this in a common location so that we can share with remote_timeline_client's tests + fn assert_remote_files(expected: &[&str], remote_path: &Path) { + let mut expected: Vec = expected.iter().map(|x| String::from(*x)).collect(); + expected.sort(); + + let mut found: Vec = Vec::new(); + let dir = match std::fs::read_dir(remote_path) { + Ok(d) => d, + Err(e) => { + if e.kind() == ErrorKind::NotFound { + if expected.is_empty() { + // We are asserting prefix is empty: it is expected that the dir is missing + return; + } else { + assert_eq!(expected, Vec::::new()); + unreachable!(); + } + } else { + panic!( + "Unexpected error listing {}: {e}", + remote_path.to_string_lossy() + ); + } + } + }; + + for entry in dir.flatten() { + let entry_name = entry.file_name(); + let fname = entry_name.to_str().unwrap(); + found.push(String::from(fname)); + } + found.sort(); + + assert_eq!(expected, found); + } + + fn assert_local_files(expected: &[&str], directory: &Path) { + let dir = match std::fs::read_dir(directory) { + Ok(d) => d, + Err(_) => { + assert_eq!(expected, &Vec::::new()); + return; + } + }; + let mut found = Vec::new(); + for dentry in dir { + let dentry = dentry.unwrap(); + let file_name = dentry.file_name(); + let file_name_str = file_name.to_string_lossy(); + found.push(file_name_str.to_string()); + } + found.sort(); + assert_eq!(expected, found); + } + + #[tokio::test] + async fn deletion_queue_smoke() -> anyhow::Result<()> { + // Basic test that the deletion queue processes the deletions we pass into it + let ctx = setup("deletion_queue_smoke").expect("Failed test setup"); + let client = ctx.deletion_queue.new_client(); + client.recover(HashMap::new()).await?; + + let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); + let tenant_id = ctx.harness.tenant_id; + + let content: Vec = "victim1 contents".into(); + let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID); + let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path()); + let deletion_prefix = ctx.harness.conf.deletion_prefix(); + + // Exercise the distinction between the generation of the layers + // we delete, and the generation of the running Tenant. + let layer_generation = Generation::new(0xdeadbeef); + let now_generation = Generation::new(0xfeedbeef); + + let remote_layer_file_name_1 = + format!("{}{}", layer_file_name_1, layer_generation.get_suffix()); + + // Set mock control plane state to valid for our generation + ctx.set_latest_generation(now_generation); + + // Inject a victim file to remote storage + info!("Writing"); + std::fs::create_dir_all(&remote_timeline_path)?; + std::fs::write( + remote_timeline_path.join(remote_layer_file_name_1.clone()), + content, + )?; + assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path); + + // File should still be there after we push it to the queue (we haven't pushed enough to flush anything) + info!("Pushing"); + client + .push_layers( + tenant_id, + TIMELINE_ID, + now_generation, + [(layer_file_name_1.clone(), layer_generation)].to_vec(), + ) + .await?; + assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path); + + assert_local_files(&[], &deletion_prefix); + + // File should still be there after we write a deletion list (we haven't pushed enough to execute anything) + info!("Flushing"); + client.flush().await?; + assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path); + assert_local_files(&["0000000000000001-01.list"], &deletion_prefix); + + // File should go away when we execute + info!("Flush-executing"); + client.flush_execute().await?; + assert_remote_files(&[], &remote_timeline_path); + assert_local_files(&["header-01"], &deletion_prefix); + + // Flushing on an empty queue should succeed immediately, and not write any lists + info!("Flush-executing on empty"); + client.flush_execute().await?; + assert_local_files(&["header-01"], &deletion_prefix); + + Ok(()) + } + + #[tokio::test] + async fn deletion_queue_validation() -> anyhow::Result<()> { + let ctx = setup("deletion_queue_validation").expect("Failed test setup"); + let client = ctx.deletion_queue.new_client(); + client.recover(HashMap::new()).await?; + + // Generation that the control plane thinks is current + let latest_generation = Generation::new(0xdeadbeef); + // Generation that our DeletionQueue thinks the tenant is running with + let stale_generation = latest_generation.previous(); + // Generation that our example layer file was written with + let layer_generation = stale_generation.previous(); + + ctx.set_latest_generation(latest_generation); + + let tenant_id = ctx.harness.tenant_id; + let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID); + let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path()); + + // Initial state: a remote layer exists + let remote_layer_name = ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?; + assert_remote_files(&[&remote_layer_name], &remote_timeline_path); + + tracing::debug!("Pushing..."); + client + .push_layers( + tenant_id, + TIMELINE_ID, + stale_generation, + [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(), + ) + .await?; + + // We enqueued the operation in a stale generation: it should have failed validation + tracing::debug!("Flushing..."); + tokio::time::timeout(Duration::from_secs(5), client.flush_execute()).await??; + assert_remote_files(&[&remote_layer_name], &remote_timeline_path); + + tracing::debug!("Pushing..."); + client + .push_layers( + tenant_id, + TIMELINE_ID, + latest_generation, + [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(), + ) + .await?; + + // We enqueued the operation in a fresh generation: it should have passed validation + tracing::debug!("Flushing..."); + tokio::time::timeout(Duration::from_secs(5), client.flush_execute()).await??; + assert_remote_files(&[], &remote_timeline_path); + + Ok(()) + } + + #[tokio::test] + async fn deletion_queue_recovery() -> anyhow::Result<()> { + // Basic test that the deletion queue processes the deletions we pass into it + let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup"); + let client = ctx.deletion_queue.new_client(); + client.recover(HashMap::new()).await?; + + let tenant_id = ctx.harness.tenant_id; + + let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID); + let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path()); + let deletion_prefix = ctx.harness.conf.deletion_prefix(); + + let layer_generation = Generation::new(0xdeadbeef); + let now_generation = Generation::new(0xfeedbeef); + + // Inject a deletion in the generation before generation_now: after restart, + // this deletion should _not_ get executed (only the immediately previous + // generation gets that treatment) + let remote_layer_file_name_historical = + ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?; + client + .push_layers( + tenant_id, + TIMELINE_ID, + now_generation.previous(), + [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(), + ) + .await?; + + // Inject a deletion in the generation before generation_now: after restart, + // this deletion should get executed, because we execute deletions in the + // immediately previous generation on the same node. + let remote_layer_file_name_previous = + ctx.write_remote_layer(EXAMPLE_LAYER_NAME_ALT, layer_generation)?; + client + .push_layers( + tenant_id, + TIMELINE_ID, + now_generation, + [(EXAMPLE_LAYER_NAME_ALT.clone(), layer_generation)].to_vec(), + ) + .await?; + + client.flush().await?; + assert_remote_files( + &[ + &remote_layer_file_name_historical, + &remote_layer_file_name_previous, + ], + &remote_timeline_path, + ); + + // Different generatinos for the same tenant will cause two separate + // deletion lists to be emitted. + assert_local_files( + &["0000000000000001-01.list", "0000000000000002-01.list"], + &deletion_prefix, + ); + + // Simulate a node restart: the latest generation advances + let now_generation = now_generation.next(); + ctx.set_latest_generation(now_generation); + + // Restart the deletion queue + drop(client); + ctx.restart().await; + let client = ctx.deletion_queue.new_client(); + client + .recover(HashMap::from([(tenant_id, now_generation)])) + .await?; + + info!("Flush-executing"); + client.flush_execute().await?; + // The deletion from immediately prior generation was executed, the one from + // an older generation was not. + assert_remote_files(&[&remote_layer_file_name_historical], &remote_timeline_path); + Ok(()) + } +} + +/// A lightweight queue which can issue ordinary DeletionQueueClient objects, but doesn't do any persistence +/// or coalescing, and doesn't actually execute any deletions unless you call pump() to kick it. +#[cfg(test)] +pub(crate) mod mock { + use tracing::info; + + use crate::tenant::remote_timeline_client::remote_layer_path; + + use super::*; + use std::sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, + }; + + pub struct ConsumerState { + rx: tokio::sync::mpsc::Receiver, + executor_rx: tokio::sync::mpsc::Receiver, + } + + impl ConsumerState { + async fn consume(&mut self, remote_storage: &GenericRemoteStorage) -> usize { + let mut executed = 0; + + info!("Executing all pending deletions"); + + // Transform all executor messages to generic frontend messages + while let Ok(msg) = self.executor_rx.try_recv() { + match msg { + DeleterMessage::Delete(objects) => { + for path in objects { + match remote_storage.delete(&path).await { + Ok(_) => { + debug!("Deleted {path}"); + } + Err(e) => { + error!("Failed to delete {path}, leaking object! ({e})"); + } + } + executed += 1; + } + } + DeleterMessage::Flush(flush_op) => { + flush_op.notify(); + } + } + } + + while let Ok(msg) = self.rx.try_recv() { + match msg { + ListWriterQueueMessage::Delete(op) => { + let mut objects = op.objects; + for (layer, generation) in op.layers { + objects.push(remote_layer_path( + &op.tenant_id, + &op.timeline_id, + &layer, + generation, + )); + } + + for path in objects { + info!("Executing deletion {path}"); + match remote_storage.delete(&path).await { + Ok(_) => { + debug!("Deleted {path}"); + } + Err(e) => { + error!("Failed to delete {path}, leaking object! ({e})"); + } + } + executed += 1; + } + } + ListWriterQueueMessage::Flush(op) => { + op.notify(); + } + ListWriterQueueMessage::FlushExecute(op) => { + // We have already executed all prior deletions because mock does them inline + op.notify(); + } + ListWriterQueueMessage::Recover(_) => { + // no-op in mock + } + } + info!("All pending deletions have been executed"); + } + + executed + } + } + + pub struct MockDeletionQueue { + tx: tokio::sync::mpsc::Sender, + executor_tx: tokio::sync::mpsc::Sender, + executed: Arc, + remote_storage: Option, + consumer: std::sync::Mutex, + lsn_table: Arc>, + } + + impl MockDeletionQueue { + pub fn new(remote_storage: Option) -> Self { + let (tx, rx) = tokio::sync::mpsc::channel(16384); + let (executor_tx, executor_rx) = tokio::sync::mpsc::channel(16384); + + let executed = Arc::new(AtomicUsize::new(0)); + + Self { + tx, + executor_tx, + executed, + remote_storage, + consumer: std::sync::Mutex::new(ConsumerState { rx, executor_rx }), + lsn_table: Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new())), + } + } + + pub fn get_executed(&self) -> usize { + self.executed.load(Ordering::Relaxed) + } + + #[allow(clippy::await_holding_lock)] + pub async fn pump(&self) { + if let Some(remote_storage) = &self.remote_storage { + // Permit holding mutex across await, because this is only ever + // called once at a time in tests. + let mut locked = self.consumer.lock().unwrap(); + let count = locked.consume(remote_storage).await; + self.executed.fetch_add(count, Ordering::Relaxed); + } + } + + pub(crate) fn new_client(&self) -> DeletionQueueClient { + DeletionQueueClient { + tx: self.tx.clone(), + executor_tx: self.executor_tx.clone(), + lsn_table: self.lsn_table.clone(), + } + } + } +} diff --git a/pageserver/src/deletion_queue/deleter.rs b/pageserver/src/deletion_queue/deleter.rs new file mode 100644 index 000000000000..5c6e7dc9d7b6 --- /dev/null +++ b/pageserver/src/deletion_queue/deleter.rs @@ -0,0 +1,156 @@ +//! The deleter is the final stage in the deletion queue. It accumulates remote +//! paths to delete, and periodically executes them in batches of up to 1000 +//! using the DeleteObjects request. +//! +//! Its purpose is to increase efficiency of remote storage I/O by issuing a smaller +//! number of full-sized DeleteObjects requests, rather than a larger number of +//! smaller requests. + +use remote_storage::GenericRemoteStorage; +use remote_storage::RemotePath; +use remote_storage::MAX_KEYS_PER_DELETE; +use std::time::Duration; +use tokio_util::sync::CancellationToken; +use tracing::info; +use tracing::warn; + +use crate::metrics; + +use super::DeletionQueueError; +use super::FlushOp; + +const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10); + +pub(super) enum DeleterMessage { + Delete(Vec), + Flush(FlushOp), +} + +/// Non-persistent deletion queue, for coalescing multiple object deletes into +/// larger DeleteObjects requests. +pub(super) struct Deleter { + // Accumulate up to 1000 keys for the next deletion operation + accumulator: Vec, + + rx: tokio::sync::mpsc::Receiver, + + cancel: CancellationToken, + remote_storage: GenericRemoteStorage, +} + +impl Deleter { + pub(super) fn new( + remote_storage: GenericRemoteStorage, + rx: tokio::sync::mpsc::Receiver, + cancel: CancellationToken, + ) -> Self { + Self { + remote_storage, + rx, + cancel, + accumulator: Vec::new(), + } + } + + /// Wrap the remote `delete_objects` with a failpoint + async fn remote_delete(&self) -> Result<(), anyhow::Error> { + fail::fail_point!("deletion-queue-before-execute", |_| { + info!("Skipping execution, failpoint set"); + metrics::DELETION_QUEUE + .remote_errors + .with_label_values(&["failpoint"]) + .inc(); + Err(anyhow::anyhow!("failpoint hit")) + }); + + self.remote_storage.delete_objects(&self.accumulator).await + } + + /// Block until everything in accumulator has been executed + async fn flush(&mut self) -> Result<(), DeletionQueueError> { + while !self.accumulator.is_empty() && !self.cancel.is_cancelled() { + match self.remote_delete().await { + Ok(()) => { + // Note: we assume that the remote storage layer returns Ok(()) if some + // or all of the deleted objects were already gone. + metrics::DELETION_QUEUE + .keys_executed + .inc_by(self.accumulator.len() as u64); + info!( + "Executed deletion batch {}..{}", + self.accumulator + .first() + .expect("accumulator should be non-empty"), + self.accumulator + .last() + .expect("accumulator should be non-empty"), + ); + self.accumulator.clear(); + } + Err(e) => { + warn!("DeleteObjects request failed: {e:#}, will retry"); + metrics::DELETION_QUEUE + .remote_errors + .with_label_values(&["execute"]) + .inc(); + } + }; + } + if self.cancel.is_cancelled() { + // Expose an error because we may not have actually flushed everything + Err(DeletionQueueError::ShuttingDown) + } else { + Ok(()) + } + } + + pub(super) async fn background(&mut self) -> Result<(), DeletionQueueError> { + self.accumulator.reserve(MAX_KEYS_PER_DELETE); + + loop { + if self.cancel.is_cancelled() { + return Err(DeletionQueueError::ShuttingDown); + } + + let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await { + Ok(Some(m)) => m, + Ok(None) => { + // All queue senders closed + info!("Shutting down"); + return Err(DeletionQueueError::ShuttingDown); + } + Err(_) => { + // Timeout, we hit deadline to execute whatever we have in hand. These functions will + // return immediately if no work is pending + self.flush().await?; + + continue; + } + }; + + match msg { + DeleterMessage::Delete(mut list) => { + while !list.is_empty() || self.accumulator.len() == MAX_KEYS_PER_DELETE { + if self.accumulator.len() == MAX_KEYS_PER_DELETE { + self.flush().await?; + // If we have received this number of keys, proceed with attempting to execute + assert_eq!(self.accumulator.len(), 0); + } + + let available_slots = MAX_KEYS_PER_DELETE - self.accumulator.len(); + let take_count = std::cmp::min(available_slots, list.len()); + for path in list.drain(list.len() - take_count..) { + self.accumulator.push(path); + } + } + } + DeleterMessage::Flush(flush_op) => { + // If flush() errors, we drop the flush_op and the caller will get + // an error recv()'ing their oneshot channel. + self.flush().await?; + flush_op.notify(); + } + } + } + } +} diff --git a/pageserver/src/deletion_queue/list_writer.rs b/pageserver/src/deletion_queue/list_writer.rs new file mode 100644 index 000000000000..618a59f8fef8 --- /dev/null +++ b/pageserver/src/deletion_queue/list_writer.rs @@ -0,0 +1,487 @@ +//! The list writer is the first stage in the deletion queue. It accumulates +//! layers to delete, and periodically writes out these layers into a persistent +//! DeletionList. +//! +//! The purpose of writing DeletionLists is to decouple the decision to +//! delete an object from the validation required to execute it: even if +//! validation is not possible, e.g. due to a control plane outage, we can +//! still persist our intent to delete an object, in a way that would +//! survive a restart. +//! +//! DeletionLists are passed onwards to the Validator. + +use super::DeletionHeader; +use super::DeletionList; +use super::FlushOp; +use super::ValidatorQueueMessage; + +use std::collections::HashMap; +use std::fs::create_dir_all; +use std::time::Duration; + +use regex::Regex; +use remote_storage::RemotePath; +use tokio_util::sync::CancellationToken; +use tracing::debug; +use tracing::info; +use tracing::warn; +use utils::generation::Generation; +use utils::id::TenantId; +use utils::id::TimelineId; + +use crate::config::PageServerConf; +use crate::deletion_queue::TEMP_SUFFIX; +use crate::metrics; +use crate::tenant::remote_timeline_client::remote_layer_path; +use crate::tenant::storage_layer::LayerFileName; + +// The number of keys in a DeletionList before we will proactively persist it +// (without reaching a flush deadline). This aims to deliver objects of the order +// of magnitude 1MB when we are under heavy delete load. +const DELETION_LIST_TARGET_SIZE: usize = 16384; + +// Ordinarily, we only flush to DeletionList periodically, to bound the window during +// which we might leak objects from not flushing a DeletionList after +// the objects are already unlinked from timeline metadata. +const FRONTEND_DEFAULT_TIMEOUT: Duration = Duration::from_millis(10000); + +// If someone is waiting for a flush to DeletionList, only delay a little to accumulate +// more objects before doing the flush. +const FRONTEND_FLUSHING_TIMEOUT: Duration = Duration::from_millis(100); + +#[derive(Debug)] +pub(super) struct DeletionOp { + pub(super) tenant_id: TenantId, + pub(super) timeline_id: TimelineId, + // `layers` and `objects` are both just lists of objects. `layers` is used if you do not + // have a config object handy to project it to a remote key, and need the consuming worker + // to do it for you. + pub(super) layers: Vec<(LayerFileName, Generation)>, + pub(super) objects: Vec, + + /// The _current_ generation of the Tenant attachment in which we are enqueuing + /// this deletion. + pub(super) generation: Generation, +} + +#[derive(Debug)] +pub(super) struct RecoverOp { + pub(super) attached_tenants: HashMap, +} + +#[derive(Debug)] +pub(super) enum ListWriterQueueMessage { + Delete(DeletionOp), + // Wait until all prior deletions make it into a persistent DeletionList + Flush(FlushOp), + // Wait until all prior deletions have been executed (i.e. objects are actually deleted) + FlushExecute(FlushOp), + // Call once after re-attaching to control plane, to notify the deletion queue about + // latest attached generations & load any saved deletion lists from disk. + Recover(RecoverOp), +} + +pub(super) struct ListWriter { + conf: &'static PageServerConf, + + // Incoming frontend requests to delete some keys + rx: tokio::sync::mpsc::Receiver, + + // Outbound requests to the backend to execute deletion lists we have composed. + tx: tokio::sync::mpsc::Sender, + + // The list we are currently building, contains a buffer of keys to delete + // and our next sequence number + pending: DeletionList, + + // These FlushOps should notify the next time we flush + pending_flushes: Vec, + + // Worker loop is torn down when this fires. + cancel: CancellationToken, + + // Safety guard to do recovery exactly once + recovered: bool, +} + +impl ListWriter { + // Initially DeletionHeader.validated_sequence is zero. The place we start our + // sequence numbers must be higher than that. + const BASE_SEQUENCE: u64 = 1; + + pub(super) fn new( + conf: &'static PageServerConf, + rx: tokio::sync::mpsc::Receiver, + tx: tokio::sync::mpsc::Sender, + cancel: CancellationToken, + ) -> Self { + Self { + pending: DeletionList::new(Self::BASE_SEQUENCE), + conf, + rx, + tx, + pending_flushes: Vec::new(), + cancel, + recovered: false, + } + } + + /// Try to flush `list` to persistent storage + /// + /// This does not return errors, because on failure to flush we do not lose + /// any state: flushing will be retried implicitly on the next deadline + async fn flush(&mut self) { + if self.pending.is_empty() { + for f in self.pending_flushes.drain(..) { + f.notify(); + } + return; + } + + match self.pending.save(self.conf).await { + Ok(_) => { + info!(sequence = self.pending.sequence, "Stored deletion list"); + + for f in self.pending_flushes.drain(..) { + f.notify(); + } + + // Take the list we've accumulated, replace it with a fresh list for the next sequence + let next_list = DeletionList::new(self.pending.sequence + 1); + let list = std::mem::replace(&mut self.pending, next_list); + + if let Err(e) = self.tx.send(ValidatorQueueMessage::Delete(list)).await { + // This is allowed to fail: it will only happen if the backend worker is shut down, + // so we can just drop this on the floor. + info!("Deletion list dropped, this is normal during shutdown ({e:#})"); + } + } + Err(e) => { + metrics::DELETION_QUEUE.unexpected_errors.inc(); + warn!( + sequence = self.pending.sequence, + "Failed to write deletion list, will retry later ({e:#})" + ); + } + } + } + + /// Load the header, to learn the sequence number up to which deletions + /// have been validated. We will apply validated=true to DeletionLists + /// <= this sequence when loading them. + /// + /// It is not an error for the header to not exist: we return None, and + /// the caller should act as if validated_sequence is 0 + async fn load_validated_sequence(&self) -> Result, anyhow::Error> { + let header_path = self.conf.deletion_header_path(); + match tokio::fs::read(&header_path).await { + Ok(header_bytes) => { + match serde_json::from_slice::(&header_bytes) { + Ok(h) => Ok(Some(h.validated_sequence)), + Err(e) => { + warn!( + "Failed to deserialize deletion header, ignoring {}: {e:#}", + header_path.display() + ); + // This should never happen unless we make a mistake with our serialization. + // Ignoring a deletion header is not consequential for correctnes because all deletions + // are ultimately allowed to fail: worst case we leak some objects for the scrubber to clean up. + metrics::DELETION_QUEUE.unexpected_errors.inc(); + Ok(None) + } + } + } + Err(e) => { + if e.kind() == std::io::ErrorKind::NotFound { + debug!( + "Deletion header {} not found, first start?", + header_path.display() + ); + Ok(None) + } else { + Err(anyhow::anyhow!(e)) + } + } + } + } + + async fn recover( + &mut self, + attached_tenants: HashMap, + ) -> Result<(), anyhow::Error> { + debug!( + "recovering with {} attached tenants", + attached_tenants.len() + ); + + // Load the header + let validated_sequence = self.load_validated_sequence().await?.unwrap_or(0); + + self.pending.sequence = validated_sequence + 1; + + let deletion_directory = self.conf.deletion_prefix(); + let mut dir = match tokio::fs::read_dir(&deletion_directory).await { + Ok(d) => d, + Err(e) => { + warn!( + "Failed to open deletion list directory {}: {e:#}", + deletion_directory.display(), + ); + + // Give up: if we can't read the deletion list directory, we probably can't + // write lists into it later, so the queue won't work. + return Err(e.into()); + } + }; + + let list_name_pattern = + Regex::new("(?[a-zA-Z0-9]{16})-(?[a-zA-Z0-9]{2}).list").unwrap(); + + let header_path = self.conf.deletion_header_path(); + let mut seqs: Vec = Vec::new(); + while let Some(dentry) = dir.next_entry().await? { + let file_name = dentry.file_name(); + let dentry_str = file_name.to_string_lossy(); + + if Some(file_name.as_os_str()) == header_path.file_name() { + // Don't try and parse the header's name like a list + continue; + } + + if dentry_str.ends_with(TEMP_SUFFIX) { + info!("Cleaning up temporary file {dentry_str}"); + let absolute_path = deletion_directory.join(dentry.file_name()); + if let Err(e) = tokio::fs::remove_file(&absolute_path).await { + // Non-fatal error: we will just leave the file behind but not + // try and load it. + warn!( + "Failed to clean up temporary file {}: {e:#}", + absolute_path.display() + ); + } + + continue; + } + + let file_name = dentry.file_name().to_owned(); + let basename = file_name.to_string_lossy(); + let seq_part = if let Some(m) = list_name_pattern.captures(&basename) { + m.name("sequence") + .expect("Non optional group should be present") + .as_str() + } else { + warn!("Unexpected key in deletion queue: {basename}"); + metrics::DELETION_QUEUE.unexpected_errors.inc(); + continue; + }; + + let seq: u64 = match u64::from_str_radix(seq_part, 16) { + Ok(s) => s, + Err(e) => { + warn!("Malformed key '{basename}': {e}"); + metrics::DELETION_QUEUE.unexpected_errors.inc(); + continue; + } + }; + seqs.push(seq); + } + seqs.sort(); + + // Start our next deletion list from after the last location validated by + // previous process lifetime, or after the last location found (it is updated + // below after enumerating the deletion lists) + self.pending.sequence = validated_sequence + 1; + if let Some(max_list_seq) = seqs.last() { + self.pending.sequence = std::cmp::max(self.pending.sequence, max_list_seq + 1); + } + + for s in seqs { + let list_path = self.conf.deletion_list_path(s); + + let list_bytes = tokio::fs::read(&list_path).await?; + + let mut deletion_list = match serde_json::from_slice::(&list_bytes) { + Ok(l) => l, + Err(e) => { + // Drop the list on the floor: any objects it referenced will be left behind + // for scrubbing to clean up. This should never happen unless we have a serialization bug. + warn!(sequence = s, "Failed to deserialize deletion list: {e}"); + metrics::DELETION_QUEUE.unexpected_errors.inc(); + continue; + } + }; + + if deletion_list.sequence <= validated_sequence { + // If the deletion list falls below valid_seq, we may assume that it was + // already validated the last time this pageserver ran. Otherwise, we still + // load it, as it may still contain content valid in this generation. + deletion_list.validated = true; + } else { + // Special case optimization: if a tenant is still attached, and no other + // generation was issued to another node in the interval while we restarted, + // then we may treat deletion lists from the previous generation as if they + // belong to our currently attached generation, and proceed to validate & execute. + for (tenant_id, tenant_list) in &mut deletion_list.tenants { + if let Some(attached_gen) = attached_tenants.get(tenant_id) { + if attached_gen.previous() == tenant_list.generation { + tenant_list.generation = *attached_gen; + } + } + } + } + + info!( + validated = deletion_list.validated, + sequence = deletion_list.sequence, + "Recovered deletion list" + ); + + // We will drop out of recovery if this fails: it indicates that we are shutting down + // or the backend has panicked + metrics::DELETION_QUEUE + .keys_submitted + .inc_by(deletion_list.len() as u64); + self.tx + .send(ValidatorQueueMessage::Delete(deletion_list)) + .await?; + } + + info!(next_sequence = self.pending.sequence, "Replay complete"); + + Ok(()) + } + + /// This is the front-end ingest, where we bundle up deletion requests into DeletionList + /// and write them out, for later validation by the backend and execution by the executor. + pub(super) async fn background(&mut self) { + info!("Started deletion frontend worker"); + + // Synchronous, but we only do it once per process lifetime so it's tolerable + if let Err(e) = create_dir_all(&self.conf.deletion_prefix()) { + tracing::error!( + "Failed to create deletion list directory {}, deletions will not be executed ({e})", + self.conf.deletion_prefix().display() + ); + metrics::DELETION_QUEUE.unexpected_errors.inc(); + return; + } + + while !self.cancel.is_cancelled() { + let timeout = if self.pending_flushes.is_empty() { + FRONTEND_DEFAULT_TIMEOUT + } else { + FRONTEND_FLUSHING_TIMEOUT + }; + + let msg = match tokio::time::timeout(timeout, self.rx.recv()).await { + Ok(Some(msg)) => msg, + Ok(None) => { + // Queue sender destroyed, shutting down + break; + } + Err(_) => { + // Hit deadline, flush. + self.flush().await; + continue; + } + }; + + match msg { + ListWriterQueueMessage::Delete(op) => { + assert!( + self.recovered, + "Cannot process deletions before recovery. This is a bug." + ); + + debug!( + "Delete: ingesting {} layers, {} other objects", + op.layers.len(), + op.objects.len() + ); + + let mut layer_paths = Vec::new(); + for (layer, generation) in op.layers { + layer_paths.push(remote_layer_path( + &op.tenant_id, + &op.timeline_id, + &layer, + generation, + )); + } + layer_paths.extend(op.objects); + + if !self.pending.push( + &op.tenant_id, + &op.timeline_id, + op.generation, + &mut layer_paths, + ) { + self.flush().await; + let retry_succeeded = self.pending.push( + &op.tenant_id, + &op.timeline_id, + op.generation, + &mut layer_paths, + ); + if !retry_succeeded { + // Unexpected: after we flush, we should have + // drained self.pending, so a conflict on + // generation numbers should be impossible. + tracing::error!( + "Failed to enqueue deletions, leaking objects. This is a bug." + ); + metrics::DELETION_QUEUE.unexpected_errors.inc(); + } + } + } + ListWriterQueueMessage::Flush(op) => { + if self.pending.is_empty() { + // Execute immediately + debug!("Flush: No pending objects, flushing immediately"); + op.notify() + } else { + // Execute next time we flush + debug!("Flush: adding to pending flush list for next deadline flush"); + self.pending_flushes.push(op); + } + } + ListWriterQueueMessage::FlushExecute(op) => { + debug!("FlushExecute: passing through to backend"); + // We do not flush to a deletion list here: the client sends a Flush before the FlushExecute + if let Err(e) = self.tx.send(ValidatorQueueMessage::Flush(op)).await { + info!("Can't flush, shutting down ({e})"); + // Caller will get error when their oneshot sender was dropped. + } + } + ListWriterQueueMessage::Recover(op) => { + if self.recovered { + tracing::error!( + "Deletion queue recovery called more than once. This is a bug." + ); + metrics::DELETION_QUEUE.unexpected_errors.inc(); + // Non-fatal: although this is a bug, since we did recovery at least once we may proceed. + continue; + } + + if let Err(e) = self.recover(op.attached_tenants).await { + // This should only happen in truly unrecoverable cases, like the recovery finding that the backend + // queue receiver has been dropped, or something is critically broken with + // the local filesystem holding deletion lists. + info!( + "Deletion queue recover aborted, deletion queue will not proceed ({e})" + ); + metrics::DELETION_QUEUE.unexpected_errors.inc(); + return; + } else { + self.recovered = true; + } + } + } + + if self.pending.len() > DELETION_LIST_TARGET_SIZE || !self.pending_flushes.is_empty() { + self.flush().await; + } + } + info!("Deletion queue shut down."); + } +} diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs new file mode 100644 index 000000000000..64603045d23b --- /dev/null +++ b/pageserver/src/deletion_queue/validator.rs @@ -0,0 +1,414 @@ +//! The validator is responsible for validating DeletionLists for execution, +//! based on whethe the generation in the DeletionList is still the latest +//! generation for a tenant. +//! +//! The purpose of validation is to ensure split-brain safety in the cluster +//! of pageservers: a deletion may only be executed if the tenant generation +//! that originated it is still current. See docs/rfcs/025-generation-numbers.md +//! The purpose of accumulating lists before validating them is to reduce load +//! on the control plane API by issuing fewer, larger requests. +//! +//! In addition to validating DeletionLists, the validator validates updates to remote_consistent_lsn +//! for timelines: these are logically deletions because the safekeepers use remote_consistent_lsn +//! to decide when old +//! +//! Deletions are passed onward to the Deleter. + +use std::collections::HashMap; +use std::path::PathBuf; +use std::sync::Arc; +use std::time::Duration; + +use tokio_util::sync::CancellationToken; +use tracing::debug; +use tracing::info; +use tracing::warn; + +use crate::config::PageServerConf; +use crate::control_plane_client::ControlPlaneGenerationsApi; +use crate::control_plane_client::RetryForeverError; +use crate::metrics; + +use super::deleter::DeleterMessage; +use super::DeletionHeader; +use super::DeletionList; +use super::DeletionQueueError; +use super::FlushOp; +use super::VisibleLsnUpdates; + +// After this length of time, do any validation work that is pending, +// even if we haven't accumulated many keys to delete. +// +// This also causes updates to remote_consistent_lsn to be validated, even +// if there were no deletions enqueued. +const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10); + +// If we have received this number of keys, proceed with attempting to execute +const AUTOFLUSH_KEY_COUNT: usize = 16384; + +#[derive(Debug)] +pub(super) enum ValidatorQueueMessage { + Delete(DeletionList), + Flush(FlushOp), +} +pub(super) struct Validator +where + C: ControlPlaneGenerationsApi, +{ + conf: &'static PageServerConf, + rx: tokio::sync::mpsc::Receiver, + tx: tokio::sync::mpsc::Sender, + + // Client for calling into control plane API for validation of deletes + control_plane_client: Option, + + // DeletionLists which are waiting generation validation. Not safe to + // execute until [`validate`] has processed them. + pending_lists: Vec, + + // DeletionLists which have passed validation and are ready to execute. + validated_lists: Vec, + + // Sum of all the lengths of lists in pending_lists + pending_key_count: usize, + + // Lsn validation state: we read projected LSNs and write back visible LSNs + // after validation. This is the LSN equivalent of `pending_validation_lists`: + // it is drained in [`validate`] + lsn_table: Arc>, + + // If we failed to rewrite a deletion list due to local filesystem I/O failure, + // we must remember that and refuse to advance our persistent validated sequence + // number past the failure. + list_write_failed: Option, + + cancel: CancellationToken, +} + +impl Validator +where + C: ControlPlaneGenerationsApi, +{ + pub(super) fn new( + conf: &'static PageServerConf, + rx: tokio::sync::mpsc::Receiver, + tx: tokio::sync::mpsc::Sender, + control_plane_client: Option, + lsn_table: Arc>, + cancel: CancellationToken, + ) -> Self { + Self { + conf, + rx, + tx, + control_plane_client, + lsn_table, + pending_lists: Vec::new(), + validated_lists: Vec::new(), + pending_key_count: 0, + list_write_failed: None, + cancel, + } + } + /// Process any outstanding validations of generations of pending LSN updates or pending + /// DeletionLists. + /// + /// Valid LSN updates propagate back to Timelines immediately, valid DeletionLists + /// go into the queue of ready-to-execute lists. + async fn validate(&mut self) -> Result<(), DeletionQueueError> { + let mut tenant_generations = HashMap::new(); + for list in &self.pending_lists { + for (tenant_id, tenant_list) in &list.tenants { + // Note: DeletionLists are in logical time order, so generation always + // goes up. By doing a simple insert() we will always end up with + // the latest generation seen for a tenant. + tenant_generations.insert(*tenant_id, tenant_list.generation); + } + } + + let pending_lsn_updates = { + let mut lsn_table = self.lsn_table.write().expect("Lock should not be poisoned"); + std::mem::take(&mut *lsn_table) + }; + for (tenant_id, update) in &pending_lsn_updates.tenants { + let entry = tenant_generations + .entry(*tenant_id) + .or_insert(update.generation); + if update.generation > *entry { + *entry = update.generation; + } + } + + if tenant_generations.is_empty() { + // No work to do + return Ok(()); + } + + let tenants_valid = if let Some(control_plane_client) = &self.control_plane_client { + match control_plane_client + .validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect()) + .await + { + Ok(tenants) => tenants, + Err(RetryForeverError::ShuttingDown) => { + // The only way a validation call returns an error is when the cancellation token fires + return Err(DeletionQueueError::ShuttingDown); + } + } + } else { + // Control plane API disabled. In legacy mode we consider everything valid. + tenant_generations.keys().map(|k| (*k, true)).collect() + }; + + let mut validated_sequence: Option = None; + + // Apply the validation results to the pending LSN updates + for (tenant_id, tenant_lsn_state) in pending_lsn_updates.tenants { + let validated_generation = tenant_generations + .get(&tenant_id) + .expect("Map was built from the same keys we're reading"); + + let valid = tenants_valid + .get(&tenant_id) + .copied() + // If the tenant was missing from the validation response, it has been deleted. + // The Timeline that requested the LSN update is probably already torn down, + // or will be torn down soon. In this case, drop the update by setting valid=false. + .unwrap_or(false); + + if valid && *validated_generation == tenant_lsn_state.generation { + for (_timeline_id, pending_lsn) in tenant_lsn_state.timelines { + pending_lsn.result_slot.store(pending_lsn.projected); + } + } else { + // If we failed validation, then do not apply any of the projected updates + warn!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation); + metrics::DELETION_QUEUE.dropped_lsn_updates.inc(); + } + } + + // Apply the validation results to the pending deletion lists + for list in &mut self.pending_lists { + // Filter the list based on whether the server responded valid: true. + // If a tenant is omitted in the response, it has been deleted, and we should + // proceed with deletion. + let mut mutated = false; + list.tenants.retain(|tenant_id, tenant| { + let validated_generation = tenant_generations + .get(tenant_id) + .expect("Map was built from the same keys we're reading"); + + // If the tenant was missing from the validation response, it has been deleted. + // This means that a deletion is valid, but also redundant since the tenant's + // objects should have already been deleted. Treat it as invalid to drop the + // redundant deletion. + let valid = tenants_valid.get(tenant_id).copied().unwrap_or(false); + + // A list is valid if it comes from the current _or previous_ generation. + // - The previous generation case is permitted due to how we store deletion lists locally: + // if we see the immediately previous generation in a locally stored deletion list, + // it proves that this node's disk was used for both current & previous generations, + // and therefore no other node was involved in between: the two generations may be + // logically treated as the same. + // - In that previous generation case, we rewrote it to the current generation + // in recover(), so the comparison here is simply an equality. + + let this_list_valid = valid + && (tenant.generation == *validated_generation); + + if !this_list_valid { + warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation); + metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64); + mutated = true; + } + this_list_valid + }); + list.validated = true; + + if mutated { + // Save the deletion list if we had to make changes due to stale generations. The + // saved list is valid for execution. + if let Err(e) = list.save(self.conf).await { + // Highly unexpected. Could happen if e.g. disk full. + // If we didn't save the trimmed list, it is _not_ valid to execute. + warn!("Failed to save modified deletion list {list}: {e:#}"); + metrics::DELETION_QUEUE.unexpected_errors.inc(); + + // Rather than have a complex retry process, just drop it and leak the objects, + // scrubber will clean up eventually. + list.tenants.clear(); // Result is a valid-but-empty list, which is a no-op for execution. + + // We must remember this failure, to prevent later writing out a header that + // would imply the unwritable list was valid on disk. + if self.list_write_failed.is_none() { + self.list_write_failed = Some(list.sequence); + } + } + } + + validated_sequence = Some(list.sequence); + } + + if let Some(validated_sequence) = validated_sequence { + if let Some(list_write_failed) = self.list_write_failed { + // Rare error case: we failed to write out a deletion list to excise invalid + // entries, so we cannot advance the header's valid sequence number past that point. + // + // In this state we will continue to validate, execute and delete deletion lists, + // we just cannot update the header. It should be noticed and fixed by a human due to + // the nonzero value of our unexpected_errors metric. + warn!( + sequence_number = list_write_failed, + "Cannot write header because writing a deletion list failed earlier", + ); + } else { + // Write the queue header to record how far validation progressed. This avoids having + // to rewrite each DeletionList to set validated=true in it. + let header = DeletionHeader::new(validated_sequence); + + // Drop result because the validated_sequence is an optimization. If we fail to save it, + // then restart, we will drop some deletion lists, creating work for scrubber. + // The save() function logs a warning on error. + if let Err(e) = header.save(self.conf).await { + warn!("Failed to write deletion queue header: {e:#}"); + metrics::DELETION_QUEUE.unexpected_errors.inc(); + } + } + } + + // Transfer the validated lists to the validated queue, for eventual execution + self.validated_lists.append(&mut self.pending_lists); + + Ok(()) + } + + async fn cleanup_lists(&mut self, list_paths: Vec) { + for list_path in list_paths { + debug!("Removing deletion list {}", list_path.display()); + + if let Err(e) = tokio::fs::remove_file(&list_path).await { + // Unexpected: we should have permissions and nothing else should + // be touching these files. We will leave the file behind. Subsequent + // pageservers will try and load it again: hopefully whatever storage + // issue (probably permissions) has been fixed by then. + tracing::error!("Failed to delete {}: {e:#}", list_path.display()); + metrics::DELETION_QUEUE.unexpected_errors.inc(); + break; + } + } + } + + async fn flush(&mut self) -> Result<(), DeletionQueueError> { + tracing::debug!("Flushing with {} pending lists", self.pending_lists.len()); + + // Issue any required generation validation calls to the control plane + self.validate().await?; + + // After successful validation, nothing is pending: any lists that + // made it through validation will be in validated_lists. + assert!(self.pending_lists.is_empty()); + self.pending_key_count = 0; + + tracing::debug!( + "Validation complete, have {} validated lists", + self.validated_lists.len() + ); + + // Return quickly if we have no validated lists to execute. This avoids flushing the + // executor when an idle backend hits its autoflush interval + if self.validated_lists.is_empty() { + return Ok(()); + } + + // Drain `validated_lists` into the executor + let mut executing_lists = Vec::new(); + for list in self.validated_lists.drain(..) { + let list_path = self.conf.deletion_list_path(list.sequence); + let objects = list.into_remote_paths(); + self.tx + .send(DeleterMessage::Delete(objects)) + .await + .map_err(|_| DeletionQueueError::ShuttingDown)?; + executing_lists.push(list_path); + } + + self.flush_executor().await?; + + // Erase the deletion lists whose keys have all be deleted from remote storage + self.cleanup_lists(executing_lists).await; + + Ok(()) + } + + async fn flush_executor(&mut self) -> Result<(), DeletionQueueError> { + // Flush the executor, so that all the keys referenced by these deletion lists + // are actually removed from remote storage. This is a precondition to deleting + // the deletion lists themselves. + let (flush_op, rx) = FlushOp::new(); + self.tx + .send(DeleterMessage::Flush(flush_op)) + .await + .map_err(|_| DeletionQueueError::ShuttingDown)?; + + rx.await.map_err(|_| DeletionQueueError::ShuttingDown) + } + + pub(super) async fn background(&mut self) { + tracing::info!("Started deletion backend worker"); + + while !self.cancel.is_cancelled() { + let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await { + Ok(Some(m)) => m, + Ok(None) => { + // All queue senders closed + info!("Shutting down"); + break; + } + Err(_) => { + // Timeout, we hit deadline to execute whatever we have in hand. These functions will + // return immediately if no work is pending. + match self.flush().await { + Ok(()) => {} + Err(DeletionQueueError::ShuttingDown) => { + // If we are shutting down, then auto-flush can safely be skipped + } + } + + continue; + } + }; + + match msg { + ValidatorQueueMessage::Delete(list) => { + if list.validated { + // A pre-validated list may only be seen during recovery, if we are recovering + // a DeletionList whose on-disk state has validated=true + self.validated_lists.push(list) + } else { + self.pending_key_count += list.len(); + self.pending_lists.push(list); + } + + if self.pending_key_count > AUTOFLUSH_KEY_COUNT { + match self.flush().await { + Ok(()) => {} + Err(DeletionQueueError::ShuttingDown) => { + // If we are shutting down, then auto-flush can safely be skipped + } + } + } + } + ValidatorQueueMessage::Flush(op) => { + match self.flush().await { + Ok(()) => { + op.notify(); + } + Err(DeletionQueueError::ShuttingDown) => { + // If we fail due to shutting down, we will just drop `op` to propagate that status. + } + } + } + } + } + } +} diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 4988641d6a73..f5c1224f01e8 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -1093,6 +1093,9 @@ components: remote_consistent_lsn: type: string format: hex + remote_consistent_lsn_visible: + type: string + format: hex ancestor_timeline_id: type: string format: hex diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index a8e914ba08d5..e61a9dcf3fad 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -5,6 +5,7 @@ use std::collections::HashMap; use std::sync::Arc; use anyhow::{anyhow, Context, Result}; +use futures::TryFutureExt; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use metrics::launch_timestamp::LaunchTimestamp; @@ -24,6 +25,7 @@ use super::models::{ TimelineCreateRequest, TimelineGcRequest, TimelineInfo, }; use crate::context::{DownloadBehavior, RequestContext}; +use crate::deletion_queue::DeletionQueueClient; use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; @@ -34,7 +36,7 @@ use crate::tenant::mgr::{ use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; use crate::tenant::timeline::Timeline; -use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError}; +use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources}; use crate::{config::PageServerConf, tenant::mgr}; use crate::{disk_usage_eviction_task, tenant}; use utils::{ @@ -61,6 +63,7 @@ pub struct State { remote_storage: Option, broker_client: storage_broker::BrokerClientChannel, disk_usage_eviction_state: Arc, + deletion_queue_client: DeletionQueueClient, } impl State { @@ -70,6 +73,7 @@ impl State { remote_storage: Option, broker_client: storage_broker::BrokerClientChannel, disk_usage_eviction_state: Arc, + deletion_queue_client: DeletionQueueClient, ) -> anyhow::Result { let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"] .iter() @@ -82,8 +86,17 @@ impl State { remote_storage, broker_client, disk_usage_eviction_state, + deletion_queue_client, }) } + + fn tenant_resources(&self) -> TenantSharedResources { + TenantSharedResources { + broker_client: self.broker_client.clone(), + remote_storage: self.remote_storage.clone(), + deletion_queue_client: self.deletion_queue_client.clone(), + } + } } #[inline(always)] @@ -283,7 +296,12 @@ async fn build_timeline_info_common( }; let current_physical_size = Some(timeline.layer_size_sum().await); let state = timeline.current_state(); - let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0)); + let remote_consistent_lsn_projected = timeline + .get_remote_consistent_lsn_projected() + .unwrap_or(Lsn(0)); + let remote_consistent_lsn_visible = timeline + .get_remote_consistent_lsn_visible() + .unwrap_or(Lsn(0)); let walreceiver_status = timeline.walreceiver_status(); @@ -293,7 +311,8 @@ async fn build_timeline_info_common( ancestor_timeline_id, ancestor_lsn, disk_consistent_lsn: timeline.get_disk_consistent_lsn(), - remote_consistent_lsn, + remote_consistent_lsn: remote_consistent_lsn_projected, + remote_consistent_lsn_visible, last_record_lsn, prev_record_lsn: Some(timeline.get_prev_record_lsn()), latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(), @@ -492,24 +511,23 @@ async fn tenant_attach_handler( let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?; - if let Some(remote_storage) = &state.remote_storage { - mgr::attach_tenant( - state.conf, - tenant_id, - generation, - tenant_conf, - state.broker_client.clone(), - remote_storage.clone(), - &ctx, - ) - .instrument(info_span!("tenant_attach", %tenant_id)) - .await?; - } else { + if state.remote_storage.is_none() { return Err(ApiError::BadRequest(anyhow!( "attach_tenant is not possible because pageserver was configured without remote storage" ))); } + mgr::attach_tenant( + state.conf, + tenant_id, + generation, + tenant_conf, + state.tenant_resources(), + &ctx, + ) + .instrument(info_span!("tenant_attach", %tenant_id)) + .await?; + json_response(StatusCode::ACCEPTED, ()) } @@ -570,6 +588,7 @@ async fn tenant_load_handler( generation, state.broker_client.clone(), state.remote_storage.clone(), + state.deletion_queue_client.clone(), &ctx, ) .instrument(info_span!("load", %tenant_id)) @@ -911,8 +930,7 @@ async fn tenant_create_handler( tenant_conf, target_tenant_id, generation, - state.broker_client.clone(), - state.remote_storage.clone(), + state.tenant_resources(), &ctx, ) .instrument(info_span!("tenant_create", tenant_id = %target_tenant_id)) @@ -1129,6 +1147,39 @@ async fn timeline_download_remote_layers_handler_get( json_response(StatusCode::OK, info) } +async fn deletion_queue_flush( + r: Request, + cancel: CancellationToken, +) -> Result, ApiError> { + let state = get_state(&r); + + if state.remote_storage.is_none() { + // Nothing to do if remote storage is disabled. + return json_response(StatusCode::OK, ()); + } + + let execute = parse_query_param(&r, "execute")?.unwrap_or(false); + + let flush = async { + if execute { + state.deletion_queue_client.flush_execute().await + } else { + state.deletion_queue_client.flush().await + } + } + // DeletionQueueError's only case is shutting down. + .map_err(|_| ApiError::ShuttingDown); + + tokio::select! { + res = flush => { + res.map(|()| json_response(StatusCode::OK, ()))? + } + _ = cancel.cancelled() => { + Err(ApiError::ShuttingDown) + } + } +} + async fn active_timeline_of_active_tenant( tenant_id: TenantId, timeline_id: TimelineId, @@ -1463,6 +1514,9 @@ pub fn make_router( .put("/v1/disk_usage_eviction/run", |r| { api_handler(r, disk_usage_eviction_run) }) + .put("/v1/deletion_queue/flush", |r| { + api_handler(r, deletion_queue_flush) + }) .put("/v1/tenant/:tenant_id/break", |r| { testing_api_handler("set tenant state to broken", r, handle_tenant_break) }) diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 3049ad6a4e8d..e370e063ba1d 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -3,7 +3,8 @@ pub mod basebackup; pub mod config; pub mod consumption_metrics; pub mod context; -mod control_plane_client; +pub mod control_plane_client; +pub mod deletion_queue; pub mod disk_usage_eviction_task; pub mod http; pub mod import_datadir; @@ -27,6 +28,7 @@ pub mod failpoint_support; use std::path::Path; use crate::task_mgr::TaskKind; +use deletion_queue::DeletionQueue; use tracing::info; /// Current storage format version @@ -48,8 +50,8 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]); pub use crate::metrics::preinitialize_metrics; -#[tracing::instrument] -pub async fn shutdown_pageserver(exit_code: i32) { +#[tracing::instrument(skip_all, fields(%exit_code))] +pub async fn shutdown_pageserver(deletion_queue: Option, exit_code: i32) { use std::time::Duration; // Shut down the libpq endpoint task. This prevents new connections from // being accepted. @@ -77,6 +79,11 @@ pub async fn shutdown_pageserver(exit_code: i32) { ) .await; + // Best effort to persist any outstanding deletions, to avoid leaking objects + if let Some(mut deletion_queue) = deletion_queue { + deletion_queue.shutdown(Duration::from_secs(5)).await; + } + // Shut down the HTTP endpoint last, so that you can still check the server's // status while it's shutting down. // FIXME: We should probably stop accepting commands like attach/detach earlier. diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 98dee095a313..b085176f189e 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -887,6 +887,54 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy = Lazy .expect("failed to define a metric") }); +pub(crate) struct DeletionQueueMetrics { + pub(crate) keys_submitted: IntCounter, + pub(crate) keys_dropped: IntCounter, + pub(crate) keys_executed: IntCounter, + pub(crate) dropped_lsn_updates: IntCounter, + pub(crate) unexpected_errors: IntCounter, + pub(crate) remote_errors: IntCounterVec, +} +pub(crate) static DELETION_QUEUE: Lazy = Lazy::new(|| { + DeletionQueueMetrics{ + + keys_submitted: register_int_counter!( + "pageserver_deletion_queue_submitted_total", + "Number of objects submitted for deletion" + ) + .expect("failed to define a metric"), + + keys_dropped: register_int_counter!( + "pageserver_deletion_queue_dropped_total", + "Number of object deletions dropped due to stale generation." + ) + .expect("failed to define a metric"), + + keys_executed: register_int_counter!( + "pageserver_deletion_queue_executed_total", + "Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed." + ) + .expect("failed to define a metric"), + + dropped_lsn_updates: register_int_counter!( + "pageserver_deletion_queue_dropped_lsn_updates_total", + "Updates to remote_consistent_lsn dropped due to stale generation number." + ) + .expect("failed to define a metric"), + unexpected_errors: register_int_counter!( + "pageserver_deletion_queue_unexpected_errors_total", + "Number of unexpected condiions that may stall the queue: any value above zero is unexpected." + ) + .expect("failed to define a metric"), + remote_errors: register_int_counter_vec!( + "pageserver_deletion_queue_remote_errors_total", + "Retryable remote I/O errors while executing deletions, for example 503 responses to DeleteObjects", + &["op_kind"], + ) + .expect("failed to define a metric") +} +}); + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum RemoteOpKind { Upload, @@ -1675,6 +1723,9 @@ pub fn preinitialize_metrics() { Lazy::force(c); }); + // Deletion queue stats + Lazy::force(&DELETION_QUEUE); + // countervecs [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT] .into_iter() diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 047fa761c36c..7a94c3449dba 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -37,7 +37,7 @@ impl Key { | self.field6 as i128 } - pub fn from_i128(x: i128) -> Self { + pub const fn from_i128(x: i128) -> Self { Key { field1: ((x >> 120) & 0xf) as u8, field2: ((x >> 104) & 0xFFFF) as u32, diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 650bc119b624..017322ffb29b 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -456,7 +456,7 @@ async fn task_finish( } if shutdown_process { - shutdown_pageserver(1).await; + shutdown_pageserver(None, 1).await; } } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 1c92c618fa6c..47bfd4a8efe8 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -57,6 +57,7 @@ use self::timeline::EvictionTaskTenantState; use self::timeline::TimelineResources; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; +use crate::deletion_queue::DeletionQueueClient; use crate::import_datadir; use crate::is_uninit_mark; use crate::metrics::TENANT_ACTIVATION; @@ -117,7 +118,7 @@ mod span; pub mod metadata; mod par_fsync; -mod remote_timeline_client; +pub mod remote_timeline_client; pub mod storage_layer; pub mod config; @@ -157,6 +158,7 @@ pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted"; pub struct TenantSharedResources { pub broker_client: storage_broker::BrokerClientChannel, pub remote_storage: Option, + pub deletion_queue_client: DeletionQueueClient, } /// @@ -197,6 +199,9 @@ pub struct Tenant { // provides access to timeline data sitting in the remote storage pub(crate) remote_storage: Option, + // Access to global deletion queue for when this tenant wants to schedule a deletion + deletion_queue_client: DeletionQueueClient, + /// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`]. cached_logical_sizes: tokio::sync::Mutex>, cached_synthetic_tenant_size: Arc, @@ -523,15 +528,20 @@ impl Tenant { conf: &'static PageServerConf, tenant_id: TenantId, generation: Generation, - broker_client: storage_broker::BrokerClientChannel, + resources: TenantSharedResources, tenants: &'static tokio::sync::RwLock, - remote_storage: GenericRemoteStorage, ctx: &RequestContext, ) -> anyhow::Result> { // TODO dedup with spawn_load let tenant_conf = Self::load_tenant_config(conf, &tenant_id).context("load tenant config")?; + let TenantSharedResources { + broker_client, + remote_storage, + deletion_queue_client, + } = resources; + let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id)); let tenant = Arc::new(Tenant::new( TenantState::Attaching, @@ -540,7 +550,8 @@ impl Tenant { wal_redo_manager, tenant_id, generation, - Some(remote_storage.clone()), + remote_storage.clone(), + deletion_queue_client, )); // Do all the hard work in the background @@ -571,7 +582,7 @@ impl Tenant { let pending_deletion = { match DeleteTenantFlow::should_resume_deletion( conf, - Some(&remote_storage), + remote_storage.as_ref(), &tenant_clone, ) .await @@ -660,6 +671,7 @@ impl Tenant { for timeline_id in remote_timeline_ids { let client = RemoteTimelineClient::new( remote_storage.clone(), + self.deletion_queue_client.clone(), self.conf, self.tenant_id, timeline_id, @@ -726,6 +738,7 @@ impl Tenant { remote_metadata, TimelineResources { remote_client: Some(remote_client), + deletion_queue_client: self.deletion_queue_client.clone(), }, ctx, ) @@ -750,6 +763,7 @@ impl Tenant { timeline_id, &index_part.metadata, Some(remote_timeline_client), + self.deletion_queue_client.clone(), None, ) .await @@ -851,6 +865,7 @@ impl Tenant { tenant_id, Generation::broken(), None, + DeletionQueueClient::broken(), )) } @@ -895,6 +910,7 @@ impl Tenant { tenant_id, generation, remote_storage.clone(), + resources.deletion_queue_client.clone(), ); let tenant = Arc::new(tenant); @@ -1302,6 +1318,7 @@ impl Tenant { timeline_id, &local_metadata, Some(remote_client), + self.deletion_queue_client.clone(), init_order, ) .await @@ -1351,6 +1368,7 @@ impl Tenant { timeline_id, &local_metadata, None, + self.deletion_queue_client.clone(), init_order, ) .await @@ -2242,6 +2260,9 @@ impl Tenant { Ok(timeline) } + // Allow too_many_arguments because a constructor's argument list naturally grows with the + // number of attributes in the struct: breaking these out into a builder wouldn't be helpful. + #[allow(clippy::too_many_arguments)] fn new( state: TenantState, conf: &'static PageServerConf, @@ -2250,6 +2271,7 @@ impl Tenant { tenant_id: TenantId, generation: Generation, remote_storage: Option, + deletion_queue_client: DeletionQueueClient, ) -> Tenant { let (state, mut rx) = watch::channel(state); @@ -2317,6 +2339,7 @@ impl Tenant { gc_cs: tokio::sync::Mutex::new(()), walredo_mgr, remote_storage, + deletion_queue_client, state, cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()), cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)), @@ -2856,6 +2879,7 @@ impl Tenant { let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() { let remote_client = RemoteTimelineClient::new( remote_storage.clone(), + self.deletion_queue_client.clone(), self.conf, self.tenant_id, timeline_id, @@ -2866,7 +2890,10 @@ impl Tenant { None }; - TimelineResources { remote_client } + TimelineResources { + remote_client, + deletion_queue_client: self.deletion_queue_client.clone(), + } } /// Creates intermediate timeline structure and its files. @@ -3322,6 +3349,7 @@ pub mod harness { use utils::logging; use utils::lsn::Lsn; + use crate::deletion_queue::mock::MockDeletionQueue; use crate::{ config::PageServerConf, repository::Key, @@ -3383,6 +3411,7 @@ pub mod harness { pub generation: Generation, pub remote_storage: GenericRemoteStorage, pub remote_fs_dir: PathBuf, + pub deletion_queue: MockDeletionQueue, } static LOG_HANDLE: OnceCell<()> = OnceCell::new(); @@ -3431,6 +3460,7 @@ pub mod harness { storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()), }; let remote_storage = GenericRemoteStorage::from_config(&config).unwrap(); + let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone())); Ok(Self { conf, @@ -3439,6 +3469,7 @@ pub mod harness { generation: Generation::new(0xdeadbeef), remote_storage, remote_fs_dir, + deletion_queue, }) } @@ -3463,6 +3494,7 @@ pub mod harness { self.tenant_id, self.generation, Some(self.remote_storage.clone()), + self.deletion_queue.new_client(), )); tenant .load(None, ctx) @@ -4193,7 +4225,8 @@ mod tests { // #[tokio::test] async fn test_bulk_insert() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("test_bulk_insert")?.load().await; + let harness = TenantHarness::create("test_bulk_insert")?; + let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) .await?; @@ -4240,7 +4273,8 @@ mod tests { #[tokio::test] async fn test_random_updates() -> anyhow::Result<()> { - let (tenant, ctx) = TenantHarness::create("test_random_updates")?.load().await; + let harness = TenantHarness::create("test_random_updates")?; + let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 74faee111509..6f3863dd4b2a 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -20,7 +20,10 @@ use utils::crashsafe; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; -use crate::control_plane_client::ControlPlaneClient; +use crate::control_plane_client::{ + ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError, +}; +use crate::deletion_queue::DeletionQueueClient; use crate::task_mgr::{self, TaskKind}; use crate::tenant::config::TenantConfOpt; use crate::tenant::delete::DeleteTenantFlow; @@ -116,7 +119,23 @@ pub async fn init_tenant_mgr( // If we are configured to use the control plane API, then it is the source of truth for what tenants to load. let tenant_generations = if let Some(client) = ControlPlaneClient::new(conf, &cancel) { - Some(client.re_attach().await?) + let result = match client.re_attach().await { + Ok(tenants) => tenants, + Err(RetryForeverError::ShuttingDown) => { + anyhow::bail!("Shut down while waiting for control plane re-attach response") + } + }; + + // The deletion queue needs to know about the startup attachment state to decide which (if any) stored + // deletion list entries may still be valid. We provide that by pushing a recovery operation into + // the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions + // are processed, even though we don't block on recovery completing here. + resources + .deletion_queue_client + .recover(result.clone()) + .await?; + + Some(result) } else { info!("Control plane API not configured, tenant generations are disabled"); None @@ -285,29 +304,21 @@ pub(crate) fn schedule_local_tenant_processing( let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() { info!("tenant {tenant_id} has attaching mark file, resuming its attach operation"); - if let Some(remote_storage) = resources.remote_storage { - match Tenant::spawn_attach( + if resources.remote_storage.is_none() { + warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured"); + Tenant::create_broken_tenant( conf, tenant_id, - generation, - resources.broker_client, - tenants, - remote_storage, - ctx, - ) { + "attaching mark file present but no remote storage configured".to_string(), + ) + } else { + match Tenant::spawn_attach(conf, tenant_id, generation, resources, tenants, ctx) { Ok(tenant) => tenant, Err(e) => { error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}"); Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}")) } } - } else { - warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured"); - Tenant::create_broken_tenant( - conf, - tenant_id, - "attaching mark file present but no remote storage configured".to_string(), - ) } } else { info!("tenant {tenant_id} is assumed to be loadable, starting load operation"); @@ -438,8 +449,7 @@ pub async fn create_tenant( tenant_conf: TenantConfOpt, tenant_id: TenantId, generation: Generation, - broker_client: storage_broker::BrokerClientChannel, - remote_storage: Option, + resources: TenantSharedResources, ctx: &RequestContext, ) -> Result, TenantMapInsertError> { tenant_map_insert(tenant_id, || async { @@ -450,13 +460,9 @@ pub async fn create_tenant( // TODO: tenant directory remains on disk if we bail out from here on. // See https://github.com/neondatabase/neon/issues/4233 - let tenant_resources = TenantSharedResources { - broker_client, - remote_storage, - }; let created_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_directory, - generation, tenant_resources, None, &TENANTS, ctx)?; + generation, resources, None, &TENANTS, ctx)?; // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. // See https://github.com/neondatabase/neon/issues/4233 @@ -622,6 +628,7 @@ pub async fn load_tenant( generation: Generation, broker_client: storage_broker::BrokerClientChannel, remote_storage: Option, + deletion_queue_client: DeletionQueueClient, ctx: &RequestContext, ) -> Result<(), TenantMapInsertError> { tenant_map_insert(tenant_id, || async { @@ -635,6 +642,7 @@ pub async fn load_tenant( let resources = TenantSharedResources { broker_client, remote_storage, + deletion_queue_client }; let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, generation, resources, None, &TENANTS, ctx) .with_context(|| { @@ -702,8 +710,7 @@ pub async fn attach_tenant( tenant_id: TenantId, generation: Generation, tenant_conf: TenantConfOpt, - broker_client: storage_broker::BrokerClientChannel, - remote_storage: GenericRemoteStorage, + resources: TenantSharedResources, ctx: &RequestContext, ) -> Result<(), TenantMapInsertError> { tenant_map_insert(tenant_id, || async { @@ -718,10 +725,7 @@ pub async fn attach_tenant( .context("check for attach marker file existence")?; anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file"); - let resources = TenantSharedResources { - broker_client, - remote_storage: Some(remote_storage), - }; + let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, generation, resources, None, &TENANTS, ctx)?; // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. // See https://github.com/neondatabase/neon/issues/4233 diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 6f42b54ac2ae..4e495d9bb2ab 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -116,8 +116,12 @@ //! # Completion //! //! Once an operation has completed, we update -//! [`UploadQueueInitialized::last_uploaded_consistent_lsn`] which indicates -//! to safekeepers that they can delete the WAL up to that LSN. +//! [`UploadQueueInitialized::projected_remote_consistent_lsn`] immediately, +//! and submit a request through the DeletionQueue to update +//! [`UploadQueueInitialized::visible_remote_consistent_lsn`] after it has +//! validated that our generation is not stale. It is this visible value +//! that is advertized to safekeepers as a signal that that they can +//! delete the WAL up to that LSN. //! //! The [`RemoteTimelineClient::wait_completion`] method can be used to wait //! for all pending operations to complete. It does not prevent more @@ -200,7 +204,6 @@ //! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map -mod delete; mod download; pub mod index; mod upload; @@ -226,6 +229,7 @@ use tracing::{debug, error, info, instrument, warn}; use tracing::{info_span, Instrument}; use utils::lsn::Lsn; +use crate::deletion_queue::DeletionQueueClient; use crate::metrics::{ MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics, RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES, @@ -324,6 +328,8 @@ pub struct RemoteTimelineClient { metrics: Arc, storage_impl: GenericRemoteStorage, + + deletion_queue_client: DeletionQueueClient, } impl RemoteTimelineClient { @@ -335,6 +341,7 @@ impl RemoteTimelineClient { /// pub fn new( remote_storage: GenericRemoteStorage, + deletion_queue_client: DeletionQueueClient, conf: &'static PageServerConf, tenant_id: TenantId, timeline_id: TimelineId, @@ -352,6 +359,7 @@ impl RemoteTimelineClient { timeline_id, generation, storage_impl: remote_storage, + deletion_queue_client, upload_queue: Mutex::new(UploadQueue::Uninitialized), metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)), } @@ -413,13 +421,24 @@ impl RemoteTimelineClient { Ok(()) } - pub fn last_uploaded_consistent_lsn(&self) -> Option { - match &*self.upload_queue.lock().unwrap() { + pub fn remote_consistent_lsn_projected(&self) -> Option { + match &mut *self.upload_queue.lock().unwrap() { UploadQueue::Uninitialized => None, - UploadQueue::Initialized(q) => Some(q.last_uploaded_consistent_lsn), - UploadQueue::Stopped(q) => { - Some(q.upload_queue_for_deletion.last_uploaded_consistent_lsn) - } + UploadQueue::Initialized(q) => q.get_last_remote_consistent_lsn_projected(), + UploadQueue::Stopped(q) => q + .upload_queue_for_deletion + .get_last_remote_consistent_lsn_projected(), + } + } + + pub fn remote_consistent_lsn_visible(&self) -> Option { + match &mut *self.upload_queue.lock().unwrap() { + UploadQueue::Uninitialized => None, + UploadQueue::Initialized(q) => Some(q.get_last_remote_consistent_lsn_visible()), + UploadQueue::Stopped(q) => Some( + q.upload_queue_for_deletion + .get_last_remote_consistent_lsn_visible(), + ), } } @@ -643,7 +662,7 @@ impl RemoteTimelineClient { /// successfully. pub fn schedule_layer_file_deletion( self: &Arc, - names: &[LayerFileName], + names: Vec, ) -> anyhow::Result<()> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -663,10 +682,10 @@ impl RemoteTimelineClient { // Decorate our list of names with each name's generation, dropping // makes that are unexpectedly missing from our metadata. let with_generations: Vec<_> = names - .iter() + .into_iter() .filter_map(|name| { // Remove from latest_files, learning the file's remote generation in the process - let meta = upload_queue.latest_files.remove(name); + let meta = upload_queue.latest_files.remove(&name); if let Some(meta) = meta { upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; @@ -688,19 +707,17 @@ impl RemoteTimelineClient { self.schedule_index_upload(upload_queue, metadata); } - // schedule the actual deletions - for (name, generation) in with_generations { - let op = UploadOp::Delete(Delete { - file_kind: RemoteOpFileKind::Layer, - layer_file_name: name.clone(), - scheduled_from_timeline_delete: false, - generation, - }); - self.calls_unfinished_metric_begin(&op); - upload_queue.queued_operations.push_back(op); - info!("scheduled layer file deletion {name}"); + for (name, gen) in &with_generations { + info!("scheduling deletion of layer {}{}", name, gen.get_suffix()); } + // schedule the actual deletions + let op = UploadOp::Delete(Delete { + layers: with_generations, + }); + self.calls_unfinished_metric_begin(&op); + upload_queue.queued_operations.push_back(op); + // Launch the tasks immediately, if possible self.launch_queued_tasks(upload_queue); }; @@ -833,9 +850,7 @@ impl RemoteTimelineClient { pub(crate) async fn delete_all(self: &Arc) -> anyhow::Result<()> { debug_assert_current_span_has_tenant_and_timeline_id(); - let (mut receiver, deletions_queued) = { - let mut deletions_queued = 0; - + let layers: Vec = { let mut locked = self.upload_queue.lock().unwrap(); let stopped = locked.stopped_mut()?; @@ -847,42 +862,30 @@ impl RemoteTimelineClient { stopped .upload_queue_for_deletion - .queued_operations - .reserve(stopped.upload_queue_for_deletion.latest_files.len()); - - // schedule the actual deletions - for (name, meta) in &stopped.upload_queue_for_deletion.latest_files { - let op = UploadOp::Delete(Delete { - file_kind: RemoteOpFileKind::Layer, - layer_file_name: name.clone(), - scheduled_from_timeline_delete: true, - generation: meta.generation, - }); - - self.calls_unfinished_metric_begin(&op); - stopped - .upload_queue_for_deletion - .queued_operations - .push_back(op); - - info!("scheduled layer file deletion {name}"); - deletions_queued += 1; - } - - self.launch_queued_tasks(&mut stopped.upload_queue_for_deletion); - - ( - self.schedule_barrier(&mut stopped.upload_queue_for_deletion), - deletions_queued, - ) + .latest_files + .drain() + .map(|(file_name, meta)| { + remote_layer_path( + &self.tenant_id, + &self.timeline_id, + &file_name, + meta.generation, + ) + }) + .collect() }; - receiver.changed().await.context("upload queue shut down")?; + let layer_deletion_count = layers.len(); + self.deletion_queue_client.push_immediate(layers).await?; // Do not delete index part yet, it is needed for possible retry. If we remove it first // and retry will arrive to different pageserver there wont be any traces of it on remote storage let timeline_storage_path = remote_timeline_path(&self.tenant_id, &self.timeline_id); + // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't + // taking the burden of listing all the layers that we already know we should delete. + self.deletion_queue_client.flush_immediate().await?; + let remaining = backoff::retry( || async { self.storage_impl @@ -910,17 +913,9 @@ impl RemoteTimelineClient { }) .collect(); + let not_referenced_count = remaining.len(); if !remaining.is_empty() { - backoff::retry( - || async { self.storage_impl.delete_objects(&remaining).await }, - |_e| false, - FAILED_UPLOAD_WARN_THRESHOLD, - FAILED_REMOTE_OP_RETRIES, - "delete_objects", - backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")), - ) - .await - .context("delete_objects")?; + self.deletion_queue_client.push_immediate(remaining).await?; } fail::fail_point!("timeline-delete-before-index-delete", |_| { @@ -931,18 +926,14 @@ impl RemoteTimelineClient { let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME)); - debug!("deleting index part"); + debug!("enqueuing index part deletion"); + self.deletion_queue_client + .push_immediate([index_file_path].to_vec()) + .await?; - backoff::retry( - || async { self.storage_impl.delete(&index_file_path).await }, - |_e| false, - FAILED_UPLOAD_WARN_THRESHOLD, - FAILED_REMOTE_OP_RETRIES, - "delete_index", - backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled")), - ) - .await - .context("delete_index")?; + // Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait + // for a flush to a persistent deletion list so that we may be sure deletion will occur. + self.deletion_queue_client.flush_immediate().await?; fail::fail_point!("timeline-delete-after-index-delete", |_| { Err(anyhow::anyhow!( @@ -950,7 +941,7 @@ impl RemoteTimelineClient { ))? }); - info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json"); + info!(prefix=%timeline_storage_path, referenced=layer_deletion_count, not_referenced=%not_referenced_count, "done deleting in timeline prefix, including index_part.json"); Ok(()) } @@ -1140,21 +1131,16 @@ impl RemoteTimelineClient { } res } - UploadOp::Delete(delete) => { - let path = &self - .conf - .timeline_path(&self.tenant_id, &self.timeline_id) - .join(delete.layer_file_name.file_name()); - delete::delete_layer(self.conf, &self.storage_impl, path, delete.generation) - .measure_remote_op( - self.tenant_id, - self.timeline_id, - delete.file_kind, - RemoteOpKind::Delete, - Arc::clone(&self.metrics), - ) - .await - } + UploadOp::Delete(delete) => self + .deletion_queue_client + .push_layers( + self.tenant_id, + self.timeline_id, + self.generation, + delete.layers.clone(), + ) + .await + .map_err(|e| anyhow::anyhow!(e)), UploadOp::Barrier(_) => { // unreachable. Barrier operations are handled synchronously in // launch_queued_tasks @@ -1210,18 +1196,12 @@ impl RemoteTimelineClient { } // The task has completed successfully. Remove it from the in-progress list. - { + let lsn_update = { let mut upload_queue_guard = self.upload_queue.lock().unwrap(); let upload_queue = match upload_queue_guard.deref_mut() { UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"), - UploadQueue::Stopped(stopped) => { - // Special care is needed for deletions, if it was an earlier deletion (not scheduled from deletion) - // then stop() took care of it so we just return. - // For deletions that come from delete_all we still want to maintain metrics, launch following tasks, etc. - match &task.op { - UploadOp::Delete(delete) if delete.scheduled_from_timeline_delete => Some(&mut stopped.upload_queue_for_deletion), - _ => None - } + UploadQueue::Stopped(_stopped) => { + None }, UploadQueue::Initialized(qi) => { Some(qi) } }; @@ -1236,23 +1216,51 @@ impl RemoteTimelineClient { upload_queue.inprogress_tasks.remove(&task.task_id); - match task.op { + let lsn_update = match task.op { UploadOp::UploadLayer(_, _) => { upload_queue.num_inprogress_layer_uploads -= 1; + None } UploadOp::UploadMetadata(_, lsn) => { upload_queue.num_inprogress_metadata_uploads -= 1; - upload_queue.last_uploaded_consistent_lsn = lsn; // XXX monotonicity check? + // XXX monotonicity check? + + upload_queue.projected_remote_consistent_lsn = Some(lsn); + if self.generation.is_none() { + // Legacy mode: skip validating generation + upload_queue.visible_remote_consistent_lsn.store(lsn); + None + } else { + Some((lsn, upload_queue.visible_remote_consistent_lsn.clone())) + } } UploadOp::Delete(_) => { upload_queue.num_inprogress_deletions -= 1; + None } UploadOp::Barrier(_) => unreachable!(), }; // Launch any queued tasks that were unblocked by this one. self.launch_queued_tasks(upload_queue); + lsn_update + }; + + if let Some((lsn, slot)) = lsn_update { + // Updates to the remote_consistent_lsn we advertise to pageservers + // are all routed through the DeletionQueue, to enforce important + // data safety guarantees (see docs/rfcs/025-generation-numbers.md) + self.deletion_queue_client + .update_remote_consistent_lsn( + self.tenant_id, + self.timeline_id, + self.generation, + lsn, + slot, + ) + .await; } + self.calls_unfinished_metric_end(&task.op); } @@ -1278,8 +1286,8 @@ impl RemoteTimelineClient { reason: "metadata uploads are tiny", }, ), - UploadOp::Delete(delete) => ( - delete.file_kind, + UploadOp::Delete(_delete) => ( + RemoteOpFileKind::Layer, RemoteOpKind::Delete, DontTrackSize { reason: "should we track deletes? positive or negative sign?", @@ -1341,7 +1349,10 @@ impl RemoteTimelineClient { latest_files: initialized.latest_files.clone(), latest_files_changes_since_metadata_upload_scheduled: 0, latest_metadata: initialized.latest_metadata.clone(), - last_uploaded_consistent_lsn: initialized.last_uploaded_consistent_lsn, + projected_remote_consistent_lsn: None, + visible_remote_consistent_lsn: initialized + .visible_remote_consistent_lsn + .clone(), num_inprogress_layer_uploads: 0, num_inprogress_metadata_uploads: 0, num_inprogress_deletions: 0, @@ -1405,13 +1416,13 @@ pub fn remote_layer_path( tenant_id: &TenantId, timeline_id: &TimelineId, layer_file_name: &LayerFileName, - layer_meta: &LayerFileMetadata, + generation: Generation, ) -> RemotePath { // Generation-aware key format let path = format!( "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}", layer_file_name.file_name(), - layer_meta.generation.get_suffix() + generation.get_suffix() ); RemotePath::from_string(&path).expect("Failed to construct path") @@ -1554,7 +1565,6 @@ mod tests { impl TestSetup { async fn new(test_name: &str) -> anyhow::Result { - // Use a current-thread runtime in the test let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}"))); let harness = TenantHarness::create(test_name)?; let (tenant, ctx) = harness.load().await; @@ -1580,6 +1590,7 @@ mod tests { timeline_id: TIMELINE_ID, generation, storage_impl: self.harness.remote_storage.clone(), + deletion_queue_client: self.harness.deletion_queue.new_client(), upload_queue: Mutex::new(UploadQueue::Uninitialized), metrics: Arc::new(RemoteTimelineClientMetrics::new( &self.harness.tenant_id, @@ -1749,7 +1760,7 @@ mod tests { ) .unwrap(); client - .schedule_layer_file_deletion(&[layer_file_name_1.clone()]) + .schedule_layer_file_deletion([layer_file_name_1.clone()].to_vec()) .unwrap(); { let mut guard = client.upload_queue.lock().unwrap(); @@ -1775,6 +1786,7 @@ mod tests { // Finish them client.wait_completion().await.unwrap(); + harness.deletion_queue.pump().await; assert_remote_files( &[ diff --git a/pageserver/src/tenant/remote_timeline_client/delete.rs b/pageserver/src/tenant/remote_timeline_client/delete.rs deleted file mode 100644 index 7324559223d6..000000000000 --- a/pageserver/src/tenant/remote_timeline_client/delete.rs +++ /dev/null @@ -1,34 +0,0 @@ -//! Helper functions to delete files from remote storage with a RemoteStorage -use anyhow::Context; -use std::path::Path; -use tracing::debug; - -use remote_storage::GenericRemoteStorage; - -use crate::{ - config::PageServerConf, - tenant::{remote_timeline_client::remote_path, Generation}, -}; - -pub(super) async fn delete_layer<'a>( - conf: &'static PageServerConf, - storage: &'a GenericRemoteStorage, - local_layer_path: &'a Path, - generation: Generation, -) -> anyhow::Result<()> { - fail::fail_point!("before-delete-layer", |_| { - anyhow::bail!("failpoint before-delete-layer") - }); - debug!("Deleting layer from remote storage: {local_layer_path:?}",); - - let path_to_delete = remote_path(conf, local_layer_path, generation)?; - - // We don't want to print an error if the delete failed if the file has - // already been deleted. Thankfully, in this situation S3 already - // does not yield an error. While OS-provided local file system APIs do yield - // errors, we avoid them in the `LocalFs` wrapper. - storage - .delete(&path_to_delete) - .await - .with_context(|| format!("delete remote layer from storage at {path_to_delete:?}")) -} diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 986321552906..5c173c613ff8 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -50,7 +50,12 @@ pub async fn download_layer_file<'a>( .timeline_path(&tenant_id, &timeline_id) .join(layer_file_name.file_name()); - let remote_path = remote_layer_path(&tenant_id, &timeline_id, layer_file_name, layer_metadata); + let remote_path = remote_layer_path( + &tenant_id, + &timeline_id, + layer_file_name, + layer_metadata.generation, + ); // Perform a rename inspired by durable_rename from file_utils.c. // The sequence: diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 78ac1338db37..4fa5039d7946 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -38,6 +38,7 @@ use std::time::{Duration, Instant, SystemTime}; use crate::context::{ AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder, }; +use crate::deletion_queue::DeletionQueueClient; use crate::tenant::remote_timeline_client::index::LayerFileMetadata; use crate::tenant::storage_layer::delta_layer::DeltaEntry; use crate::tenant::storage_layer::{ @@ -143,6 +144,7 @@ fn drop_wlock(rlock: tokio::sync::RwLockWriteGuard<'_, T>) { /// The outward-facing resources required to build a Timeline pub struct TimelineResources { pub remote_client: Option, + pub deletion_queue_client: DeletionQueueClient, } pub struct Timeline { @@ -521,9 +523,23 @@ impl Timeline { self.disk_consistent_lsn.load() } - pub fn get_remote_consistent_lsn(&self) -> Option { + /// remote_consistent_lsn from the perspective of the tenant's current generation, + /// not validated with control plane yet. + /// See [`Self::get_remote_consistent_lsn_visible`]. + pub fn get_remote_consistent_lsn_projected(&self) -> Option { if let Some(remote_client) = &self.remote_client { - remote_client.last_uploaded_consistent_lsn() + remote_client.remote_consistent_lsn_projected() + } else { + None + } + } + + /// remote_consistent_lsn which the tenant is guaranteed not to go backward from, + /// i.e. a value of remote_consistent_lsn_projected which has undergone + /// generation validation in the deletion queue. + pub fn get_remote_consistent_lsn_visible(&self) -> Option { + if let Some(remote_client) = &self.remote_client { + remote_client.remote_consistent_lsn_visible() } else { None } @@ -1820,7 +1836,7 @@ impl Timeline { for (layer, m) in needs_upload { rtc.schedule_layer_file_upload(&layer.layer_desc().filename(), &m)?; } - rtc.schedule_layer_file_deletion(&needs_cleanup)?; + rtc.schedule_layer_file_deletion(needs_cleanup)?; rtc.schedule_index_upload_for_file_changes()?; // Tenant::create_timeline will wait for these uploads to happen before returning, or // on retry. @@ -3875,7 +3891,7 @@ impl Timeline { // Also schedule the deletions in remote storage if let Some(remote_client) = &self.remote_client { - remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?; + remote_client.schedule_layer_file_deletion(layer_names_to_delete)?; } Ok(()) @@ -4210,7 +4226,7 @@ impl Timeline { } if let Some(remote_client) = &self.remote_client { - remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?; + remote_client.schedule_layer_file_deletion(layer_names_to_delete)?; } apply.flush(); diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 18588cf0fd48..7d55388f44b7 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -14,6 +14,7 @@ use utils::{ use crate::{ config::PageServerConf, + deletion_queue::DeletionQueueClient, task_mgr::{self, TaskKind}, tenant::{ metadata::TimelineMetadata, @@ -407,6 +408,7 @@ impl DeleteTimelineFlow { timeline_id: TimelineId, local_metadata: &TimelineMetadata, remote_client: Option, + deletion_queue_client: DeletionQueueClient, init_order: Option<&InitializationOrder>, ) -> anyhow::Result<()> { // Note: here we even skip populating layer map. Timeline is essentially uninitialized. @@ -416,7 +418,10 @@ impl DeleteTimelineFlow { timeline_id, local_metadata, None, // Ancestor is not needed for deletion. - TimelineResources { remote_client }, + TimelineResources { + remote_client, + deletion_queue_client, + }, init_order, // Important. We dont pass ancestor above because it can be missing. // Thus we need to skip the validation here. diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 7d1e9b4a39e3..0831b9cedaa5 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -370,8 +370,9 @@ pub(super) async fn handle_walreceiver_connection( })?; if let Some(last_lsn) = status_update { - let timeline_remote_consistent_lsn = - timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0)); + let timeline_remote_consistent_lsn = timeline + .get_remote_consistent_lsn_visible() + .unwrap_or(Lsn(0)); // The last LSN we processed. It is not guaranteed to survive pageserver crash. let last_received_lsn = last_lsn; diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index 28822335b098..08b1cb8866e8 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -1,5 +1,3 @@ -use crate::metrics::RemoteOpFileKind; - use super::storage_layer::LayerFileName; use super::Generation; use crate::tenant::metadata::TimelineMetadata; @@ -11,6 +9,7 @@ use std::fmt::Debug; use chrono::NaiveDateTime; use std::sync::Arc; use tracing::info; +use utils::lsn::AtomicLsn; use std::sync::atomic::AtomicU32; use utils::lsn::Lsn; @@ -58,7 +57,12 @@ pub(crate) struct UploadQueueInitialized { /// uploaded. `Lsn(0)` if nothing was uploaded yet. /// Unlike `latest_files` or `latest_metadata`, this value is never ahead. /// Safekeeper can rely on it to make decisions for WAL storage. - pub(crate) last_uploaded_consistent_lsn: Lsn, + /// + /// visible_remote_consistent_lsn is only updated after our generation has been validated with + /// the control plane (unlesss a timeline's generation is None, in which case + /// we skip validation) + pub(crate) projected_remote_consistent_lsn: Option, + pub(crate) visible_remote_consistent_lsn: Arc, // Breakdown of different kinds of tasks currently in-progress pub(crate) num_inprogress_layer_uploads: usize, @@ -81,6 +85,14 @@ impl UploadQueueInitialized { pub(super) fn no_pending_work(&self) -> bool { self.inprogress_tasks.is_empty() && self.queued_operations.is_empty() } + + pub(super) fn get_last_remote_consistent_lsn_visible(&self) -> Lsn { + self.visible_remote_consistent_lsn.load() + } + + pub(super) fn get_last_remote_consistent_lsn_projected(&self) -> Option { + self.projected_remote_consistent_lsn + } } #[derive(Clone, Copy)] @@ -114,9 +126,8 @@ impl UploadQueue { latest_files: HashMap::new(), latest_files_changes_since_metadata_upload_scheduled: 0, latest_metadata: metadata.clone(), - // We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent - // safekeepers from garbage-collecting anything. - last_uploaded_consistent_lsn: Lsn(0), + projected_remote_consistent_lsn: None, + visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)), // what follows are boring default initializations task_counter: 0, num_inprogress_layer_uploads: 0, @@ -158,7 +169,10 @@ impl UploadQueue { latest_files: files, latest_files_changes_since_metadata_upload_scheduled: 0, latest_metadata: index_part.metadata.clone(), - last_uploaded_consistent_lsn: index_part.metadata.disk_consistent_lsn(), + projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()), + visible_remote_consistent_lsn: Arc::new( + index_part.metadata.disk_consistent_lsn().into(), + ), // what follows are boring default initializations task_counter: 0, num_inprogress_layer_uploads: 0, @@ -201,12 +215,11 @@ pub(crate) struct UploadTask { pub(crate) op: UploadOp, } +/// A deletion of some layers within the lifetime of a timeline. This is not used +/// for timeline deletion, which skips this queue and goes directly to DeletionQueue. #[derive(Debug)] pub(crate) struct Delete { - pub(crate) file_kind: RemoteOpFileKind, - pub(crate) layer_file_name: LayerFileName, - pub(crate) scheduled_from_timeline_delete: bool, - pub(crate) generation: Generation, + pub(crate) layers: Vec<(LayerFileName, Generation)>, } #[derive(Debug)] @@ -217,7 +230,7 @@ pub(crate) enum UploadOp { /// Upload the metadata file UploadMetadata(IndexPart, Lsn), - /// Delete a layer file + /// Delete layer files Delete(Delete), /// Barrier. When the barrier operation is reached, @@ -239,13 +252,9 @@ impl std::fmt::Display for UploadOp { UploadOp::UploadMetadata(_, lsn) => { write!(f, "UploadMetadata(lsn: {})", lsn) } - UploadOp::Delete(delete) => write!( - f, - "Delete(path: {}, scheduled_from_timeline_delete: {}, gen: {:?})", - delete.layer_file_name.file_name(), - delete.scheduled_from_timeline_delete, - delete.generation - ), + UploadOp::Delete(delete) => { + write!(f, "Delete({} layers)", delete.layers.len(),) + } UploadOp::Barrier(_) => write!(f, "Barrier"), } } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 0667403ba31e..38d0aeb96026 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1481,6 +1481,16 @@ def stop(self, immediate: bool = False) -> "NeonAttachmentService": self.running = False return self + def attach_hook(self, tenant_id: TenantId, pageserver_id: int) -> int: + response = requests.post( + f"{self.env.control_plane_api}/attach_hook", + json={"tenant_id": str(tenant_id), "pageserver_id": pageserver_id}, + ) + response.raise_for_status() + gen = response.json()["gen"] + assert isinstance(gen, int) + return gen + def __enter__(self) -> "NeonAttachmentService": return self @@ -1689,12 +1699,7 @@ def tenant_attach( to call into the pageserver HTTP client. """ if self.env.attachment_service is not None: - response = requests.post( - f"{self.env.control_plane_api}/attach_hook", - json={"tenant_id": str(tenant_id), "pageserver_id": self.id}, - ) - response.raise_for_status() - generation = response.json()["gen"] + generation = self.env.attachment_service.attach_hook(tenant_id, self.id) else: generation = None diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 9373073abf2c..9fdcd22bc25c 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -620,3 +620,8 @@ def post_tracing_event(self, level: str, message: str): }, ) self.verbose_error(res) + + def deletion_queue_flush(self, execute: bool = False): + self.put( + f"http://localhost:{self.port}/v1/deletion_queue/flush?execute={'true' if execute else 'false'}" + ).raise_for_status() diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index 2e5d75a0fcd0..70c2a06a0745 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -267,7 +267,7 @@ def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional def list_prefix( - neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None + neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None, delimiter: str = "/" ) -> ListObjectsV2OutputTypeDef: """ Note that this function takes into account prefix_in_bucket. @@ -287,7 +287,7 @@ def list_prefix( # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive. response = remote.client.list_objects_v2( - Delimiter="/", + Delimiter=delimiter, Bucket=remote.bucket_name, Prefix=prefix, ) diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py new file mode 100644 index 000000000000..81d38ac93442 --- /dev/null +++ b/test_runner/regress/test_pageserver_generations.py @@ -0,0 +1,352 @@ +""" + +Tests in this module exercise the pageserver's behavior around generation numbers, +as defined in docs/rfcs/025-generation-numbers.md. Briefly, the behaviors we require +of the pageserver are: +- Do not start a tenant without a generation number if control_plane_api is set +- Remote objects must be suffixed with generation +- Deletions may only be executed after validating generation +- Updates to remote_consistent_lsn may only be made visible after validating generation +""" + + +import re +import time +from typing import Optional + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + PgBin, + last_flush_lsn_upload, + wait_for_last_flush_lsn, +) +from fixtures.pageserver.utils import list_prefix +from fixtures.remote_storage import ( + RemoteStorageKind, +) +from fixtures.types import TenantId, TimelineId +from fixtures.utils import print_gc_result, wait_until + +# A tenant configuration that is convenient for generating uploads and deletions +# without a large amount of postgres traffic. +TENANT_CONF = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{128 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{128 * 1024}", + # no PITR horizon, we specify the horizon when we request on-demand GC + "pitr_interval": "0s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # create image layers eagerly, so that GC can remove some layers + "image_creation_threshold": "1", +} + + +def generate_uploads_and_deletions( + env: NeonEnv, + *, + init: bool = True, + tenant_id: Optional[TenantId] = None, + timeline_id: Optional[TimelineId] = None, + data: Optional[str] = None, +): + """ + Using the environment's default tenant + timeline, generate a load pattern + that results in some uploads and some deletions to remote storage. + """ + + if tenant_id is None: + tenant_id = env.initial_tenant + assert tenant_id is not None + + if timeline_id is None: + timeline_id = env.initial_timeline + assert timeline_id is not None + + ps_http = env.pageserver.http_client() + + with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: + if init: + endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)") + last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) + + def churn(data): + endpoint.safe_psql_many( + [ + f""" + INSERT INTO foo (id, val) + SELECT g, '{data}' + FROM generate_series(1, 20000) g + ON CONFLICT (id) DO UPDATE + SET val = EXCLUDED.val + """, + # to ensure that GC can actually remove some layers + "VACUUM foo", + ] + ) + assert tenant_id is not None + assert timeline_id is not None + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + ps_http.timeline_checkpoint(tenant_id, timeline_id) + + # Compaction should generate some GC-elegible layers + for i in range(0, 2): + churn(f"{i if data is None else data}") + + gc_result = ps_http.timeline_gc(tenant_id, timeline_id, 0) + print_gc_result(gc_result) + assert gc_result["layers_removed"] > 0 + + +def get_metric_or_0(ps_http, metric: str) -> int: + v = ps_http.get_metric_value(metric) + return 0 if v is None else int(v) + + +def get_deletion_queue_executed(ps_http) -> int: + return get_metric_or_0(ps_http, "pageserver_deletion_queue_executed_total") + + +def get_deletion_queue_submitted(ps_http) -> int: + return get_metric_or_0(ps_http, "pageserver_deletion_queue_submitted_total") + + +def get_deletion_queue_dropped(ps_http) -> int: + return get_metric_or_0(ps_http, "pageserver_deletion_queue_dropped_total") + + +def get_deletion_queue_unexpected_errors(ps_http) -> int: + return get_metric_or_0(ps_http, "pageserver_deletion_queue_unexpected_errors_total") + + +def get_deletion_queue_dropped_lsn_updates(ps_http) -> int: + return get_metric_or_0(ps_http, "pageserver_deletion_queue_dropped_lsn_updates_total") + + +def get_deletion_queue_depth(ps_http) -> int: + """ + Queue depth if at least one deletion has been submitted, else None + """ + submitted = get_deletion_queue_submitted(ps_http) + executed = get_deletion_queue_executed(ps_http) + dropped = get_deletion_queue_dropped(ps_http) + depth = submitted - executed - dropped + log.info(f"get_deletion_queue_depth: {depth} ({submitted} - {executed} - {dropped})") + + assert depth >= 0 + return int(depth) + + +def assert_deletion_queue(ps_http, size_fn) -> None: + v = get_deletion_queue_depth(ps_http) + assert v is not None + assert size_fn(v) is True + + +def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): + """ + Validate behavior when a pageserver is run without generation support enabled, + then started again after activating it: + - Before upgrade, no objects should have generation suffixes + - After upgrade, the bucket should contain a mixture. + - In both cases, postgres I/O should work. + """ + neon_env_builder.enable_generations = True + neon_env_builder.enable_pageserver_remote_storage( + RemoteStorageKind.MOCK_S3, + ) + + env = neon_env_builder.init_configs() + env.broker.try_start() + for sk in env.safekeepers: + sk.start() + assert env.attachment_service is not None + env.attachment_service.start() + + env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',)) + + env.neon_cli.create_tenant( + tenant_id=env.initial_tenant, conf=TENANT_CONF, timeline_id=env.initial_timeline + ) + generate_uploads_and_deletions(env) + + def parse_generation_suffix(key): + m = re.match(".+-([0-9a-zA-Z]{8})$", key) + if m is None: + return None + else: + log.info(f"match: {m}") + log.info(f"group: {m.group(1)}") + return int(m.group(1), 16) + + pre_upgrade_keys = list( + [o["Key"] for o in list_prefix(neon_env_builder, delimiter="")["Contents"]] + ) + for key in pre_upgrade_keys: + assert parse_generation_suffix(key) is None + + env.pageserver.stop() + + # Starting without the override that disabled control_plane_api + env.pageserver.start() + + generate_uploads_and_deletions(env, init=False) + + legacy_objects: list[str] = [] + suffixed_objects = [] + post_upgrade_keys = list( + [o["Key"] for o in list_prefix(neon_env_builder, delimiter="")["Contents"]] + ) + for key in post_upgrade_keys: + log.info(f"post-upgrade key: {key}") + if parse_generation_suffix(key) is not None: + suffixed_objects.append(key) + else: + legacy_objects.append(key) + + # Bucket now contains a mixture of suffixed and non-suffixed objects + assert len(suffixed_objects) > 0 + assert len(legacy_objects) > 0 + + assert get_deletion_queue_unexpected_errors(env.pageserver.http_client()) == 0 + + +def test_deferred_deletion(neon_env_builder: NeonEnvBuilder): + neon_env_builder.enable_generations = True + neon_env_builder.enable_pageserver_remote_storage( + RemoteStorageKind.MOCK_S3, + ) + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + assert env.attachment_service is not None + + some_other_pageserver = 1234 + ps_http = env.pageserver.http_client() + + generate_uploads_and_deletions(env) + + # Flush: pending deletions should all complete + assert_deletion_queue(ps_http, lambda n: n > 0) + ps_http.deletion_queue_flush(execute=True) + assert_deletion_queue(ps_http, lambda n: n == 0) + assert get_deletion_queue_dropped(ps_http) == 0 + + # Our visible remote_consistent_lsn should match projected + timeline = ps_http.timeline_detail(env.initial_tenant, env.initial_timeline) + assert timeline["remote_consistent_lsn"] == timeline["remote_consistent_lsn_visible"] + assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0 + + env.pageserver.allowed_errors.extend( + [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] + ) + + # Now advance the generation in the control plane: subsequent validations + # from the running pageserver will fail. No more deletions should happen. + env.attachment_service.attach_hook(env.initial_tenant, some_other_pageserver) + generate_uploads_and_deletions(env, init=False) + + assert_deletion_queue(ps_http, lambda n: n > 0) + queue_depth_before = get_deletion_queue_depth(ps_http) + executed_before = get_deletion_queue_executed(ps_http) + ps_http.deletion_queue_flush(execute=True) + + # Queue drains to zero because we dropped deletions + assert_deletion_queue(ps_http, lambda n: n == 0) + # The executed counter has not incremented + assert get_deletion_queue_executed(ps_http) == executed_before + # The dropped counter has incremented to consume all of the deletions that were previously enqueued + assert get_deletion_queue_dropped(ps_http) == queue_depth_before + + # Flush to S3 and see that remote_consistent_lsn does not advance: it cannot + # because generation validation fails. + timeline = ps_http.timeline_detail(env.initial_tenant, env.initial_timeline) + assert timeline["remote_consistent_lsn"] != timeline["remote_consistent_lsn_visible"] + assert get_deletion_queue_dropped_lsn_updates(ps_http) > 0 + + # TODO: list bucket and confirm all objects have a generation suffix. + + assert get_deletion_queue_unexpected_errors(ps_http) == 0 + + +@pytest.mark.parametrize("keep_attachment", [True, False]) +def test_deletion_queue_recovery( + neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, keep_attachment: bool +): + """ + :param keep_attachment: If true, we re-attach after restart. Else, we act as if some other + node took the attachment while we were restarting. + """ + neon_env_builder.enable_generations = True + neon_env_builder.enable_pageserver_remote_storage( + RemoteStorageKind.MOCK_S3, + ) + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + + ps_http = env.pageserver.http_client() + + # Prevent deletion lists from being executed, to build up some backlog of deletions + ps_http.configure_failpoints( + [ + ("deletion-queue-before-execute", "return"), + ] + ) + + generate_uploads_and_deletions(env) + + # There should be entries in the deletion queue + assert_deletion_queue(ps_http, lambda n: n > 0) + ps_http.deletion_queue_flush() + before_restart_depth = get_deletion_queue_depth(ps_http) + + assert get_deletion_queue_unexpected_errors(ps_http) == 0 + assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0 + + log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued") + env.pageserver.stop(immediate=True) + + if not keep_attachment: + some_other_pageserver = 101010 + assert env.attachment_service is not None + env.attachment_service.attach_hook(env.initial_tenant, some_other_pageserver) + + env.pageserver.start() + + def assert_deletions_submitted(n: int): + assert ps_http.get_metric_value("pageserver_deletion_queue_submitted_total") == n + + # After restart, issue a flush to kick the deletion frontend to do recovery. + # It should recover all the operations we submitted before the restart. + ps_http.deletion_queue_flush(execute=False) + wait_until(20, 0.25, lambda: assert_deletions_submitted(before_restart_depth)) + + # The queue should drain through completely if we flush it + ps_http.deletion_queue_flush(execute=True) + wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0)) + + if keep_attachment: + # If we kept the attachment, then our pre-restart deletions should have executed + # successfully + assert get_deletion_queue_executed(ps_http) == before_restart_depth + else: + # If we lost the attachment, we should have dropped our pre-restart deletions. + assert get_deletion_queue_dropped(ps_http) == before_restart_depth + env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"]) + + assert get_deletion_queue_unexpected_errors(ps_http) == 0 + assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0 + + # Restart again + env.pageserver.stop(immediate=True) + env.pageserver.start() + + # No deletion lists should be recovered: this demonstrates that deletion lists + # were cleaned up after being executed or dropped in the previous process lifetime. + time.sleep(1) + assert_deletion_queue(ps_http, lambda n: n == 0) + + assert get_deletion_queue_unexpected_errors(ps_http) == 0 + assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0 diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index c6ddb54ee6c1..9d0d42a4ef63 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -43,6 +43,12 @@ def test_tenant_delete_smoke( neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() + env.pageserver.allowed_errors.extend( + [ + # The deletion queue will complain when it encounters simulated S3 errors + ".*deletion executor: DeleteObjects request failed.*", + ] + ) # lucky race with stopping from flushing a layer we fail to schedule any uploads env.pageserver.allowed_errors.append( @@ -195,6 +201,14 @@ def test_delete_tenant_exercise_crash_safety_failpoints( ] ) + if simulate_failures: + env.pageserver.allowed_errors.extend( + [ + # The deletion queue will complain when it encounters simulated S3 errors + ".*deletion executor: DeleteObjects request failed.*", + ] + ) + ps_http = env.pageserver.http_client() timeline_id = env.neon_cli.create_timeline("delete", tenant_id=tenant_id) @@ -383,6 +397,7 @@ def test_tenant_delete_is_resumed_on_attach( assert not tenant_path.exists() if remote_storage_kind in available_s3_storages(): + ps_http.deletion_queue_flush(execute=True) assert_prefix_empty( neon_env_builder, prefix="/".join( diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 0e4df21d83ca..839df69240de 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -807,6 +807,8 @@ def test_delete_orphaned_objects( reason = timeline_info["state"]["Broken"]["reason"] assert reason.endswith(f"failpoint: {failpoint}"), reason + ps_http.deletion_queue_flush(execute=True) + for orphan in orphans: assert not orphan.exists() assert env.pageserver.log_contains(