From e839fd153e37de7ba6e1383c1f93ca1f85e96ca6 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 31 Aug 2023 14:16:53 +0100 Subject: [PATCH 01/44] libs: add Generation::previous --- libs/utils/src/generation.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs index 87c636125593..53c303520964 100644 --- a/libs/utils/src/generation.rs +++ b/libs/utils/src/generation.rs @@ -64,6 +64,14 @@ impl Generation { } } } + + pub fn previous(&self) -> Generation { + match self { + Self::Valid(n) => Self::Valid(n - 1), + Self::None => Self::None, + Self::Broken => panic!("Attempted to use a broken generation"), + } + } } impl Serialize for Generation { From 1cedf33ed6148411418b6c05e9c94cea049b7a7b Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 25 Aug 2023 13:02:53 +0100 Subject: [PATCH 02/44] pageserver_api: add types for control plane calls --- Cargo.lock | 1 + libs/pageserver_api/Cargo.toml | 1 + libs/pageserver_api/src/control_api.rs | 91 ++++++++++++++++++++++++++ libs/pageserver_api/src/lib.rs | 1 + 4 files changed, 94 insertions(+) create mode 100644 libs/pageserver_api/src/control_api.rs diff --git a/Cargo.lock b/Cargo.lock index 867008808be8..8198fe442744 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2705,6 +2705,7 @@ dependencies = [ "bytes", "const_format", "enum-map", + "hex", "postgres_ffi", "serde", "serde_json", diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index f97ec54e9145..7fdd46eede23 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -12,6 +12,7 @@ const_format.workspace = true anyhow.workspace = true bytes.workspace = true byteorder.workspace = true +hex.workspace = true utils.workspace = true postgres_ffi.workspace = true enum-map.workspace = true diff --git a/libs/pageserver_api/src/control_api.rs b/libs/pageserver_api/src/control_api.rs new file mode 100644 index 000000000000..4bf4ddc70b66 --- /dev/null +++ b/libs/pageserver_api/src/control_api.rs @@ -0,0 +1,91 @@ +use hex::FromHex; +use serde::{Deserialize, Serialize}; +use utils::id::{NodeId, TenantId}; + +/// Types in this file are for pageserver's upward-facing API calls to the control plane, +/// required for acquiring and validating tenant generation numbers. +/// +/// See docs/rfcs/025-generation-numbers.md + +#[derive(Serialize, Deserialize)] +struct ReAttachRequest { + node_id: NodeId, +} + +#[derive(Serialize, Deserialize)] +pub struct ReAttachResponseTenant { + pub id: HexTenantId, + pub generation: u32, +} + +#[derive(Serialize, Deserialize)] +struct ReAttachResponse { + tenants: Vec, +} + +#[derive(Serialize, Deserialize)] +pub struct ValidateRequestTenant { + pub id: HexTenantId, + pub gen: u32, +} + +#[derive(Serialize, Deserialize)] +struct ValidateRequest { + tenants: Vec, +} + +#[derive(Serialize, Deserialize)] +struct ValidateResponse { + tenants: Vec, +} + +#[derive(Serialize, Deserialize)] +pub struct ValidateResponseTenant { + pub id: HexTenantId, + pub valid: bool, +} + +/// Serialization helper: TenantId's serialization is an array of u8, which is rather unfriendly +/// for human readable encodings like JSON. +/// This class wraps it in serialization that is just the hex string representation, +/// which is more compact and readable in JSON. +#[derive(Eq, PartialEq, Clone, Hash)] +pub struct HexTenantId(TenantId); + +impl HexTenantId { + pub fn new(t: TenantId) -> Self { + Self(t) + } + + pub fn take(self) -> TenantId { + self.0 + } +} + +impl AsRef for HexTenantId { + fn as_ref(&self) -> &TenantId { + &self.0 + } +} + +impl Serialize for HexTenantId { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let hex = self.0.hex_encode(); + serializer.collect_str(&hex) + } +} + +impl<'de> Deserialize<'de> for HexTenantId { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + let string = String::deserialize(deserializer)?; + TenantId::from_hex(string) + .map(HexTenantId::new) + .map_err(|e| serde::de::Error::custom(format!("{e}"))) + } +} diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs index 4890d54f3697..d844021785d1 100644 --- a/libs/pageserver_api/src/lib.rs +++ b/libs/pageserver_api/src/lib.rs @@ -1,6 +1,7 @@ use const_format::formatcp; /// Public API types +pub mod control_api; pub mod models; pub mod reltag; From bf4572f22006917d3cc199846223710b83d5de39 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 24 Aug 2023 17:48:00 +0100 Subject: [PATCH 03/44] pageserver: optional generation input during create & attach --- control_plane/src/bin/neon_local.rs | 12 +++++++++++- control_plane/src/pageserver.rs | 2 ++ libs/pageserver_api/src/models.rs | 16 ++++++---------- pageserver/src/http/openapi_spec.yml | 4 +++- pageserver/src/http/routes.rs | 28 ++++++++++++++++++++++++++-- pageserver/src/tenant/mgr.rs | 17 +++++++++++++---- 6 files changed, 61 insertions(+), 18 deletions(-) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index ef308cb2d285..2af79bed905a 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -278,6 +278,14 @@ fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result> { .context("Failed to parse tenant id from the argument string") } +fn parse_generation(sub_match: &ArgMatches) -> anyhow::Result> { + sub_match + .get_one::("generation") + .map(|tenant_id| u32::from_str(tenant_id)) + .transpose() + .context("Failed to parse generation rom the argument string") +} + fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result> { sub_match .get_one::("timeline-id") @@ -343,11 +351,13 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an } Some(("create", create_match)) => { let initial_tenant_id = parse_tenant_id(create_match)?; + let generation = parse_generation(create_match)?; let tenant_conf: HashMap<_, _> = create_match .get_many::("config") .map(|vals| vals.flat_map(|c| c.split_once(':')).collect()) .unwrap_or_default(); - let new_tenant_id = pageserver.tenant_create(initial_tenant_id, tenant_conf)?; + let new_tenant_id = + pageserver.tenant_create(initial_tenant_id, generation, tenant_conf)?; println!("tenant {new_tenant_id} successfully created on the pageserver"); // Create an initial timeline for the new tenant diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 2ff09021e536..c24ea7f7170e 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -317,6 +317,7 @@ impl PageServerNode { pub fn tenant_create( &self, new_tenant_id: Option, + generation: Option, settings: HashMap<&str, &str>, ) -> anyhow::Result { let mut settings = settings.clone(); @@ -387,6 +388,7 @@ impl PageServerNode { let request = models::TenantCreateRequest { new_tenant_id, + generation, config, }; if !settings.is_empty() { diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 2f4c21326ee0..5f92556f6408 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -194,6 +194,9 @@ pub struct TimelineCreateRequest { pub struct TenantCreateRequest { #[serde_as(as = "DisplayFromStr")] pub new_tenant_id: TenantId, + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub generation: Option, #[serde(flatten)] pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it } @@ -241,15 +244,6 @@ pub struct StatusResponse { pub id: NodeId, } -impl TenantCreateRequest { - pub fn new(new_tenant_id: TenantId) -> TenantCreateRequest { - TenantCreateRequest { - new_tenant_id, - config: TenantConfig::default(), - } - } -} - #[serde_as] #[derive(Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] @@ -293,9 +287,11 @@ impl TenantConfigRequest { } } -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Deserialize)] pub struct TenantAttachRequest { pub config: TenantAttachConfig, + #[serde(default)] + pub generation: Option, } /// Newtype to enforce deny_unknown_fields on TenantConfig for diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 38e07f172d3d..4988641d6a73 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -383,7 +383,6 @@ paths: schema: type: string format: hex - post: description: | Schedules attach operation to happen in the background for the given tenant. @@ -1020,6 +1019,9 @@ components: properties: config: $ref: '#/components/schemas/TenantConfig' + generation: + type: integer + description: Attachment generation number. TenantConfigRequest: allOf: - $ref: '#/components/schemas/TenantConfig' diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index f86657fa77a1..dc4abe4475e3 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -32,11 +32,13 @@ use crate::tenant::mgr::{ }; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; -use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline}; +use crate::tenant::timeline::Timeline; +use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError}; use crate::{config::PageServerConf, tenant::mgr}; use crate::{disk_usage_eviction_task, tenant}; use utils::{ auth::JwtAuth, + generation::Generation, http::{ endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with}, error::{ApiError, HttpErrorBody}, @@ -472,7 +474,7 @@ async fn tenant_attach_handler( check_permission(&request, Some(tenant_id))?; let maybe_body: Option = json_request_or_empty_body(&mut request).await?; - let tenant_conf = match maybe_body { + let tenant_conf = match &maybe_body { Some(request) => TenantConfOpt::try_from(&*request.config).map_err(ApiError::BadRequest)?, None => TenantConfOpt::default(), }; @@ -483,10 +485,25 @@ async fn tenant_attach_handler( let state = get_state(&request); + let generation = if state.conf.control_plane_api.is_some() { + // If we have been configured with a control plane URI, then generations are + // mandatory, as we will attempt to re-attach on startup. + maybe_body + .as_ref() + .and_then(|tar| tar.generation) + .map(Generation::new) + .ok_or(ApiError::BadRequest(anyhow!( + "generation attribute missing" + )))? + } else { + Generation::placeholder() + }; + if let Some(remote_storage) = &state.remote_storage { mgr::attach_tenant( state.conf, tenant_id, + generation, tenant_conf, state.broker_client.clone(), remote_storage.clone(), @@ -867,6 +884,12 @@ async fn tenant_create_handler( let tenant_conf = TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?; + // TODO: make generation mandatory here once control plane supports it. + let generation = request_data + .generation + .map(Generation::new) + .unwrap_or(Generation::none()); + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); let state = get_state(&request); @@ -875,6 +898,7 @@ async fn tenant_create_handler( state.conf, tenant_conf, target_tenant_id, + generation, state.broker_client.clone(), state.remote_storage.clone(), &ctx, diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 87617b544cf4..38c87a67a9f4 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -126,6 +126,9 @@ pub async fn init_tenant_mgr( match schedule_local_tenant_processing( conf, &tenant_dir_path, + // TODO: call into control plane, don't start Tenants + // purely because they are on disk + Generation::none(), resources.clone(), Some(init_order.clone()), &TENANTS, @@ -162,6 +165,7 @@ pub async fn init_tenant_mgr( pub(crate) fn schedule_local_tenant_processing( conf: &'static PageServerConf, tenant_path: &Path, + generation: Generation, resources: TenantSharedResources, init_order: Option, tenants: &'static tokio::sync::RwLock, @@ -203,7 +207,7 @@ pub(crate) fn schedule_local_tenant_processing( match Tenant::spawn_attach( conf, tenant_id, - Generation::none(), + generation, resources.broker_client, tenants, remote_storage, @@ -357,6 +361,7 @@ pub async fn create_tenant( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, tenant_id: TenantId, + generation: Generation, broker_client: storage_broker::BrokerClientChannel, remote_storage: Option, ctx: &RequestContext, @@ -374,7 +379,8 @@ pub async fn create_tenant( remote_storage, }; let created_tenant = - schedule_local_tenant_processing(conf, &tenant_directory, tenant_resources, None, &TENANTS, ctx)?; + schedule_local_tenant_processing(conf, &tenant_directory, + generation, tenant_resources, None, &TENANTS, ctx)?; // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. // See https://github.com/neondatabase/neon/issues/4233 @@ -537,7 +543,9 @@ pub async fn load_tenant( broker_client, remote_storage, }; - let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, resources, None, &TENANTS, ctx) + // TODO: remove the `/load` API once generation support is complete: + // it becomes equivalent to attaching. + let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, Generation::none(), resources, None, &TENANTS, ctx) .with_context(|| { format!("Failed to schedule tenant processing in path {tenant_path:?}") })?; @@ -601,6 +609,7 @@ pub async fn list_tenants() -> Result, TenantMapLis pub async fn attach_tenant( conf: &'static PageServerConf, tenant_id: TenantId, + generation: Generation, tenant_conf: TenantConfOpt, broker_client: storage_broker::BrokerClientChannel, remote_storage: GenericRemoteStorage, @@ -622,7 +631,7 @@ pub async fn attach_tenant( broker_client, remote_storage: Some(remote_storage), }; - let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, resources, None, &TENANTS, ctx)?; + let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, generation, resources, None, &TENANTS, ctx)?; // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. // See https://github.com/neondatabase/neon/issues/4233 From 233afac4c49e1fdcb1b873e16921aac2b5f13029 Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 25 Aug 2023 12:55:05 +0100 Subject: [PATCH 04/44] pageserver: add control_plane_api conf --- pageserver/src/config.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 5394f17398e7..abef0765fad9 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -204,6 +204,8 @@ pub struct PageServerConf { /// has it's initial logical size calculated. Not running background tasks for some seconds is /// not terrible. pub background_task_maximum_delay: Duration, + + pub control_plane_api: Option, } /// We do not want to store this in a PageServerConf because the latter may be logged @@ -278,6 +280,8 @@ struct PageServerConfigBuilder { ondemand_download_behavior_treat_error_as_warn: BuilderValue, background_task_maximum_delay: BuilderValue, + + control_plane_api: BuilderValue>, } impl Default for PageServerConfigBuilder { @@ -340,6 +344,8 @@ impl Default for PageServerConfigBuilder { DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY, ) .unwrap()), + + control_plane_api: Set(None), } } } @@ -468,6 +474,10 @@ impl PageServerConfigBuilder { self.background_task_maximum_delay = BuilderValue::Set(delay); } + pub fn control_plane_api(&mut self, api: Url) { + self.control_plane_api = BuilderValue::Set(Some(api)) + } + pub fn build(self) -> anyhow::Result { let concurrent_tenant_size_logical_size_queries = self .concurrent_tenant_size_logical_size_queries @@ -553,6 +563,9 @@ impl PageServerConfigBuilder { background_task_maximum_delay: self .background_task_maximum_delay .ok_or(anyhow!("missing background_task_maximum_delay"))?, + control_plane_api: self + .control_plane_api + .ok_or(anyhow!("missing control_plane_api"))?, }) } } @@ -741,6 +754,7 @@ impl PageServerConf { }, "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?), "background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?), + "control_plane_api" => builder.control_plane_api(parse_toml_string(key, item)?.parse().context("failed to parse control plane URL")?), _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -909,6 +923,7 @@ impl PageServerConf { test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, background_task_maximum_delay: Duration::ZERO, + control_plane_api: None, } } } @@ -1132,6 +1147,7 @@ background_task_maximum_delay = '334 s' background_task_maximum_delay: humantime::parse_duration( defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY )?, + control_plane_api: None }, "Correct defaults should be used when no config values are provided" ); @@ -1187,6 +1203,7 @@ background_task_maximum_delay = '334 s' test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, background_task_maximum_delay: Duration::from_secs(334), + control_plane_api: None }, "Should be able to parse all basic config values correctly" ); From ac8ef22c4082fdc3117d311eeb240b2806965b97 Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 25 Aug 2023 13:17:10 +0100 Subject: [PATCH 05/44] pageserver: call into control plane on startup --- libs/pageserver_api/src/control_api.rs | 16 +-- pageserver/src/tenant/mgr.rs | 143 +++++++++++++++++++++---- 2 files changed, 129 insertions(+), 30 deletions(-) diff --git a/libs/pageserver_api/src/control_api.rs b/libs/pageserver_api/src/control_api.rs index 4bf4ddc70b66..1f663034f597 100644 --- a/libs/pageserver_api/src/control_api.rs +++ b/libs/pageserver_api/src/control_api.rs @@ -8,8 +8,8 @@ use utils::id::{NodeId, TenantId}; /// See docs/rfcs/025-generation-numbers.md #[derive(Serialize, Deserialize)] -struct ReAttachRequest { - node_id: NodeId, +pub struct ReAttachRequest { + pub node_id: NodeId, } #[derive(Serialize, Deserialize)] @@ -19,8 +19,8 @@ pub struct ReAttachResponseTenant { } #[derive(Serialize, Deserialize)] -struct ReAttachResponse { - tenants: Vec, +pub struct ReAttachResponse { + pub tenants: Vec, } #[derive(Serialize, Deserialize)] @@ -30,13 +30,13 @@ pub struct ValidateRequestTenant { } #[derive(Serialize, Deserialize)] -struct ValidateRequest { - tenants: Vec, +pub struct ValidateRequest { + pub tenants: Vec, } #[derive(Serialize, Deserialize)] -struct ValidateResponse { - tenants: Vec, +pub struct ValidateResponse { + pub tenants: Vec, } #[derive(Serialize, Deserialize)] diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 38c87a67a9f4..c506e1926e9f 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -1,10 +1,13 @@ //! This module acts as a switchboard to access different repositories managed by this //! page server. +use hyper::StatusCode; +use pageserver_api::control_api::{HexTenantId, ReAttachRequest, ReAttachResponse}; use std::collections::{hash_map, HashMap}; use std::ffi::OsStr; use std::path::Path; use std::sync::Arc; +use std::time::Duration; use tokio::fs; use anyhow::Context; @@ -76,6 +79,75 @@ pub async fn init_tenant_mgr( let mut tenants = HashMap::new(); + // If we are configured to use the control plane API, then it is the source of truth for what to attach + let tenant_generations = conf + .control_plane_api + .as_ref() + .map(|control_plane_api| async { + let client = reqwest::ClientBuilder::new() + .build() + .expect("Failed to construct http client"); + + // FIXME: it's awkward that join() requires the base to have a trailing slash, makes + // it easy to get a config wrong + assert!( + control_plane_api.as_str().ends_with('/'), + "control plane API needs trailing slash" + ); + + let re_attach_path = control_plane_api + .join("re-attach") + .expect("Failed to build re-attach path"); + let request = ReAttachRequest { node_id: conf.id }; + + // TODO: we should have been passed a cancellation token, and use it to end + // this loop gracefully + loop { + let response = match client + .post(re_attach_path.clone()) + .json(&request) + .send() + .await + { + Err(e) => Err(anyhow::Error::from(e)), + Ok(r) => { + if r.status() == StatusCode::OK { + r.json::() + .await + .map_err(anyhow::Error::from) + } else { + Err(anyhow::anyhow!("Unexpected status {}", r.status())) + } + } + }; + + match response { + Ok(res) => { + tracing::info!( + "Received re-attach response with {0} tenants", + res.tenants.len() + ); + + // TODO: do something with it + break res + .tenants + .into_iter() + .map(|t| (t.id, t.generation)) + .collect::>(); + } + Err(e) => { + tracing::error!("Error re-attaching tenants, retrying: {e:#}"); + tokio::time::sleep(Duration::from_secs(1)).await; + } + } + } + }); + + let tenant_generations = match tenant_generations { + Some(g) => Some(g.await), + None => None, + }; + let mut dir_entries = fs::read_dir(&tenants_dir) .await .with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?; @@ -123,12 +195,53 @@ pub async fn init_tenant_mgr( continue; } + let tenant_id = match tenant_dir_path + .file_name() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + { + Ok(id) => id, + Err(_) => { + warn!( + "Invalid tenant path (garbage in our repo directory?): {0}", + tenant_dir_path.display() + ); + continue; + } + }; + + let generation = if let Some(generations) = &tenant_generations { + // We have a generation map: treat it as the authority for whether + // this tenant is really attached. + if let Some(gen) = generations.get(&HexTenantId::new(tenant_id)) { + Generation::new(*gen) + } else { + info!("Detaching tenant {0}, control plane omitted it in re-attach response", tenant_id); + if let Err(e) = fs::remove_dir_all(&tenant_dir_path).await { + error!( + "Failed to remove detached tenant directory '{}': {:?}", + tenant_dir_path.display(), + e + ); + } + continue; + } + } else { + // Legacy mode: no generation information, any tenant present + // on local disk may activate + info!( + "Starting tenant {0} in legacy mode, no generation", + tenant_dir_path.display() + ); + Generation::none() + }; + match schedule_local_tenant_processing( conf, + tenant_id, &tenant_dir_path, - // TODO: call into control plane, don't start Tenants - // purely because they are on disk - Generation::none(), + generation, resources.clone(), Some(init_order.clone()), &TENANTS, @@ -164,6 +277,7 @@ pub async fn init_tenant_mgr( pub(crate) fn schedule_local_tenant_processing( conf: &'static PageServerConf, + tenant_id: TenantId, tenant_path: &Path, generation: Generation, resources: TenantSharedResources, @@ -186,15 +300,6 @@ pub(crate) fn schedule_local_tenant_processing( "Cannot load tenant from empty directory {tenant_path:?}" ); - let tenant_id = tenant_path - .file_name() - .and_then(OsStr::to_str) - .unwrap_or_default() - .parse::() - .with_context(|| { - format!("Could not parse tenant id out of the tenant dir name in path {tenant_path:?}") - })?; - let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id); anyhow::ensure!( !conf.tenant_ignore_mark_file_path(&tenant_id).exists(), @@ -231,13 +336,7 @@ pub(crate) fn schedule_local_tenant_processing( info!("tenant {tenant_id} is assumed to be loadable, starting load operation"); // Start loading the tenant into memory. It will initially be in Loading state. Tenant::spawn_load( - conf, - tenant_id, - Generation::none(), - resources, - init_order, - tenants, - ctx, + conf, tenant_id, generation, resources, init_order, tenants, ctx, ) }; Ok(tenant) @@ -379,7 +478,7 @@ pub async fn create_tenant( remote_storage, }; let created_tenant = - schedule_local_tenant_processing(conf, &tenant_directory, + schedule_local_tenant_processing(conf, tenant_id, &tenant_directory, generation, tenant_resources, None, &TENANTS, ctx)?; // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. // See https://github.com/neondatabase/neon/issues/4233 @@ -545,7 +644,7 @@ pub async fn load_tenant( }; // TODO: remove the `/load` API once generation support is complete: // it becomes equivalent to attaching. - let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, Generation::none(), resources, None, &TENANTS, ctx) + let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, Generation::none(), resources, None, &TENANTS, ctx) .with_context(|| { format!("Failed to schedule tenant processing in path {tenant_path:?}") })?; @@ -631,7 +730,7 @@ pub async fn attach_tenant( broker_client, remote_storage: Some(remote_storage), }; - let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, generation, resources, None, &TENANTS, ctx)?; + let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, generation, resources, None, &TENANTS, ctx)?; // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. // See https://github.com/neondatabase/neon/issues/4233 From 44e61252dd1fde93c018e3cd6cec5f5175eb47cf Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 29 Aug 2023 09:34:44 +0100 Subject: [PATCH 06/44] pageserver: if control plane API is disabled, ignore generations --- pageserver/src/tenant/mgr.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index c506e1926e9f..b3fea5607d1c 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -145,7 +145,10 @@ pub async fn init_tenant_mgr( let tenant_generations = match tenant_generations { Some(g) => Some(g.await), - None => None, + None => { + info!("Control plane API not configured, tenant generations are disabled"); + None + } }; let mut dir_entries = fs::read_dir(&tenants_dir) From 38d5dec27ac8074bc76ed0a4929b5a05843fde92 Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 29 Aug 2023 10:20:22 +0100 Subject: [PATCH 07/44] pageserver: generation-aware index_part.json loading --- pageserver/src/http/routes.rs | 2 +- pageserver/src/tenant/metadata.rs | 17 +++ .../src/tenant/remote_timeline_client.rs | 106 +++++++++++++++++- .../tenant/remote_timeline_client/download.rs | 106 +++++++++++++++++- 4 files changed, 225 insertions(+), 6 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index dc4abe4475e3..c1dfe435e499 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -496,7 +496,7 @@ async fn tenant_attach_handler( "generation attribute missing" )))? } else { - Generation::placeholder() + Generation::none() }; if let Some(remote_storage) = &state.remote_storage { diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index dbf2d5ac37dd..b77ae89398ab 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -230,6 +230,23 @@ impl TimelineMetadata { pub fn pg_version(&self) -> u32 { self.body.pg_version } + + // Checksums make it awkward to build a valid instance by hand. This helper + // provides a TimelineMetadata with a valid checksum in its header. + #[cfg(test)] + pub fn example() -> Self { + let instance = Self::new( + "0/16960E8".parse::().unwrap(), + None, + None, + Lsn::from_hex("00000000").unwrap(), + Lsn::from_hex("00000000").unwrap(), + Lsn::from_hex("00000000").unwrap(), + 0, + ); + let bytes = instance.to_bytes().unwrap(); + Self::from_bytes(&bytes).unwrap() + } } impl<'de> Deserialize<'de> for TimelineMetadata { diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 50bb8b43dee9..aeca0558d16f 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -1526,6 +1526,7 @@ mod tests { tenant_ctx: RequestContext, remote_fs_dir: PathBuf, client: Arc, + storage: GenericRemoteStorage, } impl TestSetup { @@ -1566,7 +1567,7 @@ mod tests { tenant_id: harness.tenant_id, timeline_id: TIMELINE_ID, generation, - storage_impl: storage, + storage_impl: storage.clone(), upload_queue: Mutex::new(UploadQueue::Uninitialized), metrics: Arc::new(RemoteTimelineClientMetrics::new( &harness.tenant_id, @@ -1581,6 +1582,24 @@ mod tests { tenant_ctx: ctx, remote_fs_dir, client, + storage, + }) + } + + /// Construct a RemoteTimelineClient in an arbitrary generation + fn build_client(&self, generation: Generation) -> Arc { + Arc::new(RemoteTimelineClient { + conf: self.harness.conf, + runtime: tokio::runtime::Handle::current(), + tenant_id: self.harness.tenant_id, + timeline_id: TIMELINE_ID, + generation, + storage_impl: self.storage.clone(), + upload_queue: Mutex::new(UploadQueue::Uninitialized), + metrics: Arc::new(RemoteTimelineClientMetrics::new( + &self.harness.tenant_id, + &TIMELINE_ID, + )), }) } } @@ -1609,6 +1628,7 @@ mod tests { tenant_ctx: _tenant_ctx, remote_fs_dir, client, + .. } = TestSetup::new("upload_scheduling").await.unwrap(); let timeline_path = harness.timeline_path(&TIMELINE_ID); @@ -1847,4 +1867,88 @@ mod tests { } ); } + + async fn inject_index_part(test_state: &TestSetup, generation: Generation) -> IndexPart { + // An empty IndexPart, just sufficient to ensure deserialization will succeed + let example_index_part = IndexPart::new( + HashMap::new(), + "0/16960E8".parse::().unwrap(), + TimelineMetadata::example(), + ); + + let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap(); + + let timeline_path = test_state.harness.timeline_path(&TIMELINE_ID); + let remote_timeline_dir = test_state.remote_fs_dir.join( + timeline_path + .strip_prefix(&test_state.harness.conf.workdir) + .unwrap(), + ); + + std::fs::create_dir_all(remote_timeline_dir).expect("creating test dir should work"); + + let index_path = test_state.remote_fs_dir.join( + remote_index_path(&test_state.harness.tenant_id, &TIMELINE_ID, generation).get_path(), + ); + eprintln!("Writing {}", index_path.display()); + std::fs::write(&index_path, index_part_bytes).unwrap(); + example_index_part + } + + /// Assert that when a RemoteTimelineclient in generation `get_generation` fetches its + /// index, the IndexPart returned is equal to `expected` + async fn assert_got_index_part( + test_state: &TestSetup, + get_generation: Generation, + expected: &IndexPart, + ) { + let client = test_state.build_client(get_generation); + + let download_r = client + .download_index_file() + .await + .expect("download should always succeed"); + assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_))); + match download_r { + MaybeDeletedIndexPart::IndexPart(index_part) => { + assert_eq!(&index_part, expected); + } + MaybeDeletedIndexPart::Deleted(_index_part) => panic!("Test doesn't set deleted_at"), + } + } + + #[tokio::test] + async fn index_part_download_simple() -> anyhow::Result<()> { + let test_state = TestSetup::new("index_part_download_simple").await.unwrap(); + + // Simple case: we are in generation N, load the index from generation N - 1 + let generation_n = 5; + let injected = inject_index_part(&test_state, Generation::new(generation_n - 1)).await; + + assert_got_index_part(&test_state, Generation::new(generation_n), &injected).await; + + Ok(()) + } + + #[tokio::test] + async fn index_part_download_ordering() -> anyhow::Result<()> { + let test_state = TestSetup::new("index_part_download_ordering") + .await + .unwrap(); + + // A generation-less IndexPart exists in the bucket, we should find it + let generation_n = 5; + let injected_none = inject_index_part(&test_state, Generation::none()).await; + assert_got_index_part(&test_state, Generation::new(generation_n), &injected_none).await; + + // If a more recent-than-none generation exists, we should prefer to load that + let injected_1 = inject_index_part(&test_state, Generation::new(1)).await; + assert_got_index_part(&test_state, Generation::new(generation_n), &injected_1).await; + + // If a more-recent-than-me generation exists, we should decline to load that + let _injected_10 = inject_index_part(&test_state, Generation::new(10)).await; + assert_got_index_part(&test_state, Generation::new(generation_n), &injected_1).await; + + Ok(()) + } } diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index dc8d87b9e186..b614abfe3ba1 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -19,7 +19,7 @@ use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_ use crate::tenant::storage_layer::LayerFileName; use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::Generation; -use remote_storage::{DownloadError, GenericRemoteStorage}; +use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath}; use utils::crashsafe::path_with_suffix_extension; use utils::id::{TenantId, TimelineId}; @@ -219,13 +219,13 @@ pub async fn list_remote_timelines( Ok(timeline_ids) } -pub(super) async fn download_index_part( +async fn do_download_index_part( storage: &GenericRemoteStorage, tenant_id: &TenantId, timeline_id: &TimelineId, - generation: Generation, + index_generation: Generation, ) -> Result { - let remote_path = remote_index_path(tenant_id, timeline_id, generation); + let remote_path = remote_index_path(tenant_id, timeline_id, index_generation); let index_part_bytes = download_retry( || async { @@ -252,6 +252,104 @@ pub(super) async fn download_index_part( Ok(index_part) } +pub(super) async fn download_index_part( + storage: &GenericRemoteStorage, + tenant_id: &TenantId, + timeline_id: &TimelineId, + my_generation: Generation, +) -> Result { + if my_generation.is_none() { + // Operating without generations: just fetch the generation-less path + return do_download_index_part(storage, tenant_id, timeline_id, my_generation).await; + } + + let previous_gen = my_generation.previous(); + let r_previous = do_download_index_part(storage, tenant_id, timeline_id, previous_gen).await; + + match r_previous { + Ok(index_part) => { + tracing::debug!("Found index_part from previous generation {previous_gen:?}"); + return Ok(index_part); + } + Err(e) => { + if matches!(e, DownloadError::NotFound) { + tracing::debug!("No index_part found from previous generation {previous_gen:?}, falling back to listing"); + } else { + return Err(e); + } + } + }; + + /// Given the key of an index, parse out the generation part of the name + fn parse_generation(path: RemotePath) -> Option { + let file_name = match path.get_path().file_name() { + Some(f) => f, + None => { + // Unexpected: we should be seeing index_part.json paths only + tracing::warn!("Malformed index key {0}", path); + return None; + } + }; + + let file_name_str = match file_name.to_str() { + Some(s) => s, + None => { + tracing::warn!("Malformed index key {0}", path); + return None; + } + }; + + match file_name_str.split_once('-') { + Some((_, gen_suffix)) => u32::from_str_radix(gen_suffix, 16) + .map(Generation::new) + .ok(), + None => None, + } + } + + // Fallback: we did not find an index_part.json from the previous generation, so + // we will list all the index_part objects and pick the most recent. + let index_prefix = remote_index_path(tenant_id, timeline_id, Generation::none()); + let indices = backoff::retry( + || async { storage.list_files(Some(&index_prefix)).await }, + |_| false, + FAILED_DOWNLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "listing index_part files", + // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066) + backoff::Cancel::new(CancellationToken::new(), || -> anyhow::Error { + unreachable!() + }), + ) + .await + .map_err(DownloadError::Other)?; + + let mut generations: Vec<_> = indices + .into_iter() + .filter_map(parse_generation) + .filter(|g| g <= &my_generation) + .collect(); + + generations.sort(); + match generations.last() { + Some(g) => { + tracing::debug!( + "Found index_part in generation {g:?} (my generation {my_generation:?})" + ); + do_download_index_part(storage, tenant_id, timeline_id, *g).await + } + None => { + // This is not an error: the timeline may be newly created, or we may be + // upgrading and have no historical index_part with a generation suffix. + // Fall back to trying to load the un-suffixed index_part.json. + tracing::info!( + "No index_part.json-* found when loading {tenant_id}/{timeline_id} in generation {my_generation:?}" + ); + do_download_index_part(storage, tenant_id, timeline_id, Generation::none()).await + } + } +} + /// Helper function to handle retries for a download operation. /// /// Remote operations can fail due to rate limits (IAM, S3), spurious network From 885537a2233efd8ace735bf45d254bd02a6de905 Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 25 Aug 2023 10:34:12 +0100 Subject: [PATCH 08/44] control_plane: add `attachement_service`, a mock control plane --- Cargo.lock | 2 + control_plane/Cargo.toml | 2 + control_plane/src/bin/attachment_service.rs | 260 ++++++++++++++++++++ 3 files changed, 264 insertions(+) create mode 100644 control_plane/src/bin/attachment_service.rs diff --git a/Cargo.lock b/Cargo.lock index 8198fe442744..45da0634aa16 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1029,6 +1029,7 @@ dependencies = [ "comfy-table", "compute_api", "git-version", + "hyper", "nix 0.26.2", "once_cell", "pageserver_api", @@ -1044,6 +1045,7 @@ dependencies = [ "storage_broker", "tar", "thiserror", + "tokio", "toml", "tracing", "url", diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index d2c99c5f364e..5ea99cb07eb7 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -12,6 +12,7 @@ git-version.workspace = true nix.workspace = true once_cell.workspace = true postgres.workspace = true +hyper.workspace = true regex.workspace = true reqwest = { workspace = true, features = ["blocking", "json"] } serde.workspace = true @@ -20,6 +21,7 @@ serde_with.workspace = true tar.workspace = true thiserror.workspace = true toml.workspace = true +tokio.workspace = true url.workspace = true # Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api # instead, so that recompile times are better. diff --git a/control_plane/src/bin/attachment_service.rs b/control_plane/src/bin/attachment_service.rs new file mode 100644 index 000000000000..890ba56ac832 --- /dev/null +++ b/control_plane/src/bin/attachment_service.rs @@ -0,0 +1,260 @@ +/// The attachment service mimics the aspects of the control plane API +/// that are required for a pageserver to operate. +/// +/// This enables running & testing pageservers without a full-blown +/// deployment of the Neon cloud platform. +/// +use anyhow::anyhow; +use clap::Parser; +use hyper::StatusCode; +use hyper::{Body, Request, Response}; +use serde::{Deserialize, Serialize}; +use std::path::{Path, PathBuf}; +use std::{collections::HashMap, sync::Arc}; +use utils::logging::{self, LogFormat}; + +use utils::{ + http::{ + endpoint::{self}, + error::ApiError, + json::{json_request, json_response}, + RequestExt, RouterBuilder, + }, + id::{NodeId, TenantId}, + tcp_listener, +}; + +use pageserver_api::control_api::{ + HexTenantId, ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, + ValidateResponse, ValidateResponseTenant, +}; + +use control_plane::attachment_service::{AttachHookRequest, AttachHookResponse}; + +#[derive(Parser)] +#[command(author, version, about, long_about = None)] +#[command(arg_required_else_help(true))] +struct Cli { + #[arg(short, long)] + listen: String, + + #[arg(short, long)] + path: PathBuf, +} + +// The persistent state of each Tenant +#[derive(Serialize, Deserialize, Clone)] +struct TenantState { + // Currently attached pageserver + pageserver: Option, + + // Latest generation number: next time we attach, increment this + // and use the incremented number when attaching + generation: u32, +} + +fn to_hex_map(input: &HashMap, serializer: S) -> Result +where + S: serde::Serializer, + V: Clone + Serialize, +{ + eprintln!("to_hex_map"); + let transformed = input.iter().map(|(k, v)| (HexTenantId::new(*k), v.clone())); + + transformed + .collect::>() + .serialize(serializer) +} + +fn from_hex_map<'de, D, V>(deserializer: D) -> Result, D::Error> +where + D: serde::de::Deserializer<'de>, + V: Deserialize<'de>, +{ + eprintln!("from_hex_map"); + let hex_map = HashMap::::deserialize(deserializer)?; + + Ok(hex_map.into_iter().map(|(k, v)| (k.take(), v)).collect()) +} + +// Top level state available to all HTTP handlers +#[derive(Serialize, Deserialize)] +struct PersistentState { + #[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")] + tenants: HashMap, + + #[serde(skip)] + path: PathBuf, +} + +impl PersistentState { + async fn save(&self) -> anyhow::Result<()> { + let bytes = serde_json::to_vec(self)?; + tokio::fs::write(&self.path, &bytes).await?; + + Ok(()) + } + + async fn load(path: &Path) -> anyhow::Result { + let bytes = tokio::fs::read(path).await?; + let mut decoded = serde_json::from_slice::(&bytes)?; + decoded.path = path.to_owned(); + Ok(decoded) + } + + async fn load_or_new(path: &Path) -> Self { + match Self::load(path).await { + Ok(s) => s, + Err(e) => { + tracing::info!( + "Creating new state file at {0} (load returned {e})", + path.to_string_lossy() + ); + Self { + tenants: HashMap::new(), + path: path.to_owned(), + } + } + } + } +} + +/// State available to HTTP request handlers +#[derive(Clone)] +struct State { + inner: Arc>, +} + +impl State { + fn new(persistent_state: PersistentState) -> State { + Self { + inner: Arc::new(tokio::sync::RwLock::new(persistent_state)), + } + } +} + +#[inline(always)] +fn get_state(request: &Request) -> &State { + request + .data::>() + .expect("unknown state type") + .as_ref() +} + +/// Pageserver calls into this on startup, to learn which tenants it should attach +async fn handle_re_attach(mut req: Request) -> Result, ApiError> { + let reattach_req = json_request::(&mut req).await?; + + let state = get_state(&req).inner.clone(); + let mut locked = state.write().await; + + let mut response = ReAttachResponse { + tenants: Vec::new(), + }; + for (t, state) in &mut locked.tenants { + if state.pageserver == Some(reattach_req.node_id) { + state.generation += 1; + response.tenants.push(ReAttachResponseTenant { + id: HexTenantId::new(*t), + generation: state.generation, + }); + } + } + + locked.save().await.map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, response) +} + +/// Pageserver calls into this before doing deletions, to confirm that it still +/// holds the latest generation for the tenants with deletions enqueued +async fn handle_validate(mut req: Request) -> Result, ApiError> { + let validate_req = json_request::(&mut req).await?; + + let state = get_state(&req).inner.clone(); + let locked = state.read().await; + + let mut response = ValidateResponse { + tenants: Vec::new(), + }; + + for req_tenant in validate_req.tenants { + if let Some(tenant_state) = locked.tenants.get(req_tenant.id.as_ref()) { + let valid = tenant_state.generation == req_tenant.gen; + response.tenants.push(ValidateResponseTenant { + id: req_tenant.id, + valid, + }); + } + } + + json_response(StatusCode::OK, response) +} +/// Call into this before attaching a tenant to a pageserver, to acquire a generation number +/// (in the real control plane this is unnecessary, because the same program is managing +/// generation numbers and doing attachments). +async fn handle_attach_hook(mut req: Request) -> Result, ApiError> { + let attach_req = json_request::(&mut req).await?; + + let state = get_state(&req).inner.clone(); + let mut locked = state.write().await; + + let tenant_state = locked + .tenants + .entry(attach_req.tenant_id.take()) + .or_insert_with(|| TenantState { + pageserver: attach_req.pageserver_id, + generation: 0, + }); + + if attach_req.pageserver_id.is_some() { + tenant_state.generation += 1; + } + let generation = tenant_state.generation; + + locked.save().await.map_err(ApiError::InternalServerError)?; + + json_response( + StatusCode::OK, + AttachHookResponse { + gen: attach_req.pageserver_id.map(|_| generation), + }, + ) +} + +fn make_router(persistent_state: PersistentState) -> RouterBuilder { + endpoint::make_router() + .data(Arc::new(State::new(persistent_state))) + .post("/re-attach", handle_re_attach) + .post("/validate", handle_validate) + .post("/attach_hook", handle_attach_hook) +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + logging::init( + LogFormat::Plain, + logging::TracingErrorLayerEnablement::Disabled, + )?; + + let args = Cli::parse(); + tracing::info!( + "Starting, state at {}, listening on {}", + args.path.to_string_lossy(), + args.listen + ); + + let persistent_state = PersistentState::load_or_new(&args.path).await; + + let http_listener = tcp_listener::bind(&args.listen)?; + let router = make_router(persistent_state) + .build() + .map_err(|err| anyhow!(err))?; + let service = utils::http::RouterService::new(router).unwrap(); + let server = hyper::Server::from_tcp(http_listener)?.serve(service); + + tracing::info!("Serving on {0}", args.listen.as_str()); + server.await?; + + Ok(()) +} From 08fac6734733c6008badca954f0f3a969875b29d Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 25 Aug 2023 13:59:17 +0100 Subject: [PATCH 09/44] neon_local: manage attachment_service --- control_plane/src/attachment_service.rs | 56 +++++++++++++++++++++++++ control_plane/src/bin/neon_local.rs | 53 +++++++++++++++++++++++ control_plane/src/lib.rs | 1 + control_plane/src/local_env.rs | 8 ++++ control_plane/src/pageserver.rs | 7 ++++ 5 files changed, 125 insertions(+) create mode 100644 control_plane/src/attachment_service.rs diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs new file mode 100644 index 000000000000..d4015186146c --- /dev/null +++ b/control_plane/src/attachment_service.rs @@ -0,0 +1,56 @@ +use std::{path::PathBuf, process::Child}; + +use crate::{background_process, local_env::LocalEnv}; + +pub struct AttachmentService { + env: LocalEnv, + listen: String, + path: PathBuf, +} + +const COMMAND: &str = "attachment_service"; + +impl AttachmentService { + pub fn from_env(env: &LocalEnv) -> Self { + let path = env.base_data_dir.join("attachments.json"); + + // Makes no sense to construct this if pageservers aren't going to use it: assume + // pageservers have control plane API set + let listen_url = env.pageserver.control_plane_api.clone().unwrap(); + + let listen = format!( + "{}:{}", + listen_url.host_str().unwrap(), + listen_url.port().unwrap() + ); + + Self { + env: env.clone(), + path, + listen, + } + } + + fn pid_file(&self) -> PathBuf { + self.env.base_data_dir.join("attachment_service.pid") + } + + pub fn start(&self) -> anyhow::Result { + let path_str = self.path.to_string_lossy(); + + background_process::start_process( + COMMAND, + &self.env.base_data_dir, + &self.env.attachment_service_bin(), + ["-l", &self.listen, "-p", &path_str], + [], + background_process::InitialPidFile::Create(&self.pid_file()), + // TODO: a real status check + || Ok(true), + ) + } + + pub fn stop(&self, immediate: bool) -> anyhow::Result<()> { + background_process::stop_process(immediate, COMMAND, &self.pid_file()) + } +} diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 2af79bed905a..7dc659cd6cad 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -8,6 +8,7 @@ use anyhow::{anyhow, bail, Context, Result}; use clap::{value_parser, Arg, ArgAction, ArgMatches, Command}; use compute_api::spec::ComputeMode; +use control_plane::attachment_service::AttachmentService; use control_plane::endpoint::ComputeControlPlane; use control_plane::local_env::LocalEnv; use control_plane::pageserver::PageServerNode; @@ -43,6 +44,8 @@ project_git_version!(GIT_VERSION); const DEFAULT_PG_VERSION: &str = "15"; +const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/"; + fn default_conf() -> String { format!( r#" @@ -56,11 +59,13 @@ listen_pg_addr = '{DEFAULT_PAGESERVER_PG_ADDR}' listen_http_addr = '{DEFAULT_PAGESERVER_HTTP_ADDR}' pg_auth_type = '{trust_auth}' http_auth_type = '{trust_auth}' +control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}' [[safekeepers]] id = {DEFAULT_SAFEKEEPER_ID} pg_port = {DEFAULT_SAFEKEEPER_PG_PORT} http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT} + "#, trust_auth = AuthType::Trust, ) @@ -107,6 +112,7 @@ fn main() -> Result<()> { "start" => handle_start_all(sub_args, &env), "stop" => handle_stop_all(sub_args, &env), "pageserver" => handle_pageserver(sub_args, &env), + "attachment_service" => handle_attachment_service(sub_args, &env), "safekeeper" => handle_safekeeper(sub_args, &env), "endpoint" => handle_endpoint(sub_args, &env), "pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"), @@ -827,6 +833,33 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul Ok(()) } +fn handle_attachment_service(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { + let svc = AttachmentService::from_env(env); + match sub_match.subcommand() { + Some(("start", _start_match)) => { + if let Err(e) = svc.start() { + eprintln!("start failed: {e}"); + exit(1); + } + } + + Some(("stop", stop_match)) => { + let immediate = stop_match + .get_one::("stop-mode") + .map(|s| s.as_str()) + == Some("immediate"); + + if let Err(e) = svc.stop(immediate) { + eprintln!("stop failed: {}", e); + exit(1); + } + } + Some((sub_name, _)) => bail!("Unexpected attachment_service subcommand '{}'", sub_name), + None => bail!("no attachment_service subcommand provided"), + } + Ok(()) +} + fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result { if let Some(node) = env.safekeepers.iter().find(|node| node.id == id) { Ok(SafekeeperNode::from_env(env, node)) @@ -907,6 +940,13 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow broker::start_broker_process(env)?; + let attachment_service = AttachmentService::from_env(env); + if let Err(e) = attachment_service.start() { + eprintln!("attachment_service start failed: {:#}", e); + try_stop_all(env, true); + exit(1); + } + let pageserver = PageServerNode::from_env(env); if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) { eprintln!("pageserver {} start failed: {:#}", env.pageserver.id, e); @@ -965,6 +1005,11 @@ fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { if let Err(e) = broker::stop_broker_process(env) { eprintln!("neon broker stop failed: {e:#}"); } + + let attachment_service = AttachmentService::from_env(env); + if let Err(e) = attachment_service.stop(immediate) { + eprintln!("attachment service stop failed: {e:#}"); + } } fn cli() -> Command { @@ -1148,6 +1193,14 @@ fn cli() -> Command { .arg(stop_mode_arg.clone())) .subcommand(Command::new("restart").about("Restart local pageserver").arg(pageserver_config_args.clone())) ) + .subcommand( + Command::new("attachment_service") + .arg_required_else_help(true) + .about("Manage attachment_service") + .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone())) + .subcommand(Command::new("stop").about("Stop local pageserver") + .arg(stop_mode_arg.clone())) + ) .subcommand( Command::new("safekeeper") .arg_required_else_help(true) diff --git a/control_plane/src/lib.rs b/control_plane/src/lib.rs index a773b8dcc392..7592880402b6 100644 --- a/control_plane/src/lib.rs +++ b/control_plane/src/lib.rs @@ -7,6 +7,7 @@ // local installations. // +pub mod attachment_service; mod background_process; pub mod broker; pub mod endpoint; diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 9e42c2e333a7..0215ab1bb5e7 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -118,6 +118,9 @@ pub struct PageServerConf { // auth type used for the PG and HTTP ports pub pg_auth_type: AuthType, pub http_auth_type: AuthType, + + // Control plane location + pub control_plane_api: Option, } impl Default for PageServerConf { @@ -128,6 +131,7 @@ impl Default for PageServerConf { listen_http_addr: String::new(), pg_auth_type: AuthType::Trust, http_auth_type: AuthType::Trust, + control_plane_api: None, } } } @@ -202,6 +206,10 @@ impl LocalEnv { self.neon_distrib_dir.join("pageserver") } + pub fn attachment_service_bin(&self) -> PathBuf { + self.neon_distrib_dir.join("attachment_service") + } + pub fn safekeeper_bin(&self) -> PathBuf { self.neon_distrib_dir.join("safekeeper") } diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index c24ea7f7170e..c674b982c2e3 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -126,6 +126,13 @@ impl PageServerNode { broker_endpoint_param, ]; + if let Some(control_plane_api) = &self.env.pageserver.control_plane_api { + overrides.push(format!( + "control_plane_api='{}'", + control_plane_api.as_str() + )); + } + if self.env.pageserver.http_auth_type != AuthType::Trust || self.env.pageserver.pg_auth_type != AuthType::Trust { From 5e49aa5f51962c73699df429624f4ec0e0a8e7eb Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 29 Aug 2023 09:35:34 +0100 Subject: [PATCH 10/44] control_plane: implement attach hook --- control_plane/src/attachment_service.rs | 52 +++++++++++++++++++++- control_plane/src/bin/neon_local.rs | 59 ++++++++++++++----------- control_plane/src/pageserver.rs | 5 +-- libs/utils/src/id.rs | 6 ++- 4 files changed, 89 insertions(+), 33 deletions(-) diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs index d4015186146c..b2c867aef8d8 100644 --- a/control_plane/src/attachment_service.rs +++ b/control_plane/src/attachment_service.rs @@ -1,6 +1,9 @@ -use std::{path::PathBuf, process::Child}; - use crate::{background_process, local_env::LocalEnv}; +use anyhow::anyhow; +use pageserver_api::control_api::HexTenantId; +use serde::{Deserialize, Serialize}; +use std::{path::PathBuf, process::Child}; +use utils::id::{NodeId, TenantId}; pub struct AttachmentService { env: LocalEnv, @@ -10,6 +13,17 @@ pub struct AttachmentService { const COMMAND: &str = "attachment_service"; +#[derive(Serialize, Deserialize)] +pub struct AttachHookRequest { + pub tenant_id: HexTenantId, + pub pageserver_id: Option, +} + +#[derive(Serialize, Deserialize)] +pub struct AttachHookResponse { + pub gen: Option, +} + impl AttachmentService { pub fn from_env(env: &LocalEnv) -> Self { let path = env.base_data_dir.join("attachments.json"); @@ -53,4 +67,38 @@ impl AttachmentService { pub fn stop(&self, immediate: bool) -> anyhow::Result<()> { background_process::stop_process(immediate, COMMAND, &self.pid_file()) } + + /// Call into the attach_hook API, for use before handing out attachments to pageservers + pub fn attach_hook( + &self, + tenant_id: TenantId, + pageserver_id: NodeId, + ) -> anyhow::Result> { + use hyper::StatusCode; + + let url = self + .env + .pageserver + .control_plane_api + .clone() + .unwrap() + .join("attach_hook") + .unwrap(); + let client = reqwest::blocking::ClientBuilder::new() + .build() + .expect("Failed to construct http client"); + + let request = AttachHookRequest { + tenant_id: HexTenantId::new(tenant_id), + pageserver_id: Some(pageserver_id), + }; + + let response = client.post(url).json(&request).send()?; + if response.status() != StatusCode::OK { + return Err(anyhow!("Unexpected status {0}", response.status())); + } + + let response = response.json::()?; + Ok(response.gen) + } } diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 7dc659cd6cad..2976363fa7fd 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -284,14 +284,6 @@ fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result> { .context("Failed to parse tenant id from the argument string") } -fn parse_generation(sub_match: &ArgMatches) -> anyhow::Result> { - sub_match - .get_one::("generation") - .map(|tenant_id| u32::from_str(tenant_id)) - .transpose() - .context("Failed to parse generation rom the argument string") -} - fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result> { sub_match .get_one::("timeline-id") @@ -356,15 +348,25 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an } } Some(("create", create_match)) => { - let initial_tenant_id = parse_tenant_id(create_match)?; - let generation = parse_generation(create_match)?; let tenant_conf: HashMap<_, _> = create_match .get_many::("config") .map(|vals| vals.flat_map(|c| c.split_once(':')).collect()) .unwrap_or_default(); - let new_tenant_id = - pageserver.tenant_create(initial_tenant_id, generation, tenant_conf)?; - println!("tenant {new_tenant_id} successfully created on the pageserver"); + + // If tenant ID was not specified, generate one + let tenant_id = parse_tenant_id(create_match)?.unwrap_or(TenantId::generate()); + + let generation = if env.pageserver.control_plane_api.is_some() { + // We must register the tenant with the attachment service, so + // that when the pageserver restarts, it will be re-attached. + let attachment_service = AttachmentService::from_env(env); + attachment_service.attach_hook(tenant_id, env.pageserver.id)? + } else { + None + }; + + pageserver.tenant_create(tenant_id, generation, tenant_conf)?; + println!("tenant {tenant_id} successfully created on the pageserver"); // Create an initial timeline for the new tenant let new_timeline_id = parse_timeline_id(create_match)?; @@ -374,7 +376,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an .context("Failed to parse postgres version from the argument string")?; let timeline_info = pageserver.timeline_create( - new_tenant_id, + tenant_id, new_timeline_id, None, None, @@ -385,17 +387,17 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an env.register_branch_mapping( DEFAULT_BRANCH_NAME.to_string(), - new_tenant_id, + tenant_id, new_timeline_id, )?; println!( - "Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {new_tenant_id}", + "Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {tenant_id}", ); if create_match.get_flag("set-default") { - println!("Setting tenant {new_tenant_id} as a default one"); - env.default_tenant_id = Some(new_tenant_id); + println!("Setting tenant {tenant_id} as a default one"); + env.default_tenant_id = Some(tenant_id); } } Some(("set-default", set_default_match)) => { @@ -940,11 +942,14 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow broker::start_broker_process(env)?; - let attachment_service = AttachmentService::from_env(env); - if let Err(e) = attachment_service.start() { - eprintln!("attachment_service start failed: {:#}", e); - try_stop_all(env, true); - exit(1); + // Only start the attachment service if the pageserver is configured to need it + if env.pageserver.control_plane_api.is_some() { + let attachment_service = AttachmentService::from_env(env); + if let Err(e) = attachment_service.start() { + eprintln!("attachment_service start failed: {:#}", e); + try_stop_all(env, true); + exit(1); + } } let pageserver = PageServerNode::from_env(env); @@ -1006,9 +1011,11 @@ fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { eprintln!("neon broker stop failed: {e:#}"); } - let attachment_service = AttachmentService::from_env(env); - if let Err(e) = attachment_service.stop(immediate) { - eprintln!("attachment service stop failed: {e:#}"); + if env.pageserver.control_plane_api.is_some() { + let attachment_service = AttachmentService::from_env(env); + if let Err(e) = attachment_service.stop(immediate) { + eprintln!("attachment service stop failed: {e:#}"); + } } } diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index c674b982c2e3..eecc2479ff84 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -323,7 +323,7 @@ impl PageServerNode { pub fn tenant_create( &self, - new_tenant_id: Option, + new_tenant_id: TenantId, generation: Option, settings: HashMap<&str, &str>, ) -> anyhow::Result { @@ -390,9 +390,6 @@ impl PageServerNode { .context("Failed to parse 'gc_feedback' as bool")?, }; - // If tenant ID was not specified, generate one - let new_tenant_id = new_tenant_id.unwrap_or(TenantId::generate()); - let request = models::TenantCreateRequest { new_tenant_id, generation, diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs index 2ce92ee914c6..b93146e3a0bf 100644 --- a/libs/utils/src/id.rs +++ b/libs/utils/src/id.rs @@ -50,7 +50,7 @@ impl Id { Id::from(tli_buf) } - fn hex_encode(&self) -> String { + pub fn hex_encode(&self) -> String { static HEX: &[u8] = b"0123456789abcdef"; let mut buf = vec![0u8; self.0.len() * 2]; @@ -133,6 +133,10 @@ macro_rules! id_newtype { pub const fn from_array(b: [u8; 16]) -> Self { $t(Id(b)) } + + pub fn hex_encode(&self) -> String { + self.0.hex_encode() + } } impl FromStr for $t { From 2ebe12ebe333621aff3bcb5d3fb82ab48252784d Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 29 Aug 2023 14:26:59 +0100 Subject: [PATCH 11/44] remote_storage: fix LocalFs list_files --- libs/remote_storage/src/local_fs.rs | 44 +++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index f1095ad8b8fe..9aef0cb66bcc 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -148,21 +148,53 @@ impl RemoteStorage for LocalFs { Some(folder) => folder.with_base(&self.storage_root), None => self.storage_root.clone(), }; - let mut files = vec![]; - let mut directory_queue = vec![full_path.clone()]; + // If we were given a directory, we may use it as our starting point. + // Otherwise, we must go up to the parent directory. This is because + // S3 object list prefixes can be arbitrary strings, but when reading + // the local filesystem we need a directory to start calling read_dir on. + let mut initial_dir = full_path.clone(); + match fs::metadata(full_path.clone()).await { + Err(e) => { + // It's not a file that exists: strip the prefix back to the parent directory + if matches!(e.kind(), ErrorKind::NotFound) { + initial_dir.pop(); + } + } + Ok(meta) => { + if !meta.is_dir() { + // It's not a directory: strip back to the parent + initial_dir.pop(); + } + } + } + + // Note that PathBuf starts_with only considers full path segments, but + // object prefixes are arbitrary strings, so we need the strings for doing + // starts_with later. + let prefix = full_path.to_string_lossy(); + + let mut files = vec![]; + let mut directory_queue = vec![initial_dir.clone()]; while let Some(cur_folder) = directory_queue.pop() { let mut entries = fs::read_dir(cur_folder.clone()).await?; while let Some(entry) = entries.next_entry().await? { let file_name: PathBuf = entry.file_name().into(); let full_file_name = cur_folder.clone().join(&file_name); - let file_remote_path = self.local_file_to_relative_path(full_file_name.clone()); - files.push(file_remote_path.clone()); - if full_file_name.is_dir() { - directory_queue.push(full_file_name); + if full_file_name + .to_str() + .map(|s| s.starts_with(prefix.as_ref())) + .unwrap_or(false) + { + let file_remote_path = self.local_file_to_relative_path(full_file_name.clone()); + files.push(file_remote_path.clone()); + if full_file_name.is_dir() { + directory_queue.push(full_file_name); + } } } } + Ok(files) } From 9deaa1b28e7b808a101c59ff3f8882239b400970 Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 29 Aug 2023 14:31:38 +0100 Subject: [PATCH 12/44] tests: enable generations in neon_fixture --- test_runner/fixtures/neon_fixtures.py | 84 ++++++++++++++++++- test_runner/fixtures/pageserver/http.py | 15 +++- .../regress/test_pageserver_restart.py | 5 +- test_runner/regress/test_remote_storage.py | 17 ++-- 4 files changed, 108 insertions(+), 13 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index b2cd0fe96833..9de7e90f506d 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -428,6 +428,7 @@ def __init__( preserve_database_files: bool = False, initial_tenant: Optional[TenantId] = None, initial_timeline: Optional[TimelineId] = None, + enable_generations: bool = False, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -454,6 +455,7 @@ def __init__( self.preserve_database_files = preserve_database_files self.initial_tenant = initial_tenant or TenantId.generate() self.initial_timeline = initial_timeline or TimelineId.generate() + self.enable_generations = False def init_configs(self) -> NeonEnv: # Cannot create more than one environment from one builder @@ -713,6 +715,9 @@ def __exit__( sk.stop(immediate=True) self.env.pageserver.stop(immediate=True) + if self.env.attachment_service is not None: + self.env.attachment_service.stop(immediate=True) + cleanup_error = None try: self.cleanup_remote_storage() @@ -766,6 +771,8 @@ class NeonEnv: the tenant id """ + PAGESERVER_ID = 1 + def __init__(self, config: NeonEnvBuilder): self.repo_dir = config.repo_dir self.rust_log_override = config.rust_log_override @@ -789,6 +796,14 @@ def __init__(self, config: NeonEnvBuilder): self.initial_tenant = config.initial_tenant self.initial_timeline = config.initial_timeline + if config.enable_generations: + attachment_service_port = self.port_distributor.get_port() + self.control_plane_api: Optional[str] = f"http://127.0.0.1:{attachment_service_port}" + self.attachment_service: Optional[NeonAttachmentService] = NeonAttachmentService(self) + else: + self.control_plane_api = None + self.attachment_service = None + # Create a config file corresponding to the options toml = textwrap.dedent( f""" @@ -814,7 +829,7 @@ def __init__(self, config: NeonEnvBuilder): toml += textwrap.dedent( f""" [pageserver] - id=1 + id={self.PAGESERVER_ID} listen_pg_addr = 'localhost:{pageserver_port.pg}' listen_http_addr = 'localhost:{pageserver_port.http}' pg_auth_type = '{pg_auth_type}' @@ -822,6 +837,13 @@ def __init__(self, config: NeonEnvBuilder): """ ) + if self.control_plane_api is not None: + toml += textwrap.dedent( + f""" + control_plane_api = '{self.control_plane_api}' + """ + ) + # Create a corresponding NeonPageserver object self.pageserver = NeonPageserver( self, port=pageserver_port, config_override=config.pageserver_config_override @@ -868,6 +890,9 @@ def __init__(self, config: NeonEnvBuilder): def start(self): # Start up broker, pageserver and all safekeepers self.broker.try_start() + + if self.attachment_service is not None: + self.attachment_service.start() self.pageserver.start() for safekeeper in self.safekeepers: @@ -1289,6 +1314,16 @@ def init( res.check_returncode() return res + def attachment_service_start(self): + cmd = ["attachment_service", "start"] + return self.raw_cli(cmd) + + def attachment_service_stop(self, immediate: bool): + cmd = ["attachment_service", "stop"] + if immediate: + cmd.extend(["-m", "immediate"]) + return self.raw_cli(cmd) + def pageserver_start( self, overrides: Tuple[str, ...] = (), @@ -1470,6 +1505,33 @@ class ComputeCtl(AbstractNeonCli): COMMAND = "compute_ctl" +class NeonAttachmentService: + def __init__(self, env: NeonEnv): + self.env = env + + def start(self): + self.env.neon_cli.attachment_service_start() + self.running = True + return self + + def stop(self, immediate: bool = False) -> "NeonAttachmentService": + if self.running: + self.env.neon_cli.attachment_service_stop(immediate) + self.running = False + return self + + def __enter__(self) -> "NeonAttachmentService": + return self + + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc: Optional[BaseException], + tb: Optional[TracebackType], + ): + self.stop(immediate=True) + + class NeonPageserver(PgProtocol): """ An object representing a running pageserver. @@ -1633,6 +1695,26 @@ def log_contains(self, pattern: str) -> Optional[str]: return None + def tenant_attach( + self, tenant_id: TenantId, config: None | Dict[str, Any] = None, config_null: bool = False + ): + """ + Tenant attachment passes through here to acquire a generation number before proceeding + to call into the pageserver HTTP client. + """ + if self.env.attachment_service is not None: + response = requests.post( + f"{self.env.control_plane_api}/attach_hook", + json={"tenant_id": str(tenant_id), "pageserver_id": self.env.PAGESERVER_ID}, + ) + response.raise_for_status() + generation = response.json()["gen"] + else: + generation = None + + client = self.env.pageserver.http_client() + return client.tenant_attach(tenant_id, config, config_null, generation=generation) + def append_pageserver_param_overrides( params_to_update: List[str], diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index a179ebdd09ae..9373073abf2c 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -186,18 +186,25 @@ def tenant_create( return TenantId(new_tenant_id) def tenant_attach( - self, tenant_id: TenantId, config: None | Dict[str, Any] = None, config_null: bool = False + self, + tenant_id: TenantId, + config: None | Dict[str, Any] = None, + config_null: bool = False, + generation: Optional[int] = None, ): if config_null: assert config is None - body = "null" + body: Any = None else: # null-config is prohibited by the API config = config or {} - body = json.dumps({"config": config}) + body = {"config": config} + if generation is not None: + body.update({"generation": generation}) + res = self.post( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach", - data=body, + data=json.dumps(body), headers={"Content-Type": "application/json"}, ) self.verbose_error(res) diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index 1e41ebd15b59..1c61719e5f2c 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -7,7 +7,10 @@ # Test restarting page server, while safekeeper and compute node keep # running. -def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize("generations", [True, False]) +def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool): + neon_env_builder.enable_generations = generations + env = neon_env_builder.init_start() env.neon_cli.create_branch("test_pageserver_restart") diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 0bd365efaab2..8d1caae30ad3 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -52,9 +52,9 @@ # # The tests are done for all types of remote storage pageserver supports. @pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) +@pytest.mark.parametrize("generations", [True, False]) def test_remote_storage_backup_and_restore( - neon_env_builder: NeonEnvBuilder, - remote_storage_kind: RemoteStorageKind, + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, generations: bool ): # Use this test to check more realistic SK ids: some etcd key parsing bugs were related, # and this test needs SK to write data to pageserver, so it will be visible @@ -65,6 +65,8 @@ def test_remote_storage_backup_and_restore( test_name="test_remote_storage_backup_and_restore", ) + neon_env_builder.enable_generations = generations + # Exercise retry code path by making all uploads and downloads fail for the # first time. The retries print INFO-messages to the log; we will check # that they are present after the test. @@ -155,7 +157,8 @@ def test_remote_storage_backup_and_restore( # background task to load the tenant. In that background task, # listing the remote timelines will fail because of the failpoint, # and the tenant will be marked as Broken. - client.tenant_attach(tenant_id) + # client.tenant_attach(tenant_id) + env.pageserver.tenant_attach(tenant_id) tenant_info = wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 15) assert tenant_info["attachment_status"] == { @@ -165,7 +168,7 @@ def test_remote_storage_backup_and_restore( # Ensure that even though the tenant is broken, we can't attach it again. with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state: Broken"): - client.tenant_attach(tenant_id) + env.pageserver.tenant_attach(tenant_id) # Restart again, this implicitly clears the failpoint. # test_remote_failures=1 remains active, though, as it's in the pageserver config. @@ -183,7 +186,7 @@ def test_remote_storage_backup_and_restore( # Ensure that the pageserver remembers that the tenant was attaching, by # trying to attach it again. It should fail. with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state:"): - client.tenant_attach(tenant_id) + env.pageserver.tenant_attach(tenant_id) log.info("waiting for tenant to become active. this should be quick with on-demand download") wait_until_tenant_active( @@ -364,7 +367,7 @@ def churn_while_failpoints_active(result): env.pageserver.start() client = env.pageserver.http_client() - client.tenant_attach(tenant_id) + env.pageserver.tenant_attach(tenant_id) wait_until_tenant_active(client, tenant_id) @@ -502,7 +505,7 @@ def churn(data_pass1, data_pass2): env.pageserver.start() client = env.pageserver.http_client() - client.tenant_attach(tenant_id) + env.pageserver.tenant_attach(tenant_id) wait_until_tenant_active(client, tenant_id) From af28cd182516e43e45b486ca19b58d320366ca67 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 31 Aug 2023 15:29:22 +0100 Subject: [PATCH 13/44] clippy: permit many args to schedule_local_tenant_processing When we refactor mgr.rs to implement a Mgr type, this arg count will go down. --- pageserver/src/tenant/mgr.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index b3fea5607d1c..9c8d86d5f669 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -278,6 +278,7 @@ pub async fn init_tenant_mgr( Ok(()) } +#[allow(clippy::too_many_arguments)] pub(crate) fn schedule_local_tenant_processing( conf: &'static PageServerConf, tenant_id: TenantId, From acce508387cd21c1e9b251def2b88d5a0753babc Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 31 Aug 2023 18:43:57 +0100 Subject: [PATCH 14/44] Refactor TenantId encoding in JSON APIs --- Cargo.lock | 1 + control_plane/Cargo.toml | 1 + control_plane/src/attachment_service.rs | 6 +-- control_plane/src/bin/attachment_service.rs | 29 ++++++----- libs/pageserver_api/src/control_api.rs | 55 +++------------------ libs/utils/src/id.rs | 6 +-- pageserver/src/tenant/mgr.rs | 4 +- 7 files changed, 31 insertions(+), 71 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 45da0634aa16..877bbe0a38f3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1029,6 +1029,7 @@ dependencies = [ "comfy-table", "compute_api", "git-version", + "hex", "hyper", "nix 0.26.2", "once_cell", diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 5ea99cb07eb7..ec685915f945 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -12,6 +12,7 @@ git-version.workspace = true nix.workspace = true once_cell.workspace = true postgres.workspace = true +hex.workspace = true hyper.workspace = true regex.workspace = true reqwest = { workspace = true, features = ["blocking", "json"] } diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs index b2c867aef8d8..a41c49e9585d 100644 --- a/control_plane/src/attachment_service.rs +++ b/control_plane/src/attachment_service.rs @@ -1,6 +1,5 @@ use crate::{background_process, local_env::LocalEnv}; use anyhow::anyhow; -use pageserver_api::control_api::HexTenantId; use serde::{Deserialize, Serialize}; use std::{path::PathBuf, process::Child}; use utils::id::{NodeId, TenantId}; @@ -15,7 +14,8 @@ const COMMAND: &str = "attachment_service"; #[derive(Serialize, Deserialize)] pub struct AttachHookRequest { - pub tenant_id: HexTenantId, + #[serde(with = "hex")] + pub tenant_id: TenantId, pub pageserver_id: Option, } @@ -89,7 +89,7 @@ impl AttachmentService { .expect("Failed to construct http client"); let request = AttachHookRequest { - tenant_id: HexTenantId::new(tenant_id), + tenant_id, pageserver_id: Some(pageserver_id), }; diff --git a/control_plane/src/bin/attachment_service.rs b/control_plane/src/bin/attachment_service.rs index 890ba56ac832..a0aa23a6e7ac 100644 --- a/control_plane/src/bin/attachment_service.rs +++ b/control_plane/src/bin/attachment_service.rs @@ -6,6 +6,7 @@ /// use anyhow::anyhow; use clap::Parser; +use hex::FromHex; use hyper::StatusCode; use hyper::{Body, Request, Response}; use serde::{Deserialize, Serialize}; @@ -25,8 +26,8 @@ use utils::{ }; use pageserver_api::control_api::{ - HexTenantId, ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, - ValidateResponse, ValidateResponseTenant, + ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, ValidateResponse, + ValidateResponseTenant, }; use control_plane::attachment_service::{AttachHookRequest, AttachHookResponse}; @@ -58,11 +59,10 @@ where S: serde::Serializer, V: Clone + Serialize, { - eprintln!("to_hex_map"); - let transformed = input.iter().map(|(k, v)| (HexTenantId::new(*k), v.clone())); + let transformed = input.iter().map(|(k, v)| (hex::encode(&k), v.clone())); transformed - .collect::>() + .collect::>() .serialize(serializer) } @@ -71,10 +71,15 @@ where D: serde::de::Deserializer<'de>, V: Deserialize<'de>, { - eprintln!("from_hex_map"); - let hex_map = HashMap::::deserialize(deserializer)?; - - Ok(hex_map.into_iter().map(|(k, v)| (k.take(), v)).collect()) + let hex_map = HashMap::::deserialize(deserializer)?; + hex_map + .into_iter() + .map(|(k, v)| { + TenantId::from_hex(k) + .map(|k| (k, v)) + .map_err(serde::de::Error::custom) + }) + .collect() } // Top level state available to all HTTP handlers @@ -155,7 +160,7 @@ async fn handle_re_attach(mut req: Request) -> Result, ApiE if state.pageserver == Some(reattach_req.node_id) { state.generation += 1; response.tenants.push(ReAttachResponseTenant { - id: HexTenantId::new(*t), + id: *t, generation: state.generation, }); } @@ -179,7 +184,7 @@ async fn handle_validate(mut req: Request) -> Result, ApiEr }; for req_tenant in validate_req.tenants { - if let Some(tenant_state) = locked.tenants.get(req_tenant.id.as_ref()) { + if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) { let valid = tenant_state.generation == req_tenant.gen; response.tenants.push(ValidateResponseTenant { id: req_tenant.id, @@ -201,7 +206,7 @@ async fn handle_attach_hook(mut req: Request) -> Result, Ap let tenant_state = locked .tenants - .entry(attach_req.tenant_id.take()) + .entry(attach_req.tenant_id) .or_insert_with(|| TenantState { pageserver: attach_req.pageserver_id, generation: 0, diff --git a/libs/pageserver_api/src/control_api.rs b/libs/pageserver_api/src/control_api.rs index 1f663034f597..45d78aa05786 100644 --- a/libs/pageserver_api/src/control_api.rs +++ b/libs/pageserver_api/src/control_api.rs @@ -1,4 +1,3 @@ -use hex::FromHex; use serde::{Deserialize, Serialize}; use utils::id::{NodeId, TenantId}; @@ -14,7 +13,8 @@ pub struct ReAttachRequest { #[derive(Serialize, Deserialize)] pub struct ReAttachResponseTenant { - pub id: HexTenantId, + #[serde(with = "hex")] + pub id: TenantId, pub generation: u32, } @@ -25,7 +25,8 @@ pub struct ReAttachResponse { #[derive(Serialize, Deserialize)] pub struct ValidateRequestTenant { - pub id: HexTenantId, + #[serde(with = "hex")] + pub id: TenantId, pub gen: u32, } @@ -41,51 +42,7 @@ pub struct ValidateResponse { #[derive(Serialize, Deserialize)] pub struct ValidateResponseTenant { - pub id: HexTenantId, + #[serde(with = "hex")] + pub id: TenantId, pub valid: bool, } - -/// Serialization helper: TenantId's serialization is an array of u8, which is rather unfriendly -/// for human readable encodings like JSON. -/// This class wraps it in serialization that is just the hex string representation, -/// which is more compact and readable in JSON. -#[derive(Eq, PartialEq, Clone, Hash)] -pub struct HexTenantId(TenantId); - -impl HexTenantId { - pub fn new(t: TenantId) -> Self { - Self(t) - } - - pub fn take(self) -> TenantId { - self.0 - } -} - -impl AsRef for HexTenantId { - fn as_ref(&self) -> &TenantId { - &self.0 - } -} - -impl Serialize for HexTenantId { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - let hex = self.0.hex_encode(); - serializer.collect_str(&hex) - } -} - -impl<'de> Deserialize<'de> for HexTenantId { - fn deserialize(deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - let string = String::deserialize(deserializer)?; - TenantId::from_hex(string) - .map(HexTenantId::new) - .map_err(|e| serde::de::Error::custom(format!("{e}"))) - } -} diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs index b93146e3a0bf..2ce92ee914c6 100644 --- a/libs/utils/src/id.rs +++ b/libs/utils/src/id.rs @@ -50,7 +50,7 @@ impl Id { Id::from(tli_buf) } - pub fn hex_encode(&self) -> String { + fn hex_encode(&self) -> String { static HEX: &[u8] = b"0123456789abcdef"; let mut buf = vec![0u8; self.0.len() * 2]; @@ -133,10 +133,6 @@ macro_rules! id_newtype { pub const fn from_array(b: [u8; 16]) -> Self { $t(Id(b)) } - - pub fn hex_encode(&self) -> String { - self.0.hex_encode() - } } impl FromStr for $t { diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 9c8d86d5f669..969956d61ce3 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -2,7 +2,7 @@ //! page server. use hyper::StatusCode; -use pageserver_api::control_api::{HexTenantId, ReAttachRequest, ReAttachResponse}; +use pageserver_api::control_api::{ReAttachRequest, ReAttachResponse}; use std::collections::{hash_map, HashMap}; use std::ffi::OsStr; use std::path::Path; @@ -217,7 +217,7 @@ pub async fn init_tenant_mgr( let generation = if let Some(generations) = &tenant_generations { // We have a generation map: treat it as the authority for whether // this tenant is really attached. - if let Some(gen) = generations.get(&HexTenantId::new(tenant_id)) { + if let Some(gen) = generations.get(&tenant_id) { Generation::new(*gen) } else { info!("Detaching tenant {0}, control plane omitted it in re-attach response", tenant_id); From 1856d7576cc17bb18501487030dbf90814535b53 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 31 Aug 2023 19:20:24 +0100 Subject: [PATCH 15/44] clippy --- control_plane/src/bin/attachment_service.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/control_plane/src/bin/attachment_service.rs b/control_plane/src/bin/attachment_service.rs index a0aa23a6e7ac..d1f99210ac28 100644 --- a/control_plane/src/bin/attachment_service.rs +++ b/control_plane/src/bin/attachment_service.rs @@ -59,7 +59,7 @@ where S: serde::Serializer, V: Clone + Serialize, { - let transformed = input.iter().map(|(k, v)| (hex::encode(&k), v.clone())); + let transformed = input.iter().map(|(k, v)| (hex::encode(k), v.clone())); transformed .collect::>() From e2bf591358ebc1528c87575f06a0d300bbf58b07 Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 1 Sep 2023 11:00:35 +0100 Subject: [PATCH 16/44] Update pageserver/src/tenant/remote_timeline_client/download.rs Co-authored-by: Christian Schwarz --- pageserver/src/tenant/remote_timeline_client/download.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index b614abfe3ba1..cd646b4dcfad 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -294,7 +294,7 @@ pub(super) async fn download_index_part( let file_name_str = match file_name.to_str() { Some(s) => s, None => { - tracing::warn!("Malformed index key {0}", path); + tracing::warn!("Malformed index key {0:?}", path); return None; } }; From fd2ea25b1e7a6b313e91053c33c82a7aea2bca68 Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 1 Sep 2023 11:01:22 +0100 Subject: [PATCH 17/44] Update pageserver/src/tenant/remote_timeline_client.rs Co-authored-by: Christian Schwarz --- pageserver/src/tenant/remote_timeline_client.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index aeca0558d16f..1d958e0ed635 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -1870,10 +1870,12 @@ mod tests { async fn inject_index_part(test_state: &TestSetup, generation: Generation) -> IndexPart { // An empty IndexPart, just sufficient to ensure deserialization will succeed + let example_metadata = TimelineMetadata::example(); let example_index_part = IndexPart::new( HashMap::new(), - "0/16960E8".parse::().unwrap(), - TimelineMetadata::example(), + example_metadata.disk_consistent_lsn(), + example_metadata, + ); let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap(); From 5fc08747190eff381f2c5f686ef4f65b9ad47b1d Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 1 Sep 2023 11:01:37 +0100 Subject: [PATCH 18/44] Update pageserver/src/tenant/remote_timeline_client.rs Co-authored-by: Christian Schwarz --- pageserver/src/tenant/remote_timeline_client.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 1d958e0ed635..28351a32a283 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -1947,7 +1947,7 @@ mod tests { let injected_1 = inject_index_part(&test_state, Generation::new(1)).await; assert_got_index_part(&test_state, Generation::new(generation_n), &injected_1).await; - // If a more-recent-than-me generation exists, we should decline to load that + // If a more-recent-than-me generation exists, we should ignore it. let _injected_10 = inject_index_part(&test_state, Generation::new(10)).await; assert_got_index_part(&test_state, Generation::new(generation_n), &injected_1).await; From 891f7c4ccc78505f9d7ea255ca30787e62f68852 Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 1 Sep 2023 10:46:40 +0100 Subject: [PATCH 19/44] More consistent handling of generations in pageserver API - Create gives a 400 if generation is omitted and control_plane_api is set - Load can take one now --- libs/pageserver_api/src/models.rs | 9 +++++ pageserver/src/http/routes.rs | 55 ++++++++++++++++++------------- pageserver/src/tenant/mgr.rs | 5 ++- 3 files changed, 44 insertions(+), 25 deletions(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 5f92556f6408..340463afa79f 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -201,6 +201,15 @@ pub struct TenantCreateRequest { pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it } +#[serde_as] +#[derive(Deserialize, Debug)] +#[serde(deny_unknown_fields)] +pub struct TenantLoadRequest { + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub generation: Option, +} + impl std::ops::Deref for TenantCreateRequest { type Target = TenantConfig; diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index c1dfe435e499..72567474cf1e 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -8,7 +8,9 @@ use anyhow::{anyhow, Context, Result}; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use metrics::launch_timestamp::LaunchTimestamp; -use pageserver_api::models::{DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest}; +use pageserver_api::models::{ + DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest, TenantLoadRequest, +}; use remote_storage::GenericRemoteStorage; use storage_broker::BrokerClientChannel; use tenant_size_model::{SizeResult, StorageModel}; @@ -485,19 +487,8 @@ async fn tenant_attach_handler( let state = get_state(&request); - let generation = if state.conf.control_plane_api.is_some() { - // If we have been configured with a control plane URI, then generations are - // mandatory, as we will attempt to re-attach on startup. - maybe_body - .as_ref() - .and_then(|tar| tar.generation) - .map(Generation::new) - .ok_or(ApiError::BadRequest(anyhow!( - "generation attribute missing" - )))? - } else { - Generation::none() - }; + let generation = + get_request_generation(state, maybe_body.as_ref().map(|r| r.generation).flatten())?; if let Some(remote_storage) = &state.remote_storage { mgr::attach_tenant( @@ -555,7 +546,7 @@ async fn tenant_detach_handler( } async fn tenant_load_handler( - request: Request, + mut request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; @@ -563,10 +554,19 @@ async fn tenant_load_handler( let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); + let maybe_body: Option = json_request_or_empty_body(&mut request).await?; + let state = get_state(&request); + + // The /load request is only usable when control_plane_api is not set. Once it is set, callers + // should always use /attach instead. + let generation = + get_request_generation(state, maybe_body.as_ref().map(|r| r.generation).flatten())?; + mgr::load_tenant( state.conf, tenant_id, + generation, state.broker_client.clone(), state.remote_storage.clone(), &ctx, @@ -868,6 +868,21 @@ pub fn html_response(status: StatusCode, data: String) -> Result, Ok(response) } +/// Helper for requests that may take a generation, which is mandatory +/// when control_plane_api is set, but otherwise defaults to Generation::none() +fn get_request_generation(state: &State, req_gen: Option) -> Result { + if state.conf.control_plane_api.is_some() { + req_gen + .map(Generation::new) + .ok_or(ApiError::BadRequest(anyhow!( + "generation attribute missing" + ))) + } else { + // Legacy mode: all tenants operate with no generation + Ok(Generation::none()) + } +} + async fn tenant_create_handler( mut request: Request, _cancel: CancellationToken, @@ -884,15 +899,11 @@ async fn tenant_create_handler( let tenant_conf = TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?; - // TODO: make generation mandatory here once control plane supports it. - let generation = request_data - .generation - .map(Generation::new) - .unwrap_or(Generation::none()); + let state = get_state(&request); - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); + let generation = get_request_generation(state, request_data.generation)?; - let state = get_state(&request); + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); let new_tenant = mgr::create_tenant( state.conf, diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 969956d61ce3..a09cb582f128 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -630,6 +630,7 @@ async fn detach_tenant0( pub async fn load_tenant( conf: &'static PageServerConf, tenant_id: TenantId, + generation: Generation, broker_client: storage_broker::BrokerClientChannel, remote_storage: Option, ctx: &RequestContext, @@ -646,9 +647,7 @@ pub async fn load_tenant( broker_client, remote_storage, }; - // TODO: remove the `/load` API once generation support is complete: - // it becomes equivalent to attaching. - let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, Generation::none(), resources, None, &TENANTS, ctx) + let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, generation, resources, None, &TENANTS, ctx) .with_context(|| { format!("Failed to schedule tenant processing in path {tenant_path:?}") })?; From 8ea760ad8aae3586d4e8271541538344db5e30bf Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 1 Sep 2023 11:26:53 +0100 Subject: [PATCH 20/44] clippy --- pageserver/src/http/routes.rs | 6 ++---- pageserver/src/tenant/remote_timeline_client.rs | 1 - 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 72567474cf1e..286aa3dfb055 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -487,8 +487,7 @@ async fn tenant_attach_handler( let state = get_state(&request); - let generation = - get_request_generation(state, maybe_body.as_ref().map(|r| r.generation).flatten())?; + let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?; if let Some(remote_storage) = &state.remote_storage { mgr::attach_tenant( @@ -560,8 +559,7 @@ async fn tenant_load_handler( // The /load request is only usable when control_plane_api is not set. Once it is set, callers // should always use /attach instead. - let generation = - get_request_generation(state, maybe_body.as_ref().map(|r| r.generation).flatten())?; + let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?; mgr::load_tenant( state.conf, diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 28351a32a283..d5d0959eefa2 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -1875,7 +1875,6 @@ mod tests { HashMap::new(), example_metadata.disk_consistent_lsn(), example_metadata, - ); let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap(); From 4d6b89d8c234b15b2d864a705fe3ddbcee3a9f8e Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 1 Sep 2023 11:26:58 +0100 Subject: [PATCH 21/44] refactor control plane client --- pageserver/src/control_plane_client.rs | 101 +++++++++++++++++++++++++ pageserver/src/lib.rs | 1 + pageserver/src/tenant/mgr.rs | 80 ++------------------ 3 files changed, 109 insertions(+), 73 deletions(-) create mode 100644 pageserver/src/control_plane_client.rs diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs new file mode 100644 index 000000000000..183d14fccb27 --- /dev/null +++ b/pageserver/src/control_plane_client.rs @@ -0,0 +1,101 @@ +use std::{collections::HashMap, time::Duration}; + +use hyper::StatusCode; +use pageserver_api::control_api::{ReAttachRequest, ReAttachResponse}; +use url::Url; +use utils::{ + generation::Generation, + id::{NodeId, TenantId}, +}; + +use crate::config::PageServerConf; + +/// The Pageserver's client for using the control plane API: this is a small subset +/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md) +pub(crate) struct ControlPlaneClient { + http_client: reqwest::Client, + base_url: Url, + node_id: NodeId, +} + +impl ControlPlaneClient { + /// A None return value indicates that the input `conf` object does not have control + /// plane API enabled. + pub(crate) fn new(conf: &'static PageServerConf) -> Option { + let url = match conf.control_plane_api.as_ref() { + Some(u) => u.clone(), + None => return None, + }; + + // FIXME: it's awkward that join() requires the base to have a trailing slash, makes + // it easy to get a config wrong + assert!( + url.as_str().ends_with('/'), + "control plane API needs trailing slash" + ); + + let client = reqwest::ClientBuilder::new() + .build() + .expect("Failed to construct http client"); + + Some(Self { + http_client: client, + base_url: url, + node_id: conf.id, + }) + } + + async fn try_re_attach( + &self, + url: Url, + request: &ReAttachRequest, + ) -> anyhow::Result { + match self.http_client.post(url).json(request).send().await { + Err(e) => Err(anyhow::Error::from(e)), + Ok(r) => { + if r.status() == StatusCode::OK { + r.json::() + .await + .map_err(anyhow::Error::from) + } else { + Err(anyhow::anyhow!("Unexpected status {}", r.status())) + } + } + } + } + + /// Block until we get a successful response + pub(crate) async fn re_attach(&self) -> anyhow::Result> { + let re_attach_path = self + .base_url + .join("re-attach") + .expect("Failed to build re-attach path"); + let request = ReAttachRequest { + node_id: self.node_id, + }; + + // TODO: we should have been passed a cancellation token, and use it to end + // this loop gracefully + loop { + let result = self.try_re_attach(re_attach_path.clone(), &request).await; + match result { + Ok(res) => { + tracing::info!( + "Received re-attach response with {0} tenants", + res.tenants.len() + ); + + return Ok(res + .tenants + .into_iter() + .map(|t| (t.id, Generation::new(t.generation))) + .collect::>()); + } + Err(e) => { + tracing::error!("Error re-attaching tenants, retrying: {e:#}"); + tokio::time::sleep(Duration::from_secs(1)).await; + } + } + } + } +} diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index cb20caba1fe6..3049ad6a4e8d 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -3,6 +3,7 @@ pub mod basebackup; pub mod config; pub mod consumption_metrics; pub mod context; +mod control_plane_client; pub mod disk_usage_eviction_task; pub mod http; pub mod import_datadir; diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index a09cb582f128..e915f244cd3e 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -1,13 +1,10 @@ //! This module acts as a switchboard to access different repositories managed by this //! page server. -use hyper::StatusCode; -use pageserver_api::control_api::{ReAttachRequest, ReAttachResponse}; use std::collections::{hash_map, HashMap}; use std::ffi::OsStr; use std::path::Path; use std::sync::Arc; -use std::time::Duration; use tokio::fs; use anyhow::Context; @@ -21,6 +18,7 @@ use utils::crashsafe; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; +use crate::control_plane_client::ControlPlaneClient; use crate::task_mgr::{self, TaskKind}; use crate::tenant::config::TenantConfOpt; use crate::tenant::delete::DeleteTenantFlow; @@ -80,75 +78,11 @@ pub async fn init_tenant_mgr( let mut tenants = HashMap::new(); // If we are configured to use the control plane API, then it is the source of truth for what to attach - let tenant_generations = conf - .control_plane_api - .as_ref() - .map(|control_plane_api| async { - let client = reqwest::ClientBuilder::new() - .build() - .expect("Failed to construct http client"); - - // FIXME: it's awkward that join() requires the base to have a trailing slash, makes - // it easy to get a config wrong - assert!( - control_plane_api.as_str().ends_with('/'), - "control plane API needs trailing slash" - ); - - let re_attach_path = control_plane_api - .join("re-attach") - .expect("Failed to build re-attach path"); - let request = ReAttachRequest { node_id: conf.id }; - - // TODO: we should have been passed a cancellation token, and use it to end - // this loop gracefully - loop { - let response = match client - .post(re_attach_path.clone()) - .json(&request) - .send() - .await - { - Err(e) => Err(anyhow::Error::from(e)), - Ok(r) => { - if r.status() == StatusCode::OK { - r.json::() - .await - .map_err(anyhow::Error::from) - } else { - Err(anyhow::anyhow!("Unexpected status {}", r.status())) - } - } - }; - - match response { - Ok(res) => { - tracing::info!( - "Received re-attach response with {0} tenants", - res.tenants.len() - ); - - // TODO: do something with it - break res - .tenants - .into_iter() - .map(|t| (t.id, t.generation)) - .collect::>(); - } - Err(e) => { - tracing::error!("Error re-attaching tenants, retrying: {e:#}"); - tokio::time::sleep(Duration::from_secs(1)).await; - } - } - } - }); - - let tenant_generations = match tenant_generations { - Some(g) => Some(g.await), - None => { - info!("Control plane API not configured, tenant generations are disabled"); - None - } + let tenant_generations = if let Some(client) = ControlPlaneClient::new(conf) { + Some(client.re_attach().await?) + } else { + info!("Control plane API not configured, tenant generations are disabled"); + None }; let mut dir_entries = fs::read_dir(&tenants_dir) @@ -218,7 +152,7 @@ pub async fn init_tenant_mgr( // We have a generation map: treat it as the authority for whether // this tenant is really attached. if let Some(gen) = generations.get(&tenant_id) { - Generation::new(*gen) + *gen } else { info!("Detaching tenant {0}, control plane omitted it in re-attach response", tenant_id); if let Err(e) = fs::remove_dir_all(&tenant_dir_path).await { From 9a5899ce911e002af3c4e77f09955fcab54494cc Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 1 Sep 2023 11:32:46 +0100 Subject: [PATCH 22/44] Cancellation and exponential backoff in control plane re-attach --- pageserver/src/bin/pageserver.rs | 1 + pageserver/src/control_plane_client.rs | 29 +++++++++++++++++++++----- pageserver/src/tenant/mgr.rs | 4 +++- 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 71e3a0ff3f42..bf0ad760cb82 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -388,6 +388,7 @@ fn start_pageserver( remote_storage: remote_storage.clone(), }, order, + shutdown_pageserver.clone(), ))?; BACKGROUND_RUNTIME.spawn({ diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs index 183d14fccb27..54788a1f1fa9 100644 --- a/pageserver/src/control_plane_client.rs +++ b/pageserver/src/control_plane_client.rs @@ -1,27 +1,36 @@ -use std::{collections::HashMap, time::Duration}; +use std::collections::HashMap; use hyper::StatusCode; use pageserver_api::control_api::{ReAttachRequest, ReAttachResponse}; +use tokio_util::sync::CancellationToken; use url::Url; use utils::{ + backoff, generation::Generation, id::{NodeId, TenantId}, }; use crate::config::PageServerConf; +// Backoffs when control plane requests do not succeed: compromise between reducing load +// on control plane, and retrying frequently when we are blocked on a control plane +// response to make progress. +const BACKOFF_INCREMENT: f64 = 0.1; +const BACKOFF_MAX: f64 = 10.0; + /// The Pageserver's client for using the control plane API: this is a small subset /// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md) pub(crate) struct ControlPlaneClient { http_client: reqwest::Client, base_url: Url, node_id: NodeId, + cancel: CancellationToken, } impl ControlPlaneClient { /// A None return value indicates that the input `conf` object does not have control /// plane API enabled. - pub(crate) fn new(conf: &'static PageServerConf) -> Option { + pub(crate) fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option { let url = match conf.control_plane_api.as_ref() { Some(u) => u.clone(), None => return None, @@ -42,6 +51,7 @@ impl ControlPlaneClient { http_client: client, base_url: url, node_id: conf.id, + cancel: cancel.clone(), }) } @@ -74,8 +84,7 @@ impl ControlPlaneClient { node_id: self.node_id, }; - // TODO: we should have been passed a cancellation token, and use it to end - // this loop gracefully + let mut attempt = 0; loop { let result = self.try_re_attach(re_attach_path.clone(), &request).await; match result { @@ -93,7 +102,17 @@ impl ControlPlaneClient { } Err(e) => { tracing::error!("Error re-attaching tenants, retrying: {e:#}"); - tokio::time::sleep(Duration::from_secs(1)).await; + backoff::exponential_backoff( + attempt, + BACKOFF_INCREMENT, + BACKOFF_MAX, + &self.cancel, + ) + .await; + if self.cancel.is_cancelled() { + return Err(anyhow::anyhow!("Shutting down")); + } + attempt += 1; } } } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index e915f244cd3e..1964391b0506 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -11,6 +11,7 @@ use anyhow::Context; use once_cell::sync::Lazy; use tokio::sync::RwLock; use tokio::task::JoinSet; +use tokio_util::sync::CancellationToken; use tracing::*; use remote_storage::GenericRemoteStorage; @@ -71,6 +72,7 @@ pub async fn init_tenant_mgr( conf: &'static PageServerConf, resources: TenantSharedResources, init_order: InitializationOrder, + cancel: CancellationToken, ) -> anyhow::Result<()> { // Scan local filesystem for attached tenants let tenants_dir = conf.tenants_path(); @@ -78,7 +80,7 @@ pub async fn init_tenant_mgr( let mut tenants = HashMap::new(); // If we are configured to use the control plane API, then it is the source of truth for what to attach - let tenant_generations = if let Some(client) = ControlPlaneClient::new(conf) { + let tenant_generations = if let Some(client) = ControlPlaneClient::new(conf, &cancel) { Some(client.re_attach().await?) } else { info!("Control plane API not configured, tenant generations are disabled"); From 2649114ff125671fec40bc7ca0c3fcebf25f19b7 Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 1 Sep 2023 13:29:31 +0100 Subject: [PATCH 23/44] Fix Generation::previous and refactor suffix parsing --- libs/utils/src/generation.rs | 17 ++++++++- .../src/tenant/remote_timeline_client.rs | 24 +++++++++++++ .../tenant/remote_timeline_client/download.rs | 36 ++++--------------- 3 files changed, 46 insertions(+), 31 deletions(-) diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs index 53c303520964..1408c8ca703c 100644 --- a/libs/utils/src/generation.rs +++ b/libs/utils/src/generation.rs @@ -65,9 +65,24 @@ impl Generation { } } + /// `suffix` is the part after "-" in a key + /// + /// Returns None if parsing was unsuccessful + pub fn parse_suffix(suffix: &str) -> Option { + u32::from_str_radix(suffix, 16).map(Generation::new).ok() + } + pub fn previous(&self) -> Generation { match self { - Self::Valid(n) => Self::Valid(n - 1), + Self::Valid(n) => { + if *n == 0 { + // Since a tenant may be upgraded from a pre-generations state, interpret the "previous" generation + // to 0 as being "no generation". + Self::None + } else { + Self::Valid(n - 1) + } + } Self::None => Self::None, Self::Broken => panic!("Attempted to use a broken generation"), } diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index d5d0959eefa2..65d88be12bd3 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -1425,6 +1425,30 @@ pub fn remote_index_path( .expect("Failed to construct path") } +/// Given the key of an index, parse out the generation part of the name +pub fn parse_remote_index_path(path: RemotePath) -> Option { + let file_name = match path.get_path().file_name() { + Some(f) => f, + None => { + // Unexpected: we should be seeing index_part.json paths only + tracing::warn!("Malformed index key {0}", path); + return None; + } + }; + + let file_name_str = match file_name.to_str() { + Some(s) => s, + None => { + tracing::warn!("Malformed index key {0:?}", path); + return None; + } + }; + match file_name_str.split_once('-') { + Some((_, gen_suffix)) => Generation::parse_suffix(gen_suffix), + None => None, + } +} + /// Files on the remote storage are stored with paths, relative to the workdir. /// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path. /// diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index cd646b4dcfad..544d474d61ef 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -19,12 +19,15 @@ use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_ use crate::tenant::storage_layer::LayerFileName; use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::Generation; -use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath}; +use remote_storage::{DownloadError, GenericRemoteStorage}; use utils::crashsafe::path_with_suffix_extension; use utils::id::{TenantId, TimelineId}; use super::index::{IndexPart, LayerFileMetadata}; -use super::{remote_index_path, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES}; +use super::{ + parse_remote_index_path, remote_index_path, FAILED_DOWNLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, +}; static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120); @@ -280,33 +283,6 @@ pub(super) async fn download_index_part( } }; - /// Given the key of an index, parse out the generation part of the name - fn parse_generation(path: RemotePath) -> Option { - let file_name = match path.get_path().file_name() { - Some(f) => f, - None => { - // Unexpected: we should be seeing index_part.json paths only - tracing::warn!("Malformed index key {0}", path); - return None; - } - }; - - let file_name_str = match file_name.to_str() { - Some(s) => s, - None => { - tracing::warn!("Malformed index key {0:?}", path); - return None; - } - }; - - match file_name_str.split_once('-') { - Some((_, gen_suffix)) => u32::from_str_radix(gen_suffix, 16) - .map(Generation::new) - .ok(), - None => None, - } - } - // Fallback: we did not find an index_part.json from the previous generation, so // we will list all the index_part objects and pick the most recent. let index_prefix = remote_index_path(tenant_id, timeline_id, Generation::none()); @@ -326,7 +302,7 @@ pub(super) async fn download_index_part( let mut generations: Vec<_> = indices .into_iter() - .filter_map(parse_generation) + .filter_map(parse_remote_index_path) .filter(|g| g <= &my_generation) .collect(); From a9834854a001385bd73746bfc1e7e5c6fc820c3f Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 1 Sep 2023 13:32:43 +0100 Subject: [PATCH 24/44] More efficient .max() --- pageserver/src/tenant/remote_timeline_client/download.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 544d474d61ef..0014aa89ed80 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -300,14 +300,13 @@ pub(super) async fn download_index_part( .await .map_err(DownloadError::Other)?; - let mut generations: Vec<_> = indices + let max_previous_generation = indices .into_iter() .filter_map(parse_remote_index_path) .filter(|g| g <= &my_generation) - .collect(); + .max(); - generations.sort(); - match generations.last() { + match max_previous_generation { Some(g) => { tracing::debug!( "Found index_part in generation {g:?} (my generation {my_generation:?})" From c1ca7737a099f3f657626218145de8dacff8bb2d Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 1 Sep 2023 13:56:22 +0100 Subject: [PATCH 25/44] Tighten up download_index_part Various comments, and properly cover the case of loading a stale generation where an index already exists in our own generation. --- .../src/tenant/remote_timeline_client.rs | 12 +++++ .../tenant/remote_timeline_client/download.rs | 46 +++++++++++++++---- 2 files changed, 50 insertions(+), 8 deletions(-) diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 65d88be12bd3..64e4829d4cb8 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -1974,6 +1974,18 @@ mod tests { let _injected_10 = inject_index_part(&test_state, Generation::new(10)).await; assert_got_index_part(&test_state, Generation::new(generation_n), &injected_1).await; + // If a directly previous generation exists, _and_ an index exists in my own + // generation, I should prefer my own generation. + let _injected_prev = + inject_index_part(&test_state, Generation::new(generation_n - 1)).await; + let injected_current = inject_index_part(&test_state, Generation::new(generation_n)).await; + assert_got_index_part( + &test_state, + Generation::new(generation_n), + &injected_current, + ) + .await; + Ok(()) } } diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 0014aa89ed80..dc7f8dab9abc 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -255,6 +255,11 @@ async fn do_download_index_part( Ok(index_part) } +/// index_part.json objects are suffixed with a generation number, so we cannot +/// directly GET the latest index part without doing some probing. +/// +/// In this function we probe for the most recent index in a generation <= our current generation. +/// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md pub(super) async fn download_index_part( storage: &GenericRemoteStorage, tenant_id: &TenantId, @@ -266,9 +271,33 @@ pub(super) async fn download_index_part( return do_download_index_part(storage, tenant_id, timeline_id, my_generation).await; } + // Stale case: If we were intentionally attached in a stale generation, there may already be a remote + // index in our generation. + // + // This is an optimization to avoid doing the listing for the general case below. + let r_current = do_download_index_part(storage, tenant_id, timeline_id, my_generation).await; + match r_current { + Ok(index_part) => { + tracing::debug!("Found index_part from current generation (this is a stale attachment) {my_generation:?}"); + return Ok(index_part); + } + Err(e) => { + if !matches!(e, DownloadError::NotFound) { + return Err(e); + } + } + }; + + // Typical case: the previous generation of this tenant was running healthily, and had uploaded + // and index part. We may safely start from this index without doing a listing, because: + // - We checked for current generation case above + // - generations > my_generation are to be ignored + // - any other indices that exist would have an older generation than `previous_gen`, and + // we want to find the most recent index from a previous generation. + // + // This is an optimization to avoid doing the listing for the general case below. let previous_gen = my_generation.previous(); let r_previous = do_download_index_part(storage, tenant_id, timeline_id, previous_gen).await; - match r_previous { Ok(index_part) => { tracing::debug!("Found index_part from previous generation {previous_gen:?}"); @@ -283,8 +312,8 @@ pub(super) async fn download_index_part( } }; - // Fallback: we did not find an index_part.json from the previous generation, so - // we will list all the index_part objects and pick the most recent. + // General case/fallback: if there is no index at my_generation or prev_generation, then list all index_part.json + // objects, and select the highest one with a generation < my_generation. let index_prefix = remote_index_path(tenant_id, timeline_id, Generation::none()); let indices = backoff::retry( || async { storage.list_files(Some(&index_prefix)).await }, @@ -300,6 +329,8 @@ pub(super) async fn download_index_part( .await .map_err(DownloadError::Other)?; + // General case logic for which index to use: the latest index whose generation + // is <= our own. See "Finding the rmeote indices for timelines" in docs/rfcs/025-generation-numbers.md let max_previous_generation = indices .into_iter() .filter_map(parse_remote_index_path) @@ -311,14 +342,13 @@ pub(super) async fn download_index_part( tracing::debug!( "Found index_part in generation {g:?} (my generation {my_generation:?})" ); - do_download_index_part(storage, tenant_id, timeline_id, *g).await + do_download_index_part(storage, tenant_id, timeline_id, g).await } None => { - // This is not an error: the timeline may be newly created, or we may be - // upgrading and have no historical index_part with a generation suffix. - // Fall back to trying to load the un-suffixed index_part.json. + // Migration from legacy pre-generation state: we have a generation but no prior + // attached pageservers did. Try to load from a no-generation path. tracing::info!( - "No index_part.json-* found when loading {tenant_id}/{timeline_id} in generation {my_generation:?}" + "No index_part.json* found when loading {tenant_id}/{timeline_id} in generation {my_generation:?}" ); do_download_index_part(storage, tenant_id, timeline_id, Generation::none()).await } From 9dc2f72f7bb1e06148ec207b3e10d80e79842252 Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 1 Sep 2023 13:58:20 +0100 Subject: [PATCH 26/44] remove unused constructor arg --- test_runner/fixtures/neon_fixtures.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 9de7e90f506d..276e8aa8cfdc 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -428,7 +428,6 @@ def __init__( preserve_database_files: bool = False, initial_tenant: Optional[TenantId] = None, initial_timeline: Optional[TimelineId] = None, - enable_generations: bool = False, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override From bb278aa8860c19f6d573f6d4c7e54150efd75811 Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 1 Sep 2023 14:03:24 +0100 Subject: [PATCH 27/44] Apply suggestions from code review Co-authored-by: Christian Schwarz --- pageserver/src/tenant/mgr.rs | 2 +- test_runner/fixtures/neon_fixtures.py | 1 + test_runner/regress/test_remote_storage.py | 1 - 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 1964391b0506..bbabf6b799d1 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -79,7 +79,7 @@ pub async fn init_tenant_mgr( let mut tenants = HashMap::new(); - // If we are configured to use the control plane API, then it is the source of truth for what to attach + // If we are configured to use the control plane API, then it is the source of truth for what tenants to load. let tenant_generations = if let Some(client) = ControlPlaneClient::new(conf, &cancel) { Some(client.re_attach().await?) } else { diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 276e8aa8cfdc..2a1b630f379a 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1509,6 +1509,7 @@ def __init__(self, env: NeonEnv): self.env = env def start(self): + assert not self.running self.env.neon_cli.attachment_service_start() self.running = True return self diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 8d1caae30ad3..c7044b8feed9 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -157,7 +157,6 @@ def test_remote_storage_backup_and_restore( # background task to load the tenant. In that background task, # listing the remote timelines will fail because of the failpoint, # and the tenant will be marked as Broken. - # client.tenant_attach(tenant_id) env.pageserver.tenant_attach(tenant_id) tenant_info = wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 15) From fb3632947a1485b7b5971e6c125a3c8497cbe667 Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 1 Sep 2023 14:15:15 +0100 Subject: [PATCH 28/44] Handle control plane URLs without a trailing slash --- pageserver/src/control_plane_client.rs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs index 54788a1f1fa9..bfc0e0beda93 100644 --- a/pageserver/src/control_plane_client.rs +++ b/pageserver/src/control_plane_client.rs @@ -31,17 +31,16 @@ impl ControlPlaneClient { /// A None return value indicates that the input `conf` object does not have control /// plane API enabled. pub(crate) fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option { - let url = match conf.control_plane_api.as_ref() { + let mut url = match conf.control_plane_api.as_ref() { Some(u) => u.clone(), None => return None, }; - // FIXME: it's awkward that join() requires the base to have a trailing slash, makes - // it easy to get a config wrong - assert!( - url.as_str().ends_with('/'), - "control plane API needs trailing slash" - ); + if let Ok(mut segs) = url.path_segments_mut() { + // This ensures that `url` ends with a slash if it doesn't already. + // That way, we can subsequently use join() to safely attach extra path elements. + segs.pop_if_empty().push(""); + } let client = reqwest::ClientBuilder::new() .build() From 2553e488c43c6b333881ccf36b5acde4d318ba1f Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 1 Sep 2023 15:14:30 +0100 Subject: [PATCH 29/44] Fix python lint complaint --- test_runner/fixtures/neon_fixtures.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 2a1b630f379a..d0d697fe59b3 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1507,6 +1507,7 @@ class ComputeCtl(AbstractNeonCli): class NeonAttachmentService: def __init__(self, env: NeonEnv): self.env = env + self.running = False def start(self): assert not self.running From 2af98aa93befee21e4620e15e7990fb143ca822f Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 4 Sep 2023 10:38:31 +0100 Subject: [PATCH 30/44] fixup merge --- pageserver/src/tenant.rs | 2 +- .../src/tenant/remote_timeline_client.rs | 61 +------------------ 2 files changed, 4 insertions(+), 59 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 855301cd1d1b..d402bf1671c9 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3367,7 +3367,7 @@ pub mod harness { pub tenant_conf: TenantConf, pub tenant_id: TenantId, pub generation: Generation, - remote_storage: GenericRemoteStorage, + pub remote_storage: GenericRemoteStorage, pub remote_fs_dir: PathBuf, } diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 1e6e1f7ea563..02cf8727558e 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -1550,12 +1550,6 @@ mod tests { tenant: Arc, timeline: Arc, tenant_ctx: RequestContext, -<<<<<<< HEAD - remote_fs_dir: PathBuf, - client: Arc, - storage: GenericRemoteStorage, -======= ->>>>>>> upstream/main } impl TestSetup { @@ -1569,52 +1563,11 @@ mod tests { .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; -<<<<<<< HEAD - let remote_fs_dir = harness.conf.workdir.join("remote_fs"); - std::fs::create_dir_all(remote_fs_dir)?; - let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?; - - let storage_config = RemoteStorageConfig { - max_concurrent_syncs: std::num::NonZeroUsize::new( - remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS, - ) - .unwrap(), - max_sync_errors: std::num::NonZeroU32::new( - remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS, - ) - .unwrap(), - storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()), - }; - - let generation = Generation::new(0xdeadbeef); - - let storage = GenericRemoteStorage::from_config(&storage_config).unwrap(); - - let client = Arc::new(RemoteTimelineClient { - conf: harness.conf, - runtime: tokio::runtime::Handle::current(), - tenant_id: harness.tenant_id, - timeline_id: TIMELINE_ID, - generation, - storage_impl: storage.clone(), - upload_queue: Mutex::new(UploadQueue::Uninitialized), - metrics: Arc::new(RemoteTimelineClientMetrics::new( - &harness.tenant_id, - &TIMELINE_ID, - )), - }); - -======= ->>>>>>> upstream/main Ok(Self { harness, tenant, timeline, tenant_ctx: ctx, -<<<<<<< HEAD - remote_fs_dir, - client, - storage, }) } @@ -1626,14 +1579,12 @@ mod tests { tenant_id: self.harness.tenant_id, timeline_id: TIMELINE_ID, generation, - storage_impl: self.storage.clone(), + storage_impl: self.harness.remote_storage.clone(), upload_queue: Mutex::new(UploadQueue::Uninitialized), metrics: Arc::new(RemoteTimelineClientMetrics::new( &self.harness.tenant_id, &TIMELINE_ID, )), -======= ->>>>>>> upstream/main }) } } @@ -1660,12 +1611,6 @@ mod tests { tenant: _tenant, timeline, tenant_ctx: _tenant_ctx, -<<<<<<< HEAD - remote_fs_dir, - client, - .. -======= ->>>>>>> upstream/main } = TestSetup::new("upload_scheduling").await.unwrap(); let client = timeline.remote_client.as_ref().unwrap(); @@ -1927,7 +1872,7 @@ mod tests { let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap(); let timeline_path = test_state.harness.timeline_path(&TIMELINE_ID); - let remote_timeline_dir = test_state.remote_fs_dir.join( + let remote_timeline_dir = test_state.harness.remote_fs_dir.join( timeline_path .strip_prefix(&test_state.harness.conf.workdir) .unwrap(), @@ -1935,7 +1880,7 @@ mod tests { std::fs::create_dir_all(remote_timeline_dir).expect("creating test dir should work"); - let index_path = test_state.remote_fs_dir.join( + let index_path = test_state.harness.remote_fs_dir.join( remote_index_path(&test_state.harness.tenant_id, &TIMELINE_ID, generation).get_path(), ); eprintln!("Writing {}", index_path.display()); From a88fa7d0f7dcb93f5eef43505b859aceaf8fe71f Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 4 Sep 2023 10:38:37 +0100 Subject: [PATCH 31/44] Use safe tenant dir deletion in /re-attach handling --- pageserver/src/tenant/mgr.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 600937a3718b..5c2b52b065f6 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -183,7 +183,7 @@ pub async fn init_tenant_mgr( *gen } else { info!("Detaching tenant {0}, control plane omitted it in re-attach response", tenant_id); - if let Err(e) = fs::remove_dir_all(&tenant_dir_path).await { + if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await { error!( "Failed to remove detached tenant directory '{}': {:?}", tenant_dir_path.display(), From 85c737e2e82d817b4480a74ebf4a753af6abf86b Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 6 Sep 2023 09:22:53 +0100 Subject: [PATCH 32/44] Update pageserver/src/tenant/remote_timeline_client/download.rs Co-authored-by: Joonas Koivunen --- pageserver/src/tenant/remote_timeline_client/download.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index dc7f8dab9abc..9a1f8a7429a4 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -281,11 +281,8 @@ pub(super) async fn download_index_part( tracing::debug!("Found index_part from current generation (this is a stale attachment) {my_generation:?}"); return Ok(index_part); } - Err(e) => { - if !matches!(e, DownloadError::NotFound) { - return Err(e); - } - } + Err(DownloadError::NotFound) => {} + Err(e) => return Err(e), }; // Typical case: the previous generation of this tenant was running healthily, and had uploaded From 03c5ab05ace7381371754d9e1800dd4a97fa0815 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 6 Sep 2023 09:39:51 +0100 Subject: [PATCH 33/44] Apply suggestions from code review Co-authored-by: Joonas Koivunen --- libs/pageserver_api/src/control_api.rs | 10 +++++----- pageserver/src/control_plane_client.rs | 2 +- pageserver/src/tenant/mgr.rs | 2 +- pageserver/src/tenant/remote_timeline_client.rs | 4 ++-- .../src/tenant/remote_timeline_client/download.rs | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/libs/pageserver_api/src/control_api.rs b/libs/pageserver_api/src/control_api.rs index 45d78aa05786..79bc7f3b70b7 100644 --- a/libs/pageserver_api/src/control_api.rs +++ b/libs/pageserver_api/src/control_api.rs @@ -1,11 +1,11 @@ +//! Types in this file are for pageserver's upward-facing API calls to the control plane, +//! required for acquiring and validating tenant generation numbers. +//! +//! See docs/rfcs/025-generation-numbers.md + use serde::{Deserialize, Serialize}; use utils::id::{NodeId, TenantId}; -/// Types in this file are for pageserver's upward-facing API calls to the control plane, -/// required for acquiring and validating tenant generation numbers. -/// -/// See docs/rfcs/025-generation-numbers.md - #[derive(Serialize, Deserialize)] pub struct ReAttachRequest { pub node_id: NodeId, diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs index bfc0e0beda93..192eb167894b 100644 --- a/pageserver/src/control_plane_client.rs +++ b/pageserver/src/control_plane_client.rs @@ -89,7 +89,7 @@ impl ControlPlaneClient { match result { Ok(res) => { tracing::info!( - "Received re-attach response with {0} tenants", + "Received re-attach response with {} tenants", res.tenants.len() ); diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 5c2b52b065f6..f2fe208b4dff 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -196,7 +196,7 @@ pub async fn init_tenant_mgr( // Legacy mode: no generation information, any tenant present // on local disk may activate info!( - "Starting tenant {0} in legacy mode, no generation", + "Starting tenant {} in legacy mode, no generation", tenant_dir_path.display() ); Generation::none() diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 02cf8727558e..d2c0fd6e5754 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -1436,7 +1436,7 @@ pub fn parse_remote_index_path(path: RemotePath) -> Option { Some(f) => f, None => { // Unexpected: we should be seeing index_part.json paths only - tracing::warn!("Malformed index key {0}", path); + tracing::warn!("Malformed index key {}", path); return None; } }; @@ -1444,7 +1444,7 @@ pub fn parse_remote_index_path(path: RemotePath) -> Option { let file_name_str = match file_name.to_str() { Some(s) => s, None => { - tracing::warn!("Malformed index key {0:?}", path); + tracing::warn!("Malformed index key {:?}", path); return None; } }; diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 9a1f8a7429a4..fe654ea11134 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -327,7 +327,7 @@ pub(super) async fn download_index_part( .map_err(DownloadError::Other)?; // General case logic for which index to use: the latest index whose generation - // is <= our own. See "Finding the rmeote indices for timelines" in docs/rfcs/025-generation-numbers.md + // is <= our own. See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md let max_previous_generation = indices .into_iter() .filter_map(parse_remote_index_path) From 382a9303e266f4d6e0eb9ad576f3a3cfefcf0e89 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 6 Sep 2023 09:37:09 +0100 Subject: [PATCH 34/44] Cleanup of attachment_service binary --- control_plane/src/bin/attachment_service.rs | 28 +++++++++++++-------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/control_plane/src/bin/attachment_service.rs b/control_plane/src/bin/attachment_service.rs index d1f99210ac28..8ca5154aaed3 100644 --- a/control_plane/src/bin/attachment_service.rs +++ b/control_plane/src/bin/attachment_service.rs @@ -36,9 +36,11 @@ use control_plane::attachment_service::{AttachHookRequest, AttachHookResponse}; #[command(author, version, about, long_about = None)] #[command(arg_required_else_help(true))] struct Cli { + /// Host and port to listen on, like `127.0.0.1:1234` #[arg(short, long)] - listen: String, + listen: std::net::SocketAddr, + /// Path to the .json file to store state (will be created if it doesn't exist) #[arg(short, long)] path: PathBuf, } @@ -109,17 +111,24 @@ impl PersistentState { async fn load_or_new(path: &Path) -> Self { match Self::load(path).await { - Ok(s) => s, - Err(e) => { - tracing::info!( - "Creating new state file at {0} (load returned {e})", - path.to_string_lossy() - ); + Ok(s) => { + tracing::info!("Loaded state file at {0}", path.display()); + s + } + Err(e) + if e.downcast_ref::() + .map(|e| e.kind() == std::io::ErrorKind::NotFound) + .unwrap_or(false) => + { + tracing::info!("Will create state file at {0}", path.display()); Self { tenants: HashMap::new(), path: path.to_owned(), } } + Err(e) => { + panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path.display()) + } } } } @@ -176,8 +185,7 @@ async fn handle_re_attach(mut req: Request) -> Result, ApiE async fn handle_validate(mut req: Request) -> Result, ApiError> { let validate_req = json_request::(&mut req).await?; - let state = get_state(&req).inner.clone(); - let locked = state.read().await; + let locked = get_state(&req).inner.read().await; let mut response = ValidateResponse { tenants: Vec::new(), @@ -258,7 +266,7 @@ async fn main() -> anyhow::Result<()> { let service = utils::http::RouterService::new(router).unwrap(); let server = hyper::Server::from_tcp(http_listener)?.serve(service); - tracing::info!("Serving on {0}", args.listen.as_str()); + tracing::info!("Serving on {0}", args.listen); server.await?; Ok(()) From 008545c5b10f7118fffecdffa51126263a32c7f3 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 6 Sep 2023 10:04:48 +0100 Subject: [PATCH 35/44] improve logging in index part download --- .../src/tenant/remote_timeline_client.rs | 15 +++++++++ .../tenant/remote_timeline_client/download.rs | 31 ++++++++++--------- 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index d2c0fd6e5754..65a073485903 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -1587,6 +1587,16 @@ mod tests { )), }) } + + /// A tracing::Span that satisfies remote_timeline_client methods that assert tenant_id + /// and timeline_id are present. + fn span(&self) -> tracing::Span { + tracing::info_span!( + "test", + tenant_id = %self.harness.tenant_id, + timeline_id = %TIMELINE_ID + ) + } } // Test scheduling @@ -1913,6 +1923,8 @@ mod tests { #[tokio::test] async fn index_part_download_simple() -> anyhow::Result<()> { let test_state = TestSetup::new("index_part_download_simple").await.unwrap(); + let span = test_state.span(); + let _guard = span.enter(); // Simple case: we are in generation N, load the index from generation N - 1 let generation_n = 5; @@ -1929,6 +1941,9 @@ mod tests { .await .unwrap(); + let span = test_state.span(); + let _guard = span.enter(); + // A generation-less IndexPart exists in the bucket, we should find it let generation_n = 5; let injected_none = inject_index_part(&test_state, Generation::none()).await; diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index fe654ea11134..fef8a2b4916d 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -260,12 +260,15 @@ async fn do_download_index_part( /// /// In this function we probe for the most recent index in a generation <= our current generation. /// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md +#[tracing::instrument(skip_all, fields(generation=?my_generation))] pub(super) async fn download_index_part( storage: &GenericRemoteStorage, tenant_id: &TenantId, timeline_id: &TimelineId, my_generation: Generation, ) -> Result { + debug_assert_current_span_has_tenant_and_timeline_id(); + if my_generation.is_none() { // Operating without generations: just fetch the generation-less path return do_download_index_part(storage, tenant_id, timeline_id, my_generation).await; @@ -275,10 +278,12 @@ pub(super) async fn download_index_part( // index in our generation. // // This is an optimization to avoid doing the listing for the general case below. - let r_current = do_download_index_part(storage, tenant_id, timeline_id, my_generation).await; - match r_current { + let res = do_download_index_part(storage, tenant_id, timeline_id, my_generation).await; + match res { Ok(index_part) => { - tracing::debug!("Found index_part from current generation (this is a stale attachment) {my_generation:?}"); + tracing::debug!( + "Found index_part from current generation (this is a stale attachment)" + ); return Ok(index_part); } Err(DownloadError::NotFound) => {} @@ -293,16 +298,18 @@ pub(super) async fn download_index_part( // we want to find the most recent index from a previous generation. // // This is an optimization to avoid doing the listing for the general case below. - let previous_gen = my_generation.previous(); - let r_previous = do_download_index_part(storage, tenant_id, timeline_id, previous_gen).await; - match r_previous { + let res = + do_download_index_part(storage, tenant_id, timeline_id, my_generation.previous()).await; + match res { Ok(index_part) => { - tracing::debug!("Found index_part from previous generation {previous_gen:?}"); + tracing::debug!("Found index_part from previous generation"); return Ok(index_part); } Err(e) => { if matches!(e, DownloadError::NotFound) { - tracing::debug!("No index_part found from previous generation {previous_gen:?}, falling back to listing"); + tracing::debug!( + "No index_part found from previous generation, falling back to listing" + ); } else { return Err(e); } @@ -336,17 +343,13 @@ pub(super) async fn download_index_part( match max_previous_generation { Some(g) => { - tracing::debug!( - "Found index_part in generation {g:?} (my generation {my_generation:?})" - ); + tracing::debug!("Found index_part in generation {g:?}"); do_download_index_part(storage, tenant_id, timeline_id, g).await } None => { // Migration from legacy pre-generation state: we have a generation but no prior // attached pageservers did. Try to load from a no-generation path. - tracing::info!( - "No index_part.json* found when loading {tenant_id}/{timeline_id} in generation {my_generation:?}" - ); + tracing::info!("No index_part.json* found"); do_download_index_part(storage, tenant_id, timeline_id, Generation::none()).await } } From 864fb7407e8507ce089aab08fd9ded7dc9574f76 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 6 Sep 2023 10:13:57 +0100 Subject: [PATCH 36/44] libs: flatten a match{} --- libs/remote_storage/src/local_fs.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 9aef0cb66bcc..5040183045af 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -155,18 +155,20 @@ impl RemoteStorage for LocalFs { // the local filesystem we need a directory to start calling read_dir on. let mut initial_dir = full_path.clone(); match fs::metadata(full_path.clone()).await { - Err(e) => { - // It's not a file that exists: strip the prefix back to the parent directory - if matches!(e.kind(), ErrorKind::NotFound) { - initial_dir.pop(); - } - } Ok(meta) => { if !meta.is_dir() { // It's not a directory: strip back to the parent initial_dir.pop(); } } + Err(e) if e.kind() == ErrorKind::NotFound => { + // It's not a file that exists: strip the prefix back to the parent directory + initial_dir.pop(); + } + Err(e) => { + // Unexpected I/O error + anyhow::bail!(e) + } } // Note that PathBuf starts_with only considers full path segments, but From 32548c7e56a21ede38cb3fe52bb0404b8c6a62e2 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 6 Sep 2023 10:17:11 +0100 Subject: [PATCH 37/44] flatten a match{} --- .../src/tenant/remote_timeline_client/download.rs | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index fef8a2b4916d..1dd701a1adc0 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -305,16 +305,15 @@ pub(super) async fn download_index_part( tracing::debug!("Found index_part from previous generation"); return Ok(index_part); } + Err(DownloadError::NotFound) => { + tracing::debug!( + "No index_part found from previous generation, falling back to listing" + ); + } Err(e) => { - if matches!(e, DownloadError::NotFound) { - tracing::debug!( - "No index_part found from previous generation, falling back to listing" - ); - } else { - return Err(e); - } + return Err(e); } - }; + } // General case/fallback: if there is no index at my_generation or prev_generation, then list all index_part.json // objects, and select the highest one with a generation < my_generation. From bc32ec616304388b149f156a46e7b204d9cde55d Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 6 Sep 2023 10:25:12 +0100 Subject: [PATCH 38/44] Use DisplayFromStr instead of hex --- Cargo.lock | 1 - control_plane/src/attachment_service.rs | 4 +++- libs/pageserver_api/Cargo.toml | 1 - libs/pageserver_api/src/control_api.rs | 10 +++++++--- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6b78b6988da2..e32ea4006a0f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2756,7 +2756,6 @@ dependencies = [ "bytes", "const_format", "enum-map", - "hex", "postgres_ffi", "serde", "serde_json", diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs index a41c49e9585d..9a6a79844cd7 100644 --- a/control_plane/src/attachment_service.rs +++ b/control_plane/src/attachment_service.rs @@ -1,6 +1,7 @@ use crate::{background_process, local_env::LocalEnv}; use anyhow::anyhow; use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; use std::{path::PathBuf, process::Child}; use utils::id::{NodeId, TenantId}; @@ -12,9 +13,10 @@ pub struct AttachmentService { const COMMAND: &str = "attachment_service"; +#[serde_as] #[derive(Serialize, Deserialize)] pub struct AttachHookRequest { - #[serde(with = "hex")] + #[serde_as(as = "DisplayFromStr")] pub tenant_id: TenantId, pub pageserver_id: Option, } diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 7fdd46eede23..f97ec54e9145 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -12,7 +12,6 @@ const_format.workspace = true anyhow.workspace = true bytes.workspace = true byteorder.workspace = true -hex.workspace = true utils.workspace = true postgres_ffi.workspace = true enum-map.workspace = true diff --git a/libs/pageserver_api/src/control_api.rs b/libs/pageserver_api/src/control_api.rs index 79bc7f3b70b7..a54fee47a5c6 100644 --- a/libs/pageserver_api/src/control_api.rs +++ b/libs/pageserver_api/src/control_api.rs @@ -4,6 +4,7 @@ //! See docs/rfcs/025-generation-numbers.md use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; use utils::id::{NodeId, TenantId}; #[derive(Serialize, Deserialize)] @@ -11,9 +12,10 @@ pub struct ReAttachRequest { pub node_id: NodeId, } +#[serde_as] #[derive(Serialize, Deserialize)] pub struct ReAttachResponseTenant { - #[serde(with = "hex")] + #[serde_as(as = "DisplayFromStr")] pub id: TenantId, pub generation: u32, } @@ -23,9 +25,10 @@ pub struct ReAttachResponse { pub tenants: Vec, } +#[serde_as] #[derive(Serialize, Deserialize)] pub struct ValidateRequestTenant { - #[serde(with = "hex")] + #[serde_as(as = "DisplayFromStr")] pub id: TenantId, pub gen: u32, } @@ -40,9 +43,10 @@ pub struct ValidateResponse { pub tenants: Vec, } +#[serde_as] #[derive(Serialize, Deserialize)] pub struct ValidateResponseTenant { - #[serde(with = "hex")] + #[serde_as(as = "DisplayFromStr")] pub id: TenantId, pub valid: bool, } From 646785397d153954990b9912a2866114e3922e94 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 6 Sep 2023 10:35:08 +0100 Subject: [PATCH 39/44] Add track_caller to functions that can panic --- libs/utils/src/generation.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs index 1408c8ca703c..163c8c0467f7 100644 --- a/libs/utils/src/generation.rs +++ b/libs/utils/src/generation.rs @@ -53,6 +53,7 @@ impl Generation { matches!(self, Self::None) } + #[track_caller] pub fn get_suffix(&self) -> String { match self { Self::Valid(v) => { @@ -72,6 +73,7 @@ impl Generation { u32::from_str_radix(suffix, 16).map(Generation::new).ok() } + #[track_caller] pub fn previous(&self) -> Generation { match self { Self::Valid(n) => { From 386542ea6611c36f51a187ede28b249a62a428bd Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 6 Sep 2023 10:35:37 +0100 Subject: [PATCH 40/44] clippy --- control_plane/src/bin/attachment_service.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/control_plane/src/bin/attachment_service.rs b/control_plane/src/bin/attachment_service.rs index 8ca5154aaed3..ec800087a226 100644 --- a/control_plane/src/bin/attachment_service.rs +++ b/control_plane/src/bin/attachment_service.rs @@ -259,7 +259,7 @@ async fn main() -> anyhow::Result<()> { let persistent_state = PersistentState::load_or_new(&args.path).await; - let http_listener = tcp_listener::bind(&args.listen)?; + let http_listener = tcp_listener::bind(args.listen)?; let router = make_router(persistent_state) .build() .map_err(|err| anyhow!(err))?; From 88374796b5464c7dc0925b871b5eb1de66e8a630 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 6 Sep 2023 11:51:57 +0100 Subject: [PATCH 41/44] Add a log span to a unit test --- pageserver/src/tenant/remote_timeline_client.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 65a073485903..3dcd95ed5f50 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -1616,12 +1616,16 @@ mod tests { // Schedule another deletion. Check that it's launched immediately. // Schedule index upload. Check that it's queued + let test_setup = TestSetup::new("upload_scheduling").await.unwrap(); + let span = test_setup.span(); + let _guard = span.enter(); + let TestSetup { harness, tenant: _tenant, timeline, tenant_ctx: _tenant_ctx, - } = TestSetup::new("upload_scheduling").await.unwrap(); + } = test_setup; let client = timeline.remote_client.as_ref().unwrap(); From 606a392f04d7cfe9849fa16e4cb6b691da597194 Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 5 Sep 2023 14:13:56 +0100 Subject: [PATCH 42/44] pageserver: fix layer reconciliation with generations This was wrongly assuming generations should be the same: the local metadata will actually always have the current generation set, and in this situation we want to UseLocal if the sizes match, but take the metadata (generation) from the remote metadata. --- pageserver/src/tenant/timeline.rs | 17 ++++++++++++----- pageserver/src/tenant/timeline/init.rs | 6 +----- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index d7ff70a29ce1..7b6c6dbfadce 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1724,11 +1724,18 @@ impl Timeline { for (name, decision) in decided { let decision = match decision { Ok(UseRemote { local, remote }) => { - path.push(name.file_name()); - init::cleanup_local_file_for_remote(&path, &local, &remote)?; - path.pop(); - - UseRemote { local, remote } + // Remote is authoritative, but we may still choose to retain + // the local file if the contents appear to match + if local.file_size() == remote.file_size() { + // Use the local file, but take the remote metadata so that we pick up + // the correct generation. + UseLocal(remote) + } else { + path.push(name.file_name()); + init::cleanup_local_file_for_remote(&path, &local, &remote)?; + path.pop(); + UseRemote { local, remote } + } } Ok(decision) => decision, Err(FutureLayer { local }) => { diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs index 33effb431878..22976a514df7 100644 --- a/pageserver/src/tenant/timeline/init.rs +++ b/pageserver/src/tenant/timeline/init.rs @@ -147,11 +147,7 @@ pub(super) fn reconcile( Err(FutureLayer { local }) } else { Ok(match (local, remote) { - (Some(local), Some(remote)) if local != remote => { - assert_eq!(local.generation, remote.generation); - - UseRemote { local, remote } - } + (Some(local), Some(remote)) if local != remote => UseRemote { local, remote }, (Some(x), Some(_)) => UseLocal(x), (None, Some(x)) => Evicted(x), (Some(x), None) => NeedsUpload(x), From db3dc069161213ab2f75d3126832b8e100f15354 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 6 Sep 2023 13:42:58 +0100 Subject: [PATCH 43/44] Apply suggestions from code review Co-authored-by: Joonas Koivunen --- control_plane/src/attachment_service.rs | 2 +- control_plane/src/bin/attachment_service.rs | 4 ++-- pageserver/src/tenant/mgr.rs | 4 ++-- pageserver/src/tenant/remote_timeline_client.rs | 2 +- pageserver/src/tenant/remote_timeline_client/download.rs | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs index 9a6a79844cd7..2bd4260aa8ce 100644 --- a/control_plane/src/attachment_service.rs +++ b/control_plane/src/attachment_service.rs @@ -97,7 +97,7 @@ impl AttachmentService { let response = client.post(url).json(&request).send()?; if response.status() != StatusCode::OK { - return Err(anyhow!("Unexpected status {0}", response.status())); + return Err(anyhow!("Unexpected status {}", response.status())); } let response = response.json::()?; diff --git a/control_plane/src/bin/attachment_service.rs b/control_plane/src/bin/attachment_service.rs index ec800087a226..e879646b637b 100644 --- a/control_plane/src/bin/attachment_service.rs +++ b/control_plane/src/bin/attachment_service.rs @@ -112,7 +112,7 @@ impl PersistentState { async fn load_or_new(path: &Path) -> Self { match Self::load(path).await { Ok(s) => { - tracing::info!("Loaded state file at {0}", path.display()); + tracing::info!("Loaded state file at {}", path.display()); s } Err(e) @@ -120,7 +120,7 @@ impl PersistentState { .map(|e| e.kind() == std::io::ErrorKind::NotFound) .unwrap_or(false) => { - tracing::info!("Will create state file at {0}", path.display()); + tracing::info!("Will create state file at {}", path.display()); Self { tenants: HashMap::new(), path: path.to_owned(), diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 9548e0885e32..b92c1f69c3ff 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -169,7 +169,7 @@ pub async fn init_tenant_mgr( Ok(id) => id, Err(_) => { warn!( - "Invalid tenant path (garbage in our repo directory?): {0}", + "Invalid tenant path (garbage in our repo directory?): {}", tenant_dir_path.display() ); continue; @@ -182,7 +182,7 @@ pub async fn init_tenant_mgr( if let Some(gen) = generations.get(&tenant_id) { *gen } else { - info!("Detaching tenant {0}, control plane omitted it in re-attach response", tenant_id); + info!("Detaching tenant {tenant_id}, control plane omitted it in re-attach response"); if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await { error!( "Failed to remove detached tenant directory '{}': {:?}", diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 3dcd95ed5f50..6f42b54ac2ae 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -1431,7 +1431,7 @@ pub fn remote_index_path( } /// Given the key of an index, parse out the generation part of the name -pub fn parse_remote_index_path(path: RemotePath) -> Option { +pub(crate) fn parse_remote_index_path(path: RemotePath) -> Option { let file_name = match path.get_path().file_name() { Some(f) => f, None => { diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 1dd701a1adc0..986321552906 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -316,7 +316,7 @@ pub(super) async fn download_index_part( } // General case/fallback: if there is no index at my_generation or prev_generation, then list all index_part.json - // objects, and select the highest one with a generation < my_generation. + // objects, and select the highest one with a generation <= my_generation. let index_prefix = remote_index_path(tenant_id, timeline_id, Generation::none()); let indices = backoff::retry( || async { storage.list_files(Some(&index_prefix)).await }, From 562e47ee7c05184538e113d47bce945f1a99588d Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 6 Sep 2023 13:45:59 +0100 Subject: [PATCH 44/44] unwrap_or_else --- control_plane/src/bin/neon_local.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 2976363fa7fd..6b49b92cfa8f 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -354,7 +354,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an .unwrap_or_default(); // If tenant ID was not specified, generate one - let tenant_id = parse_tenant_id(create_match)?.unwrap_or(TenantId::generate()); + let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate); let generation = if env.pageserver.control_plane_api.is_some() { // We must register the tenant with the attachment service, so