From 6331c25e6ba40f8fb39a8f37fe0b092654088d77 Mon Sep 17 00:00:00 2001 From: Robert Habermeier Date: Wed, 23 Jun 2021 00:41:54 +0100 Subject: [PATCH] SelectChain implementation for relay chains (#3341) * stubbed SelectRelayChain * disconnected overseer handlers * add is_disconnected * add fallback in case overseer is disconnected * fall back on fallback * fetch leaves by calling into chain-selection subsystem * implement best_chain * mostly implement finality_target * chain constrain * metrics and maximum safeguard * remove review comment after review --- Cargo.lock | 1 + node/overseer/src/lib.rs | 41 ++- node/service/Cargo.toml | 1 + node/service/src/grandpa_support.rs | 2 +- node/service/src/lib.rs | 1 + node/service/src/relay_chain_selection.rs | 368 ++++++++++++++++++++++ 6 files changed, 408 insertions(+), 6 deletions(-) create mode 100644 node/service/src/relay_chain_selection.rs diff --git a/Cargo.lock b/Cargo.lock index 54d2f12408d4..eeaac3b736a4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6687,6 +6687,7 @@ dependencies = [ name = "polkadot-service" version = "0.9.5" dependencies = [ + "async-trait", "beefy-gadget", "beefy-primitives", "env_logger 0.8.4", diff --git a/node/overseer/src/lib.rs b/node/overseer/src/lib.rs index e40ce9e9e849..7e6f831dd70a 100644 --- a/node/overseer/src/lib.rs +++ b/node/overseer/src/lib.rs @@ -419,10 +419,33 @@ enum ExternalRequest { /// [`Overseer`]: struct.Overseer.html #[derive(Clone)] pub struct OverseerHandler { - events_tx: metered::MeteredSender, + events_tx: Option>, } impl OverseerHandler { + /// Create a disconnected overseer handler. + pub fn disconnected() -> Self { + OverseerHandler { + events_tx: None, + } + } + + /// Whether the overseer handler is connected to an overseer. + pub fn is_connected(&self) -> bool { + self.events_tx.is_some() + } + + /// Whether the handler is disconnected. + pub fn is_disconnected(&self) -> bool { + self.events_tx.is_none() + } + + /// Using this handler, connect another handler to the same + /// overseer, if any. + pub fn connect_other(&self, other: &mut OverseerHandler) { + other.events_tx = self.events_tx.clone(); + } + /// Inform the `Overseer` that that some block was imported. pub async fn block_imported(&mut self, block: BlockInfo) { self.send_and_log_error(Event::BlockImported(block)).await @@ -457,8 +480,10 @@ impl OverseerHandler { } async fn send_and_log_error(&mut self, event: Event) { - if self.events_tx.send(event).await.is_err() { - tracing::info!(target: LOG_TARGET, "Failed to send an event to Overseer"); + if let Some(ref mut events_tx) = self.events_tx { + if events_tx.send(event).await.is_err() { + tracing::info!(target: LOG_TARGET, "Failed to send an event to Overseer"); + } } } } @@ -1274,7 +1299,13 @@ where S: SpawnNamed, SupportsParachains: HeadSupportsParachains, { - /// Create a new instance of the `Overseer` with a fixed set of [`Subsystem`]s. + /// Create a new instance of the [`Overseer`] with a fixed set of [`Subsystem`]s. + /// + /// This returns the overseer along with an [`OverseerHandler`] which can + /// be used to send messages from external parts of the codebase. + /// + /// The [`OverseerHandler`] returned from this function is connected to + /// the returned [`Overseer`]. /// /// ```text /// +------------------------------------+ @@ -1393,7 +1424,7 @@ where let (events_tx, events_rx) = metered::channel(CHANNEL_CAPACITY); let handler = OverseerHandler { - events_tx: events_tx.clone(), + events_tx: Some(events_tx.clone()), }; let metrics = ::register(prometheus_registry)?; diff --git a/node/service/Cargo.toml b/node/service/Cargo.toml index 5449c2cf8da2..81f5da22364e 100644 --- a/node/service/Cargo.toml +++ b/node/service/Cargo.toml @@ -67,6 +67,7 @@ serde = { version = "1.0.123", features = ["derive"] } thiserror = "1.0.23" kvdb = "0.9.0" kvdb-rocksdb = { version = "0.11.1", optional = true } +async-trait = "0.1.42" # Polkadot polkadot-node-core-parachains-inherent = { path = "../core/parachains-inherent" } diff --git a/node/service/src/grandpa_support.rs b/node/service/src/grandpa_support.rs index f2c1145cf3a8..bb45709ebbc7 100644 --- a/node/service/src/grandpa_support.rs +++ b/node/service/src/grandpa_support.rs @@ -196,7 +196,7 @@ impl grandpa::VotingRule for ApprovalCheckingVotingRule /// Returns the block hash of the block at the given `target_number` by walking /// backwards from the given `current_header`. -fn walk_backwards_to_target_block( +pub(super) fn walk_backwards_to_target_block( backend: &B, target_number: NumberFor, current_header: &Block::Header, diff --git a/node/service/src/lib.rs b/node/service/src/lib.rs index 4e0a7f123a2a..c3bdff911d12 100644 --- a/node/service/src/lib.rs +++ b/node/service/src/lib.rs @@ -21,6 +21,7 @@ pub mod chain_spec; mod grandpa_support; mod parachains_db; +mod relay_chain_selection; #[cfg(feature = "full-node")] mod overseer; diff --git a/node/service/src/relay_chain_selection.rs b/node/service/src/relay_chain_selection.rs new file mode 100644 index 000000000000..1ca1c5c5f29c --- /dev/null +++ b/node/service/src/relay_chain_selection.rs @@ -0,0 +1,368 @@ +// Copyright 2021 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +//! A [`SelectChain`] implementation designed for relay chains. +//! +//! This uses information about parachains to inform GRANDPA and BABE +//! about blocks which are safe to build on and blocks which are safe to +//! finalize. +//! +//! To learn more about chain-selection rules for Relay Chains, please see the +//! documentation on [chain-selection][chain-selection-guide] +//! in the implementers' guide. +//! +//! This is mostly a wrapper around a subsystem which implements the +//! chain-selection rule, which leaves the code to be very simple. +//! +//! However, this does apply the further finality constraints to the best +//! leaf returned from the chain selection subsystem by calling into other +//! subsystems which yield information about approvals and disputes. +//! +//! [chain-selection-guide]: https://w3f.github.io/parachain-implementers-guide/protocol-chain-selection.html + +#![cfg(feature = "full-node")] + +use { + polkadot_primitives::v1::{ + Hash, BlockNumber, Block as PolkadotBlock, Header as PolkadotHeader, + }, + polkadot_subsystem::messages::{ApprovalVotingMessage, ChainSelectionMessage}, + polkadot_node_subsystem_util::metrics::{self, prometheus}, + polkadot_overseer::OverseerHandler, + futures::channel::oneshot, + consensus_common::{Error as ConsensusError, SelectChain}, + sp_blockchain::HeaderBackend, + sp_runtime::generic::BlockId, + std::sync::Arc, +}; + +/// The maximum amount of unfinalized blocks we are willing to allow due to approval checking +/// or disputes. +/// +/// This is a safety net that should be removed at some point in the future. +const MAX_FINALITY_LAG: polkadot_primitives::v1::BlockNumber = 50; + +const LOG_TARGET: &str = "parachain::chain-selection"; + +/// Prometheus metrics for chain-selection. +#[derive(Debug, Default, Clone)] +pub struct Metrics(Option); + +#[derive(Debug, Clone)] +struct MetricsInner { + approval_checking_finality_lag: prometheus::Gauge, + disputes_finality_lag: prometheus::Gauge, +} + +impl metrics::Metrics for Metrics { + fn try_register(registry: &prometheus::Registry) -> Result { + let metrics = MetricsInner { + approval_checking_finality_lag: prometheus::register( + prometheus::Gauge::with_opts( + prometheus::Opts::new( + "parachain_approval_checking_finality_lag", + "How far behind the head of the chain the Approval Checking protocol wants to vote", + ) + )?, + registry, + )?, + disputes_finality_lag: prometheus::register( + prometheus::Gauge::with_opts( + prometheus::Opts::new( + "parachain_disputes_finality_lag", + "How far behind the head of the chain the Disputes protocol wants to vote", + ) + )?, + registry, + )?, + }; + + Ok(Metrics(Some(metrics))) + } +} + +impl Metrics { + fn note_approval_checking_finality_lag(&self, lag: BlockNumber) { + if let Some(ref metrics) = self.0 { + metrics.approval_checking_finality_lag.set(lag as _); + } + } + + fn note_disputes_finality_lag(&self, lag: BlockNumber) { + if let Some(ref metrics) = self.0 { + metrics.disputes_finality_lag.set(lag as _); + } + } +} + +/// A chain-selection implementation which provides safety for relay chains. +pub struct SelectRelayChain { + backend: Arc, + overseer: OverseerHandler, + // A fallback to use in case the overseer is disconnected. + // + // This is used on relay chains which have not yet enabled + // parachains as well as situations where the node is offline. + fallback: sc_consensus::LongestChain, + metrics: Metrics, +} + +impl SelectRelayChain + where B: sc_client_api::backend::Backend + 'static +{ + /// Create a new [`SelectRelayChain`] wrapping the given chain backend + /// and a handle to the overseer. + #[allow(unused)] + pub fn new(backend: Arc, overseer: OverseerHandler, metrics: Metrics) -> Self { + SelectRelayChain { + fallback: sc_consensus::LongestChain::new(backend.clone()), + backend, + overseer, + metrics, + } + } + + fn block_header(&self, hash: Hash) -> Result { + match self.backend.blockchain().header(BlockId::Hash(hash)) { + Ok(Some(header)) => Ok(header), + Ok(None) => Err(ConsensusError::ChainLookup(format!( + "Missing header with hash {:?}", + hash, + ))), + Err(e) => Err(ConsensusError::ChainLookup(format!( + "Lookup failed for header with hash {:?}: {:?}", + hash, + e, + ))), + } + } + + fn block_number(&self, hash: Hash) -> Result { + match self.backend.blockchain().number(hash) { + Ok(Some(number)) => Ok(number), + Ok(None) => Err(ConsensusError::ChainLookup(format!( + "Missing number with hash {:?}", + hash, + ))), + Err(e) => Err(ConsensusError::ChainLookup(format!( + "Lookup failed for number with hash {:?}: {:?}", + hash, + e, + ))), + } + } +} + +impl SelectRelayChain { + /// Given an overseer handler, this connects the [`SelectRelayChain`]'s + /// internal handler to the same overseer. + #[allow(unused)] + pub fn connect_overseer_handler( + &mut self, + other_handler: &OverseerHandler, + ) { + other_handler.connect_other(&mut self.overseer); + } +} + +impl Clone for SelectRelayChain + where B: sc_client_api::backend::Backend + 'static +{ + fn clone(&self) -> SelectRelayChain { + SelectRelayChain { + backend: self.backend.clone(), + overseer: self.overseer.clone(), + fallback: self.fallback.clone(), + metrics: self.metrics.clone(), + } + } +} + +#[derive(thiserror::Error, Debug)] +enum Error { + // A request to the subsystem was canceled. + #[error("Overseer is disconnected from Chain Selection")] + OverseerDisconnected(oneshot::Canceled), + /// Chain selection returned empty leaves. + #[error("ChainSelection returned no leaves")] + EmptyLeaves, +} + +#[async_trait::async_trait] +impl SelectChain for SelectRelayChain + where B: sc_client_api::backend::Backend + 'static +{ + /// Get all leaves of the chain, i.e. block hashes that are suitable to + /// build upon and have no suitable children. + async fn leaves(&self) -> Result, ConsensusError> { + if self.overseer.is_disconnected() { + return self.fallback.leaves().await + } + + let (tx, rx) = oneshot::channel(); + + self.overseer + .clone() + .send_msg(ChainSelectionMessage::Leaves(tx)).await; + + rx.await + .map_err(Error::OverseerDisconnected) + .map_err(|e| ConsensusError::Other(Box::new(e))) + } + + /// Among all leaves, pick the one which is the best chain to build upon. + async fn best_chain(&self) -> Result { + if self.overseer.is_disconnected() { + return self.fallback.best_chain().await + } + + // The Chain Selection subsystem is supposed to treat the finalized + // block as the best leaf in the case that there are no viable + // leaves, so this should not happen in practice. + let best_leaf = self.leaves() + .await? + .first() + .ok_or_else(|| ConsensusError::Other(Box::new(Error::EmptyLeaves)))? + .clone(); + + + self.block_header(best_leaf) + } + + /// Get the best descendent of `target_hash` that we should attempt to + /// finalize next, if any. It is valid to return the `target_hash` if + /// no better block exists. + /// + /// This will search all leaves to find the best one containing the + /// given target hash, and then constrain to the given block number. + /// + /// It will also constrain the chain to only chains which are fully + /// approved, and chains which contain no disputes. + async fn finality_target( + &self, + target_hash: Hash, + maybe_max_number: Option, + ) -> Result, ConsensusError> { + if self.overseer.is_disconnected() { + return self.fallback.finality_target(target_hash, maybe_max_number).await + } + + let mut overseer = self.overseer.clone(); + + let subchain_head = { + let (tx, rx) = oneshot::channel(); + overseer.send_msg(ChainSelectionMessage::BestLeafContaining(target_hash, tx)).await; + + let best = rx.await + .map_err(Error::OverseerDisconnected) + .map_err(|e| ConsensusError::Other(Box::new(e)))?; + + match best { + // No viable leaves containing the block. + None => return Ok(Some(target_hash)), + Some(best) => best, + } + }; + + let target_number = self.block_number(target_hash)?; + + // 1. Constrain the leaf according to `maybe_max_number`. + let subchain_head = match maybe_max_number { + None => subchain_head, + Some(max) => { + if max <= target_number { + if max < target_number { + tracing::warn!( + LOG_TARGET, + max_number = max, + target_number, + "`finality_target` max number is less than target number", + ); + } + return Ok(Some(target_hash)); + } + // find the current number. + let subchain_header = self.block_header(subchain_head)?; + + if subchain_header.number <= max { + subchain_head + } else { + let (ancestor_hash, _) = crate::grandpa_support::walk_backwards_to_target_block( + self.backend.blockchain(), + max, + &subchain_header, + ).map_err(|e| ConsensusError::ChainLookup(format!("{:?}", e)))?; + + ancestor_hash + } + } + }; + + let initial_leaf = subchain_head; + let initial_leaf_number = self.block_number(initial_leaf)?; + + // 2. Constrain according to `ApprovedAncestor`. + let (subchain_head, subchain_number) = { + + let (tx, rx) = oneshot::channel(); + overseer.send_msg(ApprovalVotingMessage::ApprovedAncestor( + subchain_head, + target_number, + tx, + )).await; + + match rx.await + .map_err(Error::OverseerDisconnected) + .map_err(|e| ConsensusError::Other(Box::new(e)))? + { + // No approved ancestors means target hash is maximal vote. + None => (target_hash, target_number), + Some((s_h, s_n)) => (s_h, s_n), + } + }; + + let lag = initial_leaf_number.saturating_sub(subchain_number); + self.metrics.note_approval_checking_finality_lag(lag); + + // 3. Constrain according to disputes: + // TODO: https://github.com/paritytech/polkadot/issues/3164 + self.metrics.note_disputes_finality_lag(0); + + // 4. Apply the maximum safeguard to the finality lag. + if lag > MAX_FINALITY_LAG { + // We need to constrain our vote as a safety net to + // ensure the network continues to finalize. + let safe_target = initial_leaf_number - MAX_FINALITY_LAG; + + if safe_target <= target_number { + // Minimal vote needs to be on the target number. + Ok(Some(target_hash)) + } else { + // Otherwise we're looking for a descendant. + let initial_leaf_header = self.block_header(initial_leaf)?; + let (forced_target, _) = crate::grandpa_support::walk_backwards_to_target_block( + self.backend.blockchain(), + safe_target, + &initial_leaf_header, + ).map_err(|e| ConsensusError::ChainLookup(format!("{:?}", e)))?; + + Ok(Some(forced_target)) + } + } else { + Ok(Some(subchain_head)) + } + } +}