diff --git a/.changelog/unreleased/bug-fixes/1264-recover-acct-seq.md b/.changelog/unreleased/bug-fixes/1264-recover-acct-seq.md new file mode 100644 index 0000000000..850ce5b86e --- /dev/null +++ b/.changelog/unreleased/bug-fixes/1264-recover-acct-seq.md @@ -0,0 +1,3 @@ +- Added a recovery mechanism to automatically retry or drop tx upon account + sequence mismatch errors ([#1264](https://github.com/informalsystems/ibc- + rs/issues/1264)) \ No newline at end of file diff --git a/relayer/src/chain/cosmos.rs b/relayer/src/chain/cosmos.rs index 12062b4145..49e735f6e8 100644 --- a/relayer/src/chain/cosmos.rs +++ b/relayer/src/chain/cosmos.rs @@ -16,7 +16,7 @@ use chrono::DateTime; use itertools::Itertools; use prost::Message; use prost_types::Any; -use tendermint::abci::{Event, Path as TendermintABCIPath}; +use tendermint::abci::{Code, Event, Path as TendermintABCIPath}; use tendermint::account::Id as AccountId; use tendermint::block::Height; use tendermint::consensus::Params; @@ -87,6 +87,7 @@ use crate::keyring::{KeyEntry, KeyRing}; use crate::light_client::tendermint::LightClient as TmLightClient; use crate::light_client::LightClient; use crate::light_client::Verified; +use crate::util::retry::{retry_with_index, RetryResult}; use crate::{ chain::handle::requests::AppVersion, chain::QueryResponse, chain::StatusResponse, event::monitor::TxMonitorCmd, @@ -114,10 +115,22 @@ const DEFAULT_FEE_GRANTER: &str = ""; /// fraction of the maximum block size defined in the Tendermint core consensus parameters. pub const GENESIS_MAX_BYTES_MAX_FRACTION: f64 = 0.9; +// The error "incorrect account sequence" is defined as the unique error code 32 in cosmos-sdk: +// https://github.com/cosmos/cosmos-sdk/blob/v0.44.0/types/errors/errors.go#L115-L117 +pub const INCORRECT_ACCOUNT_SEQUENCE_ERR: u32 = 32; + mod retry_strategy { use crate::util::retry::Fixed; use core::time::Duration; + // Maximum number of retries for send_tx in the case of + // an account sequence mismatch. + pub const MAX_ACCOUNT_SEQUENCE_RETRY: u32 = 5; + + // Backoff multiplier to apply while retrying in the case + // of account sequence mismatch. + pub const BACKOFF_MULTIPLIER_ACCOUNT_SEQUENCE_RETRY: u64 = 200; + pub fn wait_for_block_commits(max_total_wait: Duration) -> impl Iterator { let backoff_millis = 300; // The periodic backoff let count: usize = (max_total_wait.as_millis() / backoff_millis as u128) as usize; @@ -284,11 +297,11 @@ impl CosmosSdkChain { self.rt.block_on(f) } - fn send_tx(&mut self, proto_msgs: Vec) -> Result { - crate::time!("send_tx"); - - let account_seq = self.account_sequence()?; - + fn send_tx_with_account_sequence( + &mut self, + proto_msgs: Vec, + account_seq: u64, + ) -> Result { debug!( "[{}] send_tx: sending {} messages using account sequence {}", self.id(), @@ -316,6 +329,7 @@ impl CosmosSdkChain { signatures: vec![signed_doc], }; + // This may result in an account sequence mismatch error let estimated_gas = self.estimate_gas(simulate_tx)?; if estimated_gas > self.max_gas() { @@ -352,31 +366,110 @@ impl CosmosSdkChain { prost::Message::encode(&tx_raw, &mut tx_bytes) .map_err(|e| Error::protobuf_encode(String::from("Transaction"), e))?; - let response = self.block_on(broadcast_tx_sync( + self.block_on(broadcast_tx_sync( self.rpc_client(), &self.config.rpc_addr, tx_bytes, - ))?; + )) + } - match response.code { - tendermint::abci::Code::Ok => { - // A success means the account s.n. was increased - self.incr_account_sequence()?; - debug!("[{}] send_tx: broadcast_tx_sync: {:?}", self.id(), response); + /// Try to `send_tx` with retry on account sequence error. + /// An account sequence error can occur if the account sequence that + /// the relayer caches becomes outdated. This may happen if the relayer + /// wallet is used concurrently elsewhere, or when tx fees are mis-configured + /// leading to transactions hanging in the mempool. + /// + /// Account sequence mismatch error can occur at two separate steps: + /// 1. as Err variant, propagated from the `estimate_gas` step. + /// 2. as an Ok variant, with an Code::Err response, propagated from + /// the `broadcast_tx_sync` step. + /// + /// We treat both cases by re-fetching the account sequence number + /// from the full node. + /// Upon case #1, we do not retry submitting the same tx (retry happens + /// nonetheless at the worker `step` level). Upon case #2, we retry + /// submitting the same transaction. + fn send_tx_with_account_sequence_retry( + &mut self, + proto_msgs: Vec, + retry_counter: u32, + ) -> Result { + let account_sequence = self.account_sequence()?; + + match self.send_tx_with_account_sequence(proto_msgs.clone(), account_sequence) { + // Gas estimation failed with acct. s.n. mismatch. + // This indicates that the full node did not yet push the previous tx out of its + // mempool. Possible explanations: fees too low, network congested, or full node + // congested. Whichever the case, it is more expedient in production to drop the tx + // and refresh the s.n., to allow proceeding to the other transactions. + Err(e) if mismatching_account_sequence_number(&e) => { + warn!("send_tx failed at estimate_gas step mismatching account sequence: dropping the tx & refreshing account sequence number"); + self.refresh_account()?; + // Note: propagating error here can lead to bug: + // https://github.com/informalsystems/ibc-rs/issues/1153 + // But periodic packet clearing will catch any dropped packets. + Err(e) } - tendermint::abci::Code::Err(code) => { - // Avoid increasing the account s.n. if CheckTx failed - // Log the error - error!( - "[{}] send_tx: broadcast_tx_sync: {:?}: diagnostic: {:?}", - self.id(), - response, - sdk_error_from_tx_sync_error_code(code) - ); + + // Gas estimation succeeded. Broadcasting failed with a retry-able error. + Ok(response) if response.code == Code::Err(INCORRECT_ACCOUNT_SEQUENCE_ERR) => { + if retry_counter < retry_strategy::MAX_ACCOUNT_SEQUENCE_RETRY { + let retry_counter = retry_counter + 1; + warn!("send_tx failed at broadcast step with incorrect account sequence. retrying ({}/{})", + retry_counter, retry_strategy::MAX_ACCOUNT_SEQUENCE_RETRY); + // Backoff & re-fetch the account s.n. + let backoff = (retry_counter as u64) + * retry_strategy::BACKOFF_MULTIPLIER_ACCOUNT_SEQUENCE_RETRY; + thread::sleep(Duration::from_millis(backoff)); + self.refresh_account()?; + + // Now retry. + self.send_tx_with_account_sequence_retry(proto_msgs, retry_counter + 1) + } else { + // If after the max retry we still get an account sequence mismatch error, + // we ignore the error and return the original response to downstream. + // We do not return an error here, because the current convention + // let the caller handle error responses separately. + error!("failed to send_tx due to account sequence errors. the relayer wallet may be used elsewhere concurrently."); + Ok(response) + } + } + + // Catch-all arm for the Ok variant. + // This is the case when gas estimation succeeded. + Ok(response) => { + // Complete success. + match response.code { + tendermint::abci::Code::Ok => { + debug!("[{}] send_tx: broadcast_tx_sync: {:?}", self.id(), response); + + self.incr_account_sequence(); + Ok(response) + } + // Gas estimation succeeded, but broadcasting failed with unrecoverable error. + tendermint::abci::Code::Err(code) => { + // Avoid increasing the account s.n. if CheckTx failed + // Log the error + error!( + "[{}] send_tx: broadcast_tx_sync: {:?}: diagnostic: {:?}", + self.id(), + response, + sdk_error_from_tx_sync_error_code(code) + ); + Ok(response) + } + } } + + // Catch-all case for the Err variant. + // Gas estimation failure or other unrecoverable error, propagate. + Err(e) => Err(e), } + } - Ok(response) + fn send_tx(&mut self, proto_msgs: Vec) -> Result { + crate::time!("send_tx"); + self.send_tx_with_account_sequence_retry(proto_msgs, 0) } /// Try to simulate the given tx in order to estimate how much gas will be needed to submit it. @@ -387,7 +480,7 @@ impl CosmosSdkChain { /// /// If the batch is split in two TX-es, the second one will fail the simulation in `deliverTx` check. /// In this case we use the `default_gas` param. - fn estimate_gas(&self, tx: Tx) -> Result { + fn estimate_gas(&mut self, tx: Tx) -> Result { let simulated_gas = self.send_tx_simulate(tx).map(|sr| sr.gas_info); match simulated_gas { @@ -426,11 +519,11 @@ impl CosmosSdkChain { Err(e) => { error!( - "[{}] estimate_gas: failed to simulate tx with non-recoverable error: {}", + "[{}] estimate_gas: failed to simulate tx. propagating error to caller: {}", self.id(), e.detail() ); - + // Propagate the error, the retrying mechanism at caller may catch & retry. Err(e) } } @@ -600,21 +693,27 @@ impl CosmosSdkChain { Ok((key, key_bytes)) } - fn account(&mut self) -> Result<&mut BaseAccount, Error> { + fn refresh_account(&mut self) -> Result<(), Error> { + let account = self.block_on(query_account(self, self.key()?.account))?; + debug!( + sequence = %account.sequence, + number = %account.account_number, + "[{}] send_tx: retrieved account", + self.id() + ); + + self.account = Some(account); + Ok(()) + } + + fn account(&mut self) -> Result<&BaseAccount, Error> { if self.account == None { - let account = self.block_on(query_account(self, self.key()?.account))?; - debug!( - sequence = %account.sequence, - number = %account.account_number, - "[{}] send_tx: retrieved account", - self.id() - ); - self.account = Some(account); + self.refresh_account()?; } Ok(self .account - .as_mut() + .as_ref() .expect("account was supposedly just cached")) } @@ -626,9 +725,10 @@ impl CosmosSdkChain { Ok(self.account()?.sequence) } - fn incr_account_sequence(&mut self) -> Result<(), Error> { - self.account()?.sequence += 1; - Ok(()) + fn incr_account_sequence(&mut self) { + if let Some(account) = &mut self.account { + account.sequence += 1; + } } fn signer(&self, sequence: u64) -> Result { @@ -712,8 +812,6 @@ impl CosmosSdkChain { &self, mut tx_sync_results: Vec, ) -> Result, Error> { - use crate::util::retry::{retry_with_index, RetryResult}; - let hashes = tx_sync_results .iter() .map(|res| res.response.hash.to_string()) @@ -2414,6 +2512,18 @@ fn can_recover_from_simulation_failure(e: &Error) -> bool { } } +/// Determine whether the given error yielded by `tx_simulate` +/// indicates that the current sequence number cached in Hermes +/// may be out-of-sync with the full node's version of the s.n. +fn mismatching_account_sequence_number(e: &Error) -> bool { + use crate::error::ErrorDetail::*; + + match e.detail() { + GrpcStatus(detail) => detail.is_account_sequence_mismatch(), + _ => false, + } +} + struct PrettyFee<'a>(&'a Fee); impl fmt::Display for PrettyFee<'_> { diff --git a/relayer/src/error.rs b/relayer/src/error.rs index 2aa20c2c08..55dfadb6fc 100644 --- a/relayer/src/error.rs +++ b/relayer/src/error.rs @@ -534,11 +534,19 @@ impl GrpcStatusSubdetail { /// Check whether this gRPC error matches /// - status: InvalidArgument - /// - message: account sequence mismatch ... + /// - message: "account sequence mismatch, expected 166791, got 166793: incorrect account sequence: invalid request" + /// + /// # Note: + /// This predicate is tested and validated against errors + /// that appear at the `estimate_gas` step. The error + /// predicate to be used at the `broadcast_tx_sync` step + /// is different & relies on parsing the Response error code. pub fn is_account_sequence_mismatch(&self) -> bool { - if self.status.code() != tonic::Code::InvalidArgument { - return false; - } + // The code changed in SDK 0.44 from `InvalidArgument` to `Unknown`. + // Workaround: Ignore code. We'll match only on the status message. + // if self.status.code() != tonic::Code::InvalidArgument { + // return false; + // } self.status .message()