From 8627f40e029ea3a0360354e7327e168d21fdd693 Mon Sep 17 00:00:00 2001 From: Soares Chen Date: Tue, 14 Sep 2021 11:55:06 +0200 Subject: [PATCH 01/15] Retry send_tx on account sequence mismatch --- relayer/src/chain/cosmos.rs | 114 ++++++++++++++++++++++++++++-------- 1 file changed, 90 insertions(+), 24 deletions(-) diff --git a/relayer/src/chain/cosmos.rs b/relayer/src/chain/cosmos.rs index c5488c1df8..d7a80fa495 100644 --- a/relayer/src/chain/cosmos.rs +++ b/relayer/src/chain/cosmos.rs @@ -13,7 +13,7 @@ use bitcoin::hashes::hex::ToHex; use itertools::Itertools; use prost::Message; use prost_types::Any; -use tendermint::abci::Path as TendermintABCIPath; +use tendermint::abci::{Code, Path as TendermintABCIPath}; use tendermint::account::Id as AccountId; use tendermint::block::Height; use tendermint::consensus::Params; @@ -24,7 +24,7 @@ use tendermint_rpc::query::{EventType, Query}; use tendermint_rpc::{endpoint::broadcast::tx_sync::Response, Client, HttpClient, Order}; use tokio::runtime::Runtime as TokioRuntime; use tonic::codegen::http::Uri; -use tracing::{debug, trace, warn}; +use tracing::{debug, error, trace, warn}; use ibc::downcast; use ibc::events::{from_tx_response_event, IbcEvent}; @@ -78,12 +78,17 @@ use crate::keyring::{KeyEntry, KeyRing, Store}; use crate::light_client::tendermint::LightClient as TmLightClient; use crate::light_client::LightClient; use crate::light_client::Verified; +use crate::util::retry::{retry_with_index, RetryResult}; use crate::{chain::QueryResponse, event::monitor::TxMonitorCmd}; use super::ChainEndpoint; mod compatibility; +// Maximum number of retries for send_tx in the case of +// an account sequence mismatch. +const MAX_ACCOUNT_SEQUENCE_RETRY: u32 = 5; + /// Default gas limit when submitting a transaction. const DEFAULT_MAX_GAS: u64 = 300_000; @@ -308,10 +313,11 @@ impl CosmosSdkChain { self.rt.block_on(f) } - fn send_tx(&mut self, proto_msgs: Vec) -> Result { - crate::time!("send_tx"); - let account_seq = self.account_sequence()?; - + fn send_tx_with_account_sequence( + &mut self, + proto_msgs: Vec, + account_seq: u64, + ) -> Result { debug!( "[{}] send_tx: sending {} messages using nonce {}", self.id(), @@ -373,13 +379,65 @@ impl CosmosSdkChain { let mut tx_bytes = Vec::new(); prost::Message::encode(&tx_raw, &mut tx_bytes).unwrap(); - let response = self.block_on(broadcast_tx_sync(self, tx_bytes))?; + self.block_on(broadcast_tx_sync(self, tx_bytes)) + } + + // Try to send_tx with retry on account sequence error. + // An account sequence error can occur if the cached account sequence of + // the relayer somehow get outdated, or when the relayer's wallet is used + // concurrently elsewhere. + // When there is an account sequence mismatch, we refetch the account sequence + // from the chain and retry sending the transaction again with the new sequence. + fn send_tx_with_account_sequence_retry( + &mut self, + proto_msgs: Vec, + retries: u32, + ) -> Result { + let account_sequence = self.account_sequence()?; + + let response = self.send_tx_with_account_sequence(proto_msgs.clone(), account_sequence)?; debug!("[{}] send_tx: broadcast_tx_sync: {:?}", self.id(), response); - self.incr_account_sequence()?; + match response.code { + Code::Ok => { + self.incr_account_sequence(); + Ok(response) + } + // Cosmos SDK defines an account sequence mismatch with the error code 32 + Code::Err(32) => { + if retries < MAX_ACCOUNT_SEQUENCE_RETRY { + warn!("send_tx failed with incorrect account sequence. retrying with account sequence refetched from the chain."); + + // Refetch account information from the chain. + // We do not wait between retries as waiting will + // only increase the chance of the account sequence getting + // out of synced again from other sources. + self.refresh_account()?; + + self.send_tx_with_account_sequence_retry(proto_msgs, retries + 1) + } else { + // If after the max retry we still get an account sequence mismatch error, + // we ignore the error and return the original response to downstream. + // We do not return an error here, because the current convention + // let the caller handle error responses separately. + + error!("failed to send_tx with incorrect account sequence. the relayer's wallet may be used elsewhere concurrently."); - Ok(response) + Ok(response) + } + } + Code::Err(_) => { + // The original code do not convert an error response into error result. + // We may want to refactor this in the future to simplify error handling. + Ok(response) + } + } + } + + fn send_tx(&mut self, proto_msgs: Vec) -> Result { + crate::time!("send_tx"); + self.send_tx_with_account_sequence_retry(proto_msgs, 0) } /// The maximum amount of gas the relayer is willing to pay for a transaction @@ -520,21 +578,27 @@ impl CosmosSdkChain { Ok((key, key_bytes)) } - fn account(&mut self) -> Result<&mut BaseAccount, Error> { + fn refresh_account(&mut self) -> Result<(), Error> { + let account = self.block_on(query_account(self, self.key()?.account))?; + debug!( + sequence = %account.sequence, + number = %account.account_number, + "[{}] send_tx: retrieved account", + self.id() + ); + + self.account = Some(account); + Ok(()) + } + + fn account(&mut self) -> Result<&BaseAccount, Error> { if self.account == None { - let account = self.block_on(query_account(self, self.key()?.account))?; - debug!( - sequence = %account.sequence, - number = %account.account_number, - "[{}] send_tx: retrieved account", - self.id() - ); - self.account = Some(account); + self.refresh_account()?; } Ok(self .account - .as_mut() + .as_ref() .expect("account was supposedly just cached")) } @@ -546,9 +610,13 @@ impl CosmosSdkChain { Ok(self.account()?.sequence) } - fn incr_account_sequence(&mut self) -> Result<(), Error> { - self.account()?.sequence += 1; - Ok(()) + fn incr_account_sequence(&mut self) { + match &mut self.account { + Some(account) => { + account.sequence += 1; + } + None => {} + } } fn signer(&self, sequence: u64) -> Result { @@ -629,8 +697,6 @@ impl CosmosSdkChain { &self, mut tx_sync_results: Vec, ) -> Result, Error> { - use crate::util::retry::{retry_with_index, RetryResult}; - let hashes = tx_sync_results .iter() .map(|res| res.response.hash.to_string()) From 4d6706b5428f07c5366361cb24c91025e0b3176f Mon Sep 17 00:00:00 2001 From: Soares Chen Date: Wed, 15 Sep 2021 13:58:31 +0200 Subject: [PATCH 02/15] Move MAX_ACCOUNT_SEQUENCE_RETRY to retry_strategy module --- relayer/src/chain/cosmos.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/relayer/src/chain/cosmos.rs b/relayer/src/chain/cosmos.rs index d7a80fa495..da4c3c76c9 100644 --- a/relayer/src/chain/cosmos.rs +++ b/relayer/src/chain/cosmos.rs @@ -85,10 +85,6 @@ use super::ChainEndpoint; mod compatibility; -// Maximum number of retries for send_tx in the case of -// an account sequence mismatch. -const MAX_ACCOUNT_SEQUENCE_RETRY: u32 = 5; - /// Default gas limit when submitting a transaction. const DEFAULT_MAX_GAS: u64 = 300_000; @@ -103,6 +99,10 @@ mod retry_strategy { use crate::util::retry::Fixed; use std::time::Duration; + // Maximum number of retries for send_tx in the case of + // an account sequence mismatch. + pub const MAX_ACCOUNT_SEQUENCE_RETRY: u32 = 5; + pub fn wait_for_block_commits(max_total_wait: Duration) -> impl Iterator { let backoff_millis = 300; // The periodic backoff let count: usize = (max_total_wait.as_millis() / backoff_millis as u128) as usize; @@ -406,7 +406,7 @@ impl CosmosSdkChain { } // Cosmos SDK defines an account sequence mismatch with the error code 32 Code::Err(32) => { - if retries < MAX_ACCOUNT_SEQUENCE_RETRY { + if retries < retry_strategy::MAX_ACCOUNT_SEQUENCE_RETRY { warn!("send_tx failed with incorrect account sequence. retrying with account sequence refetched from the chain."); // Refetch account information from the chain. From dae1f9f4bb0395b493385e720912e7aaaa3ccd4b Mon Sep 17 00:00:00 2001 From: Soares Chen Date: Wed, 15 Sep 2021 21:14:51 +0200 Subject: [PATCH 03/15] Introduce INCORRECT_ACCOUNT_SEQUENCE_ERR constant for error code 32 --- relayer/src/chain/cosmos.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/relayer/src/chain/cosmos.rs b/relayer/src/chain/cosmos.rs index da4c3c76c9..966396c715 100644 --- a/relayer/src/chain/cosmos.rs +++ b/relayer/src/chain/cosmos.rs @@ -95,6 +95,10 @@ const DEFAULT_GAS_PRICE_ADJUSTMENT: f64 = 0.1; /// fraction of the maximum block size defined in the Tendermint core consensus parameters. pub const GENESIS_MAX_BYTES_MAX_FRACTION: f64 = 0.9; +// The error "incorrect account sequence" is defined as the unique error code 32 in cosmos-sdk: +// https://github.com/cosmos/cosmos-sdk/blob/v0.44.0/types/errors/errors.go#L115-L117 +pub const INCORRECT_ACCOUNT_SEQUENCE_ERR: u32 = 32; + mod retry_strategy { use crate::util::retry::Fixed; use std::time::Duration; @@ -404,8 +408,7 @@ impl CosmosSdkChain { self.incr_account_sequence(); Ok(response) } - // Cosmos SDK defines an account sequence mismatch with the error code 32 - Code::Err(32) => { + Code::Err(code) if code == INCORRECT_ACCOUNT_SEQUENCE_ERR => { if retries < retry_strategy::MAX_ACCOUNT_SEQUENCE_RETRY { warn!("send_tx failed with incorrect account sequence. retrying with account sequence refetched from the chain."); From dcb80d58626fbb6db2e8322b162f5d048d2573e5 Mon Sep 17 00:00:00 2001 From: Soares Chen Date: Thu, 16 Sep 2021 15:33:10 +0200 Subject: [PATCH 04/15] Inline constant matching --- relayer/src/chain/cosmos.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/relayer/src/chain/cosmos.rs b/relayer/src/chain/cosmos.rs index 966396c715..ba0430685c 100644 --- a/relayer/src/chain/cosmos.rs +++ b/relayer/src/chain/cosmos.rs @@ -408,7 +408,7 @@ impl CosmosSdkChain { self.incr_account_sequence(); Ok(response) } - Code::Err(code) if code == INCORRECT_ACCOUNT_SEQUENCE_ERR => { + Code::Err(INCORRECT_ACCOUNT_SEQUENCE_ERR) => { if retries < retry_strategy::MAX_ACCOUNT_SEQUENCE_RETRY { warn!("send_tx failed with incorrect account sequence. retrying with account sequence refetched from the chain."); From 8a726b1e8864db7d20b0d75db6ab6823ae44de46 Mon Sep 17 00:00:00 2001 From: Adi Seredinschi Date: Wed, 15 Dec 2021 16:49:48 +0200 Subject: [PATCH 05/15] Retry on simulation failure --- relayer/src/chain/cosmos.rs | 25 +++++++++++++++++++++++-- relayer/src/error.rs | 8 +++++++- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/relayer/src/chain/cosmos.rs b/relayer/src/chain/cosmos.rs index b1c1ed4f11..edbe424969 100644 --- a/relayer/src/chain/cosmos.rs +++ b/relayer/src/chain/cosmos.rs @@ -370,7 +370,7 @@ impl CosmosSdkChain { // Try to send_tx with retry on account sequence error. // An account sequence error can occur if the cached account sequence of - // the relayer somehow get outdated, or when the relayer's wallet is used + // the relayer somehow get outdated, or when the relayer wallet is used // concurrently elsewhere. // When there is an account sequence mismatch, we refetch the account sequence // from the chain and retry sending the transaction again with the new sequence. @@ -438,7 +438,7 @@ impl CosmosSdkChain { /// /// If the batch is split in two TX-es, the second one will fail the simulation in `deliverTx` check. /// In this case we use the `default_gas` param. - fn estimate_gas(&self, tx: Tx) -> Result { + fn estimate_gas(&mut self, tx: Tx) -> Result { let simulated_gas = self.send_tx_simulate(tx).map(|sr| sr.gas_info); match simulated_gas { @@ -475,6 +475,18 @@ impl CosmosSdkChain { Ok(self.default_gas()) } + Err(e) if can_retry_simulation(&e) => { + warn!( + "[{}] estimate_gas: failed to simulate tx, refreshing account and retrying: {}", + self.id(), + e.detail() + ); + // TODO: Figure our retrying abstraction here. + self.refresh_account()?; + + Ok(self.default_gas()) + } + Err(e) => { error!( "[{}] estimate_gas: failed to simulate tx with non-recoverable error: {}", @@ -2473,6 +2485,15 @@ fn can_recover_from_simulation_failure(e: &Error) -> bool { } } +fn can_retry_simulation(e: &Error) -> bool { + use crate::error::ErrorDetail::*; + + match e.detail() { + GrpcStatus(detail) => detail.is_account_sequence_mismatch(), + _ => false, + } +} + struct PrettyFee<'a>(&'a Fee); impl fmt::Display for PrettyFee<'_> { diff --git a/relayer/src/error.rs b/relayer/src/error.rs index 2aa20c2c08..21ce2a36a5 100644 --- a/relayer/src/error.rs +++ b/relayer/src/error.rs @@ -534,7 +534,13 @@ impl GrpcStatusSubdetail { /// Check whether this gRPC error matches /// - status: InvalidArgument - /// - message: account sequence mismatch ... + /// - message: "account sequence mismatch, expected 166791, got 166793: incorrect account sequence: invalid request" + /// + /// # Note: + /// This predicate is tested and validated against errors + /// that appear at the `estimate_gas` step. The error + /// predicate to be used at the `broadcast_tx_sync` step + /// is different & relies on parsing the Response error code. pub fn is_account_sequence_mismatch(&self) -> bool { if self.status.code() != tonic::Code::InvalidArgument { return false; From 63390ec4977e47ba35c1202fa900f6c69394c333 Mon Sep 17 00:00:00 2001 From: Adi Seredinschi Date: Wed, 15 Dec 2021 19:55:08 +0200 Subject: [PATCH 06/15] Refactored retry to catch both estimate and bcast errors --- relayer/src/chain/cosmos.rs | 115 ++++++++++++++++++++---------------- 1 file changed, 65 insertions(+), 50 deletions(-) diff --git a/relayer/src/chain/cosmos.rs b/relayer/src/chain/cosmos.rs index edbe424969..ab5c38b1b9 100644 --- a/relayer/src/chain/cosmos.rs +++ b/relayer/src/chain/cosmos.rs @@ -325,6 +325,7 @@ impl CosmosSdkChain { signatures: vec![signed_doc], }; + // This may result in an account sequence mismatch error let estimated_gas = self.estimate_gas(simulate_tx)?; if estimated_gas > self.max_gas() { @@ -368,60 +369,86 @@ impl CosmosSdkChain { )) } - // Try to send_tx with retry on account sequence error. - // An account sequence error can occur if the cached account sequence of - // the relayer somehow get outdated, or when the relayer wallet is used - // concurrently elsewhere. - // When there is an account sequence mismatch, we refetch the account sequence - // from the chain and retry sending the transaction again with the new sequence. + /// Try to send_tx with retry on account sequence error. + /// An account sequence error can occur if the cached account sequence of + /// the relayer somehow get outdated, or when the relayer wallet is used + /// concurrently elsewhere. + /// When there is an account sequence mismatch, we refetch the account sequence + /// from the chain and retry sending the transaction again with the new sequence. + /// + /// Account sequence mismatch can occur at two separate steps: + /// 1. as Err variant, propagated from the `estimate_gas` step. + /// 2. as an Ok variant, with an Code::Err response, propagated from + /// the `broadcast_tx_sync` step. + /// We cover and treat both of these cases. fn send_tx_with_account_sequence_retry( &mut self, proto_msgs: Vec, - retries: u32, + retry_counter: u32, ) -> Result { - let account_sequence = self.account_sequence()?; - - let response = self.send_tx_with_account_sequence(proto_msgs.clone(), account_sequence)?; + // If this is a retry, then re-fetch the account s.n. + if retry_counter > 1 { + self.refresh_account()?; + } - debug!("[{}] send_tx: broadcast_tx_sync: {:?}", self.id(), response); + let account_sequence = self.account_sequence()?; - match response.code { - Code::Ok => { - self.incr_account_sequence(); - Ok(response) + match self.send_tx_with_account_sequence(proto_msgs.clone(), account_sequence) { + // Gas estimation failed with retry-able error. + Err(e) if can_retry_simulation(&e) => { + if retry_counter < retry_strategy::MAX_ACCOUNT_SEQUENCE_RETRY { + warn!("send_tx failed at estimate_gas step due to incorrect account sequence. retrying.."); + self.send_tx_with_account_sequence_retry(proto_msgs, retry_counter + 1) + } else { + error!("send_tx failed due to account sequence errors. the relayer wallet may be used elsewhere concurrently."); + Err(e) + } } - Code::Err(INCORRECT_ACCOUNT_SEQUENCE_ERR) => { - if retries < retry_strategy::MAX_ACCOUNT_SEQUENCE_RETRY { - warn!("send_tx failed with incorrect account sequence. re-fetching account sequence from chain & retrying."); - // Refetch account information from the chain. - // We do not wait between retries as waiting will - // only increase the chance of the account sequence getting - // out of synced again from other sources. - self.refresh_account()?; - - self.send_tx_with_account_sequence_retry(proto_msgs, retries + 1) + + // Gas estimation succeeded. Broadcasting failed with retry-able error. + Ok(response) if response.code == Code::Err(INCORRECT_ACCOUNT_SEQUENCE_ERR) => { + if retry_counter < retry_strategy::MAX_ACCOUNT_SEQUENCE_RETRY { + warn!("send_tx failed at broadcast step with incorrect account sequence. retrying.."); + self.send_tx_with_account_sequence_retry(proto_msgs, retry_counter + 1) } else { // If after the max retry we still get an account sequence mismatch error, // we ignore the error and return the original response to downstream. // We do not return an error here, because the current convention // let the caller handle error responses separately. - error!("failed to send_tx due to account sequence errors. the relayer wallet may be used elsewhere concurrently."); - Ok(response) } } - Code::Err(code) => { - // Avoid increasing the account s.n. if CheckTx failed - // Log the error - error!( - "[{}] send_tx: broadcast_tx_sync: {:?}: diagnostic: {:?}", - self.id(), - response, - sdk_error_from_tx_sync_error_code(code) - ); - Ok(response) + + // Catch-all arm for the Ok variant. + // This is the case when gas estimation succeeded. + Ok(response) => { + // Complete success. + match response.code { + tendermint::abci::Code::Ok => { + debug!("[{}] send_tx: broadcast_tx_sync: {:?}", self.id(), response); + + self.incr_account_sequence(); + Ok(response) + } + // Gas estimation succeeded, but broadcasting failed with unrecoverable error. + tendermint::abci::Code::Err(code) => { + // Avoid increasing the account s.n. if CheckTx failed + // Log the error + error!( + "[{}] send_tx: broadcast_tx_sync: {:?}: diagnostic: {:?}", + self.id(), + response, + sdk_error_from_tx_sync_error_code(code) + ); + Ok(response) + } + } } + + // Catch-all case for the Err variant. + // Gas estimation failure or other unrecoverable error, propagate. + Err(e) => Err(e), } } @@ -475,25 +502,13 @@ impl CosmosSdkChain { Ok(self.default_gas()) } - Err(e) if can_retry_simulation(&e) => { - warn!( - "[{}] estimate_gas: failed to simulate tx, refreshing account and retrying: {}", - self.id(), - e.detail() - ); - // TODO: Figure our retrying abstraction here. - self.refresh_account()?; - - Ok(self.default_gas()) - } - Err(e) => { error!( "[{}] estimate_gas: failed to simulate tx with non-recoverable error: {}", self.id(), e.detail() ); - + // Propagate the error, the retrying mechanism at caller may catch & retry. Err(e) } } From d44ff54c0d673fa43e170428dce918418c03bb55 Mon Sep 17 00:00:00 2001 From: Adi Seredinschi Date: Wed, 15 Dec 2021 20:10:59 +0200 Subject: [PATCH 07/15] Fixed off-by-one error in re-fetching logic --- relayer/src/chain/cosmos.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/relayer/src/chain/cosmos.rs b/relayer/src/chain/cosmos.rs index ab5c38b1b9..ab5b4ba0e4 100644 --- a/relayer/src/chain/cosmos.rs +++ b/relayer/src/chain/cosmos.rs @@ -387,7 +387,7 @@ impl CosmosSdkChain { retry_counter: u32, ) -> Result { // If this is a retry, then re-fetch the account s.n. - if retry_counter > 1 { + if retry_counter > 0 { self.refresh_account()?; } @@ -395,7 +395,7 @@ impl CosmosSdkChain { match self.send_tx_with_account_sequence(proto_msgs.clone(), account_sequence) { // Gas estimation failed with retry-able error. - Err(e) if can_retry_simulation(&e) => { + Err(e) if can_retry_tx_simulation(&e) => { if retry_counter < retry_strategy::MAX_ACCOUNT_SEQUENCE_RETRY { warn!("send_tx failed at estimate_gas step due to incorrect account sequence. retrying.."); self.send_tx_with_account_sequence_retry(proto_msgs, retry_counter + 1) @@ -2500,7 +2500,7 @@ fn can_recover_from_simulation_failure(e: &Error) -> bool { } } -fn can_retry_simulation(e: &Error) -> bool { +fn can_retry_tx_simulation(e: &Error) -> bool { use crate::error::ErrorDetail::*; match e.detail() { From bd89f50a4e20af1dfa43bdbf938d054204460cec Mon Sep 17 00:00:00 2001 From: Adi Seredinschi Date: Wed, 15 Dec 2021 20:23:38 +0200 Subject: [PATCH 08/15] Basic backoff mechanism --- relayer/src/chain/cosmos.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/relayer/src/chain/cosmos.rs b/relayer/src/chain/cosmos.rs index ab5b4ba0e4..f76466612d 100644 --- a/relayer/src/chain/cosmos.rs +++ b/relayer/src/chain/cosmos.rs @@ -127,6 +127,10 @@ mod retry_strategy { // an account sequence mismatch. pub const MAX_ACCOUNT_SEQUENCE_RETRY: u32 = 5; + // Backoff multiplier to apply while retrying in the case + // of account sequence mismatch. + pub const BACKOFF_MULTIPLIER_SEQUENCE_RETRY: u64 = 100; + pub fn wait_for_block_commits(max_total_wait: Duration) -> impl Iterator { let backoff_millis = 300; // The periodic backoff let count: usize = (max_total_wait.as_millis() / backoff_millis as u128) as usize; @@ -386,8 +390,11 @@ impl CosmosSdkChain { proto_msgs: Vec, retry_counter: u32, ) -> Result { - // If this is a retry, then re-fetch the account s.n. + // If this is a retry, then backoff & re-fetch the account s.n. if retry_counter > 0 { + let backoff = + (retry_counter as u64) * retry_strategy::BACKOFF_MULTIPLIER_SEQUENCE_RETRY; + thread::sleep(Duration::from_millis(backoff)); self.refresh_account()?; } @@ -504,7 +511,7 @@ impl CosmosSdkChain { Err(e) => { error!( - "[{}] estimate_gas: failed to simulate tx with non-recoverable error: {}", + "[{}] estimate_gas: failed to simulate tx. propagating error to caller: {}", self.id(), e.detail() ); From 934a14486328faa4f0e93831cd7f84a372c32728 Mon Sep 17 00:00:00 2001 From: Adi Seredinschi Date: Thu, 16 Dec 2021 09:55:00 +0200 Subject: [PATCH 09/15] More idiomatic account seq. incremental --- relayer/src/chain/cosmos.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/relayer/src/chain/cosmos.rs b/relayer/src/chain/cosmos.rs index f76466612d..878dc58e62 100644 --- a/relayer/src/chain/cosmos.rs +++ b/relayer/src/chain/cosmos.rs @@ -718,11 +718,8 @@ impl CosmosSdkChain { } fn incr_account_sequence(&mut self) { - match &mut self.account { - Some(account) => { - account.sequence += 1; - } - None => {} + if let Some(account) = &mut self.account { + account.sequence += 1; } } From a7a1b54ff4cf3d8e50ce31ad85a4903878cd939f Mon Sep 17 00:00:00 2001 From: Adi Seredinschi Date: Thu, 16 Dec 2021 18:22:14 +0200 Subject: [PATCH 10/15] Adapted to catch acct.seq more generally --- relayer/src/chain/cosmos.rs | 2 +- relayer/src/error.rs | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/relayer/src/chain/cosmos.rs b/relayer/src/chain/cosmos.rs index 878dc58e62..8797387664 100644 --- a/relayer/src/chain/cosmos.rs +++ b/relayer/src/chain/cosmos.rs @@ -125,7 +125,7 @@ mod retry_strategy { // Maximum number of retries for send_tx in the case of // an account sequence mismatch. - pub const MAX_ACCOUNT_SEQUENCE_RETRY: u32 = 5; + pub const MAX_ACCOUNT_SEQUENCE_RETRY: u32 = 10; // Backoff multiplier to apply while retrying in the case // of account sequence mismatch. diff --git a/relayer/src/error.rs b/relayer/src/error.rs index 21ce2a36a5..857d994574 100644 --- a/relayer/src/error.rs +++ b/relayer/src/error.rs @@ -542,9 +542,11 @@ impl GrpcStatusSubdetail { /// predicate to be used at the `broadcast_tx_sync` step /// is different & relies on parsing the Response error code. pub fn is_account_sequence_mismatch(&self) -> bool { - if self.status.code() != tonic::Code::InvalidArgument { - return false; - } + // SDK 0.44 changed the status from `InvalidArgument` to `Unknown` + // Workaround since this is unstable: We'll match only on the status message. + // if self.status.code() != tonic::Code::InvalidArgument { + // return false; + // } self.status .message() From c4c59792959ad487e2a2793eab80064d8d8339a2 Mon Sep 17 00:00:00 2001 From: Adi Seredinschi Date: Thu, 16 Dec 2021 18:28:21 +0200 Subject: [PATCH 11/15] retry counter logging --- relayer/src/chain/cosmos.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/relayer/src/chain/cosmos.rs b/relayer/src/chain/cosmos.rs index 8797387664..ea12ae6e7d 100644 --- a/relayer/src/chain/cosmos.rs +++ b/relayer/src/chain/cosmos.rs @@ -404,7 +404,8 @@ impl CosmosSdkChain { // Gas estimation failed with retry-able error. Err(e) if can_retry_tx_simulation(&e) => { if retry_counter < retry_strategy::MAX_ACCOUNT_SEQUENCE_RETRY { - warn!("send_tx failed at estimate_gas step due to incorrect account sequence. retrying.."); + warn!("send_tx failed at estimate_gas step due to incorrect account sequence. retrying ({}/{})", + retry_counter + 1, retry_strategy::MAX_ACCOUNT_SEQUENCE_RETRY); self.send_tx_with_account_sequence_retry(proto_msgs, retry_counter + 1) } else { error!("send_tx failed due to account sequence errors. the relayer wallet may be used elsewhere concurrently."); @@ -415,7 +416,8 @@ impl CosmosSdkChain { // Gas estimation succeeded. Broadcasting failed with retry-able error. Ok(response) if response.code == Code::Err(INCORRECT_ACCOUNT_SEQUENCE_ERR) => { if retry_counter < retry_strategy::MAX_ACCOUNT_SEQUENCE_RETRY { - warn!("send_tx failed at broadcast step with incorrect account sequence. retrying.."); + warn!("send_tx failed at broadcast step with incorrect account sequence. retrying ({}/{})", + retry_counter + 1, retry_strategy::MAX_ACCOUNT_SEQUENCE_RETRY); self.send_tx_with_account_sequence_retry(proto_msgs, retry_counter + 1) } else { // If after the max retry we still get an account sequence mismatch error, From 45a61dce9a257da3fb9a3fe9a3e4c4969bce4a37 Mon Sep 17 00:00:00 2001 From: Adi Seredinschi Date: Thu, 16 Dec 2021 18:33:54 +0200 Subject: [PATCH 12/15] Bumped the multipler from 100 to 200 --- relayer/src/chain/cosmos.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/relayer/src/chain/cosmos.rs b/relayer/src/chain/cosmos.rs index ea12ae6e7d..542d328006 100644 --- a/relayer/src/chain/cosmos.rs +++ b/relayer/src/chain/cosmos.rs @@ -129,7 +129,7 @@ mod retry_strategy { // Backoff multiplier to apply while retrying in the case // of account sequence mismatch. - pub const BACKOFF_MULTIPLIER_SEQUENCE_RETRY: u64 = 100; + pub const BACKOFF_MULTIPLIER_ACCOUNT_SEQUENCE_RETRY: u64 = 200; pub fn wait_for_block_commits(max_total_wait: Duration) -> impl Iterator { let backoff_millis = 300; // The periodic backoff @@ -393,7 +393,7 @@ impl CosmosSdkChain { // If this is a retry, then backoff & re-fetch the account s.n. if retry_counter > 0 { let backoff = - (retry_counter as u64) * retry_strategy::BACKOFF_MULTIPLIER_SEQUENCE_RETRY; + (retry_counter as u64) * retry_strategy::BACKOFF_MULTIPLIER_ACCOUNT_SEQUENCE_RETRY; thread::sleep(Duration::from_millis(backoff)); self.refresh_account()?; } From 445d3c881cde330f2b5d456c2afc6cd3e2e8226b Mon Sep 17 00:00:00 2001 From: Adi Seredinschi Date: Fri, 17 Dec 2021 15:55:59 +0200 Subject: [PATCH 13/15] Non-retrying fix upon gas estimation failure --- relayer/src/chain/cosmos.rs | 44 +++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/relayer/src/chain/cosmos.rs b/relayer/src/chain/cosmos.rs index 542d328006..8b083fce03 100644 --- a/relayer/src/chain/cosmos.rs +++ b/relayer/src/chain/cosmos.rs @@ -390,34 +390,33 @@ impl CosmosSdkChain { proto_msgs: Vec, retry_counter: u32, ) -> Result { - // If this is a retry, then backoff & re-fetch the account s.n. - if retry_counter > 0 { - let backoff = - (retry_counter as u64) * retry_strategy::BACKOFF_MULTIPLIER_ACCOUNT_SEQUENCE_RETRY; - thread::sleep(Duration::from_millis(backoff)); - self.refresh_account()?; - } - let account_sequence = self.account_sequence()?; match self.send_tx_with_account_sequence(proto_msgs.clone(), account_sequence) { - // Gas estimation failed with retry-able error. - Err(e) if can_retry_tx_simulation(&e) => { - if retry_counter < retry_strategy::MAX_ACCOUNT_SEQUENCE_RETRY { - warn!("send_tx failed at estimate_gas step due to incorrect account sequence. retrying ({}/{})", - retry_counter + 1, retry_strategy::MAX_ACCOUNT_SEQUENCE_RETRY); - self.send_tx_with_account_sequence_retry(proto_msgs, retry_counter + 1) - } else { - error!("send_tx failed due to account sequence errors. the relayer wallet may be used elsewhere concurrently."); - Err(e) - } + // Gas estimation failed with acct. s.n. mismatch. + // This indicates that the full node did not yet push the previous tx out of its + // mempool. Possible explanations: fees too low, network congested, or full node + // congested. Whichever the case, it is more expedient in production to drop the tx + // and refresh the s.n., to allow proceeding to the other transactions. + Err(e) if mismatching_account_sequence_number(&e) => { + warn!("send_tx failed at estimate_gas step mismatching account sequence: dropping the tx & refreshing account sequence number"); + self.refresh_account()?; + Err(e) } - // Gas estimation succeeded. Broadcasting failed with retry-able error. + // Gas estimation succeeded. Broadcasting failed with a retry-able error. Ok(response) if response.code == Code::Err(INCORRECT_ACCOUNT_SEQUENCE_ERR) => { if retry_counter < retry_strategy::MAX_ACCOUNT_SEQUENCE_RETRY { + let retry_counter = retry_counter + 1; warn!("send_tx failed at broadcast step with incorrect account sequence. retrying ({}/{})", - retry_counter + 1, retry_strategy::MAX_ACCOUNT_SEQUENCE_RETRY); + retry_counter, retry_strategy::MAX_ACCOUNT_SEQUENCE_RETRY); + // Backoff & re-fetch the account s.n. + let backoff = (retry_counter as u64) + * retry_strategy::BACKOFF_MULTIPLIER_ACCOUNT_SEQUENCE_RETRY; + thread::sleep(Duration::from_millis(backoff)); + self.refresh_account()?; + + // Now trigger the retry. self.send_tx_with_account_sequence_retry(proto_msgs, retry_counter + 1) } else { // If after the max retry we still get an account sequence mismatch error, @@ -2506,7 +2505,10 @@ fn can_recover_from_simulation_failure(e: &Error) -> bool { } } -fn can_retry_tx_simulation(e: &Error) -> bool { +/// Determine whether the given error yielded by `tx_simulate` +/// indicates that the current sequence number cached in Hermes +/// may be out-of-sync with the full node's version of the s.n. +fn mismatching_account_sequence_number(e: &Error) -> bool { use crate::error::ErrorDetail::*; match e.detail() { From 7dd5c53ce35d1d2ad92508e91d90a8e61ea76ceb Mon Sep 17 00:00:00 2001 From: Adi Seredinschi Date: Mon, 20 Dec 2021 17:05:47 +0200 Subject: [PATCH 14/15] Adjusted retry params; added ref. to #1153. --- relayer/src/chain/cosmos.rs | 27 +++++++++++++++++---------- relayer/src/error.rs | 4 ++-- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/relayer/src/chain/cosmos.rs b/relayer/src/chain/cosmos.rs index 8b083fce03..49e735f6e8 100644 --- a/relayer/src/chain/cosmos.rs +++ b/relayer/src/chain/cosmos.rs @@ -125,7 +125,7 @@ mod retry_strategy { // Maximum number of retries for send_tx in the case of // an account sequence mismatch. - pub const MAX_ACCOUNT_SEQUENCE_RETRY: u32 = 10; + pub const MAX_ACCOUNT_SEQUENCE_RETRY: u32 = 5; // Backoff multiplier to apply while retrying in the case // of account sequence mismatch. @@ -373,18 +373,22 @@ impl CosmosSdkChain { )) } - /// Try to send_tx with retry on account sequence error. - /// An account sequence error can occur if the cached account sequence of - /// the relayer somehow get outdated, or when the relayer wallet is used - /// concurrently elsewhere. - /// When there is an account sequence mismatch, we refetch the account sequence - /// from the chain and retry sending the transaction again with the new sequence. + /// Try to `send_tx` with retry on account sequence error. + /// An account sequence error can occur if the account sequence that + /// the relayer caches becomes outdated. This may happen if the relayer + /// wallet is used concurrently elsewhere, or when tx fees are mis-configured + /// leading to transactions hanging in the mempool. /// - /// Account sequence mismatch can occur at two separate steps: + /// Account sequence mismatch error can occur at two separate steps: /// 1. as Err variant, propagated from the `estimate_gas` step. /// 2. as an Ok variant, with an Code::Err response, propagated from /// the `broadcast_tx_sync` step. - /// We cover and treat both of these cases. + /// + /// We treat both cases by re-fetching the account sequence number + /// from the full node. + /// Upon case #1, we do not retry submitting the same tx (retry happens + /// nonetheless at the worker `step` level). Upon case #2, we retry + /// submitting the same transaction. fn send_tx_with_account_sequence_retry( &mut self, proto_msgs: Vec, @@ -401,6 +405,9 @@ impl CosmosSdkChain { Err(e) if mismatching_account_sequence_number(&e) => { warn!("send_tx failed at estimate_gas step mismatching account sequence: dropping the tx & refreshing account sequence number"); self.refresh_account()?; + // Note: propagating error here can lead to bug: + // https://github.com/informalsystems/ibc-rs/issues/1153 + // But periodic packet clearing will catch any dropped packets. Err(e) } @@ -416,7 +423,7 @@ impl CosmosSdkChain { thread::sleep(Duration::from_millis(backoff)); self.refresh_account()?; - // Now trigger the retry. + // Now retry. self.send_tx_with_account_sequence_retry(proto_msgs, retry_counter + 1) } else { // If after the max retry we still get an account sequence mismatch error, diff --git a/relayer/src/error.rs b/relayer/src/error.rs index 857d994574..55dfadb6fc 100644 --- a/relayer/src/error.rs +++ b/relayer/src/error.rs @@ -542,8 +542,8 @@ impl GrpcStatusSubdetail { /// predicate to be used at the `broadcast_tx_sync` step /// is different & relies on parsing the Response error code. pub fn is_account_sequence_mismatch(&self) -> bool { - // SDK 0.44 changed the status from `InvalidArgument` to `Unknown` - // Workaround since this is unstable: We'll match only on the status message. + // The code changed in SDK 0.44 from `InvalidArgument` to `Unknown`. + // Workaround: Ignore code. We'll match only on the status message. // if self.status.code() != tonic::Code::InvalidArgument { // return false; // } From 7cc1db4fcbb2937d20c676bb023ae82e538a57b9 Mon Sep 17 00:00:00 2001 From: Adi Seredinschi Date: Tue, 21 Dec 2021 11:54:31 +0200 Subject: [PATCH 15/15] changelog --- .changelog/unreleased/bug-fixes/1264-recover-acct-seq.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .changelog/unreleased/bug-fixes/1264-recover-acct-seq.md diff --git a/.changelog/unreleased/bug-fixes/1264-recover-acct-seq.md b/.changelog/unreleased/bug-fixes/1264-recover-acct-seq.md new file mode 100644 index 0000000000..850ce5b86e --- /dev/null +++ b/.changelog/unreleased/bug-fixes/1264-recover-acct-seq.md @@ -0,0 +1,3 @@ +- Added a recovery mechanism to automatically retry or drop tx upon account + sequence mismatch errors ([#1264](https://github.com/informalsystems/ibc- + rs/issues/1264)) \ No newline at end of file