Skip to content

Commit

Permalink
Retry send_tx on account sequence mismatch (informalsystems#1349)
Browse files Browse the repository at this point in the history
* Retry send_tx on account sequence mismatch

* Move MAX_ACCOUNT_SEQUENCE_RETRY to retry_strategy module

* Introduce INCORRECT_ACCOUNT_SEQUENCE_ERR constant for error code 32

* Inline constant matching

* Retry on simulation failure

* Refactored retry to catch both estimate and bcast errors

* Fixed off-by-one error in re-fetching logic

* Basic backoff mechanism

* More idiomatic account seq. incremental

* Adapted to catch acct.seq more generally

* retry counter logging

* Bumped the multipler from 100 to 200

* Non-retrying fix upon gas estimation failure

* Adjusted retry params; added ref. to informalsystems#1153.

* changelog

Co-authored-by: Adi Seredinschi <[email protected]>
  • Loading branch information
soareschen and adizere authored Dec 21, 2021
1 parent e2c4892 commit 205489c
Show file tree
Hide file tree
Showing 3 changed files with 166 additions and 45 deletions.
3 changes: 3 additions & 0 deletions .changelog/unreleased/bug-fixes/1264-recover-acct-seq.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
- Added a recovery mechanism to automatically retry or drop tx upon account
sequence mismatch errors ([#1264](https://github.com/informalsystems/ibc-
rs/issues/1264))
192 changes: 151 additions & 41 deletions relayer/src/chain/cosmos.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ use bitcoin::hashes::hex::ToHex;
use itertools::Itertools;
use prost::Message;
use prost_types::Any;
use tendermint::abci::{Event, Path as TendermintABCIPath};
use tendermint::abci::{Code, Event, Path as TendermintABCIPath};
use tendermint::account::Id as AccountId;
use tendermint::block::Height;
use tendermint::consensus::Params;
Expand Down Expand Up @@ -85,6 +85,7 @@ use crate::keyring::{KeyEntry, KeyRing};
use crate::light_client::tendermint::LightClient as TmLightClient;
use crate::light_client::LightClient;
use crate::light_client::Verified;
use crate::util::retry::{retry_with_index, RetryResult};
use crate::{
chain::handle::requests::AppVersion, chain::QueryResponse, chain::StatusResponse,
event::monitor::TxMonitorCmd,
Expand Down Expand Up @@ -112,10 +113,22 @@ const DEFAULT_FEE_GRANTER: &str = "";
/// fraction of the maximum block size defined in the Tendermint core consensus parameters.
pub const GENESIS_MAX_BYTES_MAX_FRACTION: f64 = 0.9;

// The error "incorrect account sequence" is defined as the unique error code 32 in cosmos-sdk:
// https://github.com/cosmos/cosmos-sdk/blob/v0.44.0/types/errors/errors.go#L115-L117
pub const INCORRECT_ACCOUNT_SEQUENCE_ERR: u32 = 32;

mod retry_strategy {
use crate::util::retry::Fixed;
use core::time::Duration;

// Maximum number of retries for send_tx in the case of
// an account sequence mismatch.
pub const MAX_ACCOUNT_SEQUENCE_RETRY: u32 = 5;

// Backoff multiplier to apply while retrying in the case
// of account sequence mismatch.
pub const BACKOFF_MULTIPLIER_ACCOUNT_SEQUENCE_RETRY: u64 = 200;

pub fn wait_for_block_commits(max_total_wait: Duration) -> impl Iterator<Item = Duration> {
let backoff_millis = 300; // The periodic backoff
let count: usize = (max_total_wait.as_millis() / backoff_millis as u128) as usize;
Expand Down Expand Up @@ -282,11 +295,11 @@ impl CosmosSdkChain {
self.rt.block_on(f)
}

fn send_tx(&mut self, proto_msgs: Vec<Any>) -> Result<Response, Error> {
crate::time!("send_tx");

let account_seq = self.account_sequence()?;

fn send_tx_with_account_sequence(
&mut self,
proto_msgs: Vec<Any>,
account_seq: u64,
) -> Result<Response, Error> {
debug!(
"[{}] send_tx: sending {} messages using account sequence {}",
self.id(),
Expand Down Expand Up @@ -314,6 +327,7 @@ impl CosmosSdkChain {
signatures: vec![signed_doc],
};

// This may result in an account sequence mismatch error
let estimated_gas = self.estimate_gas(simulate_tx)?;

if estimated_gas > self.max_gas() {
Expand Down Expand Up @@ -350,31 +364,110 @@ impl CosmosSdkChain {
prost::Message::encode(&tx_raw, &mut tx_bytes)
.map_err(|e| Error::protobuf_encode(String::from("Transaction"), e))?;

let response = self.block_on(broadcast_tx_sync(
self.block_on(broadcast_tx_sync(
self.rpc_client(),
&self.config.rpc_addr,
tx_bytes,
))?;
))
}

match response.code {
tendermint::abci::Code::Ok => {
// A success means the account s.n. was increased
self.incr_account_sequence()?;
debug!("[{}] send_tx: broadcast_tx_sync: {:?}", self.id(), response);
/// Try to `send_tx` with retry on account sequence error.
/// An account sequence error can occur if the account sequence that
/// the relayer caches becomes outdated. This may happen if the relayer
/// wallet is used concurrently elsewhere, or when tx fees are mis-configured
/// leading to transactions hanging in the mempool.
///
/// Account sequence mismatch error can occur at two separate steps:
/// 1. as Err variant, propagated from the `estimate_gas` step.
/// 2. as an Ok variant, with an Code::Err response, propagated from
/// the `broadcast_tx_sync` step.
///
/// We treat both cases by re-fetching the account sequence number
/// from the full node.
/// Upon case #1, we do not retry submitting the same tx (retry happens
/// nonetheless at the worker `step` level). Upon case #2, we retry
/// submitting the same transaction.
fn send_tx_with_account_sequence_retry(
&mut self,
proto_msgs: Vec<Any>,
retry_counter: u32,
) -> Result<Response, Error> {
let account_sequence = self.account_sequence()?;

match self.send_tx_with_account_sequence(proto_msgs.clone(), account_sequence) {
// Gas estimation failed with acct. s.n. mismatch.
// This indicates that the full node did not yet push the previous tx out of its
// mempool. Possible explanations: fees too low, network congested, or full node
// congested. Whichever the case, it is more expedient in production to drop the tx
// and refresh the s.n., to allow proceeding to the other transactions.
Err(e) if mismatching_account_sequence_number(&e) => {
warn!("send_tx failed at estimate_gas step mismatching account sequence: dropping the tx & refreshing account sequence number");
self.refresh_account()?;
// Note: propagating error here can lead to bug:
// https://github.com/informalsystems/ibc-rs/issues/1153
// But periodic packet clearing will catch any dropped packets.
Err(e)
}
tendermint::abci::Code::Err(code) => {
// Avoid increasing the account s.n. if CheckTx failed
// Log the error
error!(
"[{}] send_tx: broadcast_tx_sync: {:?}: diagnostic: {:?}",
self.id(),
response,
sdk_error_from_tx_sync_error_code(code)
);

// Gas estimation succeeded. Broadcasting failed with a retry-able error.
Ok(response) if response.code == Code::Err(INCORRECT_ACCOUNT_SEQUENCE_ERR) => {
if retry_counter < retry_strategy::MAX_ACCOUNT_SEQUENCE_RETRY {
let retry_counter = retry_counter + 1;
warn!("send_tx failed at broadcast step with incorrect account sequence. retrying ({}/{})",
retry_counter, retry_strategy::MAX_ACCOUNT_SEQUENCE_RETRY);
// Backoff & re-fetch the account s.n.
let backoff = (retry_counter as u64)
* retry_strategy::BACKOFF_MULTIPLIER_ACCOUNT_SEQUENCE_RETRY;
thread::sleep(Duration::from_millis(backoff));
self.refresh_account()?;

// Now retry.
self.send_tx_with_account_sequence_retry(proto_msgs, retry_counter + 1)
} else {
// If after the max retry we still get an account sequence mismatch error,
// we ignore the error and return the original response to downstream.
// We do not return an error here, because the current convention
// let the caller handle error responses separately.
error!("failed to send_tx due to account sequence errors. the relayer wallet may be used elsewhere concurrently.");
Ok(response)
}
}

// Catch-all arm for the Ok variant.
// This is the case when gas estimation succeeded.
Ok(response) => {
// Complete success.
match response.code {
tendermint::abci::Code::Ok => {
debug!("[{}] send_tx: broadcast_tx_sync: {:?}", self.id(), response);

self.incr_account_sequence();
Ok(response)
}
// Gas estimation succeeded, but broadcasting failed with unrecoverable error.
tendermint::abci::Code::Err(code) => {
// Avoid increasing the account s.n. if CheckTx failed
// Log the error
error!(
"[{}] send_tx: broadcast_tx_sync: {:?}: diagnostic: {:?}",
self.id(),
response,
sdk_error_from_tx_sync_error_code(code)
);
Ok(response)
}
}
}

// Catch-all case for the Err variant.
// Gas estimation failure or other unrecoverable error, propagate.
Err(e) => Err(e),
}
}

Ok(response)
fn send_tx(&mut self, proto_msgs: Vec<Any>) -> Result<Response, Error> {
crate::time!("send_tx");
self.send_tx_with_account_sequence_retry(proto_msgs, 0)
}

/// Try to simulate the given tx in order to estimate how much gas will be needed to submit it.
Expand All @@ -385,7 +478,7 @@ impl CosmosSdkChain {
///
/// If the batch is split in two TX-es, the second one will fail the simulation in `deliverTx` check.
/// In this case we use the `default_gas` param.
fn estimate_gas(&self, tx: Tx) -> Result<u64, Error> {
fn estimate_gas(&mut self, tx: Tx) -> Result<u64, Error> {
let simulated_gas = self.send_tx_simulate(tx).map(|sr| sr.gas_info);

match simulated_gas {
Expand Down Expand Up @@ -424,11 +517,11 @@ impl CosmosSdkChain {

Err(e) => {
error!(
"[{}] estimate_gas: failed to simulate tx with non-recoverable error: {}",
"[{}] estimate_gas: failed to simulate tx. propagating error to caller: {}",
self.id(),
e.detail()
);

// Propagate the error, the retrying mechanism at caller may catch & retry.
Err(e)
}
}
Expand Down Expand Up @@ -598,21 +691,27 @@ impl CosmosSdkChain {
Ok((key, key_bytes))
}

fn account(&mut self) -> Result<&mut BaseAccount, Error> {
fn refresh_account(&mut self) -> Result<(), Error> {
let account = self.block_on(query_account(self, self.key()?.account))?;
debug!(
sequence = %account.sequence,
number = %account.account_number,
"[{}] send_tx: retrieved account",
self.id()
);

self.account = Some(account);
Ok(())
}

fn account(&mut self) -> Result<&BaseAccount, Error> {
if self.account == None {
let account = self.block_on(query_account(self, self.key()?.account))?;
debug!(
sequence = %account.sequence,
number = %account.account_number,
"[{}] send_tx: retrieved account",
self.id()
);
self.account = Some(account);
self.refresh_account()?;
}

Ok(self
.account
.as_mut()
.as_ref()
.expect("account was supposedly just cached"))
}

Expand All @@ -624,9 +723,10 @@ impl CosmosSdkChain {
Ok(self.account()?.sequence)
}

fn incr_account_sequence(&mut self) -> Result<(), Error> {
self.account()?.sequence += 1;
Ok(())
fn incr_account_sequence(&mut self) {
if let Some(account) = &mut self.account {
account.sequence += 1;
}
}

fn signer(&self, sequence: u64) -> Result<SignerInfo, Error> {
Expand Down Expand Up @@ -710,8 +810,6 @@ impl CosmosSdkChain {
&self,
mut tx_sync_results: Vec<TxSyncResult>,
) -> Result<Vec<TxSyncResult>, Error> {
use crate::util::retry::{retry_with_index, RetryResult};

let hashes = tx_sync_results
.iter()
.map(|res| res.response.hash.to_string())
Expand Down Expand Up @@ -2412,6 +2510,18 @@ fn can_recover_from_simulation_failure(e: &Error) -> bool {
}
}

/// Determine whether the given error yielded by `tx_simulate`
/// indicates that the current sequence number cached in Hermes
/// may be out-of-sync with the full node's version of the s.n.
fn mismatching_account_sequence_number(e: &Error) -> bool {
use crate::error::ErrorDetail::*;

match e.detail() {
GrpcStatus(detail) => detail.is_account_sequence_mismatch(),
_ => false,
}
}

struct PrettyFee<'a>(&'a Fee);

impl fmt::Display for PrettyFee<'_> {
Expand Down
16 changes: 12 additions & 4 deletions relayer/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -534,11 +534,19 @@ impl GrpcStatusSubdetail {

/// Check whether this gRPC error matches
/// - status: InvalidArgument
/// - message: account sequence mismatch ...
/// - message: "account sequence mismatch, expected 166791, got 166793: incorrect account sequence: invalid request"
///
/// # Note:
/// This predicate is tested and validated against errors
/// that appear at the `estimate_gas` step. The error
/// predicate to be used at the `broadcast_tx_sync` step
/// is different & relies on parsing the Response error code.
pub fn is_account_sequence_mismatch(&self) -> bool {
if self.status.code() != tonic::Code::InvalidArgument {
return false;
}
// The code changed in SDK 0.44 from `InvalidArgument` to `Unknown`.
// Workaround: Ignore code. We'll match only on the status message.
// if self.status.code() != tonic::Code::InvalidArgument {
// return false;
// }

self.status
.message()
Expand Down

0 comments on commit 205489c

Please sign in to comment.