Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(en): Fix reorg detection in presence of tree data fetcher #2197

Merged
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Handle reorgs in data fetcher
slowli committed Jun 10, 2024
commit eaea477001b602f2a656b48f88927e7851013c7d
2 changes: 2 additions & 0 deletions core/node/node_sync/src/tree_data_fetcher/metrics.rs
Original file line number Diff line number Diff line change
@@ -40,6 +40,7 @@ pub(super) enum StepOutcomeLabel {
UpdatedBatch,
NoProgress,
RemoteHashMissing,
PossibleReorg,
TransientError,
}

@@ -91,6 +92,7 @@ impl TreeDataFetcherMetrics {
}
Ok(StepOutcome::NoProgress) => StepOutcomeLabel::NoProgress,
Ok(StepOutcome::RemoteHashMissing) => StepOutcomeLabel::RemoteHashMissing,
Ok(StepOutcome::PossibleReorg) => StepOutcomeLabel::PossibleReorg,
Err(err) if err.is_transient() => StepOutcomeLabel::TransientError,
Err(_) => return, // fatal error; the node will exit soon anyway
};
21 changes: 19 additions & 2 deletions core/node/node_sync/src/tree_data_fetcher/mod.rs
Original file line number Diff line number Diff line change
@@ -80,6 +80,7 @@ enum StepOutcome {
UpdatedBatch(L1BatchNumber),
NoProgress,
RemoteHashMissing,
PossibleReorg,
}

/// Component fetching tree data (i.e., state root hashes for L1 batches) from external sources, such as
@@ -222,17 +223,23 @@ impl TreeDataFetcher {
}
Err(MissingData::Batch) => {
let err = anyhow::anyhow!(
"L1 batch #{l1_batch_to_fetch} is sealed locally, but is not present on the main node, \
"L1 batch #{l1_batch_to_fetch} is sealed locally, but is not present externally, \
which is assumed to store batch info indefinitely"
);
return Err(err.into());
}
Err(MissingData::RootHash) => {
tracing::debug!(
"L1 batch #{l1_batch_to_fetch} does not have root hash computed on the main node"
"L1 batch #{l1_batch_to_fetch} does not have root hash computed externally"
);
return Ok(StepOutcome::RemoteHashMissing);
}
Err(MissingData::PossibleReorg) => {
tracing::debug!(
"L1 batch #{l1_batch_to_fetch} potentially diverges from the external source"
);
return Ok(StepOutcome::PossibleReorg);
}
};

let stage_latency = self.metrics.stage_latency[&ProcessingStage::Persistence].start();
@@ -289,6 +296,16 @@ impl TreeDataFetcher {
self.update_health(last_updated_l1_batch);
true
}
Ok(StepOutcome::PossibleReorg) => {
tracing::info!("Potential chain reorg detected by tree data fetcher; not updating tree data");
// Since we don't 100% trust the reorg logic in the tree data fetcher, we let it continue working,
// so that, if there's a false positive, the whole node isn't brought down.
let health = TreeDataFetcherHealth::Affected {
error: "Potential chain reorg".to_string(),
};
self.health_updater.update(health.into());
true
}
Err(err) if err.is_transient() => {
tracing::warn!(
"Transient error in tree data fetcher, will retry after a delay: {err:?}"
13 changes: 7 additions & 6 deletions core/node/node_sync/src/tree_data_fetcher/provider/mod.rs
Original file line number Diff line number Diff line change
@@ -12,7 +12,7 @@ use zksync_web3_decl::{
namespaces::ZksNamespaceClient,
};

use super::{metrics::METRICS, TreeDataFetcherError, TreeDataFetcherResult};
use super::{metrics::METRICS, TreeDataFetcherResult};

#[cfg(test)]
mod tests;
@@ -25,6 +25,8 @@ pub(super) enum MissingData {
/// The provider lacks a root hash for a requested L1 batch; the batch itself is present on the provider.
#[error("no root hash for L1 batch")]
RootHash,
#[error("possible chain reorg detected")]
PossibleReorg,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, EncodeLabelValue, EncodeLabelSet)]
@@ -83,12 +85,11 @@ impl TreeDataProvider for Box<DynClient<L2>> {
if remote_l2_block_hash != Some(last_l2_block.hash) {
let last_l2_block_number = last_l2_block.number;
let last_l2_block_hash = last_l2_block.hash;
// FIXME: what should be returned here?
let err = anyhow::anyhow!(
tracing::info!(
"Fetched hash of the last L2 block #{last_l2_block_number} in L1 batch #{number} ({remote_l2_block_hash:?}) \
does not match the local one ({last_l2_block_hash:?}); this can be caused by a chain reorg"
);
return Err(TreeDataFetcherError::Internal(err));
return Ok(Err(MissingData::PossibleReorg));
}

Ok(batch_details
@@ -312,9 +313,9 @@ impl TreeDataProvider for L1DataProvider {
_ => {
tracing::warn!(
"Non-unique `BlockCommit` event for L1 batch #{number} queried using {filter:?}, potentially as a result \
of a chain revert: {logs:?}"
of a chain reorg: {logs:?}"
);
Ok(Err(MissingData::RootHash))
Ok(Err(MissingData::PossibleReorg))
}
}
}