From 090bea90181f68a7976f4480560868fa56678e03 Mon Sep 17 00:00:00 2001 From: EmilLuta Date: Sun, 4 Aug 2024 20:01:21 +0200 Subject: [PATCH 1/2] fix(core): Handle GCS Response retriable errors Response errors that GCS considers retriable are treated as fatal in our codebase. This causes the code to crash, when it could safely retry and continue it's execution. This PR treats GCS's response errors that are retriable as retriable. Alongside, `is_transient` has been renamed to `is_retriable` to better reflect it's usage. The change will reduce the amount of job crashes and peak execution times seen across prover subsystems (especially in BWGs). --- .../src/object_store/client.rs | 4 +-- core/lib/object_store/src/file.rs | 2 +- core/lib/object_store/src/gcs.rs | 25 ++++++++++--------- core/lib/object_store/src/raw.rs | 20 +++++++-------- core/lib/object_store/src/retries.rs | 4 +-- core/lib/snapshots_applier/src/lib.rs | 2 +- core/lib/snapshots_applier/src/tests/mod.rs | 4 +-- core/node/block_reverter/src/tests.rs | 2 +- 8 files changed, 32 insertions(+), 31 deletions(-) diff --git a/core/lib/default_da_clients/src/object_store/client.rs b/core/lib/default_da_clients/src/object_store/client.rs index fc17a842a099..6e16d3a4c40a 100644 --- a/core/lib/default_da_clients/src/object_store/client.rs +++ b/core/lib/default_da_clients/src/object_store/client.rs @@ -40,7 +40,7 @@ impl DataAvailabilityClient for ObjectStoreDAClient { .await { return Err(DAError { - is_transient: err.is_transient(), + is_transient: err.is_retriable(), error: anyhow::Error::from(err), }); } @@ -66,7 +66,7 @@ impl DataAvailabilityClient for ObjectStoreDAClient { } return Err(DAError { - is_transient: err.is_transient(), + is_transient: err.is_retriable(), error: anyhow::Error::from(err), }); } diff --git a/core/lib/object_store/src/file.rs b/core/lib/object_store/src/file.rs index 94689c78028f..decba534d23e 100644 --- a/core/lib/object_store/src/file.rs +++ b/core/lib/object_store/src/file.rs @@ -10,7 +10,7 @@ impl From for ObjectStoreError { match err.kind() { io::ErrorKind::NotFound => ObjectStoreError::KeyNotFound(err.into()), kind => ObjectStoreError::Other { - is_transient: matches!(kind, io::ErrorKind::Interrupted | io::ErrorKind::TimedOut), + is_retriable: matches!(kind, io::ErrorKind::Interrupted | io::ErrorKind::TimedOut), source: err.into(), }, } diff --git a/core/lib/object_store/src/gcs.rs b/core/lib/object_store/src/gcs.rs index 451866236243..cad37715edb6 100644 --- a/core/lib/object_store/src/gcs.rs +++ b/core/lib/object_store/src/gcs.rs @@ -85,22 +85,22 @@ impl GoogleCloudStore { impl From for ObjectStoreError { fn from(err: AuthError) -> Self { - let is_transient = matches!( + let is_retriable = matches!( &err, - AuthError::HttpError(err) if is_transient_http_error(err) + AuthError::HttpError(err) if is_retriable_http_error(err) ); Self::Initialization { source: err.into(), - is_transient, + is_retriable, } } } -fn is_transient_http_error(err: &reqwest::Error) -> bool { +fn is_retriable_http_error(err: &reqwest::Error) -> bool { err.is_timeout() || err.is_connect() // Not all request errors are logically transient, but a significant part of them are (e.g., - // `hyper` protocol-level errors), and it's safer to consider an error transient. + // `hyper` protocol-level errors), and it's safer to consider an error retriable. || err.is_request() || has_transient_io_source(err) || err.status() == Some(StatusCode::BAD_GATEWAY) @@ -117,8 +117,8 @@ fn get_source<'a, T: StdError + 'static>(mut err: &'a (dyn StdError + 'static)) } fn has_transient_io_source(err: &(dyn StdError + 'static)) -> bool { - // We treat any I/O errors as transient. This isn't always true, but frequently occurring I/O errors - // (e.g., "connection reset by peer") *are* transient, and treating an error as transient is a safer option, + // We treat any I/O errors as retriable. This isn't always true, but frequently occurring I/O errors + // (e.g., "connection reset by peer") *are* transient, and treating an error as retriable is a safer option, // even if it can lead to unnecessary retries. get_source::(err).is_some() } @@ -136,19 +136,20 @@ impl From for ObjectStoreError { if is_not_found { ObjectStoreError::KeyNotFound(err.into()) } else { - let is_transient = match &err { - HttpError::HttpClient(err) => is_transient_http_error(err), + let is_retriable = match &err { + HttpError::HttpClient(err) => is_retriable_http_error(err), HttpError::TokenSource(err) => { - // Token sources are mostly based on the `reqwest` HTTP client, so transient error detection + // Token sources are mostly based on the `reqwest` HTTP client, so retriable error detection // can reuse the same logic. let err = err.as_ref(); has_transient_io_source(err) - || get_source::(err).is_some_and(is_transient_http_error) + || get_source::(err).is_some_and(is_retriable_http_error) } + HttpError::Response(err) => err.is_retriable(), _ => false, }; ObjectStoreError::Other { - is_transient, + is_retriable, source: err.into(), } } diff --git a/core/lib/object_store/src/raw.rs b/core/lib/object_store/src/raw.rs index da1cd99728d9..3c5a89f160a5 100644 --- a/core/lib/object_store/src/raw.rs +++ b/core/lib/object_store/src/raw.rs @@ -59,7 +59,7 @@ pub enum ObjectStoreError { /// Object store initialization failed. Initialization { source: BoxedError, - is_transient: bool, + is_retriable: bool, }, /// An object with the specified key is not found. KeyNotFound(BoxedError), @@ -68,16 +68,16 @@ pub enum ObjectStoreError { /// Other error has occurred when accessing the store (e.g., a network error). Other { source: BoxedError, - is_transient: bool, + is_retriable: bool, }, } impl ObjectStoreError { - /// Gives a best-effort estimate whether this error is transient. - pub fn is_transient(&self) -> bool { + /// Gives a best-effort estimate whether this error is retriable. + pub fn is_retriable(&self) -> bool { match self { - Self::Initialization { is_transient, .. } | Self::Other { is_transient, .. } => { - *is_transient + Self::Initialization { is_retriable, .. } | Self::Other { is_retriable, .. } => { + *is_retriable } Self::KeyNotFound(_) | Self::Serialization(_) => false, } @@ -89,9 +89,9 @@ impl fmt::Display for ObjectStoreError { match self { Self::Initialization { source, - is_transient, + is_retriable, } => { - let kind = if *is_transient { "transient" } else { "fatal" }; + let kind = if *is_retriable { "retriable" } else { "fatal" }; write!( formatter, "{kind} error initializing object store: {source}" @@ -101,9 +101,9 @@ impl fmt::Display for ObjectStoreError { Self::Serialization(err) => write!(formatter, "serialization error: {err}"), Self::Other { source, - is_transient, + is_retriable, } => { - let kind = if *is_transient { "transient" } else { "fatal" }; + let kind = if *is_retriable { "retriable" } else { "fatal" }; write!(formatter, "{kind} error accessing object store: {source}") } } diff --git a/core/lib/object_store/src/retries.rs b/core/lib/object_store/src/retries.rs index a1932d799f8e..a0b9e3fb1bf8 100644 --- a/core/lib/object_store/src/retries.rs +++ b/core/lib/object_store/src/retries.rs @@ -39,7 +39,7 @@ impl Request<'_> { let result = loop { match f().await { Ok(result) => break Ok(result), - Err(err) if err.is_transient() => { + Err(err) if err.is_retriable() => { if retries > max_retries { tracing::warn!(%err, "Exhausted {max_retries} retries performing request; returning last error"); break Err(err); @@ -142,7 +142,7 @@ mod test { fn transient_error() -> ObjectStoreError { ObjectStoreError::Other { - is_transient: true, + is_retriable: true, source: "oops".into(), } } diff --git a/core/lib/snapshots_applier/src/lib.rs b/core/lib/snapshots_applier/src/lib.rs index d2231f730b17..12e0e0a6e3c0 100644 --- a/core/lib/snapshots_applier/src/lib.rs +++ b/core/lib/snapshots_applier/src/lib.rs @@ -82,7 +82,7 @@ enum SnapshotsApplierError { impl SnapshotsApplierError { fn object_store(err: ObjectStoreError, context: String) -> Self { - if err.is_transient() { + if err.is_retriable() { Self::Retryable(anyhow::Error::from(err).context(context)) } else { Self::Fatal(anyhow::Error::from(err).context(context)) diff --git a/core/lib/snapshots_applier/src/tests/mod.rs b/core/lib/snapshots_applier/src/tests/mod.rs index 379808b365ca..169b99ec505b 100644 --- a/core/lib/snapshots_applier/src/tests/mod.rs +++ b/core/lib/snapshots_applier/src/tests/mod.rs @@ -61,7 +61,7 @@ async fn snapshots_creator_can_successfully_recover_db( Ok(()) // "recover" after 3 retries } else { Err(ObjectStoreError::Other { - is_transient: true, + is_retriable: true, source: "transient error".into(), }) } @@ -550,7 +550,7 @@ async fn applier_returns_error_after_too_many_object_store_retries() { let (object_store, client) = prepare_clients(&expected_status, &storage_logs).await; let object_store = ObjectStoreWithErrors::new(object_store, |_| { Err(ObjectStoreError::Other { - is_transient: true, + is_retriable: true, source: "service not available".into(), }) }); diff --git a/core/node/block_reverter/src/tests.rs b/core/node/block_reverter/src/tests.rs index 161ac3ed00c4..e23c5e6f7ccf 100644 --- a/core/node/block_reverter/src/tests.rs +++ b/core/node/block_reverter/src/tests.rs @@ -382,7 +382,7 @@ impl ObjectStore for ErroneousStore { .unwrap() .remove(&(bucket, key.to_owned())); Err(ObjectStoreError::Other { - is_transient: false, + is_retriable: false, source: "fatal error".into(), }) } From a16b732c29f5c8f7981aeb67944a4d35de47d9ba Mon Sep 17 00:00:00 2001 From: EmilLuta Date: Wed, 7 Aug 2024 08:24:26 +0200 Subject: [PATCH 2/2] Refactor `is_transient` to `is_retriable` for consistency --- core/bin/zksync_tee_prover/src/error.rs | 12 ++++++------ core/bin/zksync_tee_prover/src/tee_prover.rs | 6 +++--- core/lib/da_client/src/types.rs | 10 +++++----- .../default_da_clients/src/object_store/client.rs | 6 +++--- core/lib/object_store/src/retries.rs | 6 +++--- core/lib/snapshots_applier/src/lib.rs | 4 ++-- core/lib/web3_decl/src/error.rs | 4 ++-- .../node/commitment_generator/src/validation_task.rs | 2 +- core/node/consensus/src/en.rs | 2 +- core/node/consensus/src/testonly.rs | 2 +- core/node/consistency_checker/src/lib.rs | 8 ++++---- core/node/da_dispatcher/src/da_dispatcher.rs | 2 +- core/node/eth_sender/src/error.rs | 4 ++-- core/node/eth_sender/src/eth_tx_manager.rs | 6 +++--- core/node/metadata_calculator/src/api_server/mod.rs | 4 ++-- core/node/node_sync/src/tree_data_fetcher/metrics.rs | 2 +- core/node/node_sync/src/tree_data_fetcher/mod.rs | 8 ++++---- .../node_sync/src/tree_data_fetcher/provider/mod.rs | 2 +- core/node/node_sync/src/validate_chain_ids_task.rs | 6 +++--- core/node/reorg_detector/src/lib.rs | 12 ++++++------ core/node/reorg_detector/src/tests.rs | 2 +- 21 files changed, 55 insertions(+), 55 deletions(-) diff --git a/core/bin/zksync_tee_prover/src/error.rs b/core/bin/zksync_tee_prover/src/error.rs index bd60a772948a..602ac813ae99 100644 --- a/core/bin/zksync_tee_prover/src/error.rs +++ b/core/bin/zksync_tee_prover/src/error.rs @@ -11,19 +11,19 @@ pub(crate) enum TeeProverError { } impl TeeProverError { - pub fn is_transient(&self) -> bool { + pub fn is_retriable(&self) -> bool { match self { - Self::Request(err) => is_transient_http_error(err), + Self::Request(err) => is_retriable_http_error(err), _ => false, } } } -fn is_transient_http_error(err: &reqwest::Error) -> bool { +fn is_retriable_http_error(err: &reqwest::Error) -> bool { err.is_timeout() || err.is_connect() // Not all request errors are logically transient, but a significant part of them are (e.g., - // `hyper` protocol-level errors), and it's safer to consider an error transient. + // `hyper` protocol-level errors), and it's safer to consider an error retriable. || err.is_request() || has_transient_io_source(err) || err.status() == Some(StatusCode::BAD_GATEWAY) @@ -31,8 +31,8 @@ fn is_transient_http_error(err: &reqwest::Error) -> bool { } fn has_transient_io_source(err: &(dyn StdError + 'static)) -> bool { - // We treat any I/O errors as transient. This isn't always true, but frequently occurring I/O errors - // (e.g., "connection reset by peer") *are* transient, and treating an error as transient is a safer option, + // We treat any I/O errors as retriable. This isn't always true, but frequently occurring I/O errors + // (e.g., "connection reset by peer") *are* transient, and treating an error as retriable is a safer option, // even if it can lead to unnecessary retries. get_source::(err).is_some() } diff --git a/core/bin/zksync_tee_prover/src/tee_prover.rs b/core/bin/zksync_tee_prover/src/tee_prover.rs index b14d07b72db6..bcd1e4a1b6b4 100644 --- a/core/bin/zksync_tee_prover/src/tee_prover.rs +++ b/core/bin/zksync_tee_prover/src/tee_prover.rs @@ -137,10 +137,10 @@ impl TeeProver { /// TEE prover configuration options. #[derive(Debug, Clone)] pub struct TeeProverConfig { - /// Number of retries for transient errors before giving up on recovery (i.e., returning an error + /// Number of retries for retriable errors before giving up on recovery (i.e., returning an error /// from [`Self::run()`]). pub max_retries: usize, - /// Initial back-off interval when retrying recovery on a transient error. Each subsequent retry interval + /// Initial back-off interval when retrying recovery on a retriable error. Each subsequent retry interval /// will be multiplied by [`Self.retry_backoff_multiplier`]. pub initial_retry_backoff: Duration, pub retry_backoff_multiplier: f32, @@ -198,7 +198,7 @@ impl Task for TeeProver { } Err(err) => { METRICS.network_errors_counter.inc_by(1); - if !err.is_transient() || retries > self.config.max_retries { + if !err.is_retriable() || retries > self.config.max_retries { return Err(err.into()); } retries += 1; diff --git a/core/lib/da_client/src/types.rs b/core/lib/da_client/src/types.rs index e339111bb51a..2b15cbe905ed 100644 --- a/core/lib/da_client/src/types.rs +++ b/core/lib/da_client/src/types.rs @@ -6,19 +6,19 @@ use serde::Serialize; #[derive(Debug)] pub struct DAError { pub error: anyhow::Error, - pub is_transient: bool, + pub is_retriable: bool, } impl DAError { - pub fn is_transient(&self) -> bool { - self.is_transient + pub fn is_retriable(&self) -> bool { + self.is_retriable } } impl Display for DAError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let kind = if self.is_transient { - "transient" + let kind = if self.is_retriable { + "retriable" } else { "fatal" }; diff --git a/core/lib/default_da_clients/src/object_store/client.rs b/core/lib/default_da_clients/src/object_store/client.rs index 6e16d3a4c40a..f05029a8eb1c 100644 --- a/core/lib/default_da_clients/src/object_store/client.rs +++ b/core/lib/default_da_clients/src/object_store/client.rs @@ -40,7 +40,7 @@ impl DataAvailabilityClient for ObjectStoreDAClient { .await { return Err(DAError { - is_transient: err.is_retriable(), + is_retriable: err.is_retriable(), error: anyhow::Error::from(err), }); } @@ -53,7 +53,7 @@ impl DataAvailabilityClient for ObjectStoreDAClient { async fn get_inclusion_data(&self, key: &str) -> Result, DAError> { let key_u32 = key.parse::().map_err(|err| DAError { error: anyhow::Error::from(err).context(format!("Failed to parse blob key: {}", key)), - is_transient: false, + is_retriable: false, })?; if let Err(err) = self @@ -66,7 +66,7 @@ impl DataAvailabilityClient for ObjectStoreDAClient { } return Err(DAError { - is_transient: err.is_retriable(), + is_retriable: err.is_retriable(), error: anyhow::Error::from(err), }); } diff --git a/core/lib/object_store/src/retries.rs b/core/lib/object_store/src/retries.rs index a0b9e3fb1bf8..2cccbb17c2bb 100644 --- a/core/lib/object_store/src/retries.rs +++ b/core/lib/object_store/src/retries.rs @@ -140,7 +140,7 @@ mod test { use super::*; - fn transient_error() -> ObjectStoreError { + fn retriable_error() -> ObjectStoreError { ObjectStoreError::Other { is_retriable: true, source: "oops".into(), @@ -159,7 +159,7 @@ mod test { #[tokio::test] async fn test_retry_failure_exhausted() { let err = Request::New - .retry(&"store", 2, || async { Err::(transient_error()) }) + .retry(&"store", 2, || async { Err::(retriable_error()) }) .await .unwrap_err(); assert_matches!(err, ObjectStoreError::Other { .. }); @@ -173,7 +173,7 @@ mod test { if retries + 1 == n { Ok(42) } else { - Err(transient_error()) + Err(retriable_error()) } }) .await diff --git a/core/lib/snapshots_applier/src/lib.rs b/core/lib/snapshots_applier/src/lib.rs index 12e0e0a6e3c0..b4d24a0b1851 100644 --- a/core/lib/snapshots_applier/src/lib.rs +++ b/core/lib/snapshots_applier/src/lib.rs @@ -209,10 +209,10 @@ pub enum RecoveryCompletionStatus { /// Snapshot applier configuration options. #[derive(Debug, Clone)] pub struct SnapshotsApplierConfig { - /// Number of retries for transient errors before giving up on recovery (i.e., returning an error + /// Number of retries for retriable errors before giving up on recovery (i.e., returning an error /// from [`Self::run()`]). pub retry_count: usize, - /// Initial back-off interval when retrying recovery on a transient error. Each subsequent retry interval + /// Initial back-off interval when retrying recovery on a retriable error. Each subsequent retry interval /// will be multiplied by [`Self.retry_backoff_multiplier`]. pub initial_retry_backoff: Duration, pub retry_backoff_multiplier: f32, diff --git a/core/lib/web3_decl/src/error.rs b/core/lib/web3_decl/src/error.rs index 1ea737a947f5..f42fe8de59d5 100644 --- a/core/lib/web3_decl/src/error.rs +++ b/core/lib/web3_decl/src/error.rs @@ -85,8 +85,8 @@ impl EnrichedClientError { self } - /// Whether the error should be considered transient. - pub fn is_transient(&self) -> bool { + /// Whether the error should be considered retriable. + pub fn is_retriable(&self) -> bool { match self.as_ref() { ClientError::Transport(_) | ClientError::RequestTimeout => true, ClientError::Call(err) => { diff --git a/core/node/commitment_generator/src/validation_task.rs b/core/node/commitment_generator/src/validation_task.rs index cf93a4899b89..a28eeabfd0fc 100644 --- a/core/node/commitment_generator/src/validation_task.rs +++ b/core/node/commitment_generator/src/validation_task.rs @@ -73,7 +73,7 @@ impl L1BatchCommitmentModeValidationTask { return Ok(()); } - Err(ContractCallError::EthereumGateway(err)) if err.is_transient() => { + Err(ContractCallError::EthereumGateway(err)) if err.is_retriable() => { tracing::warn!( "Transient error validating commitment mode, will retry after {:?}: {err}", self.retry_interval diff --git a/core/node/consensus/src/en.rs b/core/node/consensus/src/en.rs index d14893042f5b..54639441bd6e 100644 --- a/core/node/consensus/src/en.rs +++ b/core/node/consensus/src/en.rs @@ -192,7 +192,7 @@ impl EN { match res { Ok(Some(block)) => return Ok(block.try_into()?), Ok(None) => {} - Err(err) if err.is_transient() => {} + Err(err) if err.is_retriable() => {} Err(err) => { return Err(anyhow::format_err!("client.fetch_l2_block({}): {err}", n).into()); } diff --git a/core/node/consensus/src/testonly.rs b/core/node/consensus/src/testonly.rs index 8ed9b933d164..73631d1a137f 100644 --- a/core/node/consensus/src/testonly.rs +++ b/core/node/consensus/src/testonly.rs @@ -364,7 +364,7 @@ impl StateKeeper { let res = ctx.wait(client.fetch_l2_block_number()).await?; match res { Ok(_) => return Ok(client), - Err(err) if err.is_transient() => { + Err(err) if err.is_retriable() => { ctx.sleep(time::Duration::seconds(5)).await?; } Err(err) => { diff --git a/core/node/consistency_checker/src/lib.rs b/core/node/consistency_checker/src/lib.rs index ba8085333a4c..29b020b18607 100644 --- a/core/node/consistency_checker/src/lib.rs +++ b/core/node/consistency_checker/src/lib.rs @@ -41,10 +41,10 @@ enum CheckError { } impl CheckError { - fn is_transient(&self) -> bool { + fn is_retriable(&self) -> bool { match self { Self::Web3(err) | Self::ContractCall(ContractCallError::EthereumGateway(err)) => { - err.is_transient() + err.is_retriable() } _ => false, } @@ -535,7 +535,7 @@ impl ConsistencyChecker { self.event_handler.initialize(); while let Err(err) = self.sanity_check_diamond_proxy_addr().await { - if err.is_transient() { + if err.is_retriable() { tracing::warn!( "Transient error checking diamond proxy contract; will retry after a delay: {:#}", anyhow::Error::from(err) @@ -635,7 +635,7 @@ impl ConsistencyChecker { } } } - Err(err) if err.is_transient() => { + Err(err) if err.is_retriable() => { tracing::warn!( "Transient error while verifying L1 batch #{batch_number}; will retry after a delay: {:#}", anyhow::Error::from(err) diff --git a/core/node/da_dispatcher/src/da_dispatcher.rs b/core/node/da_dispatcher/src/da_dispatcher.rs index 99e87a1cc45d..ea1858da25d3 100644 --- a/core/node/da_dispatcher/src/da_dispatcher.rs +++ b/core/node/da_dispatcher/src/da_dispatcher.rs @@ -193,7 +193,7 @@ where return Ok(result); } Err(err) => { - if !err.is_transient() || retries > max_retries { + if !err.is_retriable() || retries > max_retries { return Err(err); } diff --git a/core/node/eth_sender/src/error.rs b/core/node/eth_sender/src/error.rs index ed4fdaaec25a..ecb89c1f324c 100644 --- a/core/node/eth_sender/src/error.rs +++ b/core/node/eth_sender/src/error.rs @@ -12,9 +12,9 @@ pub enum EthSenderError { } impl EthSenderError { - pub fn is_transient(&self) -> bool { + pub fn is_retriable(&self) -> bool { match self { - EthSenderError::EthereumGateway(err) => err.is_transient(), + EthSenderError::EthereumGateway(err) => err.is_retriable(), _ => false, } } diff --git a/core/node/eth_sender/src/eth_tx_manager.rs b/core/node/eth_sender/src/eth_tx_manager.rs index 9f41bfde8b4d..79a9b1dfdb58 100644 --- a/core/node/eth_sender/src/eth_tx_manager.rs +++ b/core/node/eth_sender/src/eth_tx_manager.rs @@ -247,9 +247,9 @@ impl EthTxManager { match self.l1_interface.send_raw_tx(raw_tx, operator_type).await { Ok(_) => Ok(()), Err(error) => { - // In transient errors, server may have received the transaction + // In retriable errors, server may have received the transaction // we don't want to loose record about it in case that happens - if !error.is_transient() { + if !error.is_retriable() { storage .eth_sender_dal() .remove_tx_history(tx_history_id) @@ -616,7 +616,7 @@ impl EthTxManager { // Web3 API request failures can cause this, // and anything more important is already properly reported. tracing::warn!("eth_sender error {:?}", error); - if error.is_transient() { + if error.is_retriable() { METRICS.l1_transient_errors.inc(); } } diff --git a/core/node/metadata_calculator/src/api_server/mod.rs b/core/node/metadata_calculator/src/api_server/mod.rs index c81e2ba74544..6f46e8aeea81 100644 --- a/core/node/metadata_calculator/src/api_server/mod.rs +++ b/core/node/metadata_calculator/src/api_server/mod.rs @@ -240,8 +240,8 @@ impl CheckHealth for TreeApiHttpClient { // Tree API is not a critical component, so its errors are not considered fatal for the app health. Err(err) => Health::from(HealthStatus::Affected).with_details(serde_json::json!({ "error": err.to_string(), - // Transient error detection is a best-effort estimate - "is_transient_error": matches!(err, TreeApiError::NotReady(_)), + // Retriable error detection is a best-effort estimate + "is_retriable_error": matches!(err, TreeApiError::NotReady(_)), })), } } diff --git a/core/node/node_sync/src/tree_data_fetcher/metrics.rs b/core/node/node_sync/src/tree_data_fetcher/metrics.rs index aad5f090e1fc..d47bf944af46 100644 --- a/core/node/node_sync/src/tree_data_fetcher/metrics.rs +++ b/core/node/node_sync/src/tree_data_fetcher/metrics.rs @@ -103,7 +103,7 @@ impl TreeDataFetcherMetrics { Ok(StepOutcome::NoProgress) => StepOutcomeLabel::NoProgress, Ok(StepOutcome::RemoteHashMissing) => StepOutcomeLabel::RemoteHashMissing, Ok(StepOutcome::PossibleReorg) => StepOutcomeLabel::PossibleReorg, - Err(err) if err.is_transient() => StepOutcomeLabel::TransientError, + Err(err) if err.is_retriable() => StepOutcomeLabel::TransientError, Err(_) => return, // fatal error; the node will exit soon anyway }; self.step_outcomes[&label].inc(); diff --git a/core/node/node_sync/src/tree_data_fetcher/mod.rs b/core/node/node_sync/src/tree_data_fetcher/mod.rs index c871ec16b9de..52037dac4edc 100644 --- a/core/node/node_sync/src/tree_data_fetcher/mod.rs +++ b/core/node/node_sync/src/tree_data_fetcher/mod.rs @@ -44,9 +44,9 @@ impl From for TreeDataFetcherError { } impl TreeDataFetcherError { - fn is_transient(&self) -> bool { + fn is_retriable(&self) -> bool { match self { - Self::Rpc(err) => err.is_transient(), + Self::Rpc(err) => err.is_retriable(), Self::Internal(_) => false, } } @@ -268,7 +268,7 @@ impl TreeDataFetcher { self.health_updater.update(health.into()); } - /// Runs this component until a fatal error occurs or a stop signal is received. Transient errors + /// Runs this component until a fatal error occurs or a stop signal is received. Retriable errors /// (e.g., no network connection) are handled gracefully by retrying after a delay. pub async fn run(mut self, mut stop_receiver: watch::Receiver) -> anyhow::Result<()> { self.metrics.observe_info(&self); @@ -304,7 +304,7 @@ impl TreeDataFetcher { self.health_updater.update(health.into()); true } - Err(err) if err.is_transient() => { + Err(err) if err.is_retriable() => { tracing::warn!( "Transient error in tree data fetcher, will retry after a delay: {err:?}" ); diff --git a/core/node/node_sync/src/tree_data_fetcher/provider/mod.rs b/core/node/node_sync/src/tree_data_fetcher/provider/mod.rs index 867ea2427541..96819101fa2b 100644 --- a/core/node/node_sync/src/tree_data_fetcher/provider/mod.rs +++ b/core/node/node_sync/src/tree_data_fetcher/provider/mod.rs @@ -328,7 +328,7 @@ impl TreeDataProvider for CombinedDataProvider { match l1_result { Err(err) => { - if err.is_transient() { + if err.is_retriable() { tracing::info!( "Transient error calling L1 data provider: {:#}", anyhow::Error::from(err) diff --git a/core/node/node_sync/src/validate_chain_ids_task.rs b/core/node/node_sync/src/validate_chain_ids_task.rs index 0f80bf799b15..3507cbb2b2f9 100644 --- a/core/node/node_sync/src/validate_chain_ids_task.rs +++ b/core/node/node_sync/src/validate_chain_ids_task.rs @@ -87,9 +87,9 @@ impl ValidateChainIdsTask { ); return Ok(()); } - Err(err) if err.is_transient() => { + Err(err) if err.is_retriable() => { tracing::warn!( - "Transient error getting L1 chain ID from main node client, will retry in {:?}: {err}", + "Retriable error getting L1 chain ID from main node client, will retry in {:?}: {err}", Self::BACKOFF_INTERVAL ); tokio::time::sleep(Self::BACKOFF_INTERVAL).await; @@ -123,7 +123,7 @@ impl ValidateChainIdsTask { ); return Ok(()); } - Err(err) if err.is_transient() => { + Err(err) if err.is_retriable() => { tracing::warn!( "Transient error getting L2 chain ID from main node client, will retry in {:?}: {err}", Self::BACKOFF_INTERVAL diff --git a/core/node/reorg_detector/src/lib.rs b/core/node/reorg_detector/src/lib.rs index 5945b201c16c..d1954ca4b74b 100644 --- a/core/node/reorg_detector/src/lib.rs +++ b/core/node/reorg_detector/src/lib.rs @@ -70,9 +70,9 @@ pub enum Error { } impl HashMatchError { - pub fn is_transient(&self) -> bool { + pub fn is_retriable(&self) -> bool { match self { - Self::Rpc(err) => err.is_transient(), + Self::Rpc(err) => err.is_retriable(), Self::MissingData(_) => true, Self::Internal(_) => false, } @@ -80,8 +80,8 @@ impl HashMatchError { } impl Error { - pub fn is_transient(&self) -> bool { - matches!(self, Self::HashMatch(err) if err.is_transient()) + pub fn is_retriable(&self) -> bool { + matches!(self, Self::HashMatch(err) if err.is_retriable()) } } @@ -433,7 +433,7 @@ impl ReorgDetector { /// - `Err(ReorgDetected(_))` if a reorg was detected. /// - `Err(_)` for fatal errors. /// - /// Transient errors are retried indefinitely accounting for a stop signal. + /// Retriable errors are retried indefinitely accounting for a stop signal. pub async fn run_once(&mut self, stop_receiver: watch::Receiver) -> Result<(), Error> { self.run_inner(true, stop_receiver).await } @@ -459,7 +459,7 @@ impl ReorgDetector { tracing::debug!("Last L1 batch on the main node doesn't have a state root hash; waiting until it is computed"); self.sleep_interval / 10 } - Err(err) if err.is_transient() => { + Err(err) if err.is_retriable() => { tracing::warn!("Following transient error occurred: {err}"); tracing::info!("Trying again after a delay"); self.sleep_interval diff --git a/core/node/reorg_detector/src/tests.rs b/core/node/reorg_detector/src/tests.rs index c90a3a0592c7..5465cf8662d6 100644 --- a/core/node/reorg_detector/src/tests.rs +++ b/core/node/reorg_detector/src/tests.rs @@ -704,7 +704,7 @@ async fn detector_does_not_deadlock_if_main_node_is_not_available() { .unwrap(); drop(storage); - // `client` will always return transient errors making the detector to retry its checks indefinitely + // `client` will always return retriable errors making the detector to retry its checks indefinitely let client = MockMainNodeClient::default(); *client.error_kind.lock().unwrap() = Some(RpcErrorKind::Transient); let mut detector = create_mock_detector(client, pool);