diff --git a/consensus/safety-rules/src/safety_rules.rs b/consensus/safety-rules/src/safety_rules.rs index 124bff17f322b..60b0b8cc27a05 100644 --- a/consensus/safety-rules/src/safety_rules.rs +++ b/consensus/safety-rules/src/safety_rules.rs @@ -431,7 +431,7 @@ where v }) .map_err(|err| { - error!(log_cb(SafetyLogSchema::new(log_entry, LogEvent::Error)).error(&err)); + warn!(log_cb(SafetyLogSchema::new(log_entry, LogEvent::Error)).error(&err)); counters::increment_query(log_entry.as_str(), "error"); err }) diff --git a/consensus/src/counters.rs b/consensus/src/counters.rs index 208f3d5e78322..542d4eeb43578 100644 --- a/consensus/src/counters.rs +++ b/consensus/src/counters.rs @@ -897,10 +897,11 @@ pub fn update_counters_for_committed_blocks(blocks_to_commit: &[Arc = Lazy::new(|| { - register_int_counter!( - "aptos_consensus_proof_wrong_epoch", - "Count of the number of epoch proofs received for the wrong epoch", +pub static EPOCH_MANAGER_ISSUES_DETAILS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "aptos_consensus_epoch_manager_issues", + "Count of occurences of different epoch manager processing issues.", + &["kind"] ) .unwrap() }); diff --git a/consensus/src/epoch_manager.rs b/consensus/src/epoch_manager.rs index b294cc28dbd3c..75f79b2249adc 100644 --- a/consensus/src/epoch_manager.rs +++ b/consensus/src/epoch_manager.rs @@ -484,10 +484,17 @@ impl EpochManager

{ end_epoch: different_epoch, }; let msg = ConsensusMsg::EpochRetrievalRequest(Box::new(request)); - self.network_sender.send_to(peer_id, msg).context(format!( - "[EpochManager] Failed to send epoch retrieval to {}", - peer_id - )) + if let Err(err) = self.network_sender.send_to(peer_id, msg) { + warn!( + "[EpochManager] Failed to send epoch retrieval to {}, {:?}", + peer_id, err + ); + counters::EPOCH_MANAGER_ISSUES_DETAILS + .with_label_values(&["failed_to_send_epoch_retrieval"]) + .inc(); + } + + Ok(()) }, Ordering::Equal => { bail!("[EpochManager] Same epoch should not come to process_different_epoch"); @@ -1287,7 +1294,9 @@ impl EpochManager

{ msg_epoch, self.epoch() ); - counters::EPOCH_PROOF_WRONG_EPOCH.inc(); + counters::EPOCH_MANAGER_ISSUES_DETAILS + .with_label_values(&["epoch_proof_wrong_epoch"]) + .inc(); } }, ConsensusMsg::EpochRetrievalRequest(request) => { @@ -1430,7 +1439,11 @@ impl EpochManager

{ if let Some(tx) = &self.buffer_manager_msg_tx { tx.push(peer_id, request) } else { - Err(anyhow::anyhow!("Buffer manager not started")) + counters::EPOCH_MANAGER_ISSUES_DETAILS + .with_label_values(&["buffer_manager_not_started"]) + .inc(); + warn!("Buffer manager not started"); + Ok(()) } }, IncomingRpcRequest::RandGenRequest(_) => Ok(()), diff --git a/consensus/src/liveness/leader_reputation.rs b/consensus/src/liveness/leader_reputation.rs index 29349b8980782..f14ce23b744f0 100644 --- a/consensus/src/liveness/leader_reputation.rs +++ b/consensus/src/liveness/leader_reputation.rs @@ -180,7 +180,8 @@ impl MetadataBackend for AptosDBBackend { self.get_from_db_result(target_epoch, target_round, &events, hit_end) }, Err(e) => { - error!( + // fails if requested events were pruned / or we never backfil them. + warn!( error = ?e, "[leader reputation] Fail to refresh window", ); (vec![], HashValue::zero()) diff --git a/state-sync/storage-service/server/src/handler.rs b/state-sync/storage-service/server/src/handler.rs index 79941ae3b4794..d1748ae8a72d9 100644 --- a/state-sync/storage-service/server/src/handler.rs +++ b/state-sync/storage-service/server/src/handler.rs @@ -17,7 +17,7 @@ use crate::{ utils, }; use aptos_config::{config::StorageServiceConfig, network_id::PeerNetworkId}; -use aptos_logger::{debug, error, sample, sample::SampleRate, trace, warn}; +use aptos_logger::{debug, sample, sample::SampleRate, trace, warn}; use aptos_network::protocols::wire::handshake::v1::ProtocolId; use aptos_storage_service_types::{ requests::{ @@ -146,7 +146,7 @@ impl Handler { // Periodically log the failure sample!( SampleRate::Duration(Duration::from_secs(ERROR_LOG_FREQUENCY_SECS)), - error!(LogSchema::new(LogEntry::StorageServiceError) + warn!(LogSchema::new(LogEntry::StorageServiceError) .error(&error) .peer_network_id(peer_network_id) .request(&request)