Skip to content

Commit

Permalink
feat(proof-data-handler): exclude batches without object file in GCS
Browse files Browse the repository at this point in the history
/tee/proof_inputs endpoint no longer returns batches that have no
corresponding object file in Google Cloud Storage for an extended
period.

Since the recent `mainnet`'s `24.25.0` redeployment, we've been
[flooded with warnings][warnings] for the `proof-data-handler` on
`mainnet` (the warnings are actually _not_ fatal in this context):

```
Failed request with a fatal error

(...)

Blobs for batch numbers 490520 to 490555 not found in the object store.
Marked as unpicked.
```

The issue was caused [by the code][code] behind the `/tee/proof_inputs`
[endpoint][endpoint_proof_inputs] (which is equivalent to the
`/proof_generation_data` [endpoint][endpoint_proof_generation_data]) –
it finds the next batch to send to the [requesting][requesting]
`tee-prover` by looking for the first batch that has a corresponding
object in the Google object store. As it skips over batches that don’t
have the objects, [it logs][logging] `Failed request with a fatal error`
for each one (unless the skipped batch was successfully proven, in which
case it doesn’t log the error). This happens with every
[request][request] the `tee-prover` sends, which is why we were getting
so much noise in the logs.

One possible solution was to manually flag the problematic batches as
`permanently_ignored`, like Thomas [did before][Thomas] on `mainnet`.
It was a quick and dirty workaround, but now we have a more automated
solution.

[warnings]: https://grafana.matterlabs.dev/goto/TjlaXQgHg?orgId=1
[code]: https://github.com/matter-labs/zksync-era/blob/3f406c7d0c0e76d798c2d838abde57ca692822c0/core/node/proof_data_handler/src/tee_request_processor.rs#L35-L79
[endpoint_proof_inputs]: https://github.com/matter-labs/zksync-era/blob/3f406c7d0c0e76d798c2d838abde57ca692822c0/core/node/proof_data_handler/src/lib.rs#L96
[endpoint_proof_generation_data]: https://github.com/matter-labs/zksync-era/blob/3f406c7d0c0e76d798c2d838abde57ca692822c0/core/node/proof_data_handler/src/lib.rs#L67
[requesting]: https://github.com/matter-labs/zksync-era/blob/3f406c7d0c0e76d798c2d838abde57ca692822c0/core/bin/zksync_tee_prover/src/tee_prover.rs#L93
[logging]: https://github.com/matter-labs/zksync-era/blob/3f406c7d0c0e76d798c2d838abde57ca692822c0/core/lib/object_store/src/retries.rs#L56
[Thomas]: https://matter-labs-workspace.slack.com/archives/C05ANUCGCKV/p1725284962312929
  • Loading branch information
pbeza committed Sep 30, 2024
1 parent 79b6fcf commit 65cc26e
Show file tree
Hide file tree
Showing 13 changed files with 200 additions and 34 deletions.
1 change: 0 additions & 1 deletion core/lib/basic_types/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ pub mod protocol_version;
pub mod prover_dal;
pub mod seed_phrase;
pub mod settlement;
pub mod tee_types;
pub mod url;
pub mod vm;
pub mod web3;
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
UPDATE tee_proof_generation_details SET status = 'permanently_ignore' WHERE status = 'permanently_ignored';
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
-- there were manually added tee_proof_generation_details with status 'permanently_ignore'
UPDATE tee_proof_generation_details SET status = 'permanently_ignored' WHERE status = 'permanently_ignore';
16 changes: 16 additions & 0 deletions core/lib/dal/src/models/storage_tee_proof.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use chrono::NaiveDateTime;
use zksync_types::{tee_types::LockedBatch, L1BatchNumber};

#[derive(Debug, Clone, sqlx::FromRow)]
pub struct StorageTeeProof {
Expand All @@ -8,3 +9,18 @@ pub struct StorageTeeProof {
pub updated_at: NaiveDateTime,
pub attestation: Option<Vec<u8>>,
}

#[derive(Debug, Clone, sqlx::FromRow)]
pub struct StorageLockedBatch {
pub l1_batch_number: i64,
pub created_at: NaiveDateTime,
}

impl From<StorageLockedBatch> for LockedBatch {
fn from(tx: StorageLockedBatch) -> LockedBatch {
LockedBatch {
l1_batch_number: L1BatchNumber::from(tx.l1_batch_number as u32),
created_at: tx.created_at,
}
}
}
31 changes: 20 additions & 11 deletions core/lib/dal/src/tee_proof_generation_dal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,15 @@ use zksync_db_connection::{
instrument::{InstrumentExt, Instrumented},
utils::pg_interval_from_duration,
};
use zksync_types::{tee_types::TeeType, L1BatchNumber};
use zksync_types::{
tee_types::{LockedBatch, TeeType},
L1BatchNumber,
};

use crate::{
models::storage_tee_proof::StorageTeeProof,
tee_verifier_input_producer_dal::TeeVerifierInputProducerJobStatus, Core,
models::storage_tee_proof::{StorageLockedBatch, StorageTeeProof},
tee_verifier_input_producer_dal::TeeVerifierInputProducerJobStatus,
Core,
};

#[derive(Debug)]
Expand All @@ -21,13 +25,15 @@ pub struct TeeProofGenerationDal<'a, 'c> {
}

#[derive(Debug, EnumString, Display)]
enum TeeProofGenerationJobStatus {
pub enum TeeProofGenerationJobStatus {
#[strum(serialize = "unpicked")]
Unpicked,
#[strum(serialize = "picked_by_prover")]
PickedByProver,
#[strum(serialize = "generated")]
Generated,
#[strum(serialize = "permanently_ignored")]
PermanentlyIgnored,
}

impl TeeProofGenerationDal<'_, '_> {
Expand All @@ -36,10 +42,11 @@ impl TeeProofGenerationDal<'_, '_> {
tee_type: TeeType,
processing_timeout: Duration,
min_batch_number: Option<L1BatchNumber>,
) -> DalResult<Option<L1BatchNumber>> {
) -> DalResult<Option<LockedBatch>> {
let processing_timeout = pg_interval_from_duration(processing_timeout);
let min_batch_number = min_batch_number.map_or(0, |num| i64::from(num.0));
let query = sqlx::query!(
let query = sqlx::query_as!(
StorageLockedBatch,
r#"
UPDATE tee_proof_generation_details
SET
Expand Down Expand Up @@ -72,7 +79,8 @@ impl TeeProofGenerationDal<'_, '_> {
SKIP LOCKED
)
RETURNING
tee_proof_generation_details.l1_batch_number
tee_proof_generation_details.l1_batch_number,
tee_proof_generation_details.created_at
"#,
TeeProofGenerationJobStatus::PickedByProver.to_string(),
tee_type.to_string(),
Expand All @@ -82,22 +90,23 @@ impl TeeProofGenerationDal<'_, '_> {
min_batch_number
);

let batch_number = Instrumented::new("lock_batch_for_proving")
let locked_batch_details = Instrumented::new("lock_batch_for_proving")
.with_arg("tee_type", &tee_type)
.with_arg("processing_timeout", &processing_timeout)
.with_arg("l1_batch_number", &min_batch_number)
.with(query)
.fetch_optional(self.storage)
.await?
.map(|row| L1BatchNumber(row.l1_batch_number as u32));
.map(Into::into);

Ok(batch_number)
Ok(locked_batch_details)
}

pub async fn unlock_batch(
&mut self,
l1_batch_number: L1BatchNumber,
tee_type: TeeType,
status: TeeProofGenerationJobStatus,
) -> DalResult<()> {
let batch_number = i64::from(l1_batch_number.0);
sqlx::query!(
Expand All @@ -110,7 +119,7 @@ impl TeeProofGenerationDal<'_, '_> {
l1_batch_number = $2
AND tee_type = $3
"#,
TeeProofGenerationJobStatus::Unpicked.to_string(),
status.to_string(),
batch_number,
tee_type.to_string()
)
Expand Down
21 changes: 21 additions & 0 deletions core/lib/object_store/src/objects.rs
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,27 @@ impl dyn ObjectStore + '_ {
V::deserialize(bytes).map_err(ObjectStoreError::Serialization)
}

/// Fetches the value for the given key, returning an Option.
///
/// # Errors
///
/// Returns an error if the object cannot be accessed or cannot be deserialized.
#[tracing::instrument(name = "ObjectStore::get_optional", skip_all, fields(key))]
pub async fn get_optional<V: StoredObject>(
&self,
key: V::Key<'_>,
) -> Result<Option<V>, ObjectStoreError> {
let key = V::encode_key(key);
tracing::Span::current().record("key", key.as_str());
match self.get_raw(V::BUCKET, &key).await {
Ok(bytes) => V::deserialize(bytes)
.map(Some)
.map_err(ObjectStoreError::Serialization),
Err(ObjectStoreError::KeyNotFound(_)) => Ok(None),
Err(err) => Err(err),
}
}

/// Fetches the value for the given encoded key if it exists.
///
/// # Errors
Expand Down
38 changes: 37 additions & 1 deletion core/lib/object_store/src/retries.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,43 @@ impl Request<'_> {
self,
store: &impl fmt::Debug,
max_retries: u16,
f: F,
) -> Result<T, ObjectStoreError>
where
Fut: Future<Output = Result<T, ObjectStoreError>>,
F: FnMut() -> Fut,
{
self.retry_internal(max_retries, f).await
}

#[tracing::instrument(
name = "object_store::Request::retry_optional",
skip(f), // output request and store as a part of structured logs
fields(retries) // Will be recorded before returning from the function
)]
async fn retry_optional<T, Fut, F>(
self,
store: &impl fmt::Debug,
max_retries: u16,
f: F,
) -> Result<Option<T>, ObjectStoreError>
where
Fut: Future<Output = Result<T, ObjectStoreError>>,
F: FnMut() -> Fut,
{
match self.retry_internal(max_retries, f).await {
Ok(result) => Ok(Some(result)),
Err(ObjectStoreError::KeyNotFound(_)) => Ok(None),
Err(err) => {
tracing::warn!(%err, "Failed request with a fatal error");
Err(err)
}
}
}

async fn retry_internal<T, Fut, F>(
&self,
max_retries: u16,
mut f: F,
) -> Result<T, ObjectStoreError>
where
Expand All @@ -53,7 +90,6 @@ impl Request<'_> {
backoff_secs *= 2;
}
Err(err) => {
tracing::warn!(%err, "Failed request with a fatal error");
break Err(err);
}
}
Expand Down
5 changes: 2 additions & 3 deletions core/lib/types/src/api/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ use serde_json::Value;
use serde_with::{hex::Hex, serde_as};
use strum::Display;
use zksync_basic_types::{
tee_types::TeeType,
web3::{AccessList, Bytes, Index},
Bloom, L1BatchNumber, H160, H256, H64, U256, U64,
};
Expand All @@ -14,8 +13,8 @@ pub use crate::transaction_request::{
Eip712Meta, SerializationTransactionError, TransactionRequest,
};
use crate::{
debug_flat_call::DebugCallFlat, protocol_version::L1VerifierConfig, Address, L2BlockNumber,
ProtocolVersionId,
debug_flat_call::DebugCallFlat, protocol_version::L1VerifierConfig, tee_types::TeeType,
Address, L2BlockNumber, ProtocolVersionId,
};

pub mod en;
Expand Down
1 change: 1 addition & 0 deletions core/lib/types/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ pub mod pubdata_da;
pub mod snapshots;
pub mod storage;
pub mod system_contracts;
pub mod tee_types;
pub mod tokens;
pub mod tx;
pub mod zk_evm_types;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
use std::fmt;

use chrono::NaiveDateTime;
use serde::{Deserialize, Serialize};
use zksync_basic_types::L1BatchNumber;

#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
Expand All @@ -17,6 +19,17 @@ impl fmt::Display for TeeType {
}
}

/// Representation of a locked batch. Used in DAL to fetch details about the locked batch to
/// determine whether it should be flagged as permanently ignored if it has no corresponding file in
/// the object store for an extended period.
#[derive(Clone, Debug)]
pub struct LockedBatch {
/// Locked batch number.
pub l1_batch_number: L1BatchNumber,
/// The creation time of the job for this batch.
pub created_at: NaiveDateTime,
}

#[cfg(test)]
mod tests {
use serde_json;
Expand Down
4 changes: 2 additions & 2 deletions core/node/proof_data_handler/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@ anyhow.workspace = true
axum.workspace = true
tokio.workspace = true
tracing.workspace = true
chrono.workspace = true

[dev-dependencies]
hyper.workspace = true
chrono.workspace = true
zksync_multivm.workspace = true
serde_json.workspace = true
tower.workspace = true
zksync_basic_types.workspace = true
zksync_contracts.workspace = true
zksync_basic_types.workspace = true
Loading

0 comments on commit 65cc26e

Please sign in to comment.