Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(proof-data-handler): exclude batches without object file in GCS #2980

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
9188f8a
feat(proof-data-handler): exclude batches without object file in GCS
pbeza Sep 27, 2024
851e3ed
Merge remote-tracking branch 'origin/main' into tee/flag-old-batches-…
pbeza Nov 18, 2024
19401af
Merge remote-tracking branch 'origin/main' into tee/flag-old-batches-…
pbeza Nov 18, 2024
a559c40
Revert unintended code artifact
pbeza Nov 18, 2024
16a0e55
Remove direct dependency on zksync_basic_types
pbeza Nov 18, 2024
5de035b
Use `break` and `return` consistently
pbeza Nov 18, 2024
6e27751
Merge remote-tracking branch 'origin/main' into tee/flag-old-batches-…
pbeza Nov 18, 2024
8503149
fixup! Remove direct dependency on zksync_basic_types
pbeza Nov 18, 2024
8826898
Merge remote-tracking branch 'origin/main' into tee/flag-old-batches-…
pbeza Nov 19, 2024
0dd0121
Make batch_ignored_timeout configurable + refine comments
pbeza Nov 19, 2024
1f70f44
Merge remote-tracking branch 'origin/main' into tee/flag-old-batches-…
pbeza Nov 19, 2024
8a80b02
fixup! Make batch_ignored_timeout configurable + refine comments
pbeza Nov 19, 2024
fe2534b
fixup! Make batch_ignored_timeout configurable + refine comments
pbeza Nov 19, 2024
7e9f9a8
Merge remote-tracking branch 'origin/main' into tee/flag-old-batches-…
pbeza Nov 19, 2024
ada62fe
Retry failed batches after 1 minute instead of 10 minutes
pbeza Nov 20, 2024
c61c1fa
Merge remote-tracking branch 'origin/main' into tee/flag-old-batches-…
pbeza Nov 20, 2024
733c27d
Merge remote-tracking branch 'origin/main' into tee/flag-old-batches-…
pbeza Nov 20, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 5 additions & 3 deletions core/lib/dal/doc/TeeProofGenerationDal.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@
title: Status Diagram
---
stateDiagram-v2
[*] --> unpicked : insert_tee_proof_generation_job
unpicked --> picked_by_prover : lock_batch_for_proving
[*] --> picked_by_prover : lock
picked_by_prover --> generated : save_proof_artifacts_metadata
picked_by_prover --> unpicked : unlock_batch
picked_by_prover --> permanently_ignored : unlock_batch
picked_by_prover --> failed : unlock_batch
failed --> picked_by_prover : lock
permanently_ignored --> [*]
generated --> [*]
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
-- There were manually added tee_proof_generation_details entries with status 'permanently_ignore'.

UPDATE tee_proof_generation_details SET status = 'permanently_ignored' WHERE status = 'permanently_ignore';

-- Entries with the status 'unpicked' were not used at all after the migration to the logic
-- introduced in https://github.com/matter-labs/zksync-era/pull/3017. This was overlooked.

DELETE FROM tee_proof_generation_details WHERE status = 'unpicked';
20 changes: 19 additions & 1 deletion core/lib/dal/src/models/storage_tee_proof.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
use chrono::NaiveDateTime;
use chrono::{DateTime, NaiveDateTime, Utc};
use zksync_types::L1BatchNumber;

use crate::tee_proof_generation_dal::LockedBatch;

#[derive(Debug, Clone, sqlx::FromRow)]
pub struct StorageTeeProof {
Expand All @@ -8,3 +11,18 @@ pub struct StorageTeeProof {
pub updated_at: NaiveDateTime,
pub attestation: Option<Vec<u8>>,
}

#[derive(Debug, Clone, sqlx::FromRow)]
pub struct StorageLockedBatch {
pub l1_batch_number: i64,
pub created_at: NaiveDateTime,
}

impl From<StorageLockedBatch> for LockedBatch {
fn from(tx: StorageLockedBatch) -> LockedBatch {
LockedBatch {
l1_batch_number: L1BatchNumber::from(tx.l1_batch_number as u32),
created_at: DateTime::<Utc>::from_naive_utc_and_offset(tx.created_at, Utc),
}
}
}
64 changes: 44 additions & 20 deletions core/lib/dal/src/tee_proof_generation_dal.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#![doc = include_str!("../doc/TeeProofGenerationDal.md")]
use std::time::Duration;

use chrono::{DateTime, Utc};
use strum::{Display, EnumString};
use zksync_db_connection::{
connection::Connection,
Expand All @@ -10,21 +11,42 @@ use zksync_db_connection::{
};
use zksync_types::{tee_types::TeeType, L1BatchNumber};

use crate::{models::storage_tee_proof::StorageTeeProof, Core};
use crate::{
models::storage_tee_proof::{StorageLockedBatch, StorageTeeProof},
Core,
};

#[derive(Debug)]
pub struct TeeProofGenerationDal<'a, 'c> {
pub(crate) storage: &'a mut Connection<'c, Core>,
}

#[derive(Debug, EnumString, Display)]
enum TeeProofGenerationJobStatus {
#[strum(serialize = "unpicked")]
Unpicked,
#[derive(Debug, Clone, Copy, EnumString, Display)]
pub enum TeeProofGenerationJobStatus {
#[strum(serialize = "picked_by_prover")]
PickedByProver,
#[strum(serialize = "generated")]
Generated,
#[strum(serialize = "failed")]
Failed,
#[strum(serialize = "permanently_ignored")]
PermanentlyIgnored,
}

/// Represents a locked batch picked by a TEE prover. A batch is locked when taken by a TEE prover
/// ([TeeProofGenerationJobStatus::PickedByProver]). It can transition to one of three states:
/// 1. [TeeProofGenerationJobStatus::Generated] when the proof is successfully submitted.
/// 2. [TeeProofGenerationJobStatus::Failed] when the proof generation fails, which can happen if
/// its inputs (GCS blob files) are incomplete or the API is unavailable for an extended period.
/// 3. [TeeProofGenerationJobStatus::PermanentlyIgnored] when the proof generation has been
/// continuously failing for an extended period.
#[derive(Clone, Debug)]
pub struct LockedBatch {
/// Locked batch number.
pub l1_batch_number: L1BatchNumber,
/// The creation time of the job for this batch. It is used to determine if the batch should
/// transition to [TeeProofGenerationJobStatus::PermanentlyIgnored] or [TeeProofGenerationJobStatus::Failed].
pub created_at: DateTime<Utc>,
}

impl TeeProofGenerationDal<'_, '_> {
Expand All @@ -33,10 +55,11 @@ impl TeeProofGenerationDal<'_, '_> {
tee_type: TeeType,
processing_timeout: Duration,
min_batch_number: L1BatchNumber,
) -> DalResult<Option<L1BatchNumber>> {
) -> DalResult<Option<LockedBatch>> {
let processing_timeout = pg_interval_from_duration(processing_timeout);
let min_batch_number = i64::from(min_batch_number.0);
sqlx::query!(
let locked_batch = sqlx::query_as!(
StorageLockedBatch,
r#"
WITH upsert AS (
SELECT
Expand All @@ -57,11 +80,8 @@ impl TeeProofGenerationDal<'_, '_> {
AND (
tee.l1_batch_number IS NULL
OR (
tee.status = $3
OR (
tee.status = $2
AND tee.prover_taken_at < NOW() - $4::INTERVAL
)
(tee.status = $2 OR tee.status = $3)
AND tee.prover_taken_at < NOW() - $4::INTERVAL
)
)
FETCH FIRST ROW ONLY
Expand All @@ -87,11 +107,12 @@ impl TeeProofGenerationDal<'_, '_> {
updated_at = NOW(),
prover_taken_at = NOW()
RETURNING
l1_batch_number
l1_batch_number,
created_at
"#,
tee_type.to_string(),
TeeProofGenerationJobStatus::PickedByProver.to_string(),
TeeProofGenerationJobStatus::Unpicked.to_string(),
TeeProofGenerationJobStatus::Failed.to_string(),
processing_timeout,
min_batch_number
)
Expand All @@ -100,14 +121,17 @@ impl TeeProofGenerationDal<'_, '_> {
.with_arg("processing_timeout", &processing_timeout)
.with_arg("l1_batch_number", &min_batch_number)
.fetch_optional(self.storage)
.await
.map(|record| record.map(|record| L1BatchNumber(record.l1_batch_number as u32)))
.await?
.map(Into::into);

Ok(locked_batch)
}

pub async fn unlock_batch(
&mut self,
l1_batch_number: L1BatchNumber,
tee_type: TeeType,
status: TeeProofGenerationJobStatus,
) -> DalResult<()> {
let batch_number = i64::from(l1_batch_number.0);
sqlx::query!(
Expand All @@ -120,7 +144,7 @@ impl TeeProofGenerationDal<'_, '_> {
l1_batch_number = $2
AND tee_type = $3
"#,
TeeProofGenerationJobStatus::Unpicked.to_string(),
status.to_string(),
batch_number,
tee_type.to_string()
)
Expand Down Expand Up @@ -266,7 +290,7 @@ impl TeeProofGenerationDal<'_, '_> {
"#,
batch_number,
tee_type.to_string(),
TeeProofGenerationJobStatus::Unpicked.to_string(),
TeeProofGenerationJobStatus::PickedByProver.to_string(),
);
let instrumentation = Instrumented::new("insert_tee_proof_generation_job")
.with_arg("l1_batch_number", &batch_number)
Expand All @@ -281,7 +305,7 @@ impl TeeProofGenerationDal<'_, '_> {
}

/// For testing purposes only.
pub async fn get_oldest_unpicked_batch(&mut self) -> DalResult<Option<L1BatchNumber>> {
pub async fn get_oldest_picked_by_prover_batch(&mut self) -> DalResult<Option<L1BatchNumber>> {
let query = sqlx::query!(
r#"
SELECT
Expand All @@ -295,7 +319,7 @@ impl TeeProofGenerationDal<'_, '_> {
LIMIT
1
"#,
TeeProofGenerationJobStatus::Unpicked.to_string(),
TeeProofGenerationJobStatus::PickedByProver.to_string(),
);
let batch_number = Instrumented::new("get_oldest_unpicked_batch")
.with(query)
Expand Down
13 changes: 12 additions & 1 deletion core/lib/object_store/src/retries.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,18 @@ impl Request<'_> {
self,
store: &impl fmt::Debug,
max_retries: u16,
f: F,
) -> Result<T, ObjectStoreError>
where
Fut: Future<Output = Result<T, ObjectStoreError>>,
F: FnMut() -> Fut,
{
self.retry_internal(max_retries, f).await
}

async fn retry_internal<T, Fut, F>(
&self,
max_retries: u16,
pbeza marked this conversation as resolved.
Show resolved Hide resolved
mut f: F,
) -> Result<T, ObjectStoreError>
where
Expand All @@ -53,7 +65,6 @@ impl Request<'_> {
backoff_secs *= 2;
}
Err(err) => {
tracing::warn!(%err, "Failed request with a fatal error");
break Err(err);
}
}
Expand Down
2 changes: 1 addition & 1 deletion core/lib/types/src/api/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ use serde_json::Value;
use serde_with::{hex::Hex, serde_as};
use strum::Display;
use zksync_basic_types::{
tee_types::TeeType,
web3::{AccessList, Bytes, Index},
Bloom, L1BatchNumber, H160, H256, H64, U256, U64,
};
Expand All @@ -16,6 +15,7 @@ pub use crate::transaction_request::{
use crate::{
debug_flat_call::{DebugCallFlat, ResultDebugCallFlat},
protocol_version::L1VerifierConfig,
tee_types::TeeType,
Address, L2BlockNumber, ProtocolVersionId,
};

Expand Down
4 changes: 2 additions & 2 deletions core/node/proof_data_handler/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@ axum.workspace = true
tokio.workspace = true
tower-http = { workspace = true, features = ["compression-zstd", "decompression-zstd"] }
tracing.workspace = true
chrono.workspace = true

[dev-dependencies]
hyper.workspace = true
chrono.workspace = true
zksync_multivm.workspace = true
serde_json.workspace = true
tower.workspace = true
zksync_basic_types.workspace = true
zksync_contracts.workspace = true
zksync_basic_types.workspace = true
pbeza marked this conversation as resolved.
Show resolved Hide resolved
Loading
Loading