diff --git a/core/bin/zksync_tee_prover/src/tee_prover.rs b/core/bin/zksync_tee_prover/src/tee_prover.rs index 64a3a9c5749d..7f874533b4b3 100644 --- a/core/bin/zksync_tee_prover/src/tee_prover.rs +++ b/core/bin/zksync_tee_prover/src/tee_prover.rs @@ -201,8 +201,8 @@ impl Task for TeeProver { if !err.is_retriable() || retries > self.config.max_retries { return Err(err.into()); } - retries += 1; tracing::warn!(%err, "Failed TEE prover step function {retries}/{}, retrying in {} milliseconds.", self.config.max_retries, backoff.as_millis()); + retries += 1; backoff = std::cmp::min( backoff.mul_f32(self.config.retry_backoff_multiplier), self.config.max_backoff, diff --git a/core/lib/dal/.sqlx/query-ff33517207e55508935d4647165361094f3444b2206c564d5ea4d11cca3fb8fe.json b/core/lib/dal/.sqlx/query-ff33517207e55508935d4647165361094f3444b2206c564d5ea4d11cca3fb8fe.json new file mode 100644 index 000000000000..995fe51fab76 --- /dev/null +++ b/core/lib/dal/.sqlx/query-ff33517207e55508935d4647165361094f3444b2206c564d5ea4d11cca3fb8fe.json @@ -0,0 +1,15 @@ +{ + "db_name": "PostgreSQL", + "query": "\n UPDATE tee_proof_generation_details\n SET\n status = 'skipped',\n updated_at = NOW()\n WHERE\n l1_batch_number = $1\n AND tee_type = $2\n ", + "describe": { + "columns": [], + "parameters": { + "Left": [ + "Int8", + "Text" + ] + }, + "nullable": [] + }, + "hash": "ff33517207e55508935d4647165361094f3444b2206c564d5ea4d11cca3fb8fe" +} diff --git a/core/lib/dal/doc/TeeProofGenerationDal.md b/core/lib/dal/doc/TeeProofGenerationDal.md index 23474d5cb5c5..96b3809fddf2 100644 --- a/core/lib/dal/doc/TeeProofGenerationDal.md +++ b/core/lib/dal/doc/TeeProofGenerationDal.md @@ -16,4 +16,7 @@ ready_to_be_proven --> picked_by_prover : get_next_batch_to_be_proven picked_by_prover --> generated : save_proof_artifacts_metadata generated --> [*] +[*] --> skipped : mark_proof_generation_job_as_skipped +skipped --> [*] + ``` diff --git a/core/lib/dal/src/tee_proof_generation_dal.rs b/core/lib/dal/src/tee_proof_generation_dal.rs index 2bd73323eb10..275e8a089f60 100644 --- a/core/lib/dal/src/tee_proof_generation_dal.rs +++ b/core/lib/dal/src/tee_proof_generation_dal.rs @@ -150,6 +150,44 @@ impl TeeProofGenerationDal<'_, '_> { Ok(()) } + pub async fn mark_proof_generation_job_as_skipped( + &mut self, + batch_number: L1BatchNumber, + tee_type: TeeType, + ) -> DalResult<()> { + let l1_batch_number = i64::from(batch_number.0); + let query = sqlx::query!( + r#" + UPDATE tee_proof_generation_details + SET + status = 'skipped', + updated_at = NOW() + WHERE + l1_batch_number = $1 + AND tee_type = $2 + "#, + l1_batch_number, + tee_type.to_string() + ); + let instrumentation = Instrumented::new("mark_proof_generation_job_as_skipped") + .with_arg("l1_batch_number", &l1_batch_number) + .with_arg("tee_type", &tee_type); + let result = instrumentation + .clone() + .with(query) + .execute(self.storage) + .await?; + if result.rows_affected() == 0 { + let err = instrumentation.constraint_error(anyhow::anyhow!( + "Cannot mark proof as skipped because batch number {} does not exist", + l1_batch_number + )); + return Err(err); + } + + Ok(()) + } + pub async fn save_attestation(&mut self, pubkey: &[u8], attestation: &[u8]) -> DalResult<()> { let query = sqlx::query!( r#" diff --git a/core/node/proof_data_handler/src/tee_request_processor.rs b/core/node/proof_data_handler/src/tee_request_processor.rs index d85591dd2c90..139549628c79 100644 --- a/core/node/proof_data_handler/src/tee_request_processor.rs +++ b/core/node/proof_data_handler/src/tee_request_processor.rs @@ -3,7 +3,7 @@ use std::sync::Arc; use axum::{extract::Path, Json}; use zksync_config::configs::ProofDataHandlerConfig; use zksync_dal::{ConnectionPool, Core, CoreDal}; -use zksync_object_store::ObjectStore; +use zksync_object_store::{ObjectStore, ObjectStoreError}; use zksync_prover_interface::{ api::{ RegisterTeeAttestationRequest, RegisterTeeAttestationResponse, SubmitProofResponse, @@ -47,26 +47,62 @@ impl TeeRequestProcessor { .await .map_err(RequestProcessorError::Dal)?; - let l1_batch_number_result = connection - .tee_proof_generation_dal() - .get_next_batch_to_be_proven(request.tee_type, self.config.proof_generation_timeout()) - .await - .map_err(RequestProcessorError::Dal)?; - - let l1_batch_number = match l1_batch_number_result { - Some(number) => number, - None => return Ok(Json(TeeProofGenerationDataResponse(None))), - }; - - let tee_verifier_input: TeeVerifierInput = self - .blob_store - .get(l1_batch_number) - .await - .map_err(RequestProcessorError::ObjectStore)?; + loop { + let l1_batch_number = match connection + .tee_proof_generation_dal() + .get_next_batch_to_be_proven( + request.tee_type, + self.config.proof_generation_timeout(), + ) + .await + .map_err(RequestProcessorError::Dal)? + { + Some(number) => number, + None => return Ok(Json(TeeProofGenerationDataResponse(None))), + }; + + match self.get_blob(l1_batch_number).await { + Ok(input) => { + return Ok(Json(TeeProofGenerationDataResponse(Some(Box::new(input))))); + } + Err(ObjectStoreError::KeyNotFound(_)) => { + tracing::warn!( + "Blob for batch number {} has not been found in the object store. Marking the job as skipped.", + l1_batch_number + ); + connection + .tee_proof_generation_dal() + .mark_proof_generation_job_as_skipped(l1_batch_number, request.tee_type) + .await + .map_err(RequestProcessorError::Dal)?; + continue; + } + Err(err) => return Err(RequestProcessorError::ObjectStore(err)), + } + } + } - let response = TeeProofGenerationDataResponse(Some(Box::new(tee_verifier_input))); + async fn get_blob( + &self, + l1_batch_number: L1BatchNumber, + ) -> Result { + let max_blob_store_retries = 3; + let mut last_err: Option = None; + + for _ in 0..max_blob_store_retries { + match self.blob_store.get(l1_batch_number).await { + Ok(input) => return Ok(input), + Err(err) => match err { + ObjectStoreError::Other { is_retriable, .. } if is_retriable => { + last_err = Some(err); + continue; + } + _ => return Err(err), + }, + } + } - Ok(Json(response)) + Err(last_err.unwrap()) } pub(crate) async fn submit_proof(