From cf2cf1dcfbdf86169f87c9f7d6fc998f7b3d63f9 Mon Sep 17 00:00:00 2001 From: Patrick Beza Date: Fri, 27 Sep 2024 17:41:25 +0200 Subject: [PATCH] feat(proof-data-handler): exclude batches without object file in GCS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit /tee/proof_inputs endpoint no longer returns batches that have no corresponding object file in Google Cloud Storage for an extended period. Since the recent `mainnet`'s `24.25.0` redeployment, we've been [flooded with warnings][warnings] for the `proof-data-handler` on `mainnet` (the warnings are actually _not_ fatal in this context): ``` Failed request with a fatal error (...) Blobs for batch numbers 490520 to 490555 not found in the object store. Marked as unpicked. ``` The issue was caused [by the code][code] behind the `/tee/proof_inputs` [endpoint][endpoint_proof_inputs] (which is equivalent to the `/proof_generation_data` [endpoint][endpoint_proof_generation_data]) – it finds the next batch to send to the [requesting][requesting] `tee-prover` by looking for the first batch that has a corresponding object in the Google object store. As it skips over batches that don’t have the objects, [it logs][logging] `Failed request with a fatal error` for each one (unless the skipped batch was successfully proven, in which case it doesn’t log the error). This happens with every [request][request] the `tee-prover` sends, which is why we were getting so much noise in the logs. One possible solution was to manually flag the problematic batches as `permanently_ignored`, like Thomas [did before][Thomas] on `mainnet`. It was a quick and dirty workaround, but now we have a more automated solution. [warnings]: https://grafana.matterlabs.dev/goto/TjlaXQgHg?orgId=1 [code]: https://github.com/matter-labs/zksync-era/blob/3f406c7d0c0e76d798c2d838abde57ca692822c0/core/node/proof_data_handler/src/tee_request_processor.rs#L35-L79 [endpoint_proof_inputs]: https://github.com/matter-labs/zksync-era/blob/3f406c7d0c0e76d798c2d838abde57ca692822c0/core/node/proof_data_handler/src/lib.rs#L96 [endpoint_proof_generation_data]: https://github.com/matter-labs/zksync-era/blob/3f406c7d0c0e76d798c2d838abde57ca692822c0/core/node/proof_data_handler/src/lib.rs#L67 [requesting]: https://github.com/matter-labs/zksync-era/blob/3f406c7d0c0e76d798c2d838abde57ca692822c0/core/bin/zksync_tee_prover/src/tee_prover.rs#L93 [logging]: https://github.com/matter-labs/zksync-era/blob/3f406c7d0c0e76d798c2d838abde57ca692822c0/core/lib/object_store/src/retries.rs#L56 [Thomas]: https://matter-labs-workspace.slack.com/archives/C05ANUCGCKV/p1725284962312929 --- ...203a62629904bc4956249e690a8ad7a48983.json} | 10 ++- core/lib/dal/doc/TeeProofGenerationDal.md | 8 ++- ...tee_add_permanently_ignored_state.down.sql | 0 ...0_tee_add_permanently_ignored_state.up.sql | 8 +++ core/lib/dal/src/models/storage_tee_proof.rs | 20 +++++- core/lib/dal/src/tee_proof_generation_dal.rs | 64 +++++++++++++------ core/lib/object_store/src/retries.rs | 13 +++- core/lib/types/src/api/mod.rs | 2 +- core/node/proof_data_handler/Cargo.toml | 4 +- .../src/tee_request_processor.rs | 60 +++++++++-------- core/node/proof_data_handler/src/tests.rs | 6 +- 11 files changed, 136 insertions(+), 59 deletions(-) rename core/lib/dal/.sqlx/{query-cee7a608bd77815e9582531383481b01395cfd2a3e95fb4593229bd878163320.json => query-e46c99b23db91800b27c717100f8203a62629904bc4956249e690a8ad7a48983.json} (50%) create mode 100644 core/lib/dal/migrations/20240930110000_tee_add_permanently_ignored_state.down.sql create mode 100644 core/lib/dal/migrations/20240930110000_tee_add_permanently_ignored_state.up.sql diff --git a/core/lib/dal/.sqlx/query-cee7a608bd77815e9582531383481b01395cfd2a3e95fb4593229bd878163320.json b/core/lib/dal/.sqlx/query-e46c99b23db91800b27c717100f8203a62629904bc4956249e690a8ad7a48983.json similarity index 50% rename from core/lib/dal/.sqlx/query-cee7a608bd77815e9582531383481b01395cfd2a3e95fb4593229bd878163320.json rename to core/lib/dal/.sqlx/query-e46c99b23db91800b27c717100f8203a62629904bc4956249e690a8ad7a48983.json index 4b219bfee0a..7ca2c9e7e9f 100644 --- a/core/lib/dal/.sqlx/query-cee7a608bd77815e9582531383481b01395cfd2a3e95fb4593229bd878163320.json +++ b/core/lib/dal/.sqlx/query-e46c99b23db91800b27c717100f8203a62629904bc4956249e690a8ad7a48983.json @@ -1,12 +1,17 @@ { "db_name": "PostgreSQL", - "query": "\n WITH upsert AS (\n SELECT\n p.l1_batch_number\n FROM\n proof_generation_details p\n LEFT JOIN\n tee_proof_generation_details tee\n ON\n p.l1_batch_number = tee.l1_batch_number\n AND tee.tee_type = $1\n WHERE\n (\n p.l1_batch_number >= $5\n AND p.vm_run_data_blob_url IS NOT NULL\n AND p.proof_gen_data_blob_url IS NOT NULL\n )\n AND (\n tee.l1_batch_number IS NULL\n OR (\n tee.status = $3\n OR (\n tee.status = $2\n AND tee.prover_taken_at < NOW() - $4::INTERVAL\n )\n )\n )\n FETCH FIRST ROW ONLY\n )\n \n INSERT INTO\n tee_proof_generation_details (\n l1_batch_number, tee_type, status, created_at, updated_at, prover_taken_at\n )\n SELECT\n l1_batch_number,\n $1,\n $2,\n NOW(),\n NOW(),\n NOW()\n FROM\n upsert\n ON CONFLICT (l1_batch_number, tee_type) DO\n UPDATE\n SET\n status = $2,\n updated_at = NOW(),\n prover_taken_at = NOW()\n RETURNING\n l1_batch_number\n ", + "query": "\n WITH upsert AS (\n SELECT\n p.l1_batch_number\n FROM\n proof_generation_details p\n LEFT JOIN\n tee_proof_generation_details tee\n ON\n p.l1_batch_number = tee.l1_batch_number\n AND tee.tee_type = $1\n WHERE\n (\n p.l1_batch_number >= $5\n AND p.vm_run_data_blob_url IS NOT NULL\n AND p.proof_gen_data_blob_url IS NOT NULL\n )\n AND (\n tee.l1_batch_number IS NULL\n OR (\n (tee.status = $2 OR tee.status = $3)\n AND tee.prover_taken_at < NOW() - $4::INTERVAL\n )\n )\n FETCH FIRST ROW ONLY\n )\n \n INSERT INTO\n tee_proof_generation_details (\n l1_batch_number, tee_type, status, created_at, updated_at, prover_taken_at\n )\n SELECT\n l1_batch_number,\n $1,\n $2,\n NOW(),\n NOW(),\n NOW()\n FROM\n upsert\n ON CONFLICT (l1_batch_number, tee_type) DO\n UPDATE\n SET\n status = $2,\n updated_at = NOW(),\n prover_taken_at = NOW()\n RETURNING\n l1_batch_number,\n created_at\n ", "describe": { "columns": [ { "ordinal": 0, "name": "l1_batch_number", "type_info": "Int8" + }, + { + "ordinal": 1, + "name": "created_at", + "type_info": "Timestamp" } ], "parameters": { @@ -19,8 +24,9 @@ ] }, "nullable": [ + false, false ] }, - "hash": "cee7a608bd77815e9582531383481b01395cfd2a3e95fb4593229bd878163320" + "hash": "e46c99b23db91800b27c717100f8203a62629904bc4956249e690a8ad7a48983" } diff --git a/core/lib/dal/doc/TeeProofGenerationDal.md b/core/lib/dal/doc/TeeProofGenerationDal.md index fcfa379816c..d9ae70aeb2f 100644 --- a/core/lib/dal/doc/TeeProofGenerationDal.md +++ b/core/lib/dal/doc/TeeProofGenerationDal.md @@ -11,9 +11,11 @@ title: Status Diagram --- stateDiagram-v2 -[*] --> unpicked : insert_tee_proof_generation_job -unpicked --> picked_by_prover : lock_batch_for_proving +[*] --> picked_by_prover : lock picked_by_prover --> generated : save_proof_artifacts_metadata -picked_by_prover --> unpicked : unlock_batch +picked_by_prover --> permanently_ignored : unlock_batch +picked_by_prover --> failed : unlock_batch +failed --> picked_by_prover : lock +permanently_ignored --> [*] generated --> [*] ``` diff --git a/core/lib/dal/migrations/20240930110000_tee_add_permanently_ignored_state.down.sql b/core/lib/dal/migrations/20240930110000_tee_add_permanently_ignored_state.down.sql new file mode 100644 index 00000000000..e69de29bb2d diff --git a/core/lib/dal/migrations/20240930110000_tee_add_permanently_ignored_state.up.sql b/core/lib/dal/migrations/20240930110000_tee_add_permanently_ignored_state.up.sql new file mode 100644 index 00000000000..12a21d1728c --- /dev/null +++ b/core/lib/dal/migrations/20240930110000_tee_add_permanently_ignored_state.up.sql @@ -0,0 +1,8 @@ +-- There were manually added tee_proof_generation_details entries with status 'permanently_ignore'. + +UPDATE tee_proof_generation_details SET status = 'permanently_ignored' WHERE status = 'permanently_ignore'; + +-- Entries with the status 'unpicked' were not used at all after the migration to the logic +-- introduced in https://github.com/matter-labs/zksync-era/pull/3017. This was overlooked. + +DELETE FROM tee_proof_generation_details WHERE status = 'unpicked'; diff --git a/core/lib/dal/src/models/storage_tee_proof.rs b/core/lib/dal/src/models/storage_tee_proof.rs index 5c93361e7df..6f80c59511f 100644 --- a/core/lib/dal/src/models/storage_tee_proof.rs +++ b/core/lib/dal/src/models/storage_tee_proof.rs @@ -1,4 +1,7 @@ -use chrono::NaiveDateTime; +use chrono::{DateTime, NaiveDateTime, Utc}; +use zksync_types::L1BatchNumber; + +use crate::tee_proof_generation_dal::LockedBatch; #[derive(Debug, Clone, sqlx::FromRow)] pub struct StorageTeeProof { @@ -8,3 +11,18 @@ pub struct StorageTeeProof { pub updated_at: NaiveDateTime, pub attestation: Option>, } + +#[derive(Debug, Clone, sqlx::FromRow)] +pub struct StorageLockedBatch { + pub l1_batch_number: i64, + pub created_at: NaiveDateTime, +} + +impl From for LockedBatch { + fn from(tx: StorageLockedBatch) -> LockedBatch { + LockedBatch { + l1_batch_number: L1BatchNumber::from(tx.l1_batch_number as u32), + created_at: DateTime::::from_naive_utc_and_offset(tx.created_at, Utc), + } + } +} diff --git a/core/lib/dal/src/tee_proof_generation_dal.rs b/core/lib/dal/src/tee_proof_generation_dal.rs index 755d0276910..8a015862685 100644 --- a/core/lib/dal/src/tee_proof_generation_dal.rs +++ b/core/lib/dal/src/tee_proof_generation_dal.rs @@ -1,6 +1,7 @@ #![doc = include_str!("../doc/TeeProofGenerationDal.md")] use std::time::Duration; +use chrono::{DateTime, Utc}; use strum::{Display, EnumString}; use zksync_db_connection::{ connection::Connection, @@ -10,21 +11,42 @@ use zksync_db_connection::{ }; use zksync_types::{tee_types::TeeType, L1BatchNumber}; -use crate::{models::storage_tee_proof::StorageTeeProof, Core}; +use crate::{ + models::storage_tee_proof::{StorageLockedBatch, StorageTeeProof}, + Core, +}; #[derive(Debug)] pub struct TeeProofGenerationDal<'a, 'c> { pub(crate) storage: &'a mut Connection<'c, Core>, } -#[derive(Debug, EnumString, Display)] -enum TeeProofGenerationJobStatus { - #[strum(serialize = "unpicked")] - Unpicked, +#[derive(Debug, Clone, Copy, EnumString, Display)] +pub enum TeeProofGenerationJobStatus { #[strum(serialize = "picked_by_prover")] PickedByProver, #[strum(serialize = "generated")] Generated, + #[strum(serialize = "failed")] + Failed, + #[strum(serialize = "permanently_ignored")] + PermanentlyIgnored, +} + +/// Represents a locked batch picked by a TEE prover. A batch is locked when taken by a TEE prover +/// ([TeeProofGenerationJobStatus::PickedByProver]). It can transition to one of three states: +/// 1. [TeeProofGenerationJobStatus::Generated] when the proof is successfully submitted. +/// 2. [TeeProofGenerationJobStatus::Failed] when the proof generation fails, which can happen if +/// its inputs (GCS blob files) are incomplete or the API is unavailable for an extended period. +/// 3. [TeeProofGenerationJobStatus::PermanentlyIgnored] when the proof generation has been +/// continuously failing for an extended period. +#[derive(Clone, Debug)] +pub struct LockedBatch { + /// Locked batch number. + pub l1_batch_number: L1BatchNumber, + /// The creation time of the job for this batch. It is used to determine if the batch should + /// transition to [TeeProofGenerationJobStatus::PermanentlyIgnored] or [TeeProofGenerationJobStatus::Failed]. + pub created_at: DateTime, } impl TeeProofGenerationDal<'_, '_> { @@ -33,10 +55,11 @@ impl TeeProofGenerationDal<'_, '_> { tee_type: TeeType, processing_timeout: Duration, min_batch_number: L1BatchNumber, - ) -> DalResult> { + ) -> DalResult> { let processing_timeout = pg_interval_from_duration(processing_timeout); let min_batch_number = i64::from(min_batch_number.0); - sqlx::query!( + let locked_batch = sqlx::query_as!( + StorageLockedBatch, r#" WITH upsert AS ( SELECT @@ -57,11 +80,8 @@ impl TeeProofGenerationDal<'_, '_> { AND ( tee.l1_batch_number IS NULL OR ( - tee.status = $3 - OR ( - tee.status = $2 - AND tee.prover_taken_at < NOW() - $4::INTERVAL - ) + (tee.status = $2 OR tee.status = $3) + AND tee.prover_taken_at < NOW() - $4::INTERVAL ) ) FETCH FIRST ROW ONLY @@ -87,11 +107,12 @@ impl TeeProofGenerationDal<'_, '_> { updated_at = NOW(), prover_taken_at = NOW() RETURNING - l1_batch_number + l1_batch_number, + created_at "#, tee_type.to_string(), TeeProofGenerationJobStatus::PickedByProver.to_string(), - TeeProofGenerationJobStatus::Unpicked.to_string(), + TeeProofGenerationJobStatus::Failed.to_string(), processing_timeout, min_batch_number ) @@ -100,14 +121,17 @@ impl TeeProofGenerationDal<'_, '_> { .with_arg("processing_timeout", &processing_timeout) .with_arg("l1_batch_number", &min_batch_number) .fetch_optional(self.storage) - .await - .map(|record| record.map(|record| L1BatchNumber(record.l1_batch_number as u32))) + .await? + .map(Into::into); + + Ok(locked_batch) } pub async fn unlock_batch( &mut self, l1_batch_number: L1BatchNumber, tee_type: TeeType, + status: TeeProofGenerationJobStatus, ) -> DalResult<()> { let batch_number = i64::from(l1_batch_number.0); sqlx::query!( @@ -120,7 +144,7 @@ impl TeeProofGenerationDal<'_, '_> { l1_batch_number = $2 AND tee_type = $3 "#, - TeeProofGenerationJobStatus::Unpicked.to_string(), + status.to_string(), batch_number, tee_type.to_string() ) @@ -266,7 +290,7 @@ impl TeeProofGenerationDal<'_, '_> { "#, batch_number, tee_type.to_string(), - TeeProofGenerationJobStatus::Unpicked.to_string(), + TeeProofGenerationJobStatus::PickedByProver.to_string(), ); let instrumentation = Instrumented::new("insert_tee_proof_generation_job") .with_arg("l1_batch_number", &batch_number) @@ -281,7 +305,7 @@ impl TeeProofGenerationDal<'_, '_> { } /// For testing purposes only. - pub async fn get_oldest_unpicked_batch(&mut self) -> DalResult> { + pub async fn get_oldest_picked_by_prover_batch(&mut self) -> DalResult> { let query = sqlx::query!( r#" SELECT @@ -295,7 +319,7 @@ impl TeeProofGenerationDal<'_, '_> { LIMIT 1 "#, - TeeProofGenerationJobStatus::Unpicked.to_string(), + TeeProofGenerationJobStatus::PickedByProver.to_string(), ); let batch_number = Instrumented::new("get_oldest_unpicked_batch") .with(query) diff --git a/core/lib/object_store/src/retries.rs b/core/lib/object_store/src/retries.rs index 2cccbb17c2b..431a6e6276b 100644 --- a/core/lib/object_store/src/retries.rs +++ b/core/lib/object_store/src/retries.rs @@ -28,6 +28,18 @@ impl Request<'_> { self, store: &impl fmt::Debug, max_retries: u16, + f: F, + ) -> Result + where + Fut: Future>, + F: FnMut() -> Fut, + { + self.retry_internal(max_retries, f).await + } + + async fn retry_internal( + &self, + max_retries: u16, mut f: F, ) -> Result where @@ -53,7 +65,6 @@ impl Request<'_> { backoff_secs *= 2; } Err(err) => { - tracing::warn!(%err, "Failed request with a fatal error"); break Err(err); } } diff --git a/core/lib/types/src/api/mod.rs b/core/lib/types/src/api/mod.rs index 409dc372757..6758124626f 100644 --- a/core/lib/types/src/api/mod.rs +++ b/core/lib/types/src/api/mod.rs @@ -4,7 +4,6 @@ use serde_json::Value; use serde_with::{hex::Hex, serde_as}; use strum::Display; use zksync_basic_types::{ - tee_types::TeeType, web3::{AccessList, Bytes, Index}, Bloom, L1BatchNumber, H160, H256, H64, U256, U64, }; @@ -16,6 +15,7 @@ pub use crate::transaction_request::{ use crate::{ debug_flat_call::{DebugCallFlat, ResultDebugCallFlat}, protocol_version::L1VerifierConfig, + tee_types::TeeType, Address, L2BlockNumber, ProtocolVersionId, }; diff --git a/core/node/proof_data_handler/Cargo.toml b/core/node/proof_data_handler/Cargo.toml index e2ddc972a2f..53ac03c98f4 100644 --- a/core/node/proof_data_handler/Cargo.toml +++ b/core/node/proof_data_handler/Cargo.toml @@ -24,12 +24,12 @@ axum.workspace = true tokio.workspace = true tower-http = { workspace = true, features = ["compression-zstd", "decompression-zstd"] } tracing.workspace = true +chrono.workspace = true [dev-dependencies] hyper.workspace = true -chrono.workspace = true zksync_multivm.workspace = true serde_json.workspace = true tower.workspace = true -zksync_basic_types.workspace = true zksync_contracts.workspace = true +zksync_basic_types.workspace = true diff --git a/core/node/proof_data_handler/src/tee_request_processor.rs b/core/node/proof_data_handler/src/tee_request_processor.rs index b265b94d4d7..2bf709fd163 100644 --- a/core/node/proof_data_handler/src/tee_request_processor.rs +++ b/core/node/proof_data_handler/src/tee_request_processor.rs @@ -1,8 +1,12 @@ use std::sync::Arc; use axum::{extract::Path, Json}; +use chrono::{Duration as ChronoDuration, Utc}; use zksync_config::configs::ProofDataHandlerConfig; -use zksync_dal::{ConnectionPool, Core, CoreDal}; +use zksync_dal::{ + tee_proof_generation_dal::{LockedBatch, TeeProofGenerationJobStatus}, + ConnectionPool, Core, CoreDal, +}; use zksync_object_store::{ObjectStore, ObjectStoreError}; use zksync_prover_interface::{ api::{ @@ -47,49 +51,52 @@ impl TeeRequestProcessor { ) -> Result>, RequestProcessorError> { tracing::info!("Received request for proof generation data: {:?}", request); - let mut min_batch_number = self.config.tee_config.first_tee_processed_batch; - let mut missing_range: Option<(L1BatchNumber, L1BatchNumber)> = None; + let batch_ignored_timeout = ChronoDuration::days(10); + let min_batch_number = self.config.tee_config.first_tee_processed_batch; - let result = loop { - let Some(l1_batch_number) = self + loop { + let Some(locked_batch) = self .lock_batch_for_proving(request.tee_type, min_batch_number) .await? else { - // No job available - return Ok(None); + return Ok(None); // no job available }; + let batch_number = locked_batch.l1_batch_number; match self - .tee_verifier_input_for_existing_batch(l1_batch_number) + .tee_verifier_input_for_existing_batch(batch_number) .await { Ok(input) => { break Ok(Some(Json(TeeProofGenerationDataResponse(Box::new(input))))); } Err(RequestProcessorError::ObjectStore(ObjectStoreError::KeyNotFound(_))) => { - missing_range = match missing_range { - Some((start, _)) => Some((start, l1_batch_number)), - None => Some((l1_batch_number, l1_batch_number)), + let duration = Utc::now().signed_duration_since(locked_batch.created_at); + let status = if duration > batch_ignored_timeout { + TeeProofGenerationJobStatus::PermanentlyIgnored + } else { + TeeProofGenerationJobStatus::Failed }; - self.unlock_batch(l1_batch_number, request.tee_type).await?; - min_batch_number = l1_batch_number + 1; + self.unlock_batch(batch_number, request.tee_type, status) + .await?; + tracing::info!( + "Assigned status {} to batch {} created at {}", + status, + batch_number, + locked_batch.created_at + ); } Err(err) => { - self.unlock_batch(l1_batch_number, request.tee_type).await?; + self.unlock_batch( + batch_number, + request.tee_type, + TeeProofGenerationJobStatus::Failed, + ) + .await?; break Err(err); } } - }; - - if let Some((start, end)) = missing_range { - tracing::warn!( - "Blobs for batch numbers {} to {} not found in the object store. Marked as unpicked.", - start, - end - ); } - - result } #[tracing::instrument(skip(self))] @@ -157,7 +164,7 @@ impl TeeRequestProcessor { &self, tee_type: TeeType, min_batch_number: L1BatchNumber, - ) -> Result, RequestProcessorError> { + ) -> Result, RequestProcessorError> { self.pool .connection_tagged("tee_request_processor") .await? @@ -175,12 +182,13 @@ impl TeeRequestProcessor { &self, l1_batch_number: L1BatchNumber, tee_type: TeeType, + status: TeeProofGenerationJobStatus, ) -> Result<(), RequestProcessorError> { self.pool .connection_tagged("tee_request_processor") .await? .tee_proof_generation_dal() - .unlock_batch(l1_batch_number, tee_type) + .unlock_batch(l1_batch_number, tee_type, status) .await?; Ok(()) } diff --git a/core/node/proof_data_handler/src/tests.rs b/core/node/proof_data_handler/src/tests.rs index 87c6bff8a1f..2eb17b55dbb 100644 --- a/core/node/proof_data_handler/src/tests.rs +++ b/core/node/proof_data_handler/src/tests.rs @@ -119,7 +119,7 @@ async fn submit_tee_proof() { let mut proof_db_conn = db_conn_pool.connection().await.unwrap(); let oldest_batch_number = proof_db_conn .tee_proof_generation_dal() - .get_oldest_unpicked_batch() + .get_oldest_picked_by_prover_batch() .await .unwrap(); @@ -156,7 +156,7 @@ async fn mock_tee_batch_status( // there should not be any batches awaiting proof in the db yet - let oldest_batch_number = proof_dal.get_oldest_unpicked_batch().await.unwrap(); + let oldest_batch_number = proof_dal.get_oldest_picked_by_prover_batch().await.unwrap(); assert!(oldest_batch_number.is_none()); // mock SQL table with relevant information about the status of TEE proof generation @@ -169,7 +169,7 @@ async fn mock_tee_batch_status( // now, there should be one batch in the db awaiting proof let oldest_batch_number = proof_dal - .get_oldest_unpicked_batch() + .get_oldest_picked_by_prover_batch() .await .unwrap() .unwrap();