Skip to content

Commit

Permalink
feat(proof-data-handler): exclude batches without object file in GCS
Browse files Browse the repository at this point in the history
/tee/proof_inputs endpoint no longer returns batches that have no
corresponding object file in Google Cloud Storage for an extended
period.

Since the recent `mainnet`'s `24.25.0` redeployment, we've been
[flooded with warnings][warnings] for the `proof-data-handler` on
`mainnet` (the warnings are actually _not_ fatal in this context):

```
Failed request with a fatal error

(...)

Blobs for batch numbers 490520 to 490555 not found in the object store.
Marked as unpicked.
```

The issue was caused [by the code][code] behind the `/tee/proof_inputs`
[endpoint][endpoint_proof_inputs] (which is equivalent to the
`/proof_generation_data` [endpoint][endpoint_proof_generation_data]) –
it finds the next batch to send to the [requesting][requesting]
`tee-prover` by looking for the first batch that has a corresponding
object in the Google object store. As it skips over batches that don’t
have the objects, [it logs][logging] `Failed request with a fatal error`
for each one (unless the skipped batch was successfully proven, in which
case it doesn’t log the error). This happens with every
[request][request] the `tee-prover` sends, which is why we were getting
so much noise in the logs.

One possible solution was to manually flag the problematic batches as
`permanently_ignored`, like Thomas [did before][Thomas] on `mainnet`.
It was a quick and dirty workaround, but now we have a more automated
solution.

[warnings]: https://grafana.matterlabs.dev/goto/TjlaXQgHg?orgId=1
[code]: https://github.com/matter-labs/zksync-era/blob/3f406c7d0c0e76d798c2d838abde57ca692822c0/core/node/proof_data_handler/src/tee_request_processor.rs#L35-L79
[endpoint_proof_inputs]: https://github.com/matter-labs/zksync-era/blob/3f406c7d0c0e76d798c2d838abde57ca692822c0/core/node/proof_data_handler/src/lib.rs#L96
[endpoint_proof_generation_data]: https://github.com/matter-labs/zksync-era/blob/3f406c7d0c0e76d798c2d838abde57ca692822c0/core/node/proof_data_handler/src/lib.rs#L67
[requesting]: https://github.com/matter-labs/zksync-era/blob/3f406c7d0c0e76d798c2d838abde57ca692822c0/core/bin/zksync_tee_prover/src/tee_prover.rs#L93
[logging]: https://github.com/matter-labs/zksync-era/blob/3f406c7d0c0e76d798c2d838abde57ca692822c0/core/lib/object_store/src/retries.rs#L56
[Thomas]: https://matter-labs-workspace.slack.com/archives/C05ANUCGCKV/p1725284962312929
  • Loading branch information
pbeza committed Oct 31, 2024
1 parent f1328c0 commit cf2cf1d
Show file tree
Hide file tree
Showing 11 changed files with 136 additions and 59 deletions.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 5 additions & 3 deletions core/lib/dal/doc/TeeProofGenerationDal.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@
title: Status Diagram
---
stateDiagram-v2
[*] --> unpicked : insert_tee_proof_generation_job
unpicked --> picked_by_prover : lock_batch_for_proving
[*] --> picked_by_prover : lock
picked_by_prover --> generated : save_proof_artifacts_metadata
picked_by_prover --> unpicked : unlock_batch
picked_by_prover --> permanently_ignored : unlock_batch
picked_by_prover --> failed : unlock_batch
failed --> picked_by_prover : lock
permanently_ignored --> [*]
generated --> [*]
```
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
-- There were manually added tee_proof_generation_details entries with status 'permanently_ignore'.

UPDATE tee_proof_generation_details SET status = 'permanently_ignored' WHERE status = 'permanently_ignore';

-- Entries with the status 'unpicked' were not used at all after the migration to the logic
-- introduced in https://github.com/matter-labs/zksync-era/pull/3017. This was overlooked.

DELETE FROM tee_proof_generation_details WHERE status = 'unpicked';
20 changes: 19 additions & 1 deletion core/lib/dal/src/models/storage_tee_proof.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
use chrono::NaiveDateTime;
use chrono::{DateTime, NaiveDateTime, Utc};
use zksync_types::L1BatchNumber;

use crate::tee_proof_generation_dal::LockedBatch;

#[derive(Debug, Clone, sqlx::FromRow)]
pub struct StorageTeeProof {
Expand All @@ -8,3 +11,18 @@ pub struct StorageTeeProof {
pub updated_at: NaiveDateTime,
pub attestation: Option<Vec<u8>>,
}

#[derive(Debug, Clone, sqlx::FromRow)]
pub struct StorageLockedBatch {
pub l1_batch_number: i64,
pub created_at: NaiveDateTime,
}

impl From<StorageLockedBatch> for LockedBatch {
fn from(tx: StorageLockedBatch) -> LockedBatch {
LockedBatch {
l1_batch_number: L1BatchNumber::from(tx.l1_batch_number as u32),
created_at: DateTime::<Utc>::from_naive_utc_and_offset(tx.created_at, Utc),
}
}
}
64 changes: 44 additions & 20 deletions core/lib/dal/src/tee_proof_generation_dal.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#![doc = include_str!("../doc/TeeProofGenerationDal.md")]
use std::time::Duration;

use chrono::{DateTime, Utc};
use strum::{Display, EnumString};
use zksync_db_connection::{
connection::Connection,
Expand All @@ -10,21 +11,42 @@ use zksync_db_connection::{
};
use zksync_types::{tee_types::TeeType, L1BatchNumber};

use crate::{models::storage_tee_proof::StorageTeeProof, Core};
use crate::{
models::storage_tee_proof::{StorageLockedBatch, StorageTeeProof},
Core,
};

#[derive(Debug)]
pub struct TeeProofGenerationDal<'a, 'c> {
pub(crate) storage: &'a mut Connection<'c, Core>,
}

#[derive(Debug, EnumString, Display)]
enum TeeProofGenerationJobStatus {
#[strum(serialize = "unpicked")]
Unpicked,
#[derive(Debug, Clone, Copy, EnumString, Display)]
pub enum TeeProofGenerationJobStatus {
#[strum(serialize = "picked_by_prover")]
PickedByProver,
#[strum(serialize = "generated")]
Generated,
#[strum(serialize = "failed")]
Failed,
#[strum(serialize = "permanently_ignored")]
PermanentlyIgnored,
}

/// Represents a locked batch picked by a TEE prover. A batch is locked when taken by a TEE prover
/// ([TeeProofGenerationJobStatus::PickedByProver]). It can transition to one of three states:
/// 1. [TeeProofGenerationJobStatus::Generated] when the proof is successfully submitted.
/// 2. [TeeProofGenerationJobStatus::Failed] when the proof generation fails, which can happen if
/// its inputs (GCS blob files) are incomplete or the API is unavailable for an extended period.
/// 3. [TeeProofGenerationJobStatus::PermanentlyIgnored] when the proof generation has been
/// continuously failing for an extended period.
#[derive(Clone, Debug)]
pub struct LockedBatch {
/// Locked batch number.
pub l1_batch_number: L1BatchNumber,
/// The creation time of the job for this batch. It is used to determine if the batch should
/// transition to [TeeProofGenerationJobStatus::PermanentlyIgnored] or [TeeProofGenerationJobStatus::Failed].
pub created_at: DateTime<Utc>,
}

impl TeeProofGenerationDal<'_, '_> {
Expand All @@ -33,10 +55,11 @@ impl TeeProofGenerationDal<'_, '_> {
tee_type: TeeType,
processing_timeout: Duration,
min_batch_number: L1BatchNumber,
) -> DalResult<Option<L1BatchNumber>> {
) -> DalResult<Option<LockedBatch>> {
let processing_timeout = pg_interval_from_duration(processing_timeout);
let min_batch_number = i64::from(min_batch_number.0);
sqlx::query!(
let locked_batch = sqlx::query_as!(
StorageLockedBatch,
r#"
WITH upsert AS (
SELECT
Expand All @@ -57,11 +80,8 @@ impl TeeProofGenerationDal<'_, '_> {
AND (
tee.l1_batch_number IS NULL
OR (
tee.status = $3
OR (
tee.status = $2
AND tee.prover_taken_at < NOW() - $4::INTERVAL
)
(tee.status = $2 OR tee.status = $3)
AND tee.prover_taken_at < NOW() - $4::INTERVAL
)
)
FETCH FIRST ROW ONLY
Expand All @@ -87,11 +107,12 @@ impl TeeProofGenerationDal<'_, '_> {
updated_at = NOW(),
prover_taken_at = NOW()
RETURNING
l1_batch_number
l1_batch_number,
created_at
"#,
tee_type.to_string(),
TeeProofGenerationJobStatus::PickedByProver.to_string(),
TeeProofGenerationJobStatus::Unpicked.to_string(),
TeeProofGenerationJobStatus::Failed.to_string(),
processing_timeout,
min_batch_number
)
Expand All @@ -100,14 +121,17 @@ impl TeeProofGenerationDal<'_, '_> {
.with_arg("processing_timeout", &processing_timeout)
.with_arg("l1_batch_number", &min_batch_number)
.fetch_optional(self.storage)
.await
.map(|record| record.map(|record| L1BatchNumber(record.l1_batch_number as u32)))
.await?
.map(Into::into);

Ok(locked_batch)
}

pub async fn unlock_batch(
&mut self,
l1_batch_number: L1BatchNumber,
tee_type: TeeType,
status: TeeProofGenerationJobStatus,
) -> DalResult<()> {
let batch_number = i64::from(l1_batch_number.0);
sqlx::query!(
Expand All @@ -120,7 +144,7 @@ impl TeeProofGenerationDal<'_, '_> {
l1_batch_number = $2
AND tee_type = $3
"#,
TeeProofGenerationJobStatus::Unpicked.to_string(),
status.to_string(),
batch_number,
tee_type.to_string()
)
Expand Down Expand Up @@ -266,7 +290,7 @@ impl TeeProofGenerationDal<'_, '_> {
"#,
batch_number,
tee_type.to_string(),
TeeProofGenerationJobStatus::Unpicked.to_string(),
TeeProofGenerationJobStatus::PickedByProver.to_string(),
);
let instrumentation = Instrumented::new("insert_tee_proof_generation_job")
.with_arg("l1_batch_number", &batch_number)
Expand All @@ -281,7 +305,7 @@ impl TeeProofGenerationDal<'_, '_> {
}

/// For testing purposes only.
pub async fn get_oldest_unpicked_batch(&mut self) -> DalResult<Option<L1BatchNumber>> {
pub async fn get_oldest_picked_by_prover_batch(&mut self) -> DalResult<Option<L1BatchNumber>> {
let query = sqlx::query!(
r#"
SELECT
Expand All @@ -295,7 +319,7 @@ impl TeeProofGenerationDal<'_, '_> {
LIMIT
1
"#,
TeeProofGenerationJobStatus::Unpicked.to_string(),
TeeProofGenerationJobStatus::PickedByProver.to_string(),
);
let batch_number = Instrumented::new("get_oldest_unpicked_batch")
.with(query)
Expand Down
13 changes: 12 additions & 1 deletion core/lib/object_store/src/retries.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,18 @@ impl Request<'_> {
self,
store: &impl fmt::Debug,
max_retries: u16,
f: F,
) -> Result<T, ObjectStoreError>
where
Fut: Future<Output = Result<T, ObjectStoreError>>,
F: FnMut() -> Fut,
{
self.retry_internal(max_retries, f).await
}

async fn retry_internal<T, Fut, F>(
&self,
max_retries: u16,
mut f: F,
) -> Result<T, ObjectStoreError>
where
Expand All @@ -53,7 +65,6 @@ impl Request<'_> {
backoff_secs *= 2;
}
Err(err) => {
tracing::warn!(%err, "Failed request with a fatal error");
break Err(err);
}
}
Expand Down
2 changes: 1 addition & 1 deletion core/lib/types/src/api/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ use serde_json::Value;
use serde_with::{hex::Hex, serde_as};
use strum::Display;
use zksync_basic_types::{
tee_types::TeeType,
web3::{AccessList, Bytes, Index},
Bloom, L1BatchNumber, H160, H256, H64, U256, U64,
};
Expand All @@ -16,6 +15,7 @@ pub use crate::transaction_request::{
use crate::{
debug_flat_call::{DebugCallFlat, ResultDebugCallFlat},
protocol_version::L1VerifierConfig,
tee_types::TeeType,
Address, L2BlockNumber, ProtocolVersionId,
};

Expand Down
4 changes: 2 additions & 2 deletions core/node/proof_data_handler/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@ axum.workspace = true
tokio.workspace = true
tower-http = { workspace = true, features = ["compression-zstd", "decompression-zstd"] }
tracing.workspace = true
chrono.workspace = true

[dev-dependencies]
hyper.workspace = true
chrono.workspace = true
zksync_multivm.workspace = true
serde_json.workspace = true
tower.workspace = true
zksync_basic_types.workspace = true
zksync_contracts.workspace = true
zksync_basic_types.workspace = true
Loading

0 comments on commit cf2cf1d

Please sign in to comment.