From 41268c2b4758b9e04fc31ed0bf0478fc3a41884c Mon Sep 17 00:00:00 2001 From: Alex Ostrovski Date: Thu, 14 Dec 2023 12:18:22 +0200 Subject: [PATCH 01/19] Sketch new fields for `snapshots` table --- ...add_lifecycle_fields_to_snapshots.down.sql | 2 + ...0_add_lifecycle_fields_to_snapshots.up.sql | 2 + core/lib/dal/sqlx-data.json | 195 +++++++++++------- core/lib/dal/src/snapshots_dal.rs | 126 +++++++---- core/lib/object_store/src/objects.rs | 5 +- core/lib/types/src/snapshots.rs | 69 ++++++- 6 files changed, 274 insertions(+), 125 deletions(-) create mode 100644 core/lib/dal/migrations/20231214084950_add_lifecycle_fields_to_snapshots.down.sql create mode 100644 core/lib/dal/migrations/20231214084950_add_lifecycle_fields_to_snapshots.up.sql diff --git a/core/lib/dal/migrations/20231214084950_add_lifecycle_fields_to_snapshots.down.sql b/core/lib/dal/migrations/20231214084950_add_lifecycle_fields_to_snapshots.down.sql new file mode 100644 index 000000000000..0a6accf6acea --- /dev/null +++ b/core/lib/dal/migrations/20231214084950_add_lifecycle_fields_to_snapshots.down.sql @@ -0,0 +1,2 @@ +ALTER TABLE snapshots + DROP COLUMN storage_logs_chunk_count; diff --git a/core/lib/dal/migrations/20231214084950_add_lifecycle_fields_to_snapshots.up.sql b/core/lib/dal/migrations/20231214084950_add_lifecycle_fields_to_snapshots.up.sql new file mode 100644 index 000000000000..03e1c05678be --- /dev/null +++ b/core/lib/dal/migrations/20231214084950_add_lifecycle_fields_to_snapshots.up.sql @@ -0,0 +1,2 @@ +ALTER TABLE snapshots + ADD COLUMN storage_logs_chunk_count BIGINT NOT NULL; diff --git a/core/lib/dal/sqlx-data.json b/core/lib/dal/sqlx-data.json index 2d4061a640cf..7c33dfd6789a 100644 --- a/core/lib/dal/sqlx-data.json +++ b/core/lib/dal/sqlx-data.json @@ -358,20 +358,6 @@ }, "query": "INSERT INTO eth_txs_history (eth_tx_id, base_fee_per_gas, priority_fee_per_gas, tx_hash, signed_raw_tx, created_at, updated_at, confirmed_at) VALUES ($1, 0, 0, $2, '\\x00', now(), now(), $3) RETURNING id" }, - "07bb6aa5f4ffe0b753cca8ac92c65bd7618db908250e5bf9e835f54b1dd04755": { - "describe": { - "columns": [], - "nullable": [], - "parameters": { - "Left": [ - "Int8", - "TextArray", - "Text" - ] - } - }, - "query": "INSERT INTO snapshots (l1_batch_number, storage_logs_filepaths, factory_deps_filepath, created_at, updated_at) VALUES ($1, $2, $3, NOW(), NOW())" - }, "09768b376996b96add16a02d1a59231cb9b525cd5bd19d22a76149962d4c91c2": { "describe": { "columns": [], @@ -696,38 +682,6 @@ }, "query": "SELECT l1_address FROM tokens WHERE market_volume > $1" }, - "1658e6fce121904c1353e51663fc307b01e02bc412ee46ac17e0f5acacd0b5c4": { - "describe": { - "columns": [ - { - "name": "l1_batch_number", - "ordinal": 0, - "type_info": "Int8" - }, - { - "name": "factory_deps_filepath", - "ordinal": 1, - "type_info": "Text" - }, - { - "name": "storage_logs_filepaths", - "ordinal": 2, - "type_info": "TextArray" - } - ], - "nullable": [ - false, - false, - false - ], - "parameters": { - "Left": [ - "Int8" - ] - } - }, - "query": "SELECT l1_batch_number, factory_deps_filepath, storage_logs_filepaths FROM snapshots WHERE l1_batch_number = $1" - }, "166dcd8d504ba3f52a9e44a05305ed00954ab9b5302be4bb5ab05dfd2272afca": { "describe": { "columns": [ @@ -1099,6 +1053,44 @@ }, "query": "DELETE FROM storage_logs WHERE miniblock_number > $1" }, + "18e79d285742498df1733aa7cb698f71fb66de588b82558f12df7f50478ba09c": { + "describe": { + "columns": [ + { + "name": "l1_batch_number", + "ordinal": 0, + "type_info": "Int8" + }, + { + "name": "storage_logs_chunk_count", + "ordinal": 1, + "type_info": "Int8" + }, + { + "name": "factory_deps_filepath", + "ordinal": 2, + "type_info": "Text" + }, + { + "name": "storage_logs_filepaths", + "ordinal": 3, + "type_info": "TextArray" + } + ], + "nullable": [ + false, + false, + false, + false + ], + "parameters": { + "Left": [ + "Int8" + ] + } + }, + "query": "SELECT l1_batch_number, storage_logs_chunk_count, factory_deps_filepath, storage_logs_filepaths FROM snapshots WHERE l1_batch_number = $1" + }, "191fb8c0549267b515aaa7acc199675be1ea113e9137195468bb8ce64a099ae8": { "describe": { "columns": [ @@ -4515,6 +4507,20 @@ }, "query": "SELECT bytecode_hash, bytecode FROM factory_deps INNER JOIN miniblocks ON miniblocks.number = factory_deps.miniblock_number WHERE miniblocks.l1_batch_number = $1" }, + "5574fcf4de1401eaca9901c2a7b70234e02ad2cae3456c986236ebf9a0caff71": { + "describe": { + "columns": [], + "nullable": [], + "parameters": { + "Left": [ + "Int8", + "Int8", + "Text" + ] + } + }, + "query": "INSERT INTO snapshots ( l1_batch_number, storage_logs_chunk_count, storage_logs_filepaths, factory_deps_filepath, created_at, updated_at ) VALUES ($1, $2, ARRAY[]::text[], $3, NOW(), NOW())" + }, "55debba852ef32f3b5ba6ffcb745f7b59d6888a21cb8792f8f9027e3b164a245": { "describe": { "columns": [ @@ -7603,6 +7609,19 @@ }, "query": "DELETE FROM storage WHERE hashed_key = ANY($1)" }, + "974cacecb574f0b0dc2a360986b8b9b88d10336b793ebc7387eac8b2d7a88044": { + "describe": { + "columns": [], + "nullable": [], + "parameters": { + "Left": [ + "Int8", + "Text" + ] + } + }, + "query": "UPDATE snapshots SET storage_logs_filepaths = storage_logs_filepaths || $2::text, updated_at = NOW() WHERE l1_batch_number = $1" + }, "97d81c27885fda4390ebc9789c6169cb94a449f583f7819ec74286fb0d9f81d5": { "describe": { "columns": [ @@ -8477,6 +8496,42 @@ }, "query": "DELETE FROM basic_witness_input_producer_jobs" }, + "a504be176fb07418afb0af16956fae00d56fa00ce02702cbfab90b7a419396b2": { + "describe": { + "columns": [ + { + "name": "l1_batch_number", + "ordinal": 0, + "type_info": "Int8" + }, + { + "name": "storage_logs_chunk_count", + "ordinal": 1, + "type_info": "Int8" + }, + { + "name": "factory_deps_filepath", + "ordinal": 2, + "type_info": "Text" + }, + { + "name": "storage_logs_filepaths", + "ordinal": 3, + "type_info": "TextArray" + } + ], + "nullable": [ + false, + false, + false, + false + ], + "parameters": { + "Left": [] + } + }, + "query": "SELECT l1_batch_number, storage_logs_chunk_count, factory_deps_filepath, storage_logs_filepaths FROM snapshots ORDER BY l1_batch_number DESC LIMIT 1" + }, "a5115658f3a53462a9570fd6676f1931604d1c17a9a2b5f1475519006aaf03ba": { "describe": { "columns": [], @@ -9276,36 +9331,6 @@ }, "query": "INSERT INTO prover_protocol_versions\n (id, timestamp, recursion_scheduler_level_vk_hash, recursion_node_level_vk_hash,\n recursion_leaf_level_vk_hash, recursion_circuits_set_vks_hash, verifier_address, created_at)\n VALUES ($1, $2, $3, $4, $5, $6, $7, now())\n " }, - "b707b6247c76a50bda3be8076aafb77de60cfc5a0cc61c7dd60e4330eabc28d7": { - "describe": { - "columns": [ - { - "name": "l1_batch_number", - "ordinal": 0, - "type_info": "Int8" - }, - { - "name": "factory_deps_filepath", - "ordinal": 1, - "type_info": "Text" - }, - { - "name": "storage_logs_filepaths", - "ordinal": 2, - "type_info": "TextArray" - } - ], - "nullable": [ - false, - false, - false - ], - "parameters": { - "Left": [] - } - }, - "query": "SELECT l1_batch_number, factory_deps_filepath, storage_logs_filepaths FROM snapshots" - }, "b944df7af612ec911170a43be846eb2f6e27163b0d3983672de2b8d5d60af640": { "describe": { "columns": [ @@ -10584,6 +10609,24 @@ }, "query": "DELETE FROM transactions WHERE in_mempool = TRUE AND initiator_address = ANY($1)" }, + "e9517bcf2f0b01ffd82dc8c1d7da84b3759ef4739e0e89e509b16a43c61147fb": { + "describe": { + "columns": [ + { + "name": "l1_batch_number", + "ordinal": 0, + "type_info": "Int8" + } + ], + "nullable": [ + false + ], + "parameters": { + "Left": [] + } + }, + "query": "SELECT l1_batch_number FROM snapshots ORDER BY l1_batch_number" + }, "e9b03a0d79eb40a67eab9bdaac8447fc17922bea89bcc6a89eb8eadf147835fe": { "describe": { "columns": [], diff --git a/core/lib/dal/src/snapshots_dal.rs b/core/lib/dal/src/snapshots_dal.rs index 0e031b31d142..d76b080214be 100644 --- a/core/lib/dal/src/snapshots_dal.rs +++ b/core/lib/dal/src/snapshots_dal.rs @@ -14,14 +14,17 @@ impl SnapshotsDal<'_, '_> { pub async fn add_snapshot( &mut self, l1_batch_number: L1BatchNumber, - storage_logs_filepaths: &[String], + storage_logs_chunk_count: u64, factory_deps_filepaths: &str, - ) -> Result<(), sqlx::Error> { + ) -> sqlx::Result<()> { sqlx::query!( - "INSERT INTO snapshots (l1_batch_number, storage_logs_filepaths, factory_deps_filepath, created_at, updated_at) \ - VALUES ($1, $2, $3, NOW(), NOW())", + "INSERT INTO snapshots ( \ + l1_batch_number, storage_logs_chunk_count, storage_logs_filepaths, factory_deps_filepath, \ + created_at, updated_at \ + ) \ + VALUES ($1, $2, ARRAY[]::text[], $3, NOW(), NOW())", l1_batch_number.0 as i32, - storage_logs_filepaths, + storage_logs_chunk_count as i64, factory_deps_filepaths, ) .instrument("add_snapshot") @@ -31,40 +34,81 @@ impl SnapshotsDal<'_, '_> { Ok(()) } - pub async fn get_all_snapshots(&mut self) -> Result { - let records: Vec = sqlx::query!( - "SELECT l1_batch_number, factory_deps_filepath, storage_logs_filepaths FROM snapshots" + pub async fn add_storage_logs_filepath_for_snapshot( + &mut self, + l1_batch_number: L1BatchNumber, + storage_logs_filepath: &str, + ) -> sqlx::Result<()> { + sqlx::query!( + "UPDATE snapshots SET \ + storage_logs_filepaths = storage_logs_filepaths || $2::text, \ + updated_at = NOW() \ + WHERE l1_batch_number = $1", + l1_batch_number.0 as i32, + storage_logs_filepath, ) - .instrument("get_all_snapshots") - .report_latency() - .fetch_all(self.storage.conn()) - .await? - .into_iter() - .map(|r| L1BatchNumber(r.l1_batch_number as u32)) - .collect(); + .execute(self.storage.conn()) + .await?; + + Ok(()) + } + + pub async fn get_all_snapshots(&mut self) -> sqlx::Result { + let records = + sqlx::query!("SELECT l1_batch_number FROM snapshots ORDER BY l1_batch_number") + .instrument("get_all_snapshots") + .report_latency() + .fetch_all(self.storage.conn()) + .await? + .into_iter() + .map(|row| L1BatchNumber(row.l1_batch_number as u32)) + .collect(); + Ok(AllSnapshots { snapshots_l1_batch_numbers: records, }) } + pub async fn get_newest_snapshot_metadata(&mut self) -> sqlx::Result> { + let row = sqlx::query!( + "SELECT l1_batch_number, storage_logs_chunk_count, factory_deps_filepath, storage_logs_filepaths \ + FROM snapshots \ + ORDER BY l1_batch_number DESC LIMIT 1" + ) + .instrument("get_newest_snapshot_metadata") + .report_latency() + .fetch_optional(self.storage.conn()) + .await?; + + Ok(row.map(|row| SnapshotMetadata { + l1_batch_number: L1BatchNumber(row.l1_batch_number as u32), + storage_logs_chunk_count: row.storage_logs_chunk_count as u64, + factory_deps_filepath: row.factory_deps_filepath, + storage_logs_filepaths: row.storage_logs_filepaths, + })) + } + pub async fn get_snapshot_metadata( &mut self, l1_batch_number: L1BatchNumber, - ) -> Result, sqlx::Error> { - let record: Option = sqlx::query!( - "SELECT l1_batch_number, factory_deps_filepath, storage_logs_filepaths FROM snapshots WHERE l1_batch_number = $1", + ) -> sqlx::Result> { + let row = sqlx::query!( + "SELECT l1_batch_number, storage_logs_chunk_count, factory_deps_filepath, storage_logs_filepaths \ + FROM snapshots \ + WHERE l1_batch_number = $1", l1_batch_number.0 as i32 ) .instrument("get_snapshot_metadata") .report_latency() .fetch_optional(self.storage.conn()) - .await? - .map(|r| SnapshotMetadata { - l1_batch_number: L1BatchNumber(r.l1_batch_number as u32), - factory_deps_filepath: r.factory_deps_filepath, - storage_logs_filepaths: r.storage_logs_filepaths, - }); - Ok(record) + .await?; + + Ok(row.map(|row| SnapshotMetadata { + l1_batch_number: L1BatchNumber(row.l1_batch_number as u32), + storage_logs_chunk_count: row.storage_logs_chunk_count as u64, + factory_deps_filepath: row.factory_deps_filepath, + storage_logs_filepaths: row.storage_logs_filepaths, + })) } } @@ -80,7 +124,7 @@ mod tests { let mut conn = pool.access_storage().await.unwrap(); let mut dal = conn.snapshots_dal(); let l1_batch_number = L1BatchNumber(100); - dal.add_snapshot(l1_batch_number, &[], "gs:///bucket/factory_deps.bin") + dal.add_snapshot(l1_batch_number, 100, "gs:///bucket/factory_deps.bin") .await .expect("Failed to add snapshot"); @@ -89,20 +133,14 @@ mod tests { .await .expect("Failed to retrieve snapshots"); assert_eq!(1, snapshots.snapshots_l1_batch_numbers.len()); - assert_eq!( - snapshots.snapshots_l1_batch_numbers[0], - l1_batch_number as L1BatchNumber - ); + assert_eq!(snapshots.snapshots_l1_batch_numbers[0], l1_batch_number); let snapshot_metadata = dal .get_snapshot_metadata(l1_batch_number) .await .expect("Failed to retrieve snapshot") .unwrap(); - assert_eq!( - snapshot_metadata.l1_batch_number, - l1_batch_number as L1BatchNumber - ); + assert_eq!(snapshot_metadata.l1_batch_number, l1_batch_number); } #[tokio::test] @@ -111,16 +149,16 @@ mod tests { let mut conn = pool.access_storage().await.unwrap(); let mut dal = conn.snapshots_dal(); let l1_batch_number = L1BatchNumber(100); - dal.add_snapshot( - l1_batch_number, - &[ - "gs:///bucket/test_file1.bin".to_string(), - "gs:///bucket/test_file2.bin".to_string(), - ], - "gs:///bucket/factory_deps.bin", - ) - .await - .expect("Failed to add snapshot"); + dal.add_snapshot(l1_batch_number, 100, "gs:///bucket/factory_deps.bin") + .await + .expect("Failed to add snapshot"); + + let storage_log_filepaths = ["gs:///bucket/test_file1.bin", "gs:///bucket/test_file2.bin"]; + for storage_log_filepath in storage_log_filepaths { + dal.add_storage_logs_filepath_for_snapshot(l1_batch_number, storage_log_filepath) + .await + .unwrap(); + } let files = dal .get_snapshot_metadata(l1_batch_number) diff --git a/core/lib/object_store/src/objects.rs b/core/lib/object_store/src/objects.rs index 89241c7edf7c..3f6469b9827d 100644 --- a/core/lib/object_store/src/objects.rs +++ b/core/lib/object_store/src/objects.rs @@ -101,10 +101,7 @@ impl StoredObject for SnapshotStorageLogsChunk { type Key<'a> = SnapshotStorageLogsStorageKey; fn encode_key(key: Self::Key<'_>) -> String { - format!( - "snapshot_l1_batch_{}_storage_logs_part_{:0>4}.json.gzip", - key.l1_batch_number, key.chunk_id - ) + key.to_string() } //TODO use better language agnostic serialization format like protobuf diff --git a/core/lib/types/src/snapshots.rs b/core/lib/types/src/snapshots.rs index a2c834c96339..8705cda6d1d8 100644 --- a/core/lib/types/src/snapshots.rs +++ b/core/lib/types/src/snapshots.rs @@ -1,3 +1,6 @@ +use std::{fmt, str::FromStr}; + +use anyhow::Context as _; use serde::{Deserialize, Serialize}; use zksync_basic_types::{L1BatchNumber, MiniblockNumber}; @@ -14,10 +17,17 @@ pub struct AllSnapshots { #[serde(rename_all = "camelCase")] pub struct SnapshotMetadata { pub l1_batch_number: L1BatchNumber, + pub storage_logs_chunk_count: u64, pub factory_deps_filepath: String, pub storage_logs_filepaths: Vec, } +impl SnapshotMetadata { + pub fn is_complete(&self) -> bool { + self.storage_logs_filepaths.len() as u64 == self.storage_logs_chunk_count + } +} + //contains all data not contained in factory_deps/storage_logs files to perform restore process #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] @@ -38,13 +48,49 @@ pub struct SnapshotStorageLogsChunkMetadata { pub filepath: String, } -#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub struct SnapshotStorageLogsStorageKey { pub l1_batch_number: L1BatchNumber, pub chunk_id: u64, } +impl fmt::Display for SnapshotStorageLogsStorageKey { + fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + formatter, + "snapshot_l1_batch_{}_storage_logs_part_{:0>4}.json.gzip", + self.l1_batch_number, self.chunk_id + ) + } +} + +impl FromStr for SnapshotStorageLogsStorageKey { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + let s = s + .strip_prefix("snapshot_l1_batch_") + .context("missing `snapshot_l1_batch_` prefix")?; + let underscore_pos = s.find('_').context("missing '_' char")?; + let (l1_batch_str, s) = s.split_at(underscore_pos); + let l1_batch_number = L1BatchNumber(l1_batch_str.parse().context("invalid L1 batch")?); + + let s = s + .strip_prefix("_storage_logs_part_") + .context("missing `_storage_logs_part_`")?; + let s = s + .strip_suffix(".json.gzip") + .context("missing `.json.gzip` suffix")?; + let chunk_id = s.parse().context("invalid chunk ID")?; + + Ok(Self { + l1_batch_number, + chunk_id, + }) + } +} + #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub struct SnapshotStorageLogsChunk { @@ -71,3 +117,24 @@ pub struct SnapshotFactoryDependencies { pub struct SnapshotFactoryDependency { pub bytecode: Bytes, } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn displaying_and_parsing_storage_logs_key() { + let key = SnapshotStorageLogsStorageKey { + l1_batch_number: L1BatchNumber(12345), + chunk_id: 42, + }; + let key_str = key.to_string(); + assert_eq!( + key_str, + "snapshot_l1_batch_12345_storage_logs_part_0042.json.gzip" + ); + + let parsed_key: SnapshotStorageLogsStorageKey = key_str.parse().unwrap(); + assert_eq!(parsed_key, key); + } +} From 79b3a89c69fe969319e474eaea74ad1d3c493790 Mon Sep 17 00:00:00 2001 From: Alex Ostrovski Date: Thu, 14 Dec 2023 12:20:21 +0200 Subject: [PATCH 02/19] Sketch recovery for snapshots creator --- core/bin/snapshots_creator/src/main.rs | 214 +++++++++++++++------- core/bin/snapshots_creator/src/metrics.rs | 2 +- core/bin/snapshots_creator/src/tests.rs | 2 + prover/Cargo.lock | 55 +++++- 4 files changed, 199 insertions(+), 74 deletions(-) diff --git a/core/bin/snapshots_creator/src/main.rs b/core/bin/snapshots_creator/src/main.rs index 69aee643abdf..4ca6bdbde9c1 100644 --- a/core/bin/snapshots_creator/src/main.rs +++ b/core/bin/snapshots_creator/src/main.rs @@ -1,5 +1,7 @@ //! Snapshot creator utility. Intended to run on a schedule, with each run creating a new snapshot. +use std::collections::HashSet; + use anyhow::Context as _; use prometheus_exporter::PrometheusExporterConfig; use tokio::sync::{watch, Semaphore}; @@ -9,7 +11,8 @@ use zksync_env_config::{object_store::SnapshotsObjectStoreConfig, FromEnv}; use zksync_object_store::{ObjectStore, ObjectStoreFactory}; use zksync_types::{ snapshots::{ - SnapshotFactoryDependencies, SnapshotStorageLogsChunk, SnapshotStorageLogsStorageKey, + SnapshotFactoryDependencies, SnapshotMetadata, SnapshotStorageLogsChunk, + SnapshotStorageLogsStorageKey, }, L1BatchNumber, MiniblockNumber, }; @@ -43,15 +46,17 @@ async fn maybe_enable_prometheus_metrics( Ok(()) } +#[allow(clippy::too_many_arguments)] // FIXME async fn process_storage_logs_single_chunk( blob_store: &dyn ObjectStore, + master_pool: &ConnectionPool, pool: &ConnectionPool, semaphore: &Semaphore, miniblock_number: MiniblockNumber, l1_batch_number: L1BatchNumber, chunk_id: u64, chunks_count: u64, -) -> anyhow::Result { +) -> anyhow::Result<()> { let _permit = semaphore.acquire().await?; let hashed_keys_range = get_chunk_hashed_keys_range(chunk_id, chunks_count); let mut conn = pool.access_storage_tagged("snapshots_creator").await?; @@ -84,12 +89,20 @@ async fn process_storage_logs_single_chunk( let output_filepath = format!("{output_filepath_prefix}/{filename}"); let latency = latency.observe(); + let mut master_conn = master_pool + .access_storage_tagged("snapshots_creator") + .await?; + master_conn + .snapshots_dal() + .add_storage_logs_filepath_for_snapshot(l1_batch_number, &output_filepath) + .await?; + let tasks_left = METRICS.storage_logs_chunks_left_to_process.dec_by(1) - 1; tracing::info!( "Saved chunk {chunk_id} (overall progress {}/{chunks_count}) in {latency:?} to location: {output_filepath}", - chunks_count - tasks_left + chunks_count - tasks_left as u64 ); - Ok(output_filepath) + Ok(()) } async fn process_factory_deps( @@ -129,6 +142,60 @@ async fn process_factory_deps( Ok(output_filepath) } +#[derive(Debug)] +struct SnapshotProgress { + l1_batch_number: L1BatchNumber, + needs_persisting_factory_deps: bool, + chunk_count: u64, + remaining_chunk_ids: Vec, +} + +impl SnapshotProgress { + fn new(l1_batch_number: L1BatchNumber, chunk_count: u64) -> Self { + Self { + l1_batch_number, + needs_persisting_factory_deps: true, + chunk_count, + remaining_chunk_ids: (0..chunk_count).collect(), + } + } + + fn from_existing_snapshot( + snapshot: &SnapshotMetadata, + blob_store: &dyn ObjectStore, + ) -> anyhow::Result { + let output_filepath_prefix = blob_store.get_storage_prefix::(); + let existing_chunk_ids = snapshot.storage_logs_filepaths.iter().map(|path| { + let object_key = path + .strip_prefix(&output_filepath_prefix) + .with_context(|| format!("Path `{path}` has unexpected prefix"))?; + let object_key = object_key + .parse::() + .with_context(|| format!("Object key `{object_key}` cannot be parsed"))?; + anyhow::ensure!( + object_key.l1_batch_number == snapshot.l1_batch_number, + "Mismatch" + ); + anyhow::Ok(object_key.chunk_id) + }); + let existing_chunk_ids: anyhow::Result> = existing_chunk_ids.collect(); + let existing_chunk_ids = existing_chunk_ids?; + + let all_chunk_ids = (0..snapshot.storage_logs_chunk_count).collect::>(); + let remaining_chunk_ids = all_chunk_ids + .difference(&existing_chunk_ids) + .copied() + .collect(); + + Ok(Self { + l1_batch_number: snapshot.l1_batch_number, + needs_persisting_factory_deps: false, + chunk_count: snapshot.storage_logs_chunk_count, + remaining_chunk_ids, + }) + } +} + async fn run( blob_store: Box, replica_pool: ConnectionPool, @@ -141,96 +208,113 @@ async fn run( let mut conn = replica_pool .access_storage_tagged("snapshots_creator") .await?; - - // We subtract 1 so that after restore, EN node has at least one L1 batch to fetch - let sealed_l1_batch_number = conn.blocks_dal().get_sealed_l1_batch_number().await?; - assert_ne!( - sealed_l1_batch_number, - L1BatchNumber(0), - "Cannot create snapshot when only the genesis L1 batch is present in Postgres" - ); - let l1_batch_number = sealed_l1_batch_number - 1; - let mut master_conn = master_pool .access_storage_tagged("snapshots_creator") .await?; - if master_conn + + let latest_snapshot = master_conn .snapshots_dal() - .get_snapshot_metadata(l1_batch_number) - .await? - .is_some() - { - tracing::info!("Snapshot for L1 batch number {l1_batch_number} already exists, exiting"); - return Ok(()); - } + .get_newest_snapshot_metadata() + .await?; + let pending_snapshot = latest_snapshot + .as_ref() + .filter(|snapshot| !snapshot.is_complete()); + let progress = if let Some(snapshot) = pending_snapshot { + SnapshotProgress::from_existing_snapshot(snapshot, &*blob_store)? + } else { + // We subtract 1 so that after restore, EN node has at least one L1 batch to fetch + let sealed_l1_batch_number = conn.blocks_dal().get_sealed_l1_batch_number().await?; + assert_ne!( + sealed_l1_batch_number, + L1BatchNumber(0), + "Cannot create snapshot when only the genesis L1 batch is present in Postgres" + ); + let l1_batch_number = sealed_l1_batch_number - 1; + + let latest_snapshot_l1_batch_number = latest_snapshot + .as_ref() + .map(|snapshot| snapshot.l1_batch_number); + if latest_snapshot_l1_batch_number == Some(l1_batch_number) { + tracing::info!( + "Snapshot at expected L1 batch #{l1_batch_number} is already created; exiting" + ); + return Ok(()); + } + + let distinct_storage_logs_keys_count = conn + .snapshots_creator_dal() + .get_distinct_storage_logs_keys_count(l1_batch_number) + .await?; + let chunk_size = config.storage_logs_chunk_size; + // We force the minimum number of chunks to avoid situations where only one chunk is created in tests. + let chunk_count = + ceil_div(distinct_storage_logs_keys_count, chunk_size).max(min_chunk_count); + + tracing::info!( + "Selected storage logs chunking for L1 batch {l1_batch_number}: \ + {chunk_count} chunks of expected size {chunk_size}" + ); + SnapshotProgress::new(l1_batch_number, chunk_count) + }; drop(master_conn); let (_, last_miniblock_number_in_batch) = conn .blocks_dal() - .get_miniblock_range_of_l1_batch(l1_batch_number) + .get_miniblock_range_of_l1_batch(progress.l1_batch_number) .await? .context("Error fetching last miniblock number")?; - let distinct_storage_logs_keys_count = conn - .snapshots_creator_dal() - .get_distinct_storage_logs_keys_count(l1_batch_number) - .await?; drop(conn); - let chunk_size = config.storage_logs_chunk_size; - // We force the minimum number of chunks to avoid situations where only one chunk is created in tests. - let chunks_count = ceil_div(distinct_storage_logs_keys_count, chunk_size).max(min_chunk_count); - - METRICS.storage_logs_chunks_count.set(chunks_count); - + METRICS.storage_logs_chunks_count.set(progress.chunk_count); tracing::info!( "Creating snapshot for storage logs up to miniblock {last_miniblock_number_in_batch}, \ - L1 batch {l1_batch_number}" + L1 batch {}", + progress.l1_batch_number ); - tracing::info!("Starting to generate {chunks_count} chunks of expected size {chunk_size}"); - let factory_deps_output_file = process_factory_deps( - &*blob_store, - &replica_pool, - last_miniblock_number_in_batch, - l1_batch_number, - ) - .await?; + if progress.needs_persisting_factory_deps { + let factory_deps_output_file = process_factory_deps( + &*blob_store, + &replica_pool, + last_miniblock_number_in_batch, + progress.l1_batch_number, + ) + .await?; + + let mut master_conn = master_pool + .access_storage_tagged("snapshots_creator") + .await?; + master_conn + .snapshots_dal() + .add_snapshot( + progress.l1_batch_number, + progress.chunk_count, + &factory_deps_output_file, + ) + .await?; + } METRICS .storage_logs_chunks_left_to_process - .set(chunks_count); - + .set(progress.remaining_chunk_ids.len()); let semaphore = Semaphore::new(config.concurrent_queries_count as usize); - let tasks = (0..chunks_count).map(|chunk_id| { + let tasks = progress.remaining_chunk_ids.into_iter().map(|chunk_id| { process_storage_logs_single_chunk( &*blob_store, + &master_pool, &replica_pool, &semaphore, last_miniblock_number_in_batch, - l1_batch_number, + progress.l1_batch_number, chunk_id, - chunks_count, + progress.chunk_count, ) }); - let mut storage_logs_output_files = futures::future::try_join_all(tasks).await?; - // Sanity check: the number of files should equal the number of chunks. - assert_eq!(storage_logs_output_files.len(), chunks_count as usize); - storage_logs_output_files.sort(); - - tracing::info!("Finished generating snapshot, storing progress in Postgres"); - let mut master_conn = master_pool - .access_storage_tagged("snapshots_creator") - .await?; - master_conn - .snapshots_dal() - .add_snapshot( - l1_batch_number, - &storage_logs_output_files, - &factory_deps_output_file, - ) - .await?; + futures::future::try_join_all(tasks).await?; - METRICS.snapshot_l1_batch.set(l1_batch_number.0 as u64); + METRICS + .snapshot_l1_batch + .set(progress.l1_batch_number.0.into()); let elapsed = latency.observe(); tracing::info!("snapshot_generation_duration: {elapsed:?}"); diff --git a/core/bin/snapshots_creator/src/metrics.rs b/core/bin/snapshots_creator/src/metrics.rs index 194ed8f1e680..5eb1984712e5 100644 --- a/core/bin/snapshots_creator/src/metrics.rs +++ b/core/bin/snapshots_creator/src/metrics.rs @@ -24,7 +24,7 @@ pub(crate) struct SnapshotsCreatorMetrics { /// Number of chunks in the most recently generated snapshot. Set when a snapshot generation starts. pub storage_logs_chunks_count: Gauge, /// Number of chunks left to process for the snapshot being currently generated. - pub storage_logs_chunks_left_to_process: Gauge, + pub storage_logs_chunks_left_to_process: Gauge, /// Total latency of snapshot generation. #[metrics(buckets = Buckets::LATENCIES, unit = Unit::Seconds)] pub snapshot_generation_duration: Histogram, diff --git a/core/bin/snapshots_creator/src/tests.rs b/core/bin/snapshots_creator/src/tests.rs index fb07a8d335a4..d2eb183cdcfd 100644 --- a/core/bin/snapshots_creator/src/tests.rs +++ b/core/bin/snapshots_creator/src/tests.rs @@ -1,5 +1,7 @@ //! Lower-level tests for the snapshot creator component. +// FIXME: test snapshot creator recovery + use std::collections::{HashMap, HashSet}; use rand::{thread_rng, Rng}; diff --git a/prover/Cargo.lock b/prover/Cargo.lock index d6ea72d2c8b6..00e1b170d842 100644 --- a/prover/Cargo.lock +++ b/prover/Cargo.lock @@ -1058,6 +1058,15 @@ version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ccaeedb56da03b09f598226e25e80088cb4cd25f316e6e4df7d695f0feeb1403" +[[package]] +name = "crc32fast" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +dependencies = [ + "cfg-if 1.0.0", +] + [[package]] name = "criterion" version = "0.5.1" @@ -1879,6 +1888,16 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" +[[package]] +name = "flate2" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "fnv" version = "1.0.7" @@ -3556,7 +3575,7 @@ dependencies = [ [[package]] name = "pairing_ce" version = "0.28.5" -source = "git+https://github.com/matter-labs/pairing.git?rev=f55393f#f55393fd366596eac792d78525d26e9c4d6ed1ca" +source = "git+https://github.com/matter-labs/pairing.git?rev=f55393fd366596eac792d78525d26e9c4d6ed1ca#f55393fd366596eac792d78525d26e9c4d6ed1ca" dependencies = [ "byteorder", "cfg-if 1.0.0", @@ -6962,7 +6981,7 @@ dependencies = [ [[package]] name = "zksync_concurrency" version = "0.1.0" -source = "git+https://github.com/matter-labs/era-consensus.git?rev=da015d4c94b19962bc11622b6cc256e214256555#da015d4c94b19962bc11622b6cc256e214256555" +source = "git+https://github.com/matter-labs/era-consensus.git?rev=49b1a98f80d0e9f74fdceadece4283e745c71599#49b1a98f80d0e9f74fdceadece4283e745c71599" dependencies = [ "anyhow", "once_cell", @@ -6989,14 +7008,14 @@ dependencies = [ [[package]] name = "zksync_consensus_crypto" version = "0.1.0" -source = "git+https://github.com/matter-labs/era-consensus.git?rev=da015d4c94b19962bc11622b6cc256e214256555#da015d4c94b19962bc11622b6cc256e214256555" +source = "git+https://github.com/matter-labs/era-consensus.git?rev=49b1a98f80d0e9f74fdceadece4283e745c71599#49b1a98f80d0e9f74fdceadece4283e745c71599" dependencies = [ "anyhow", "blst", "ed25519-dalek", "ff_ce", "hex", - "pairing_ce 0.28.5 (git+https://github.com/matter-labs/pairing.git?rev=f55393f)", + "pairing_ce 0.28.5 (git+https://github.com/matter-labs/pairing.git?rev=f55393fd366596eac792d78525d26e9c4d6ed1ca)", "rand 0.4.6", "rand 0.8.5", "sha3 0.10.8", @@ -7007,7 +7026,7 @@ dependencies = [ [[package]] name = "zksync_consensus_roles" version = "0.1.0" -source = "git+https://github.com/matter-labs/era-consensus.git?rev=da015d4c94b19962bc11622b6cc256e214256555#da015d4c94b19962bc11622b6cc256e214256555" +source = "git+https://github.com/matter-labs/era-consensus.git?rev=49b1a98f80d0e9f74fdceadece4283e745c71599#49b1a98f80d0e9f74fdceadece4283e745c71599" dependencies = [ "anyhow", "bit-vec", @@ -7024,10 +7043,27 @@ dependencies = [ "zksync_protobuf_build", ] +[[package]] +name = "zksync_consensus_storage" +version = "0.1.0" +source = "git+https://github.com/matter-labs/era-consensus.git?rev=49b1a98f80d0e9f74fdceadece4283e745c71599#49b1a98f80d0e9f74fdceadece4283e745c71599" +dependencies = [ + "anyhow", + "async-trait", + "prost", + "rand 0.8.5", + "thiserror", + "tracing", + "zksync_concurrency", + "zksync_consensus_roles", + "zksync_protobuf", + "zksync_protobuf_build", +] + [[package]] name = "zksync_consensus_utils" version = "0.1.0" -source = "git+https://github.com/matter-labs/era-consensus.git?rev=da015d4c94b19962bc11622b6cc256e214256555#da015d4c94b19962bc11622b6cc256e214256555" +source = "git+https://github.com/matter-labs/era-consensus.git?rev=49b1a98f80d0e9f74fdceadece4283e745c71599#49b1a98f80d0e9f74fdceadece4283e745c71599" dependencies = [ "thiserror", "zksync_concurrency", @@ -7083,6 +7119,7 @@ dependencies = [ "url", "vise", "zksync_consensus_roles", + "zksync_consensus_storage", "zksync_contracts", "zksync_health_check", "zksync_protobuf", @@ -7168,9 +7205,11 @@ dependencies = [ "anyhow", "async-trait", "bincode", + "flate2", "google-cloud-auth", "google-cloud-storage", "http", + "serde_json", "tokio", "tracing", "vise", @@ -7209,7 +7248,7 @@ dependencies = [ [[package]] name = "zksync_protobuf" version = "0.1.0" -source = "git+https://github.com/matter-labs/era-consensus.git?rev=da015d4c94b19962bc11622b6cc256e214256555#da015d4c94b19962bc11622b6cc256e214256555" +source = "git+https://github.com/matter-labs/era-consensus.git?rev=49b1a98f80d0e9f74fdceadece4283e745c71599#49b1a98f80d0e9f74fdceadece4283e745c71599" dependencies = [ "anyhow", "bit-vec", @@ -7227,7 +7266,7 @@ dependencies = [ [[package]] name = "zksync_protobuf_build" version = "0.1.0" -source = "git+https://github.com/matter-labs/era-consensus.git?rev=da015d4c94b19962bc11622b6cc256e214256555#da015d4c94b19962bc11622b6cc256e214256555" +source = "git+https://github.com/matter-labs/era-consensus.git?rev=49b1a98f80d0e9f74fdceadece4283e745c71599#49b1a98f80d0e9f74fdceadece4283e745c71599" dependencies = [ "anyhow", "heck 0.4.1", From a9168678ae0ad6d222beaf346c7ddc79148710f6 Mon Sep 17 00:00:00 2001 From: Alex Ostrovski Date: Thu, 14 Dec 2023 13:36:49 +0200 Subject: [PATCH 03/19] Test recovery for snapshots creator --- core/bin/snapshots_creator/src/chunking.rs | 6 +- core/bin/snapshots_creator/src/main.rs | 41 ++++- core/bin/snapshots_creator/src/tests.rs | 203 +++++++++++++++++++-- 3 files changed, 229 insertions(+), 21 deletions(-) diff --git a/core/bin/snapshots_creator/src/chunking.rs b/core/bin/snapshots_creator/src/chunking.rs index 11768febd44f..047a6a23d24e 100644 --- a/core/bin/snapshots_creator/src/chunking.rs +++ b/core/bin/snapshots_creator/src/chunking.rs @@ -5,10 +5,10 @@ use zksync_utils::u256_to_h256; pub(crate) fn get_chunk_hashed_keys_range( chunk_id: u64, - chunks_count: u64, + chunk_count: u64, ) -> ops::RangeInclusive { - assert!(chunks_count > 0); - let mut stride = U256::MAX / chunks_count; + assert!(chunk_count > 0); + let mut stride = U256::MAX / chunk_count; let stride_minus_one = if stride < U256::MAX { stride += U256::one(); stride - 1 diff --git a/core/bin/snapshots_creator/src/main.rs b/core/bin/snapshots_creator/src/main.rs index 4ca6bdbde9c1..e63a7c63ee0d 100644 --- a/core/bin/snapshots_creator/src/main.rs +++ b/core/bin/snapshots_creator/src/main.rs @@ -18,6 +18,8 @@ use zksync_types::{ }; use zksync_utils::ceil_div; +#[cfg(test)] +use crate::tests::HandleEvent; use crate::{ chunking::get_chunk_hashed_keys_range, metrics::{FactoryDepsStage, StorageChunkStage, METRICS}, @@ -55,10 +57,16 @@ async fn process_storage_logs_single_chunk( miniblock_number: MiniblockNumber, l1_batch_number: L1BatchNumber, chunk_id: u64, - chunks_count: u64, + chunk_count: u64, + #[cfg(test)] event_listener: &dyn HandleEvent, ) -> anyhow::Result<()> { let _permit = semaphore.acquire().await?; - let hashed_keys_range = get_chunk_hashed_keys_range(chunk_id, chunks_count); + #[cfg(test)] + if event_listener.on_chunk_started().should_exit() { + return Ok(()); + } + + let hashed_keys_range = get_chunk_hashed_keys_range(chunk_id, chunk_count); let mut conn = pool.access_storage_tagged("snapshots_creator").await?; let latency = @@ -97,10 +105,13 @@ async fn process_storage_logs_single_chunk( .add_storage_logs_filepath_for_snapshot(l1_batch_number, &output_filepath) .await?; + #[cfg(test)] + event_listener.on_chunk_saved(); + let tasks_left = METRICS.storage_logs_chunks_left_to_process.dec_by(1) - 1; tracing::info!( - "Saved chunk {chunk_id} (overall progress {}/{chunks_count}) in {latency:?} to location: {output_filepath}", - chunks_count - tasks_left as u64 + "Saved chunk {chunk_id} (overall progress {}/{chunk_count}) in {latency:?} to location: {output_filepath}", + chunk_count - tasks_left as u64 ); Ok(()) } @@ -166,9 +177,13 @@ impl SnapshotProgress { ) -> anyhow::Result { let output_filepath_prefix = blob_store.get_storage_prefix::(); let existing_chunk_ids = snapshot.storage_logs_filepaths.iter().map(|path| { + // FIXME: this parsing looks unnecessarily fragile let object_key = path .strip_prefix(&output_filepath_prefix) .with_context(|| format!("Path `{path}` has unexpected prefix"))?; + let object_key = object_key + .strip_prefix('/') + .with_context(|| format!("Path `{path}` has unexpected prefix"))?; let object_key = object_key .parse::() .with_context(|| format!("Object key `{object_key}` cannot be parsed"))?; @@ -197,13 +212,14 @@ impl SnapshotProgress { } async fn run( + config: SnapshotsCreatorConfig, blob_store: Box, replica_pool: ConnectionPool, master_pool: ConnectionPool, min_chunk_count: u64, + #[cfg(test)] event_listener: &dyn HandleEvent, ) -> anyhow::Result<()> { let latency = METRICS.snapshot_generation_duration.start(); - let config = SnapshotsCreatorConfig::from_env().context("SnapshotsCreatorConfig::from_env")?; let mut conn = replica_pool .access_storage_tagged("snapshots_creator") @@ -308,6 +324,8 @@ async fn run( progress.l1_batch_number, chunk_id, progress.chunk_count, + #[cfg(test)] + event_listener, ) }); futures::future::try_join_all(tasks).await?; @@ -372,7 +390,18 @@ async fn main() -> anyhow::Result<()> { .build() .await?; - run(blob_store, replica_pool, master_pool, MIN_CHUNK_COUNT).await?; + let config = SnapshotsCreatorConfig::from_env().context("SnapshotsCreatorConfig::from_env")?; + run( + config, + blob_store, + replica_pool, + master_pool, + MIN_CHUNK_COUNT, + #[cfg(test)] + &(), + ) + .await?; + tracing::info!("Finished running snapshot creator!"); stop_sender.send(true).ok(); Ok(()) diff --git a/core/bin/snapshots_creator/src/tests.rs b/core/bin/snapshots_creator/src/tests.rs index d2eb183cdcfd..add8fe39446d 100644 --- a/core/bin/snapshots_creator/src/tests.rs +++ b/core/bin/snapshots_creator/src/tests.rs @@ -1,8 +1,9 @@ //! Lower-level tests for the snapshot creator component. -// FIXME: test snapshot creator recovery - -use std::collections::{HashMap, HashSet}; +use std::{ + collections::{HashMap, HashSet}, + sync::atomic::{AtomicUsize, Ordering}, +}; use rand::{thread_rng, Rng}; use zksync_dal::StorageProcessor; @@ -14,6 +15,42 @@ use zksync_types::{ use super::*; +const TEST_CONFIG: SnapshotsCreatorConfig = SnapshotsCreatorConfig { + storage_logs_chunk_size: 1_000_000, + concurrent_queries_count: 10, +}; +const SEQUENTIAL_TEST_CONFIG: SnapshotsCreatorConfig = SnapshotsCreatorConfig { + storage_logs_chunk_size: 1_000_000, + concurrent_queries_count: 1, +}; + +#[derive(Debug)] +pub(crate) struct TestBehavior { + should_exit: bool, +} + +impl TestBehavior { + fn new(should_exit: bool) -> Self { + Self { should_exit } + } + + pub fn should_exit(&self) -> bool { + self.should_exit + } +} + +pub(crate) trait HandleEvent { + fn on_chunk_started(&self) -> TestBehavior { + TestBehavior::new(false) + } + + fn on_chunk_saved(&self) { + // Do nothing + } +} + +impl HandleEvent for () {} + fn gen_storage_logs(rng: &mut impl Rng, count: usize) -> Vec { (0..count) .map(|_| { @@ -161,9 +198,16 @@ async fn persisting_snapshot_metadata() { let mut conn = pool.access_storage().await.unwrap(); prepare_postgres(&mut rng, &mut conn, 10).await; - run(object_store, pool.clone(), pool.clone(), MIN_CHUNK_COUNT) - .await - .unwrap(); + run( + TEST_CONFIG, + object_store, + pool.clone(), + pool.clone(), + MIN_CHUNK_COUNT, + &(), + ) + .await + .unwrap(); // Check snapshot metadata in Postgres. let snapshots = conn.snapshots_dal().get_all_snapshots().await.unwrap(); @@ -201,9 +245,16 @@ async fn persisting_snapshot_factory_deps() { let mut conn = pool.access_storage().await.unwrap(); let expected_outputs = prepare_postgres(&mut rng, &mut conn, 10).await; - run(object_store, pool.clone(), pool.clone(), MIN_CHUNK_COUNT) - .await - .unwrap(); + run( + TEST_CONFIG, + object_store, + pool.clone(), + pool.clone(), + MIN_CHUNK_COUNT, + &(), + ) + .await + .unwrap(); let snapshot_l1_batch_number = L1BatchNumber(8); let object_store = object_store_factory.create_store().await; @@ -224,12 +275,34 @@ async fn persisting_snapshot_logs() { let mut conn = pool.access_storage().await.unwrap(); let expected_outputs = prepare_postgres(&mut rng, &mut conn, 10).await; - run(object_store, pool.clone(), pool.clone(), MIN_CHUNK_COUNT) - .await - .unwrap(); + run( + TEST_CONFIG, + object_store, + pool.clone(), + pool.clone(), + MIN_CHUNK_COUNT, + &(), + ) + .await + .unwrap(); let snapshot_l1_batch_number = L1BatchNumber(8); let object_store = object_store_factory.create_store().await; + assert_storage_logs( + &mut conn, + &*object_store, + snapshot_l1_batch_number, + &expected_outputs, + ) + .await; +} + +async fn assert_storage_logs( + conn: &mut StorageProcessor<'_>, + object_store: &dyn ObjectStore, + snapshot_l1_batch_number: L1BatchNumber, + expected_outputs: &ExpectedOutputs, +) { let mut actual_logs = HashSet::new(); for chunk_id in 0..MIN_CHUNK_COUNT { let key = SnapshotStorageLogsStorageKey { @@ -241,3 +314,109 @@ async fn persisting_snapshot_logs() { } assert_eq!(actual_logs, expected_outputs.storage_logs); } + +#[derive(Debug)] +struct TestEventListener { + stop_after_chunk_count: usize, + processed_chunk_count: AtomicUsize, +} + +impl TestEventListener { + fn new(stop_after_chunk_count: usize) -> Self { + Self { + stop_after_chunk_count, + processed_chunk_count: AtomicUsize::new(0), + } + } +} + +impl HandleEvent for TestEventListener { + fn on_chunk_started(&self) -> TestBehavior { + let should_stop = + self.processed_chunk_count.load(Ordering::SeqCst) >= self.stop_after_chunk_count; + TestBehavior::new(should_stop) + } + + fn on_chunk_saved(&self) { + self.processed_chunk_count.fetch_add(1, Ordering::SeqCst); + } +} + +#[tokio::test] +async fn recovery_workflow() { + let pool = ConnectionPool::test_pool().await; + let mut rng = thread_rng(); + let object_store_factory = ObjectStoreFactory::mock(); + let object_store = object_store_factory.create_store().await; + + // Insert some data to Postgres. + let mut conn = pool.access_storage().await.unwrap(); + let expected_outputs = prepare_postgres(&mut rng, &mut conn, 10).await; + + run( + SEQUENTIAL_TEST_CONFIG, + object_store, + pool.clone(), + pool.clone(), + MIN_CHUNK_COUNT, + &TestEventListener::new(0), + ) + .await + .unwrap(); + + let snapshot_l1_batch_number = L1BatchNumber(8); + let snapshot_metadata = conn + .snapshots_dal() + .get_snapshot_metadata(snapshot_l1_batch_number) + .await + .unwrap() + .expect("No snapshot metadata"); + assert!(snapshot_metadata.storage_logs_filepaths.is_empty()); + + let object_store = object_store_factory.create_store().await; + let SnapshotFactoryDependencies { factory_deps } = + object_store.get(snapshot_l1_batch_number).await.unwrap(); + let actual_deps: HashSet<_> = factory_deps.into_iter().map(|dep| dep.bytecode).collect(); + assert_eq!(actual_deps, expected_outputs.deps); + + // Process 2 storage log chunks, then stop. + run( + SEQUENTIAL_TEST_CONFIG, + object_store, + pool.clone(), + pool.clone(), + MIN_CHUNK_COUNT, + &TestEventListener::new(2), + ) + .await + .unwrap(); + + let snapshot_metadata = conn + .snapshots_dal() + .get_snapshot_metadata(snapshot_l1_batch_number) + .await + .unwrap() + .expect("No snapshot metadata"); + assert_eq!(snapshot_metadata.storage_logs_filepaths.len(), 2); + + // Process the remaining chunks. + run( + SEQUENTIAL_TEST_CONFIG, + object_store_factory.create_store().await, + pool.clone(), + pool.clone(), + MIN_CHUNK_COUNT, + &(), + ) + .await + .unwrap(); + + let object_store = object_store_factory.create_store().await; + assert_storage_logs( + &mut conn, + &*object_store, + snapshot_l1_batch_number, + &expected_outputs, + ) + .await; +} From b2dddb2417b13a62426feb8ad980427107b95300 Mon Sep 17 00:00:00 2001 From: Alex Ostrovski Date: Thu, 14 Dec 2023 13:53:39 +0200 Subject: [PATCH 04/19] Better structure snapshots creator --- core/bin/snapshots_creator/src/main.rs | 455 ++++++++++++------------ core/bin/snapshots_creator/src/tests.rs | 181 ++++------ 2 files changed, 308 insertions(+), 328 deletions(-) diff --git a/core/bin/snapshots_creator/src/main.rs b/core/bin/snapshots_creator/src/main.rs index e63a7c63ee0d..22fb102e11a5 100644 --- a/core/bin/snapshots_creator/src/main.rs +++ b/core/bin/snapshots_creator/src/main.rs @@ -48,111 +48,7 @@ async fn maybe_enable_prometheus_metrics( Ok(()) } -#[allow(clippy::too_many_arguments)] // FIXME -async fn process_storage_logs_single_chunk( - blob_store: &dyn ObjectStore, - master_pool: &ConnectionPool, - pool: &ConnectionPool, - semaphore: &Semaphore, - miniblock_number: MiniblockNumber, - l1_batch_number: L1BatchNumber, - chunk_id: u64, - chunk_count: u64, - #[cfg(test)] event_listener: &dyn HandleEvent, -) -> anyhow::Result<()> { - let _permit = semaphore.acquire().await?; - #[cfg(test)] - if event_listener.on_chunk_started().should_exit() { - return Ok(()); - } - - let hashed_keys_range = get_chunk_hashed_keys_range(chunk_id, chunk_count); - let mut conn = pool.access_storage_tagged("snapshots_creator").await?; - - let latency = - METRICS.storage_logs_processing_duration[&StorageChunkStage::LoadFromPostgres].start(); - let logs = conn - .snapshots_creator_dal() - .get_storage_logs_chunk(miniblock_number, hashed_keys_range) - .await - .context("Error fetching storage logs count")?; - drop(conn); - let latency = latency.observe(); - tracing::info!( - "Loaded chunk {chunk_id} ({} logs) from Postgres in {latency:?}", - logs.len() - ); - - let latency = METRICS.storage_logs_processing_duration[&StorageChunkStage::SaveToGcs].start(); - let storage_logs_chunk = SnapshotStorageLogsChunk { storage_logs: logs }; - let key = SnapshotStorageLogsStorageKey { - l1_batch_number, - chunk_id, - }; - let filename = blob_store - .put(key, &storage_logs_chunk) - .await - .context("Error storing storage logs chunk in blob store")?; - let output_filepath_prefix = blob_store.get_storage_prefix::(); - let output_filepath = format!("{output_filepath_prefix}/{filename}"); - let latency = latency.observe(); - - let mut master_conn = master_pool - .access_storage_tagged("snapshots_creator") - .await?; - master_conn - .snapshots_dal() - .add_storage_logs_filepath_for_snapshot(l1_batch_number, &output_filepath) - .await?; - - #[cfg(test)] - event_listener.on_chunk_saved(); - - let tasks_left = METRICS.storage_logs_chunks_left_to_process.dec_by(1) - 1; - tracing::info!( - "Saved chunk {chunk_id} (overall progress {}/{chunk_count}) in {latency:?} to location: {output_filepath}", - chunk_count - tasks_left as u64 - ); - Ok(()) -} - -async fn process_factory_deps( - blob_store: &dyn ObjectStore, - pool: &ConnectionPool, - miniblock_number: MiniblockNumber, - l1_batch_number: L1BatchNumber, -) -> anyhow::Result { - let mut conn = pool.access_storage_tagged("snapshots_creator").await?; - - tracing::info!("Loading factory deps from Postgres..."); - let latency = - METRICS.factory_deps_processing_duration[&FactoryDepsStage::LoadFromPostgres].start(); - let factory_deps = conn - .snapshots_creator_dal() - .get_all_factory_deps(miniblock_number) - .await?; - drop(conn); - let latency = latency.observe(); - tracing::info!("Loaded {} factory deps in {latency:?}", factory_deps.len()); - - tracing::info!("Saving factory deps to GCS..."); - let latency = METRICS.factory_deps_processing_duration[&FactoryDepsStage::SaveToGcs].start(); - let factory_deps = SnapshotFactoryDependencies { factory_deps }; - let filename = blob_store - .put(l1_batch_number, &factory_deps) - .await - .context("Error storing factory deps in blob store")?; - let output_filepath_prefix = blob_store.get_storage_prefix::(); - let output_filepath = format!("{output_filepath_prefix}/{filename}"); - let latency = latency.observe(); - tracing::info!( - "Saved {} factory deps in {latency:?} to location: {output_filepath}", - factory_deps.factory_deps.len() - ); - - Ok(output_filepath) -} - +/// Encapsulates progress of creating a particular storage snapshot. #[derive(Debug)] struct SnapshotProgress { l1_batch_number: L1BatchNumber, @@ -211,137 +107,250 @@ impl SnapshotProgress { } } -async fn run( - config: SnapshotsCreatorConfig, +/// Creator of a single storage snapshot. +#[derive(Debug)] +struct SnapshotCreator { blob_store: Box, - replica_pool: ConnectionPool, master_pool: ConnectionPool, - min_chunk_count: u64, - #[cfg(test)] event_listener: &dyn HandleEvent, -) -> anyhow::Result<()> { - let latency = METRICS.snapshot_generation_duration.start(); - - let mut conn = replica_pool - .access_storage_tagged("snapshots_creator") - .await?; - let mut master_conn = master_pool - .access_storage_tagged("snapshots_creator") - .await?; - - let latest_snapshot = master_conn - .snapshots_dal() - .get_newest_snapshot_metadata() - .await?; - let pending_snapshot = latest_snapshot - .as_ref() - .filter(|snapshot| !snapshot.is_complete()); - let progress = if let Some(snapshot) = pending_snapshot { - SnapshotProgress::from_existing_snapshot(snapshot, &*blob_store)? - } else { - // We subtract 1 so that after restore, EN node has at least one L1 batch to fetch - let sealed_l1_batch_number = conn.blocks_dal().get_sealed_l1_batch_number().await?; - assert_ne!( - sealed_l1_batch_number, - L1BatchNumber(0), - "Cannot create snapshot when only the genesis L1 batch is present in Postgres" - ); - let l1_batch_number = sealed_l1_batch_number - 1; + replica_pool: ConnectionPool, + #[cfg(test)] + event_listener: Box, +} - let latest_snapshot_l1_batch_number = latest_snapshot - .as_ref() - .map(|snapshot| snapshot.l1_batch_number); - if latest_snapshot_l1_batch_number == Some(l1_batch_number) { - tracing::info!( - "Snapshot at expected L1 batch #{l1_batch_number} is already created; exiting" - ); +impl SnapshotCreator { + async fn process_storage_logs_single_chunk( + &self, + semaphore: &Semaphore, + miniblock_number: MiniblockNumber, + l1_batch_number: L1BatchNumber, + chunk_id: u64, + chunk_count: u64, + ) -> anyhow::Result<()> { + let _permit = semaphore.acquire().await?; + #[cfg(test)] + if self.event_listener.on_chunk_started().should_exit() { return Ok(()); } - let distinct_storage_logs_keys_count = conn - .snapshots_creator_dal() - .get_distinct_storage_logs_keys_count(l1_batch_number) + let hashed_keys_range = get_chunk_hashed_keys_range(chunk_id, chunk_count); + let mut conn = self + .replica_pool + .access_storage_tagged("snapshots_creator") .await?; - let chunk_size = config.storage_logs_chunk_size; - // We force the minimum number of chunks to avoid situations where only one chunk is created in tests. - let chunk_count = - ceil_div(distinct_storage_logs_keys_count, chunk_size).max(min_chunk_count); + let latency = + METRICS.storage_logs_processing_duration[&StorageChunkStage::LoadFromPostgres].start(); + let logs = conn + .snapshots_creator_dal() + .get_storage_logs_chunk(miniblock_number, hashed_keys_range) + .await + .context("Error fetching storage logs count")?; + drop(conn); + let latency = latency.observe(); tracing::info!( - "Selected storage logs chunking for L1 batch {l1_batch_number}: \ - {chunk_count} chunks of expected size {chunk_size}" + "Loaded chunk {chunk_id} ({} logs) from Postgres in {latency:?}", + logs.len() ); - SnapshotProgress::new(l1_batch_number, chunk_count) - }; - drop(master_conn); - - let (_, last_miniblock_number_in_batch) = conn - .blocks_dal() - .get_miniblock_range_of_l1_batch(progress.l1_batch_number) - .await? - .context("Error fetching last miniblock number")?; - drop(conn); - - METRICS.storage_logs_chunks_count.set(progress.chunk_count); - tracing::info!( - "Creating snapshot for storage logs up to miniblock {last_miniblock_number_in_batch}, \ - L1 batch {}", - progress.l1_batch_number - ); - - if progress.needs_persisting_factory_deps { - let factory_deps_output_file = process_factory_deps( - &*blob_store, - &replica_pool, - last_miniblock_number_in_batch, - progress.l1_batch_number, - ) - .await?; - let mut master_conn = master_pool + let latency = + METRICS.storage_logs_processing_duration[&StorageChunkStage::SaveToGcs].start(); + let storage_logs_chunk = SnapshotStorageLogsChunk { storage_logs: logs }; + let key = SnapshotStorageLogsStorageKey { + l1_batch_number, + chunk_id, + }; + let filename = self + .blob_store + .put(key, &storage_logs_chunk) + .await + .context("Error storing storage logs chunk in blob store")?; + let output_filepath_prefix = self + .blob_store + .get_storage_prefix::(); + let output_filepath = format!("{output_filepath_prefix}/{filename}"); + let latency = latency.observe(); + + let mut master_conn = self + .master_pool .access_storage_tagged("snapshots_creator") .await?; master_conn .snapshots_dal() - .add_snapshot( + .add_storage_logs_filepath_for_snapshot(l1_batch_number, &output_filepath) + .await?; + #[cfg(test)] + self.event_listener.on_chunk_saved(); + + let tasks_left = METRICS.storage_logs_chunks_left_to_process.dec_by(1) - 1; + tracing::info!( + "Saved chunk {chunk_id} (overall progress {}/{chunk_count}) in {latency:?} to location: {output_filepath}", + chunk_count - tasks_left as u64 + ); + Ok(()) + } + + async fn process_factory_deps( + &self, + miniblock_number: MiniblockNumber, + l1_batch_number: L1BatchNumber, + ) -> anyhow::Result { + let mut conn = self + .replica_pool + .access_storage_tagged("snapshots_creator") + .await?; + + tracing::info!("Loading factory deps from Postgres..."); + let latency = + METRICS.factory_deps_processing_duration[&FactoryDepsStage::LoadFromPostgres].start(); + let factory_deps = conn + .snapshots_creator_dal() + .get_all_factory_deps(miniblock_number) + .await?; + drop(conn); + let latency = latency.observe(); + tracing::info!("Loaded {} factory deps in {latency:?}", factory_deps.len()); + + tracing::info!("Saving factory deps to GCS..."); + let latency = + METRICS.factory_deps_processing_duration[&FactoryDepsStage::SaveToGcs].start(); + let factory_deps = SnapshotFactoryDependencies { factory_deps }; + let filename = self + .blob_store + .put(l1_batch_number, &factory_deps) + .await + .context("Error storing factory deps in blob store")?; + let output_filepath_prefix = self + .blob_store + .get_storage_prefix::(); + let output_filepath = format!("{output_filepath_prefix}/{filename}"); + let latency = latency.observe(); + tracing::info!( + "Saved {} factory deps in {latency:?} to location: {output_filepath}", + factory_deps.factory_deps.len() + ); + + Ok(output_filepath) + } + + async fn run(self, config: SnapshotsCreatorConfig, min_chunk_count: u64) -> anyhow::Result<()> { + let latency = METRICS.snapshot_generation_duration.start(); + + let mut conn = self + .replica_pool + .access_storage_tagged("snapshots_creator") + .await?; + let mut master_conn = self + .master_pool + .access_storage_tagged("snapshots_creator") + .await?; + + let latest_snapshot = master_conn + .snapshots_dal() + .get_newest_snapshot_metadata() + .await?; + let pending_snapshot = latest_snapshot + .as_ref() + .filter(|snapshot| !snapshot.is_complete()); + let progress = if let Some(snapshot) = pending_snapshot { + SnapshotProgress::from_existing_snapshot(snapshot, &*self.blob_store)? + } else { + // We subtract 1 so that after restore, EN node has at least one L1 batch to fetch + let sealed_l1_batch_number = conn.blocks_dal().get_sealed_l1_batch_number().await?; + assert_ne!( + sealed_l1_batch_number, + L1BatchNumber(0), + "Cannot create snapshot when only the genesis L1 batch is present in Postgres" + ); + let l1_batch_number = sealed_l1_batch_number - 1; + + let latest_snapshot_l1_batch_number = latest_snapshot + .as_ref() + .map(|snapshot| snapshot.l1_batch_number); + if latest_snapshot_l1_batch_number == Some(l1_batch_number) { + tracing::info!( + "Snapshot at expected L1 batch #{l1_batch_number} is already created; exiting" + ); + return Ok(()); + } + + let distinct_storage_logs_keys_count = conn + .snapshots_creator_dal() + .get_distinct_storage_logs_keys_count(l1_batch_number) + .await?; + let chunk_size = config.storage_logs_chunk_size; + // We force the minimum number of chunks to avoid situations where only one chunk is created in tests. + let chunk_count = + ceil_div(distinct_storage_logs_keys_count, chunk_size).max(min_chunk_count); + + tracing::info!( + "Selected storage logs chunking for L1 batch {l1_batch_number}: \ + {chunk_count} chunks of expected size {chunk_size}" + ); + SnapshotProgress::new(l1_batch_number, chunk_count) + }; + drop(master_conn); + + let (_, last_miniblock_number_in_batch) = conn + .blocks_dal() + .get_miniblock_range_of_l1_batch(progress.l1_batch_number) + .await? + .context("Error fetching last miniblock number")?; + drop(conn); + + METRICS.storage_logs_chunks_count.set(progress.chunk_count); + tracing::info!( + "Creating snapshot for storage logs up to miniblock {last_miniblock_number_in_batch}, \ + L1 batch {}", + progress.l1_batch_number + ); + + if progress.needs_persisting_factory_deps { + let factory_deps_output_file = self + .process_factory_deps(last_miniblock_number_in_batch, progress.l1_batch_number) + .await?; + + let mut master_conn = self + .master_pool + .access_storage_tagged("snapshots_creator") + .await?; + master_conn + .snapshots_dal() + .add_snapshot( + progress.l1_batch_number, + progress.chunk_count, + &factory_deps_output_file, + ) + .await?; + } + + METRICS + .storage_logs_chunks_left_to_process + .set(progress.remaining_chunk_ids.len()); + let semaphore = Semaphore::new(config.concurrent_queries_count as usize); + let tasks = progress.remaining_chunk_ids.into_iter().map(|chunk_id| { + self.process_storage_logs_single_chunk( + &semaphore, + last_miniblock_number_in_batch, progress.l1_batch_number, + chunk_id, progress.chunk_count, - &factory_deps_output_file, ) - .await?; - } + }); + futures::future::try_join_all(tasks).await?; - METRICS - .storage_logs_chunks_left_to_process - .set(progress.remaining_chunk_ids.len()); - let semaphore = Semaphore::new(config.concurrent_queries_count as usize); - let tasks = progress.remaining_chunk_ids.into_iter().map(|chunk_id| { - process_storage_logs_single_chunk( - &*blob_store, - &master_pool, - &replica_pool, - &semaphore, - last_miniblock_number_in_batch, - progress.l1_batch_number, - chunk_id, - progress.chunk_count, - #[cfg(test)] - event_listener, - ) - }); - futures::future::try_join_all(tasks).await?; - - METRICS - .snapshot_l1_batch - .set(progress.l1_batch_number.0.into()); - - let elapsed = latency.observe(); - tracing::info!("snapshot_generation_duration: {elapsed:?}"); - tracing::info!("snapshot_l1_batch: {}", METRICS.snapshot_l1_batch.get()); - tracing::info!( - "storage_logs_chunks_count: {}", - METRICS.storage_logs_chunks_count.get() - ); - Ok(()) + METRICS + .snapshot_l1_batch + .set(progress.l1_batch_number.0.into()); + + let elapsed = latency.observe(); + tracing::info!("snapshot_generation_duration: {elapsed:?}"); + tracing::info!("snapshot_l1_batch: {}", METRICS.snapshot_l1_batch.get()); + tracing::info!( + "storage_logs_chunks_count: {}", + METRICS.storage_logs_chunks_count.get() + ); + Ok(()) + } } /// Minimum number of storage log chunks to produce. @@ -391,16 +400,14 @@ async fn main() -> anyhow::Result<()> { .await?; let config = SnapshotsCreatorConfig::from_env().context("SnapshotsCreatorConfig::from_env")?; - run( - config, + let creator = SnapshotCreator { blob_store, - replica_pool, master_pool, - MIN_CHUNK_COUNT, + replica_pool, #[cfg(test)] - &(), - ) - .await?; + event_listener: Box::new(()), + }; + creator.run(config, MIN_CHUNK_COUNT).await?; tracing::info!("Finished running snapshot creator!"); stop_sender.send(true).ok(); diff --git a/core/bin/snapshots_creator/src/tests.rs b/core/bin/snapshots_creator/src/tests.rs index add8fe39446d..80e4bb19b48f 100644 --- a/core/bin/snapshots_creator/src/tests.rs +++ b/core/bin/snapshots_creator/src/tests.rs @@ -2,6 +2,7 @@ use std::{ collections::{HashMap, HashSet}, + fmt, sync::atomic::{AtomicUsize, Ordering}, }; @@ -24,6 +25,51 @@ const SEQUENTIAL_TEST_CONFIG: SnapshotsCreatorConfig = SnapshotsCreatorConfig { concurrent_queries_count: 1, }; +#[derive(Debug)] +struct TestEventListener { + stop_after_chunk_count: usize, + processed_chunk_count: AtomicUsize, +} + +impl TestEventListener { + fn new(stop_after_chunk_count: usize) -> Self { + Self { + stop_after_chunk_count, + processed_chunk_count: AtomicUsize::new(0), + } + } +} + +impl HandleEvent for TestEventListener { + fn on_chunk_started(&self) -> TestBehavior { + let should_stop = + self.processed_chunk_count.load(Ordering::SeqCst) >= self.stop_after_chunk_count; + TestBehavior::new(should_stop) + } + + fn on_chunk_saved(&self) { + self.processed_chunk_count.fetch_add(1, Ordering::SeqCst); + } +} + +impl SnapshotCreator { + fn for_tests(blob_store: Box, pool: ConnectionPool) -> Self { + Self { + blob_store, + master_pool: pool.clone(), + replica_pool: pool, + event_listener: Box::new(()), + } + } + + fn stop_after_chunk_count(self, stop_after_chunk_count: usize) -> Self { + Self { + event_listener: Box::new(TestEventListener::new(stop_after_chunk_count)), + ..self + } + } +} + #[derive(Debug)] pub(crate) struct TestBehavior { should_exit: bool, @@ -39,7 +85,7 @@ impl TestBehavior { } } -pub(crate) trait HandleEvent { +pub(crate) trait HandleEvent: fmt::Debug { fn on_chunk_started(&self) -> TestBehavior { TestBehavior::new(false) } @@ -198,16 +244,10 @@ async fn persisting_snapshot_metadata() { let mut conn = pool.access_storage().await.unwrap(); prepare_postgres(&mut rng, &mut conn, 10).await; - run( - TEST_CONFIG, - object_store, - pool.clone(), - pool.clone(), - MIN_CHUNK_COUNT, - &(), - ) - .await - .unwrap(); + SnapshotCreator::for_tests(object_store, pool.clone()) + .run(TEST_CONFIG, MIN_CHUNK_COUNT) + .await + .unwrap(); // Check snapshot metadata in Postgres. let snapshots = conn.snapshots_dal().get_all_snapshots().await.unwrap(); @@ -245,16 +285,10 @@ async fn persisting_snapshot_factory_deps() { let mut conn = pool.access_storage().await.unwrap(); let expected_outputs = prepare_postgres(&mut rng, &mut conn, 10).await; - run( - TEST_CONFIG, - object_store, - pool.clone(), - pool.clone(), - MIN_CHUNK_COUNT, - &(), - ) - .await - .unwrap(); + SnapshotCreator::for_tests(object_store, pool.clone()) + .run(TEST_CONFIG, MIN_CHUNK_COUNT) + .await + .unwrap(); let snapshot_l1_batch_number = L1BatchNumber(8); let object_store = object_store_factory.create_store().await; @@ -275,30 +309,17 @@ async fn persisting_snapshot_logs() { let mut conn = pool.access_storage().await.unwrap(); let expected_outputs = prepare_postgres(&mut rng, &mut conn, 10).await; - run( - TEST_CONFIG, - object_store, - pool.clone(), - pool.clone(), - MIN_CHUNK_COUNT, - &(), - ) - .await - .unwrap(); + SnapshotCreator::for_tests(object_store, pool.clone()) + .run(TEST_CONFIG, MIN_CHUNK_COUNT) + .await + .unwrap(); let snapshot_l1_batch_number = L1BatchNumber(8); let object_store = object_store_factory.create_store().await; - assert_storage_logs( - &mut conn, - &*object_store, - snapshot_l1_batch_number, - &expected_outputs, - ) - .await; + assert_storage_logs(&*object_store, snapshot_l1_batch_number, &expected_outputs).await; } async fn assert_storage_logs( - conn: &mut StorageProcessor<'_>, object_store: &dyn ObjectStore, snapshot_l1_batch_number: L1BatchNumber, expected_outputs: &ExpectedOutputs, @@ -315,33 +336,6 @@ async fn assert_storage_logs( assert_eq!(actual_logs, expected_outputs.storage_logs); } -#[derive(Debug)] -struct TestEventListener { - stop_after_chunk_count: usize, - processed_chunk_count: AtomicUsize, -} - -impl TestEventListener { - fn new(stop_after_chunk_count: usize) -> Self { - Self { - stop_after_chunk_count, - processed_chunk_count: AtomicUsize::new(0), - } - } -} - -impl HandleEvent for TestEventListener { - fn on_chunk_started(&self) -> TestBehavior { - let should_stop = - self.processed_chunk_count.load(Ordering::SeqCst) >= self.stop_after_chunk_count; - TestBehavior::new(should_stop) - } - - fn on_chunk_saved(&self) { - self.processed_chunk_count.fetch_add(1, Ordering::SeqCst); - } -} - #[tokio::test] async fn recovery_workflow() { let pool = ConnectionPool::test_pool().await; @@ -353,16 +347,11 @@ async fn recovery_workflow() { let mut conn = pool.access_storage().await.unwrap(); let expected_outputs = prepare_postgres(&mut rng, &mut conn, 10).await; - run( - SEQUENTIAL_TEST_CONFIG, - object_store, - pool.clone(), - pool.clone(), - MIN_CHUNK_COUNT, - &TestEventListener::new(0), - ) - .await - .unwrap(); + SnapshotCreator::for_tests(object_store, pool.clone()) + .stop_after_chunk_count(0) + .run(SEQUENTIAL_TEST_CONFIG, MIN_CHUNK_COUNT) + .await + .unwrap(); let snapshot_l1_batch_number = L1BatchNumber(8); let snapshot_metadata = conn @@ -376,20 +365,15 @@ async fn recovery_workflow() { let object_store = object_store_factory.create_store().await; let SnapshotFactoryDependencies { factory_deps } = object_store.get(snapshot_l1_batch_number).await.unwrap(); - let actual_deps: HashSet<_> = factory_deps.into_iter().map(|dep| dep.bytecode).collect(); + let actual_deps: HashSet<_> = factory_deps.into_iter().collect(); assert_eq!(actual_deps, expected_outputs.deps); // Process 2 storage log chunks, then stop. - run( - SEQUENTIAL_TEST_CONFIG, - object_store, - pool.clone(), - pool.clone(), - MIN_CHUNK_COUNT, - &TestEventListener::new(2), - ) - .await - .unwrap(); + SnapshotCreator::for_tests(object_store, pool.clone()) + .stop_after_chunk_count(2) + .run(SEQUENTIAL_TEST_CONFIG, MIN_CHUNK_COUNT) + .await + .unwrap(); let snapshot_metadata = conn .snapshots_dal() @@ -400,23 +384,12 @@ async fn recovery_workflow() { assert_eq!(snapshot_metadata.storage_logs_filepaths.len(), 2); // Process the remaining chunks. - run( - SEQUENTIAL_TEST_CONFIG, - object_store_factory.create_store().await, - pool.clone(), - pool.clone(), - MIN_CHUNK_COUNT, - &(), - ) - .await - .unwrap(); + let object_store = object_store_factory.create_store().await; + SnapshotCreator::for_tests(object_store, pool.clone()) + .run(SEQUENTIAL_TEST_CONFIG, MIN_CHUNK_COUNT) + .await + .unwrap(); let object_store = object_store_factory.create_store().await; - assert_storage_logs( - &mut conn, - &*object_store, - snapshot_l1_batch_number, - &expected_outputs, - ) - .await; + assert_storage_logs(&*object_store, snapshot_l1_batch_number, &expected_outputs).await; } From 39eeab298af13d3dbe75ec4512bc9aef12d9cdff Mon Sep 17 00:00:00 2001 From: Alex Ostrovski Date: Thu, 14 Dec 2023 13:59:43 +0200 Subject: [PATCH 05/19] Test snapshots creator some more --- core/bin/snapshots_creator/src/tests.rs | 50 ++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/core/bin/snapshots_creator/src/tests.rs b/core/bin/snapshots_creator/src/tests.rs index 80e4bb19b48f..30a2fb6b7cd8 100644 --- a/core/bin/snapshots_creator/src/tests.rs +++ b/core/bin/snapshots_creator/src/tests.rs @@ -280,8 +280,6 @@ async fn persisting_snapshot_factory_deps() { let mut rng = thread_rng(); let object_store_factory = ObjectStoreFactory::mock(); let object_store = object_store_factory.create_store().await; - - // Insert some data to Postgres. let mut conn = pool.access_storage().await.unwrap(); let expected_outputs = prepare_postgres(&mut rng, &mut conn, 10).await; @@ -304,8 +302,6 @@ async fn persisting_snapshot_logs() { let mut rng = thread_rng(); let object_store_factory = ObjectStoreFactory::mock(); let object_store = object_store_factory.create_store().await; - - // Insert some data to Postgres. let mut conn = pool.access_storage().await.unwrap(); let expected_outputs = prepare_postgres(&mut rng, &mut conn, 10).await; @@ -342,8 +338,6 @@ async fn recovery_workflow() { let mut rng = thread_rng(); let object_store_factory = ObjectStoreFactory::mock(); let object_store = object_store_factory.create_store().await; - - // Insert some data to Postgres. let mut conn = pool.access_storage().await.unwrap(); let expected_outputs = prepare_postgres(&mut rng, &mut conn, 10).await; @@ -393,3 +387,47 @@ async fn recovery_workflow() { let object_store = object_store_factory.create_store().await; assert_storage_logs(&*object_store, snapshot_l1_batch_number, &expected_outputs).await; } + +#[tokio::test] +async fn recovery_workflow_with_varying_chunk_size() { + let pool = ConnectionPool::test_pool().await; + let mut rng = thread_rng(); + let object_store_factory = ObjectStoreFactory::mock(); + let object_store = object_store_factory.create_store().await; + let mut conn = pool.access_storage().await.unwrap(); + let expected_outputs = prepare_postgres(&mut rng, &mut conn, 10).await; + + SnapshotCreator::for_tests(object_store, pool.clone()) + .stop_after_chunk_count(2) + .run(SEQUENTIAL_TEST_CONFIG, MIN_CHUNK_COUNT) + .await + .unwrap(); + + let snapshot_l1_batch_number = L1BatchNumber(8); + let snapshot_metadata = conn + .snapshots_dal() + .get_snapshot_metadata(snapshot_l1_batch_number) + .await + .unwrap() + .expect("No snapshot metadata"); + assert_eq!(snapshot_metadata.storage_logs_filepaths.len(), 2); + + let config_with_other_size = SnapshotsCreatorConfig { + storage_logs_chunk_size: 1, // << should be ignored + ..SEQUENTIAL_TEST_CONFIG + }; + let object_store = object_store_factory.create_store().await; + SnapshotCreator::for_tests(object_store, pool.clone()) + .run(config_with_other_size, MIN_CHUNK_COUNT) + .await + .unwrap(); + + let object_store = object_store_factory.create_store().await; + assert_storage_logs( + &mut conn, + &*object_store, + snapshot_l1_batch_number, + &expected_outputs, + ) + .await; +} From 38d5029830d2a0d836891d4b1063d7a2dc6c1613 Mon Sep 17 00:00:00 2001 From: Alex Ostrovski Date: Thu, 14 Dec 2023 14:38:51 +0200 Subject: [PATCH 06/19] Structure snapshots creator some more --- core/bin/snapshots_creator/src/main.rs | 93 +++++++++++++++---------- core/bin/snapshots_creator/src/tests.rs | 8 +-- 2 files changed, 57 insertions(+), 44 deletions(-) diff --git a/core/bin/snapshots_creator/src/main.rs b/core/bin/snapshots_creator/src/main.rs index 22fb102e11a5..d1a6b9d5639c 100644 --- a/core/bin/snapshots_creator/src/main.rs +++ b/core/bin/snapshots_creator/src/main.rs @@ -6,7 +6,7 @@ use anyhow::Context as _; use prometheus_exporter::PrometheusExporterConfig; use tokio::sync::{watch, Semaphore}; use zksync_config::{configs::PrometheusConfig, PostgresConfig, SnapshotsCreatorConfig}; -use zksync_dal::ConnectionPool; +use zksync_dal::{ConnectionPool, StorageProcessor}; use zksync_env_config::{object_store::SnapshotsObjectStoreConfig, FromEnv}; use zksync_object_store::{ObjectStore, ObjectStoreFactory}; use zksync_types::{ @@ -232,6 +232,47 @@ impl SnapshotCreator { Ok(output_filepath) } + /// Returns `None` iff the created snapshot would coincide with `latest_snapshot`. + async fn initialize_snapshot_progress( + config: &SnapshotsCreatorConfig, + min_chunk_count: u64, + latest_snapshot: Option<&SnapshotMetadata>, + conn: &mut StorageProcessor<'_>, + ) -> anyhow::Result> { + // We subtract 1 so that after restore, EN node has at least one L1 batch to fetch + let sealed_l1_batch_number = conn.blocks_dal().get_sealed_l1_batch_number().await?; + assert_ne!( + sealed_l1_batch_number, + L1BatchNumber(0), + "Cannot create snapshot when only the genesis L1 batch is present in Postgres" + ); + let l1_batch_number = sealed_l1_batch_number - 1; + + let latest_snapshot_l1_batch_number = + latest_snapshot.map(|snapshot| snapshot.l1_batch_number); + if latest_snapshot_l1_batch_number == Some(l1_batch_number) { + tracing::info!( + "Snapshot at expected L1 batch #{l1_batch_number} is already created; exiting" + ); + return Ok(None); + } + + let distinct_storage_logs_keys_count = conn + .snapshots_creator_dal() + .get_distinct_storage_logs_keys_count(l1_batch_number) + .await?; + let chunk_size = config.storage_logs_chunk_size; + // We force the minimum number of chunks to avoid situations where only one chunk is created in tests. + let chunk_count = + ceil_div(distinct_storage_logs_keys_count, chunk_size).max(min_chunk_count); + + tracing::info!( + "Selected storage logs chunking for L1 batch {l1_batch_number}: \ + {chunk_count} chunks of expected size {chunk_size}" + ); + Ok(Some(SnapshotProgress::new(l1_batch_number, chunk_count))) + } + async fn run(self, config: SnapshotsCreatorConfig, min_chunk_count: u64) -> anyhow::Result<()> { let latency = METRICS.snapshot_generation_duration.start(); @@ -243,52 +284,31 @@ impl SnapshotCreator { .master_pool .access_storage_tagged("snapshots_creator") .await?; - let latest_snapshot = master_conn .snapshots_dal() .get_newest_snapshot_metadata() .await?; + drop(master_conn); + let pending_snapshot = latest_snapshot .as_ref() .filter(|snapshot| !snapshot.is_complete()); let progress = if let Some(snapshot) = pending_snapshot { SnapshotProgress::from_existing_snapshot(snapshot, &*self.blob_store)? } else { - // We subtract 1 so that after restore, EN node has at least one L1 batch to fetch - let sealed_l1_batch_number = conn.blocks_dal().get_sealed_l1_batch_number().await?; - assert_ne!( - sealed_l1_batch_number, - L1BatchNumber(0), - "Cannot create snapshot when only the genesis L1 batch is present in Postgres" - ); - let l1_batch_number = sealed_l1_batch_number - 1; - - let latest_snapshot_l1_batch_number = latest_snapshot - .as_ref() - .map(|snapshot| snapshot.l1_batch_number); - if latest_snapshot_l1_batch_number == Some(l1_batch_number) { - tracing::info!( - "Snapshot at expected L1 batch #{l1_batch_number} is already created; exiting" - ); - return Ok(()); - } - - let distinct_storage_logs_keys_count = conn - .snapshots_creator_dal() - .get_distinct_storage_logs_keys_count(l1_batch_number) - .await?; - let chunk_size = config.storage_logs_chunk_size; - // We force the minimum number of chunks to avoid situations where only one chunk is created in tests. - let chunk_count = - ceil_div(distinct_storage_logs_keys_count, chunk_size).max(min_chunk_count); + let progress = Self::initialize_snapshot_progress( + &config, + min_chunk_count, + latest_snapshot.as_ref(), + &mut conn, + ) + .await?; - tracing::info!( - "Selected storage logs chunking for L1 batch {l1_batch_number}: \ - {chunk_count} chunks of expected size {chunk_size}" - ); - SnapshotProgress::new(l1_batch_number, chunk_count) + match progress { + Some(progress) => progress, + None => return Ok(()), + } }; - drop(master_conn); let (_, last_miniblock_number_in_batch) = conn .blocks_dal() @@ -399,7 +419,6 @@ async fn main() -> anyhow::Result<()> { .build() .await?; - let config = SnapshotsCreatorConfig::from_env().context("SnapshotsCreatorConfig::from_env")?; let creator = SnapshotCreator { blob_store, master_pool, @@ -407,7 +426,7 @@ async fn main() -> anyhow::Result<()> { #[cfg(test)] event_listener: Box::new(()), }; - creator.run(config, MIN_CHUNK_COUNT).await?; + creator.run(creator_config, MIN_CHUNK_COUNT).await?; tracing::info!("Finished running snapshot creator!"); stop_sender.send(true).ok(); diff --git a/core/bin/snapshots_creator/src/tests.rs b/core/bin/snapshots_creator/src/tests.rs index 30a2fb6b7cd8..8edd14d99ed6 100644 --- a/core/bin/snapshots_creator/src/tests.rs +++ b/core/bin/snapshots_creator/src/tests.rs @@ -423,11 +423,5 @@ async fn recovery_workflow_with_varying_chunk_size() { .unwrap(); let object_store = object_store_factory.create_store().await; - assert_storage_logs( - &mut conn, - &*object_store, - snapshot_l1_batch_number, - &expected_outputs, - ) - .await; + assert_storage_logs(&*object_store, snapshot_l1_batch_number, &expected_outputs).await; } From 03e09b6225577189de9b490951e722780c6ce022 Mon Sep 17 00:00:00 2001 From: Alex Ostrovski Date: Fri, 15 Dec 2023 10:10:09 +0200 Subject: [PATCH 07/19] Simplify `snapshots` schema --- ...add_lifecycle_fields_to_snapshots.down.sql | 2 - ...0_add_lifecycle_fields_to_snapshots.up.sql | 2 - core/lib/dal/sqlx-data.json | 191 +++++++++--------- core/lib/dal/src/snapshots_dal.rs | 91 ++++++--- core/lib/types/src/snapshots.rs | 10 +- 5 files changed, 156 insertions(+), 140 deletions(-) delete mode 100644 core/lib/dal/migrations/20231214084950_add_lifecycle_fields_to_snapshots.down.sql delete mode 100644 core/lib/dal/migrations/20231214084950_add_lifecycle_fields_to_snapshots.up.sql diff --git a/core/lib/dal/migrations/20231214084950_add_lifecycle_fields_to_snapshots.down.sql b/core/lib/dal/migrations/20231214084950_add_lifecycle_fields_to_snapshots.down.sql deleted file mode 100644 index 0a6accf6acea..000000000000 --- a/core/lib/dal/migrations/20231214084950_add_lifecycle_fields_to_snapshots.down.sql +++ /dev/null @@ -1,2 +0,0 @@ -ALTER TABLE snapshots - DROP COLUMN storage_logs_chunk_count; diff --git a/core/lib/dal/migrations/20231214084950_add_lifecycle_fields_to_snapshots.up.sql b/core/lib/dal/migrations/20231214084950_add_lifecycle_fields_to_snapshots.up.sql deleted file mode 100644 index 03e1c05678be..000000000000 --- a/core/lib/dal/migrations/20231214084950_add_lifecycle_fields_to_snapshots.up.sql +++ /dev/null @@ -1,2 +0,0 @@ -ALTER TABLE snapshots - ADD COLUMN storage_logs_chunk_count BIGINT NOT NULL; diff --git a/core/lib/dal/sqlx-data.json b/core/lib/dal/sqlx-data.json index 7c33dfd6789a..0de7ab05a7c1 100644 --- a/core/lib/dal/sqlx-data.json +++ b/core/lib/dal/sqlx-data.json @@ -682,6 +682,38 @@ }, "query": "SELECT l1_address FROM tokens WHERE market_volume > $1" }, + "1658e6fce121904c1353e51663fc307b01e02bc412ee46ac17e0f5acacd0b5c4": { + "describe": { + "columns": [ + { + "name": "l1_batch_number", + "ordinal": 0, + "type_info": "Int8" + }, + { + "name": "factory_deps_filepath", + "ordinal": 1, + "type_info": "Text" + }, + { + "name": "storage_logs_filepaths", + "ordinal": 2, + "type_info": "TextArray" + } + ], + "nullable": [ + false, + false, + false + ], + "parameters": { + "Left": [ + "Int8" + ] + } + }, + "query": "SELECT l1_batch_number, factory_deps_filepath, storage_logs_filepaths FROM snapshots WHERE l1_batch_number = $1" + }, "166dcd8d504ba3f52a9e44a05305ed00954ab9b5302be4bb5ab05dfd2272afca": { "describe": { "columns": [ @@ -1053,44 +1085,6 @@ }, "query": "DELETE FROM storage_logs WHERE miniblock_number > $1" }, - "18e79d285742498df1733aa7cb698f71fb66de588b82558f12df7f50478ba09c": { - "describe": { - "columns": [ - { - "name": "l1_batch_number", - "ordinal": 0, - "type_info": "Int8" - }, - { - "name": "storage_logs_chunk_count", - "ordinal": 1, - "type_info": "Int8" - }, - { - "name": "factory_deps_filepath", - "ordinal": 2, - "type_info": "Text" - }, - { - "name": "storage_logs_filepaths", - "ordinal": 3, - "type_info": "TextArray" - } - ], - "nullable": [ - false, - false, - false, - false - ], - "parameters": { - "Left": [ - "Int8" - ] - } - }, - "query": "SELECT l1_batch_number, storage_logs_chunk_count, factory_deps_filepath, storage_logs_filepaths FROM snapshots WHERE l1_batch_number = $1" - }, "191fb8c0549267b515aaa7acc199675be1ea113e9137195468bb8ce64a099ae8": { "describe": { "columns": [ @@ -1466,6 +1460,20 @@ }, "query": "\n UPDATE witness_inputs_fri SET status =$1, updated_at = now()\n WHERE l1_batch_number = $2\n " }, + "1c83bc63bea0549ab4c9364be9240b85c659698b8a2cfae81981c9da264eaedb": { + "describe": { + "columns": [], + "nullable": [], + "parameters": { + "Left": [ + "Int8", + "Int4", + "Text" + ] + } + }, + "query": "INSERT INTO snapshots ( l1_batch_number, storage_logs_filepaths, factory_deps_filepath, created_at, updated_at ) VALUES ($1, array_fill(''::text, ARRAY[$2::integer]), $3, NOW(), NOW())" + }, "1d1f5198cbb0b9cd70019a9b386212de294075c00ebac4dbd39fda5397dbb07c": { "describe": { "columns": [ @@ -3764,6 +3772,36 @@ }, "query": "SELECT index FROM initial_writes WHERE hashed_key = $1" }, + "3ef1a6683c5d6811cd69a2fba371561904dc10a326bbe9a11c2450ff7dd28886": { + "describe": { + "columns": [ + { + "name": "l1_batch_number", + "ordinal": 0, + "type_info": "Int8" + }, + { + "name": "factory_deps_filepath", + "ordinal": 1, + "type_info": "Text" + }, + { + "name": "storage_logs_filepaths", + "ordinal": 2, + "type_info": "TextArray" + } + ], + "nullable": [ + false, + false, + false + ], + "parameters": { + "Left": [] + } + }, + "query": "SELECT l1_batch_number, factory_deps_filepath, storage_logs_filepaths FROM snapshots ORDER BY l1_batch_number DESC LIMIT 1" + }, "3f6332706376ef4cadda96498872429b6ed28eca5402b03b1aa3b77b8262bccd": { "describe": { "columns": [], @@ -4507,20 +4545,6 @@ }, "query": "SELECT bytecode_hash, bytecode FROM factory_deps INNER JOIN miniblocks ON miniblocks.number = factory_deps.miniblock_number WHERE miniblocks.l1_batch_number = $1" }, - "5574fcf4de1401eaca9901c2a7b70234e02ad2cae3456c986236ebf9a0caff71": { - "describe": { - "columns": [], - "nullable": [], - "parameters": { - "Left": [ - "Int8", - "Int8", - "Text" - ] - } - }, - "query": "INSERT INTO snapshots ( l1_batch_number, storage_logs_chunk_count, storage_logs_filepaths, factory_deps_filepath, created_at, updated_at ) VALUES ($1, $2, ARRAY[]::text[], $3, NOW(), NOW())" - }, "55debba852ef32f3b5ba6ffcb745f7b59d6888a21cb8792f8f9027e3b164a245": { "describe": { "columns": [ @@ -7609,19 +7633,6 @@ }, "query": "DELETE FROM storage WHERE hashed_key = ANY($1)" }, - "974cacecb574f0b0dc2a360986b8b9b88d10336b793ebc7387eac8b2d7a88044": { - "describe": { - "columns": [], - "nullable": [], - "parameters": { - "Left": [ - "Int8", - "Text" - ] - } - }, - "query": "UPDATE snapshots SET storage_logs_filepaths = storage_logs_filepaths || $2::text, updated_at = NOW() WHERE l1_batch_number = $1" - }, "97d81c27885fda4390ebc9789c6169cb94a449f583f7819ec74286fb0d9f81d5": { "describe": { "columns": [ @@ -8496,42 +8507,6 @@ }, "query": "DELETE FROM basic_witness_input_producer_jobs" }, - "a504be176fb07418afb0af16956fae00d56fa00ce02702cbfab90b7a419396b2": { - "describe": { - "columns": [ - { - "name": "l1_batch_number", - "ordinal": 0, - "type_info": "Int8" - }, - { - "name": "storage_logs_chunk_count", - "ordinal": 1, - "type_info": "Int8" - }, - { - "name": "factory_deps_filepath", - "ordinal": 2, - "type_info": "Text" - }, - { - "name": "storage_logs_filepaths", - "ordinal": 3, - "type_info": "TextArray" - } - ], - "nullable": [ - false, - false, - false, - false - ], - "parameters": { - "Left": [] - } - }, - "query": "SELECT l1_batch_number, storage_logs_chunk_count, factory_deps_filepath, storage_logs_filepaths FROM snapshots ORDER BY l1_batch_number DESC LIMIT 1" - }, "a5115658f3a53462a9570fd6676f1931604d1c17a9a2b5f1475519006aaf03ba": { "describe": { "columns": [], @@ -11941,6 +11916,20 @@ }, "query": "SELECT * FROM eth_txs WHERE id = $1" }, + "fae0f84f6777df80b4d2ad2a18c7f6355c416c63b55de675264898ab57812f5e": { + "describe": { + "columns": [], + "nullable": [], + "parameters": { + "Left": [ + "Int8", + "Int4", + "Text" + ] + } + }, + "query": "UPDATE snapshots SET storage_logs_filepaths[$2] = $3, updated_at = NOW() WHERE l1_batch_number = $1" + }, "fcca1961f34082f7186de607b922fd608166c5af98031e4dcc8a056b89696dbe": { "describe": { "columns": [], diff --git a/core/lib/dal/src/snapshots_dal.rs b/core/lib/dal/src/snapshots_dal.rs index d76b080214be..6beebb56034f 100644 --- a/core/lib/dal/src/snapshots_dal.rs +++ b/core/lib/dal/src/snapshots_dal.rs @@ -5,6 +5,27 @@ use zksync_types::{ use crate::{instrument::InstrumentExt, StorageProcessor}; +#[derive(Debug, sqlx::FromRow)] +struct StorageSnapshotMetadata { + l1_batch_number: i64, + storage_logs_filepaths: Vec, + factory_deps_filepath: String, +} + +impl From for SnapshotMetadata { + fn from(row: StorageSnapshotMetadata) -> Self { + Self { + l1_batch_number: L1BatchNumber(row.l1_batch_number as u32), + storage_logs_filepaths: row + .storage_logs_filepaths + .into_iter() + .map(|path| (!path.is_empty()).then_some(path)) + .collect(), + factory_deps_filepath: row.factory_deps_filepath, + } + } +} + #[derive(Debug)] pub struct SnapshotsDal<'a, 'c> { pub(crate) storage: &'a mut StorageProcessor<'c>, @@ -19,12 +40,12 @@ impl SnapshotsDal<'_, '_> { ) -> sqlx::Result<()> { sqlx::query!( "INSERT INTO snapshots ( \ - l1_batch_number, storage_logs_chunk_count, storage_logs_filepaths, factory_deps_filepath, \ + l1_batch_number, storage_logs_filepaths, factory_deps_filepath, \ created_at, updated_at \ ) \ - VALUES ($1, $2, ARRAY[]::text[], $3, NOW(), NOW())", + VALUES ($1, array_fill(''::text, ARRAY[$2::integer]), $3, NOW(), NOW())", l1_batch_number.0 as i32, - storage_logs_chunk_count as i64, + storage_logs_chunk_count as i32, factory_deps_filepaths, ) .instrument("add_snapshot") @@ -37,14 +58,16 @@ impl SnapshotsDal<'_, '_> { pub async fn add_storage_logs_filepath_for_snapshot( &mut self, l1_batch_number: L1BatchNumber, + chunk_id: u64, storage_logs_filepath: &str, ) -> sqlx::Result<()> { sqlx::query!( "UPDATE snapshots SET \ - storage_logs_filepaths = storage_logs_filepaths || $2::text, \ + storage_logs_filepaths[$2] = $3, \ updated_at = NOW() \ WHERE l1_batch_number = $1", l1_batch_number.0 as i32, + chunk_id as i32 + 1, storage_logs_filepath, ) .execute(self.storage.conn()) @@ -70,8 +93,9 @@ impl SnapshotsDal<'_, '_> { } pub async fn get_newest_snapshot_metadata(&mut self) -> sqlx::Result> { - let row = sqlx::query!( - "SELECT l1_batch_number, storage_logs_chunk_count, factory_deps_filepath, storage_logs_filepaths \ + let row = sqlx::query_as!( + StorageSnapshotMetadata, + "SELECT l1_batch_number, factory_deps_filepath, storage_logs_filepaths \ FROM snapshots \ ORDER BY l1_batch_number DESC LIMIT 1" ) @@ -80,20 +104,16 @@ impl SnapshotsDal<'_, '_> { .fetch_optional(self.storage.conn()) .await?; - Ok(row.map(|row| SnapshotMetadata { - l1_batch_number: L1BatchNumber(row.l1_batch_number as u32), - storage_logs_chunk_count: row.storage_logs_chunk_count as u64, - factory_deps_filepath: row.factory_deps_filepath, - storage_logs_filepaths: row.storage_logs_filepaths, - })) + Ok(row.map(Into::into)) } pub async fn get_snapshot_metadata( &mut self, l1_batch_number: L1BatchNumber, ) -> sqlx::Result> { - let row = sqlx::query!( - "SELECT l1_batch_number, storage_logs_chunk_count, factory_deps_filepath, storage_logs_filepaths \ + let row = sqlx::query_as!( + StorageSnapshotMetadata, + "SELECT l1_batch_number, factory_deps_filepath, storage_logs_filepaths \ FROM snapshots \ WHERE l1_batch_number = $1", l1_batch_number.0 as i32 @@ -103,12 +123,7 @@ impl SnapshotsDal<'_, '_> { .fetch_optional(self.storage.conn()) .await?; - Ok(row.map(|row| SnapshotMetadata { - l1_batch_number: L1BatchNumber(row.l1_batch_number as u32), - storage_logs_chunk_count: row.storage_logs_chunk_count as u64, - factory_deps_filepath: row.factory_deps_filepath, - storage_logs_filepaths: row.storage_logs_filepaths, - })) + Ok(row.map(Into::into)) } } @@ -124,7 +139,7 @@ mod tests { let mut conn = pool.access_storage().await.unwrap(); let mut dal = conn.snapshots_dal(); let l1_batch_number = L1BatchNumber(100); - dal.add_snapshot(l1_batch_number, 100, "gs:///bucket/factory_deps.bin") + dal.add_snapshot(l1_batch_number, 2, "gs:///bucket/factory_deps.bin") .await .expect("Failed to add snapshot"); @@ -149,16 +164,29 @@ mod tests { let mut conn = pool.access_storage().await.unwrap(); let mut dal = conn.snapshots_dal(); let l1_batch_number = L1BatchNumber(100); - dal.add_snapshot(l1_batch_number, 100, "gs:///bucket/factory_deps.bin") + dal.add_snapshot(l1_batch_number, 2, "gs:///bucket/factory_deps.bin") .await .expect("Failed to add snapshot"); let storage_log_filepaths = ["gs:///bucket/test_file1.bin", "gs:///bucket/test_file2.bin"]; - for storage_log_filepath in storage_log_filepaths { - dal.add_storage_logs_filepath_for_snapshot(l1_batch_number, storage_log_filepath) - .await - .unwrap(); - } + dal.add_storage_logs_filepath_for_snapshot(l1_batch_number, 1, storage_log_filepaths[1]) + .await + .unwrap(); + + let files = dal + .get_snapshot_metadata(l1_batch_number) + .await + .expect("Failed to retrieve snapshot") + .unwrap() + .storage_logs_filepaths; + assert_eq!( + files, + [None, Some("gs:///bucket/test_file2.bin".to_string())] + ); + + dal.add_storage_logs_filepath_for_snapshot(l1_batch_number, 0, storage_log_filepaths[0]) + .await + .unwrap(); let files = dal .get_snapshot_metadata(l1_batch_number) @@ -166,7 +194,12 @@ mod tests { .expect("Failed to retrieve snapshot") .unwrap() .storage_logs_filepaths; - assert!(files.contains(&"gs:///bucket/test_file1.bin".to_string())); - assert!(files.contains(&"gs:///bucket/test_file2.bin".to_string())); + assert_eq!( + files, + [ + Some("gs:///bucket/test_file1.bin".to_string()), + Some("gs:///bucket/test_file2.bin".to_string()) + ] + ); } } diff --git a/core/lib/types/src/snapshots.rs b/core/lib/types/src/snapshots.rs index 8705cda6d1d8..f6039f65b502 100644 --- a/core/lib/types/src/snapshots.rs +++ b/core/lib/types/src/snapshots.rs @@ -12,19 +12,17 @@ pub struct AllSnapshots { pub snapshots_l1_batch_numbers: Vec, } -// used in dal to fetch certain snapshot data -#[derive(Debug, Clone, Serialize, Deserialize)] -#[serde(rename_all = "camelCase")] +/// Storage snapshot metadata. Used in DAL to fetch certain snapshot data. +#[derive(Debug, Clone)] pub struct SnapshotMetadata { pub l1_batch_number: L1BatchNumber, - pub storage_logs_chunk_count: u64, pub factory_deps_filepath: String, - pub storage_logs_filepaths: Vec, + pub storage_logs_filepaths: Vec>, } impl SnapshotMetadata { pub fn is_complete(&self) -> bool { - self.storage_logs_filepaths.len() as u64 == self.storage_logs_chunk_count + self.storage_logs_filepaths.iter().all(Option::is_some) } } From 819ed1f286c9b4eeec9e42344524e140a68533b9 Mon Sep 17 00:00:00 2001 From: Alex Ostrovski Date: Fri, 15 Dec 2023 10:10:35 +0200 Subject: [PATCH 08/19] Fix `SnapshotMetadata` usage in API --- .../src/api_server/web3/namespaces/snapshots.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/core/lib/zksync_core/src/api_server/web3/namespaces/snapshots.rs b/core/lib/zksync_core/src/api_server/web3/namespaces/snapshots.rs index 02dd3b18b22d..61bf7630988f 100644 --- a/core/lib/zksync_core/src/api_server/web3/namespaces/snapshots.rs +++ b/core/lib/zksync_core/src/api_server/web3/namespaces/snapshots.rs @@ -46,6 +46,7 @@ impl SnapshotsNamespace { response } + // FIXME: test pub async fn get_snapshot_by_l1_batch_number_impl( &self, l1_batch_number: L1BatchNumber, @@ -64,13 +65,15 @@ impl SnapshotsNamespace { .await .map_err(|err| internal_error(method_name, err))?; if let Some(snapshot_metadata) = snapshot_metadata { - let snapshot_files = snapshot_metadata.storage_logs_filepaths.clone(); + let snapshot_files = snapshot_metadata.storage_logs_filepaths; let chunks = snapshot_files - .iter() + .into_iter() .enumerate() - .map(|(chunk_id, filepath)| SnapshotStorageLogsChunkMetadata { - chunk_id: chunk_id as u64, - filepath: filepath.clone(), + .filter_map(|(chunk_id, filepath)| { + Some(SnapshotStorageLogsChunkMetadata { + chunk_id: chunk_id as u64, + filepath: filepath?, + }) }) .collect(); let l1_batch_with_metadata = storage_processor From 69a627d9c05a1da96af9e854a6238feb73ca52b7 Mon Sep 17 00:00:00 2001 From: Alex Ostrovski Date: Fri, 15 Dec 2023 10:18:46 +0200 Subject: [PATCH 09/19] Fix `SnapshotMetadata` usage in creator --- core/bin/snapshots_creator/src/main.rs | 47 ++++++------------------- core/bin/snapshots_creator/src/tests.rs | 29 ++++++++++++--- core/lib/types/src/snapshots.rs | 5 +++ 3 files changed, 41 insertions(+), 40 deletions(-) diff --git a/core/bin/snapshots_creator/src/main.rs b/core/bin/snapshots_creator/src/main.rs index d1a6b9d5639c..4f61632b8a5a 100644 --- a/core/bin/snapshots_creator/src/main.rs +++ b/core/bin/snapshots_creator/src/main.rs @@ -1,7 +1,5 @@ //! Snapshot creator utility. Intended to run on a schedule, with each run creating a new snapshot. -use std::collections::HashSet; - use anyhow::Context as _; use prometheus_exporter::PrometheusExporterConfig; use tokio::sync::{watch, Semaphore}; @@ -67,43 +65,20 @@ impl SnapshotProgress { } } - fn from_existing_snapshot( - snapshot: &SnapshotMetadata, - blob_store: &dyn ObjectStore, - ) -> anyhow::Result { - let output_filepath_prefix = blob_store.get_storage_prefix::(); - let existing_chunk_ids = snapshot.storage_logs_filepaths.iter().map(|path| { - // FIXME: this parsing looks unnecessarily fragile - let object_key = path - .strip_prefix(&output_filepath_prefix) - .with_context(|| format!("Path `{path}` has unexpected prefix"))?; - let object_key = object_key - .strip_prefix('/') - .with_context(|| format!("Path `{path}` has unexpected prefix"))?; - let object_key = object_key - .parse::() - .with_context(|| format!("Object key `{object_key}` cannot be parsed"))?; - anyhow::ensure!( - object_key.l1_batch_number == snapshot.l1_batch_number, - "Mismatch" - ); - anyhow::Ok(object_key.chunk_id) - }); - let existing_chunk_ids: anyhow::Result> = existing_chunk_ids.collect(); - let existing_chunk_ids = existing_chunk_ids?; - - let all_chunk_ids = (0..snapshot.storage_logs_chunk_count).collect::>(); - let remaining_chunk_ids = all_chunk_ids - .difference(&existing_chunk_ids) - .copied() + fn from_existing_snapshot(snapshot: &SnapshotMetadata) -> Self { + let remaining_chunk_ids = snapshot + .storage_logs_filepaths + .iter() + .enumerate() + .filter_map(|(chunk_id, path)| path.is_none().then_some(chunk_id as u64)) .collect(); - Ok(Self { + Self { l1_batch_number: snapshot.l1_batch_number, needs_persisting_factory_deps: false, - chunk_count: snapshot.storage_logs_chunk_count, + chunk_count: snapshot.storage_logs_filepaths.len() as u64, remaining_chunk_ids, - }) + } } } @@ -176,7 +151,7 @@ impl SnapshotCreator { .await?; master_conn .snapshots_dal() - .add_storage_logs_filepath_for_snapshot(l1_batch_number, &output_filepath) + .add_storage_logs_filepath_for_snapshot(l1_batch_number, chunk_id, &output_filepath) .await?; #[cfg(test)] self.event_listener.on_chunk_saved(); @@ -294,7 +269,7 @@ impl SnapshotCreator { .as_ref() .filter(|snapshot| !snapshot.is_complete()); let progress = if let Some(snapshot) = pending_snapshot { - SnapshotProgress::from_existing_snapshot(snapshot, &*self.blob_store)? + SnapshotProgress::from_existing_snapshot(snapshot) } else { let progress = Self::initialize_snapshot_progress( &config, diff --git a/core/bin/snapshots_creator/src/tests.rs b/core/bin/snapshots_creator/src/tests.rs index 8edd14d99ed6..e69b01c445e8 100644 --- a/core/bin/snapshots_creator/src/tests.rs +++ b/core/bin/snapshots_creator/src/tests.rs @@ -269,7 +269,11 @@ async fn persisting_snapshot_metadata() { MIN_CHUNK_COUNT as usize ); for path in &snapshot_metadata.storage_logs_filepaths { - let path = path.strip_prefix("storage_logs_snapshots/").unwrap(); + let path = path + .as_ref() + .unwrap() + .strip_prefix("storage_logs_snapshots/") + .unwrap(); assert!(path.ends_with(".json.gzip")); } } @@ -354,7 +358,10 @@ async fn recovery_workflow() { .await .unwrap() .expect("No snapshot metadata"); - assert!(snapshot_metadata.storage_logs_filepaths.is_empty()); + assert!(snapshot_metadata + .storage_logs_filepaths + .iter() + .all(Option::is_none)); let object_store = object_store_factory.create_store().await; let SnapshotFactoryDependencies { factory_deps } = @@ -375,7 +382,14 @@ async fn recovery_workflow() { .await .unwrap() .expect("No snapshot metadata"); - assert_eq!(snapshot_metadata.storage_logs_filepaths.len(), 2); + assert_eq!( + snapshot_metadata + .storage_logs_filepaths + .iter() + .flatten() + .count(), + 2 + ); // Process the remaining chunks. let object_store = object_store_factory.create_store().await; @@ -410,7 +424,14 @@ async fn recovery_workflow_with_varying_chunk_size() { .await .unwrap() .expect("No snapshot metadata"); - assert_eq!(snapshot_metadata.storage_logs_filepaths.len(), 2); + assert_eq!( + snapshot_metadata + .storage_logs_filepaths + .iter() + .flatten() + .count(), + 2 + ); let config_with_other_size = SnapshotsCreatorConfig { storage_logs_chunk_size: 1, // << should be ignored diff --git a/core/lib/types/src/snapshots.rs b/core/lib/types/src/snapshots.rs index f6039f65b502..a0874df4882e 100644 --- a/core/lib/types/src/snapshots.rs +++ b/core/lib/types/src/snapshots.rs @@ -15,12 +15,17 @@ pub struct AllSnapshots { /// Storage snapshot metadata. Used in DAL to fetch certain snapshot data. #[derive(Debug, Clone)] pub struct SnapshotMetadata { + /// L1 batch for the snapshot. The data in the snapshot captures node storage at the end of this batch. pub l1_batch_number: L1BatchNumber, + /// Path to the factory dependencies blob. pub factory_deps_filepath: String, + /// Paths to the storage log blobs. Ordered by the chunk ID. If a certain chunk is not produced yet, + /// the corresponding path is `None`. pub storage_logs_filepaths: Vec>, } impl SnapshotMetadata { + /// Checks whether a snapshot is complete (contains all information to restore from). pub fn is_complete(&self) -> bool { self.storage_logs_filepaths.iter().all(Option::is_some) } From e92f9804794a8be91a5ef5bd466ba4fd49a421a4 Mon Sep 17 00:00:00 2001 From: Alex Ostrovski Date: Fri, 15 Dec 2023 11:16:49 +0200 Subject: [PATCH 10/19] Test snapshot APIs --- .../api_server/web3/namespaces/snapshots.rs | 1 - .../src/api_server/web3/tests/mod.rs | 14 ++- .../src/api_server/web3/tests/snapshots.rs | 110 ++++++++++++++++++ .../src/api_server/web3/tests/ws.rs | 3 +- 4 files changed, 122 insertions(+), 6 deletions(-) create mode 100644 core/lib/zksync_core/src/api_server/web3/tests/snapshots.rs diff --git a/core/lib/zksync_core/src/api_server/web3/namespaces/snapshots.rs b/core/lib/zksync_core/src/api_server/web3/namespaces/snapshots.rs index 61bf7630988f..58a41ff2bc30 100644 --- a/core/lib/zksync_core/src/api_server/web3/namespaces/snapshots.rs +++ b/core/lib/zksync_core/src/api_server/web3/namespaces/snapshots.rs @@ -46,7 +46,6 @@ impl SnapshotsNamespace { response } - // FIXME: test pub async fn get_snapshot_by_l1_batch_number_impl( &self, l1_batch_number: L1BatchNumber, diff --git a/core/lib/zksync_core/src/api_server/web3/tests/mod.rs b/core/lib/zksync_core/src/api_server/web3/tests/mod.rs index 2f7fad8c6e10..97cd104c5828 100644 --- a/core/lib/zksync_core/src/api_server/web3/tests/mod.rs +++ b/core/lib/zksync_core/src/api_server/web3/tests/mod.rs @@ -29,6 +29,7 @@ use crate::{ state_keeper::tests::create_l2_transaction, }; +mod snapshots; mod ws; const TEST_TIMEOUT: Duration = Duration::from_secs(10); @@ -121,6 +122,9 @@ async fn spawn_server( .await; let (pub_sub_events_sender, pub_sub_events_receiver) = mpsc::unbounded_channel(); + let mut namespaces = Namespace::DEFAULT.to_vec(); + namespaces.push(Namespace::Snapshots); + let server_builder = match transport { ApiTransportLabel::Http => ApiBuilder::jsonrpsee_backend(api_config, pool).http(0), ApiTransportLabel::Ws => ApiBuilder::jsonrpc_backend(api_config, pool) @@ -132,7 +136,7 @@ async fn spawn_server( .with_threads(1) .with_tx_sender(tx_sender, vm_barrier) .with_pub_sub_events(pub_sub_events_sender) - .enable_api_namespaces(Namespace::DEFAULT.to_vec()) + .enable_api_namespaces(namespaces) .build(stop_receiver) .await .expect("Failed spawning JSON-RPC server"); @@ -197,8 +201,9 @@ fn create_miniblock(number: u32) -> MiniblockHeader { } } -async fn store_block(pool: &ConnectionPool) -> anyhow::Result<(MiniblockHeader, H256)> { - let mut storage = pool.access_storage().await?; +async fn store_miniblock( + storage: &mut StorageProcessor<'_>, +) -> anyhow::Result<(MiniblockHeader, H256)> { let new_tx = create_l2_transaction(1, 2); let new_tx_hash = new_tx.hash(); let tx_submission_result = storage @@ -305,7 +310,8 @@ impl HttpTest for BasicFilterChanges { let block_filter_id = client.new_block_filter().await?; let tx_filter_id = client.new_pending_transaction_filter().await?; - let (new_miniblock, new_tx_hash) = store_block(pool).await?; + let (new_miniblock, new_tx_hash) = + store_miniblock(&mut pool.access_storage().await?).await?; let block_filter_changes = client.get_filter_changes(block_filter_id).await?; assert_matches!( diff --git a/core/lib/zksync_core/src/api_server/web3/tests/snapshots.rs b/core/lib/zksync_core/src/api_server/web3/tests/snapshots.rs new file mode 100644 index 000000000000..474358e81193 --- /dev/null +++ b/core/lib/zksync_core/src/api_server/web3/tests/snapshots.rs @@ -0,0 +1,110 @@ +//! Tests for the `snapshots` Web3 namespace. + +use std::collections::HashSet; + +use zksync_types::block::{BlockGasCount, L1BatchHeader}; +use zksync_web3_decl::namespaces::SnapshotsNamespaceClient; + +use super::*; +use crate::state_keeper::tests::create_l1_batch_metadata; + +async fn seal_l1_batch( + storage: &mut StorageProcessor<'_>, + number: L1BatchNumber, +) -> anyhow::Result<()> { + let header = L1BatchHeader::new( + number, + number.0.into(), + Address::repeat_byte(1), + BaseSystemContractsHashes::default(), + ProtocolVersionId::latest(), + ); + storage + .blocks_dal() + .insert_l1_batch(&header, &[], BlockGasCount::default(), &[], &[]) + .await?; + storage + .blocks_dal() + .mark_miniblocks_as_executed_in_l1_batch(number) + .await?; + let metadata = create_l1_batch_metadata(number.0); + storage + .blocks_dal() + .save_l1_batch_metadata(number, &metadata, H256::zero(), false) + .await?; + Ok(()) +} + +#[derive(Debug)] +struct SnapshotBasics { + chunk_ids: HashSet, +} + +impl SnapshotBasics { + const CHUNK_COUNT: u64 = 5; + + fn new(chunk_ids: impl IntoIterator) -> Self { + let chunk_ids: HashSet<_> = chunk_ids.into_iter().collect(); + assert!(chunk_ids.iter().all(|&id| id < Self::CHUNK_COUNT)); + Self { chunk_ids } + } +} + +#[async_trait] +impl HttpTest for SnapshotBasics { + async fn test(&self, client: &HttpClient, pool: &ConnectionPool) -> anyhow::Result<()> { + let mut storage = pool.access_storage().await.unwrap(); + store_miniblock(&mut storage).await?; + seal_l1_batch(&mut storage, L1BatchNumber(1)).await?; + storage + .snapshots_dal() + .add_snapshot(L1BatchNumber(1), Self::CHUNK_COUNT, "file:///factory_deps") + .await?; + + for &chunk_id in &self.chunk_ids { + let path = format!("file:///storage_logs/chunk{chunk_id}"); + storage + .snapshots_dal() + .add_storage_logs_filepath_for_snapshot(L1BatchNumber(1), chunk_id, &path) + .await?; + } + + let all_snapshots = client.get_all_snapshots().await?; + assert_eq!(all_snapshots.snapshots_l1_batch_numbers, [L1BatchNumber(1)]); + let snapshot_header = client + .get_snapshot_by_l1_batch_number(L1BatchNumber(1)) + .await? + .context("no snapshot for L1 batch #1")?; + assert_eq!(snapshot_header.l1_batch_number, L1BatchNumber(1)); + assert_eq!(snapshot_header.miniblock_number, MiniblockNumber(1)); + assert_eq!( + snapshot_header.factory_deps_filepath, + "file:///factory_deps" + ); + + assert_eq!( + snapshot_header.storage_logs_chunks.len(), + self.chunk_ids.len() + ); + for chunk in &snapshot_header.storage_logs_chunks { + assert!(self.chunk_ids.contains(&chunk.chunk_id)); + assert!(chunk.filepath.starts_with("file:///storage_logs/")); + } + Ok(()) + } +} + +#[tokio::test] +async fn snapshot_without_chunks() { + test_http_server(SnapshotBasics::new([])).await; +} + +#[tokio::test] +async fn snapshot_with_some_chunks() { + test_http_server(SnapshotBasics::new([0, 2, 4])).await; +} + +#[tokio::test] +async fn snapshot_with_all_chunks() { + test_http_server(SnapshotBasics::new(0..SnapshotBasics::CHUNK_COUNT)).await; +} diff --git a/core/lib/zksync_core/src/api_server/web3/tests/ws.rs b/core/lib/zksync_core/src/api_server/web3/tests/ws.rs index 58fcebeda0d6..9c854740b502 100644 --- a/core/lib/zksync_core/src/api_server/web3/tests/ws.rs +++ b/core/lib/zksync_core/src/api_server/web3/tests/ws.rs @@ -161,7 +161,8 @@ impl WsTest for BasicSubscriptions { .await?; wait_for_subscription(&mut pub_sub_events, SubscriptionType::Txs).await; - let (new_miniblock, new_tx_hash) = store_block(pool).await?; + let (new_miniblock, new_tx_hash) = + store_miniblock(&mut pool.access_storage().await?).await?; let received_tx_hash = tokio::time::timeout(TEST_TIMEOUT, txs_subscription.next()) .await From 4800e266b4238a1e950adc1ae2d370c16e5dbfb2 Mon Sep 17 00:00:00 2001 From: Alex Ostrovski Date: Fri, 15 Dec 2023 11:25:53 +0200 Subject: [PATCH 11/19] Refactor `get_snapshot_by_l1_batch_number_impl` --- .../api_server/web3/namespaces/snapshots.rs | 83 ++++++++++--------- 1 file changed, 45 insertions(+), 38 deletions(-) diff --git a/core/lib/zksync_core/src/api_server/web3/namespaces/snapshots.rs b/core/lib/zksync_core/src/api_server/web3/namespaces/snapshots.rs index 58a41ff2bc30..170c368ec13f 100644 --- a/core/lib/zksync_core/src/api_server/web3/namespaces/snapshots.rs +++ b/core/lib/zksync_core/src/api_server/web3/namespaces/snapshots.rs @@ -58,47 +58,54 @@ impl SnapshotsNamespace { .access_storage_tagged("api") .await .map_err(|err| internal_error(method_name, err))?; - let mut snapshots_dal = storage_processor.snapshots_dal(); - let snapshot_metadata = snapshots_dal + let snapshot_metadata = storage_processor + .snapshots_dal() .get_snapshot_metadata(l1_batch_number) .await .map_err(|err| internal_error(method_name, err))?; - if let Some(snapshot_metadata) = snapshot_metadata { - let snapshot_files = snapshot_metadata.storage_logs_filepaths; - let chunks = snapshot_files - .into_iter() - .enumerate() - .filter_map(|(chunk_id, filepath)| { - Some(SnapshotStorageLogsChunkMetadata { - chunk_id: chunk_id as u64, - filepath: filepath?, - }) - }) - .collect(); - let l1_batch_with_metadata = storage_processor - .blocks_dal() - .get_l1_batch_metadata(l1_batch_number) - .await - .map_err(|err| internal_error(method_name, err))? - .unwrap(); - let miniblock_number = storage_processor - .blocks_dal() - .get_miniblock_range_of_l1_batch(l1_batch_number) - .await - .map_err(|err| internal_error(method_name, err))? - .unwrap() - .1; - method_latency.observe(); - Ok(Some(SnapshotHeader { - l1_batch_number: snapshot_metadata.l1_batch_number, - miniblock_number, - last_l1_batch_with_metadata: l1_batch_with_metadata, - storage_logs_chunks: chunks, - factory_deps_filepath: snapshot_metadata.factory_deps_filepath, - })) - } else { + + let Some(snapshot_metadata) = snapshot_metadata else { method_latency.observe(); - Ok(None) - } + return Ok(None); + }; + + let snapshot_files = snapshot_metadata.storage_logs_filepaths; + let chunks = snapshot_files + .into_iter() + .enumerate() + .filter_map(|(chunk_id, filepath)| { + Some(SnapshotStorageLogsChunkMetadata { + chunk_id: chunk_id as u64, + filepath: filepath?, + }) + }) + .collect(); + let l1_batch_with_metadata = storage_processor + .blocks_dal() + .get_l1_batch_metadata(l1_batch_number) + .await + .map_err(|err| internal_error(method_name, err))? + .ok_or_else(|| { + let err = format!("missing metadata for L1 batch #{l1_batch_number}"); + internal_error(method_name, err) + })?; + let (_, miniblock_number) = storage_processor + .blocks_dal() + .get_miniblock_range_of_l1_batch(l1_batch_number) + .await + .map_err(|err| internal_error(method_name, err))? + .ok_or_else(|| { + let err = format!("missing miniblocks for L1 batch #{l1_batch_number}"); + internal_error(method_name, err) + })?; + + method_latency.observe(); + Ok(Some(SnapshotHeader { + l1_batch_number: snapshot_metadata.l1_batch_number, + miniblock_number, + last_l1_batch_with_metadata: l1_batch_with_metadata, + storage_logs_chunks: chunks, + factory_deps_filepath: snapshot_metadata.factory_deps_filepath, + })) } } From 76fdf15b430779a9c1773e72cc104a9e30c9f990 Mon Sep 17 00:00:00 2001 From: Alex Ostrovski Date: Fri, 15 Dec 2023 11:34:34 +0200 Subject: [PATCH 12/19] Extract `SnapshotCreator` to module --- core/bin/snapshots_creator/src/creator.rs | 328 ++++++++++++++++++++++ core/bin/snapshots_creator/src/main.rs | 324 +-------------------- core/bin/snapshots_creator/src/tests.rs | 9 +- 3 files changed, 340 insertions(+), 321 deletions(-) create mode 100644 core/bin/snapshots_creator/src/creator.rs diff --git a/core/bin/snapshots_creator/src/creator.rs b/core/bin/snapshots_creator/src/creator.rs new file mode 100644 index 000000000000..71b3b7795e79 --- /dev/null +++ b/core/bin/snapshots_creator/src/creator.rs @@ -0,0 +1,328 @@ +//! [`SnapshotCreator`] and tightly related types. + +use anyhow::Context as _; +use tokio::sync::Semaphore; +use zksync_config::SnapshotsCreatorConfig; +use zksync_dal::{ConnectionPool, StorageProcessor}; +use zksync_object_store::ObjectStore; +use zksync_types::{ + snapshots::{ + SnapshotFactoryDependencies, SnapshotMetadata, SnapshotStorageLogsChunk, + SnapshotStorageLogsStorageKey, + }, + L1BatchNumber, MiniblockNumber, +}; +use zksync_utils::ceil_div; + +#[cfg(test)] +use crate::tests::HandleEvent; +use crate::{ + chunking::get_chunk_hashed_keys_range, + metrics::{FactoryDepsStage, StorageChunkStage, METRICS}, +}; + +/// Encapsulates progress of creating a particular storage snapshot. +#[derive(Debug)] +struct SnapshotProgress { + l1_batch_number: L1BatchNumber, + needs_persisting_factory_deps: bool, + chunk_count: u64, + remaining_chunk_ids: Vec, +} + +impl SnapshotProgress { + fn new(l1_batch_number: L1BatchNumber, chunk_count: u64) -> Self { + Self { + l1_batch_number, + needs_persisting_factory_deps: true, + chunk_count, + remaining_chunk_ids: (0..chunk_count).collect(), + } + } + + fn from_existing_snapshot(snapshot: &SnapshotMetadata) -> Self { + let remaining_chunk_ids = snapshot + .storage_logs_filepaths + .iter() + .enumerate() + .filter_map(|(chunk_id, path)| path.is_none().then_some(chunk_id as u64)) + .collect(); + + Self { + l1_batch_number: snapshot.l1_batch_number, + needs_persisting_factory_deps: false, + chunk_count: snapshot.storage_logs_filepaths.len() as u64, + remaining_chunk_ids, + } + } +} + +/// Creator of a single storage snapshot. +#[derive(Debug)] +pub(crate) struct SnapshotCreator { + pub blob_store: Box, + pub master_pool: ConnectionPool, + pub replica_pool: ConnectionPool, + #[cfg(test)] + pub event_listener: Box, +} + +impl SnapshotCreator { + async fn process_storage_logs_single_chunk( + &self, + semaphore: &Semaphore, + miniblock_number: MiniblockNumber, + l1_batch_number: L1BatchNumber, + chunk_id: u64, + chunk_count: u64, + ) -> anyhow::Result<()> { + let _permit = semaphore.acquire().await?; + #[cfg(test)] + if self.event_listener.on_chunk_started().should_exit() { + return Ok(()); + } + + let hashed_keys_range = get_chunk_hashed_keys_range(chunk_id, chunk_count); + let mut conn = self + .replica_pool + .access_storage_tagged("snapshots_creator") + .await?; + + let latency = + METRICS.storage_logs_processing_duration[&StorageChunkStage::LoadFromPostgres].start(); + let logs = conn + .snapshots_creator_dal() + .get_storage_logs_chunk(miniblock_number, hashed_keys_range) + .await + .context("Error fetching storage logs count")?; + drop(conn); + let latency = latency.observe(); + tracing::info!( + "Loaded chunk {chunk_id} ({} logs) from Postgres in {latency:?}", + logs.len() + ); + + let latency = + METRICS.storage_logs_processing_duration[&StorageChunkStage::SaveToGcs].start(); + let storage_logs_chunk = SnapshotStorageLogsChunk { storage_logs: logs }; + let key = SnapshotStorageLogsStorageKey { + l1_batch_number, + chunk_id, + }; + let filename = self + .blob_store + .put(key, &storage_logs_chunk) + .await + .context("Error storing storage logs chunk in blob store")?; + let output_filepath_prefix = self + .blob_store + .get_storage_prefix::(); + let output_filepath = format!("{output_filepath_prefix}/{filename}"); + let latency = latency.observe(); + + let mut master_conn = self + .master_pool + .access_storage_tagged("snapshots_creator") + .await?; + master_conn + .snapshots_dal() + .add_storage_logs_filepath_for_snapshot(l1_batch_number, chunk_id, &output_filepath) + .await?; + #[cfg(test)] + self.event_listener.on_chunk_saved(); + + let tasks_left = METRICS.storage_logs_chunks_left_to_process.dec_by(1) - 1; + tracing::info!( + "Saved chunk {chunk_id} (overall progress {}/{chunk_count}) in {latency:?} to location: {output_filepath}", + chunk_count - tasks_left as u64 + ); + Ok(()) + } + + async fn process_factory_deps( + &self, + miniblock_number: MiniblockNumber, + l1_batch_number: L1BatchNumber, + ) -> anyhow::Result { + let mut conn = self + .replica_pool + .access_storage_tagged("snapshots_creator") + .await?; + + tracing::info!("Loading factory deps from Postgres..."); + let latency = + METRICS.factory_deps_processing_duration[&FactoryDepsStage::LoadFromPostgres].start(); + let factory_deps = conn + .snapshots_creator_dal() + .get_all_factory_deps(miniblock_number) + .await?; + drop(conn); + let latency = latency.observe(); + tracing::info!("Loaded {} factory deps in {latency:?}", factory_deps.len()); + + tracing::info!("Saving factory deps to GCS..."); + let latency = + METRICS.factory_deps_processing_duration[&FactoryDepsStage::SaveToGcs].start(); + let factory_deps = SnapshotFactoryDependencies { factory_deps }; + let filename = self + .blob_store + .put(l1_batch_number, &factory_deps) + .await + .context("Error storing factory deps in blob store")?; + let output_filepath_prefix = self + .blob_store + .get_storage_prefix::(); + let output_filepath = format!("{output_filepath_prefix}/{filename}"); + let latency = latency.observe(); + tracing::info!( + "Saved {} factory deps in {latency:?} to location: {output_filepath}", + factory_deps.factory_deps.len() + ); + + Ok(output_filepath) + } + + /// Returns `None` if the created snapshot would coincide with `latest_snapshot`. + async fn initialize_snapshot_progress( + config: &SnapshotsCreatorConfig, + min_chunk_count: u64, + latest_snapshot: Option<&SnapshotMetadata>, + conn: &mut StorageProcessor<'_>, + ) -> anyhow::Result> { + // We subtract 1 so that after restore, EN node has at least one L1 batch to fetch + let sealed_l1_batch_number = conn.blocks_dal().get_sealed_l1_batch_number().await?; + assert_ne!( + sealed_l1_batch_number, + L1BatchNumber(0), + "Cannot create snapshot when only the genesis L1 batch is present in Postgres" + ); + let l1_batch_number = sealed_l1_batch_number - 1; + + let latest_snapshot_l1_batch_number = + latest_snapshot.map(|snapshot| snapshot.l1_batch_number); + if latest_snapshot_l1_batch_number == Some(l1_batch_number) { + tracing::info!( + "Snapshot at expected L1 batch #{l1_batch_number} is already created; exiting" + ); + return Ok(None); + } + + let distinct_storage_logs_keys_count = conn + .snapshots_creator_dal() + .get_distinct_storage_logs_keys_count(l1_batch_number) + .await?; + let chunk_size = config.storage_logs_chunk_size; + // We force the minimum number of chunks to avoid situations where only one chunk is created in tests. + let chunk_count = + ceil_div(distinct_storage_logs_keys_count, chunk_size).max(min_chunk_count); + + tracing::info!( + "Selected storage logs chunking for L1 batch {l1_batch_number}: \ + {chunk_count} chunks of expected size {chunk_size}" + ); + Ok(Some(SnapshotProgress::new(l1_batch_number, chunk_count))) + } + + pub async fn run( + self, + config: SnapshotsCreatorConfig, + min_chunk_count: u64, + ) -> anyhow::Result<()> { + let latency = METRICS.snapshot_generation_duration.start(); + + let mut conn = self + .replica_pool + .access_storage_tagged("snapshots_creator") + .await?; + let mut master_conn = self + .master_pool + .access_storage_tagged("snapshots_creator") + .await?; + let latest_snapshot = master_conn + .snapshots_dal() + .get_newest_snapshot_metadata() + .await?; + drop(master_conn); + + let pending_snapshot = latest_snapshot + .as_ref() + .filter(|snapshot| !snapshot.is_complete()); + let progress = if let Some(snapshot) = pending_snapshot { + SnapshotProgress::from_existing_snapshot(snapshot) + } else { + let progress = Self::initialize_snapshot_progress( + &config, + min_chunk_count, + latest_snapshot.as_ref(), + &mut conn, + ) + .await?; + + match progress { + Some(progress) => progress, + None => return Ok(()), + } + }; + + let (_, last_miniblock_number_in_batch) = conn + .blocks_dal() + .get_miniblock_range_of_l1_batch(progress.l1_batch_number) + .await? + .context("Error fetching last miniblock number")?; + drop(conn); + + METRICS.storage_logs_chunks_count.set(progress.chunk_count); + tracing::info!( + "Creating snapshot for storage logs up to miniblock {last_miniblock_number_in_batch}, \ + L1 batch {}", + progress.l1_batch_number + ); + + if progress.needs_persisting_factory_deps { + let factory_deps_output_file = self + .process_factory_deps(last_miniblock_number_in_batch, progress.l1_batch_number) + .await?; + + let mut master_conn = self + .master_pool + .access_storage_tagged("snapshots_creator") + .await?; + master_conn + .snapshots_dal() + .add_snapshot( + progress.l1_batch_number, + progress.chunk_count, + &factory_deps_output_file, + ) + .await?; + } + + METRICS + .storage_logs_chunks_left_to_process + .set(progress.remaining_chunk_ids.len()); + let semaphore = Semaphore::new(config.concurrent_queries_count as usize); + let tasks = progress.remaining_chunk_ids.into_iter().map(|chunk_id| { + self.process_storage_logs_single_chunk( + &semaphore, + last_miniblock_number_in_batch, + progress.l1_batch_number, + chunk_id, + progress.chunk_count, + ) + }); + futures::future::try_join_all(tasks).await?; + + METRICS + .snapshot_l1_batch + .set(progress.l1_batch_number.0.into()); + + let elapsed = latency.observe(); + tracing::info!("snapshot_generation_duration: {elapsed:?}"); + tracing::info!("snapshot_l1_batch: {}", METRICS.snapshot_l1_batch.get()); + tracing::info!( + "storage_logs_chunks_count: {}", + METRICS.storage_logs_chunks_count.get() + ); + Ok(()) + } +} diff --git a/core/bin/snapshots_creator/src/main.rs b/core/bin/snapshots_creator/src/main.rs index 4f61632b8a5a..f1a9e75c8611 100644 --- a/core/bin/snapshots_creator/src/main.rs +++ b/core/bin/snapshots_creator/src/main.rs @@ -2,28 +2,16 @@ use anyhow::Context as _; use prometheus_exporter::PrometheusExporterConfig; -use tokio::sync::{watch, Semaphore}; +use tokio::sync::watch; use zksync_config::{configs::PrometheusConfig, PostgresConfig, SnapshotsCreatorConfig}; -use zksync_dal::{ConnectionPool, StorageProcessor}; +use zksync_dal::ConnectionPool; use zksync_env_config::{object_store::SnapshotsObjectStoreConfig, FromEnv}; -use zksync_object_store::{ObjectStore, ObjectStoreFactory}; -use zksync_types::{ - snapshots::{ - SnapshotFactoryDependencies, SnapshotMetadata, SnapshotStorageLogsChunk, - SnapshotStorageLogsStorageKey, - }, - L1BatchNumber, MiniblockNumber, -}; -use zksync_utils::ceil_div; +use zksync_object_store::ObjectStoreFactory; -#[cfg(test)] -use crate::tests::HandleEvent; -use crate::{ - chunking::get_chunk_hashed_keys_range, - metrics::{FactoryDepsStage, StorageChunkStage, METRICS}, -}; +use crate::creator::SnapshotCreator; mod chunking; +mod creator; mod metrics; #[cfg(test)] mod tests; @@ -46,308 +34,6 @@ async fn maybe_enable_prometheus_metrics( Ok(()) } -/// Encapsulates progress of creating a particular storage snapshot. -#[derive(Debug)] -struct SnapshotProgress { - l1_batch_number: L1BatchNumber, - needs_persisting_factory_deps: bool, - chunk_count: u64, - remaining_chunk_ids: Vec, -} - -impl SnapshotProgress { - fn new(l1_batch_number: L1BatchNumber, chunk_count: u64) -> Self { - Self { - l1_batch_number, - needs_persisting_factory_deps: true, - chunk_count, - remaining_chunk_ids: (0..chunk_count).collect(), - } - } - - fn from_existing_snapshot(snapshot: &SnapshotMetadata) -> Self { - let remaining_chunk_ids = snapshot - .storage_logs_filepaths - .iter() - .enumerate() - .filter_map(|(chunk_id, path)| path.is_none().then_some(chunk_id as u64)) - .collect(); - - Self { - l1_batch_number: snapshot.l1_batch_number, - needs_persisting_factory_deps: false, - chunk_count: snapshot.storage_logs_filepaths.len() as u64, - remaining_chunk_ids, - } - } -} - -/// Creator of a single storage snapshot. -#[derive(Debug)] -struct SnapshotCreator { - blob_store: Box, - master_pool: ConnectionPool, - replica_pool: ConnectionPool, - #[cfg(test)] - event_listener: Box, -} - -impl SnapshotCreator { - async fn process_storage_logs_single_chunk( - &self, - semaphore: &Semaphore, - miniblock_number: MiniblockNumber, - l1_batch_number: L1BatchNumber, - chunk_id: u64, - chunk_count: u64, - ) -> anyhow::Result<()> { - let _permit = semaphore.acquire().await?; - #[cfg(test)] - if self.event_listener.on_chunk_started().should_exit() { - return Ok(()); - } - - let hashed_keys_range = get_chunk_hashed_keys_range(chunk_id, chunk_count); - let mut conn = self - .replica_pool - .access_storage_tagged("snapshots_creator") - .await?; - - let latency = - METRICS.storage_logs_processing_duration[&StorageChunkStage::LoadFromPostgres].start(); - let logs = conn - .snapshots_creator_dal() - .get_storage_logs_chunk(miniblock_number, hashed_keys_range) - .await - .context("Error fetching storage logs count")?; - drop(conn); - let latency = latency.observe(); - tracing::info!( - "Loaded chunk {chunk_id} ({} logs) from Postgres in {latency:?}", - logs.len() - ); - - let latency = - METRICS.storage_logs_processing_duration[&StorageChunkStage::SaveToGcs].start(); - let storage_logs_chunk = SnapshotStorageLogsChunk { storage_logs: logs }; - let key = SnapshotStorageLogsStorageKey { - l1_batch_number, - chunk_id, - }; - let filename = self - .blob_store - .put(key, &storage_logs_chunk) - .await - .context("Error storing storage logs chunk in blob store")?; - let output_filepath_prefix = self - .blob_store - .get_storage_prefix::(); - let output_filepath = format!("{output_filepath_prefix}/{filename}"); - let latency = latency.observe(); - - let mut master_conn = self - .master_pool - .access_storage_tagged("snapshots_creator") - .await?; - master_conn - .snapshots_dal() - .add_storage_logs_filepath_for_snapshot(l1_batch_number, chunk_id, &output_filepath) - .await?; - #[cfg(test)] - self.event_listener.on_chunk_saved(); - - let tasks_left = METRICS.storage_logs_chunks_left_to_process.dec_by(1) - 1; - tracing::info!( - "Saved chunk {chunk_id} (overall progress {}/{chunk_count}) in {latency:?} to location: {output_filepath}", - chunk_count - tasks_left as u64 - ); - Ok(()) - } - - async fn process_factory_deps( - &self, - miniblock_number: MiniblockNumber, - l1_batch_number: L1BatchNumber, - ) -> anyhow::Result { - let mut conn = self - .replica_pool - .access_storage_tagged("snapshots_creator") - .await?; - - tracing::info!("Loading factory deps from Postgres..."); - let latency = - METRICS.factory_deps_processing_duration[&FactoryDepsStage::LoadFromPostgres].start(); - let factory_deps = conn - .snapshots_creator_dal() - .get_all_factory_deps(miniblock_number) - .await?; - drop(conn); - let latency = latency.observe(); - tracing::info!("Loaded {} factory deps in {latency:?}", factory_deps.len()); - - tracing::info!("Saving factory deps to GCS..."); - let latency = - METRICS.factory_deps_processing_duration[&FactoryDepsStage::SaveToGcs].start(); - let factory_deps = SnapshotFactoryDependencies { factory_deps }; - let filename = self - .blob_store - .put(l1_batch_number, &factory_deps) - .await - .context("Error storing factory deps in blob store")?; - let output_filepath_prefix = self - .blob_store - .get_storage_prefix::(); - let output_filepath = format!("{output_filepath_prefix}/{filename}"); - let latency = latency.observe(); - tracing::info!( - "Saved {} factory deps in {latency:?} to location: {output_filepath}", - factory_deps.factory_deps.len() - ); - - Ok(output_filepath) - } - - /// Returns `None` iff the created snapshot would coincide with `latest_snapshot`. - async fn initialize_snapshot_progress( - config: &SnapshotsCreatorConfig, - min_chunk_count: u64, - latest_snapshot: Option<&SnapshotMetadata>, - conn: &mut StorageProcessor<'_>, - ) -> anyhow::Result> { - // We subtract 1 so that after restore, EN node has at least one L1 batch to fetch - let sealed_l1_batch_number = conn.blocks_dal().get_sealed_l1_batch_number().await?; - assert_ne!( - sealed_l1_batch_number, - L1BatchNumber(0), - "Cannot create snapshot when only the genesis L1 batch is present in Postgres" - ); - let l1_batch_number = sealed_l1_batch_number - 1; - - let latest_snapshot_l1_batch_number = - latest_snapshot.map(|snapshot| snapshot.l1_batch_number); - if latest_snapshot_l1_batch_number == Some(l1_batch_number) { - tracing::info!( - "Snapshot at expected L1 batch #{l1_batch_number} is already created; exiting" - ); - return Ok(None); - } - - let distinct_storage_logs_keys_count = conn - .snapshots_creator_dal() - .get_distinct_storage_logs_keys_count(l1_batch_number) - .await?; - let chunk_size = config.storage_logs_chunk_size; - // We force the minimum number of chunks to avoid situations where only one chunk is created in tests. - let chunk_count = - ceil_div(distinct_storage_logs_keys_count, chunk_size).max(min_chunk_count); - - tracing::info!( - "Selected storage logs chunking for L1 batch {l1_batch_number}: \ - {chunk_count} chunks of expected size {chunk_size}" - ); - Ok(Some(SnapshotProgress::new(l1_batch_number, chunk_count))) - } - - async fn run(self, config: SnapshotsCreatorConfig, min_chunk_count: u64) -> anyhow::Result<()> { - let latency = METRICS.snapshot_generation_duration.start(); - - let mut conn = self - .replica_pool - .access_storage_tagged("snapshots_creator") - .await?; - let mut master_conn = self - .master_pool - .access_storage_tagged("snapshots_creator") - .await?; - let latest_snapshot = master_conn - .snapshots_dal() - .get_newest_snapshot_metadata() - .await?; - drop(master_conn); - - let pending_snapshot = latest_snapshot - .as_ref() - .filter(|snapshot| !snapshot.is_complete()); - let progress = if let Some(snapshot) = pending_snapshot { - SnapshotProgress::from_existing_snapshot(snapshot) - } else { - let progress = Self::initialize_snapshot_progress( - &config, - min_chunk_count, - latest_snapshot.as_ref(), - &mut conn, - ) - .await?; - - match progress { - Some(progress) => progress, - None => return Ok(()), - } - }; - - let (_, last_miniblock_number_in_batch) = conn - .blocks_dal() - .get_miniblock_range_of_l1_batch(progress.l1_batch_number) - .await? - .context("Error fetching last miniblock number")?; - drop(conn); - - METRICS.storage_logs_chunks_count.set(progress.chunk_count); - tracing::info!( - "Creating snapshot for storage logs up to miniblock {last_miniblock_number_in_batch}, \ - L1 batch {}", - progress.l1_batch_number - ); - - if progress.needs_persisting_factory_deps { - let factory_deps_output_file = self - .process_factory_deps(last_miniblock_number_in_batch, progress.l1_batch_number) - .await?; - - let mut master_conn = self - .master_pool - .access_storage_tagged("snapshots_creator") - .await?; - master_conn - .snapshots_dal() - .add_snapshot( - progress.l1_batch_number, - progress.chunk_count, - &factory_deps_output_file, - ) - .await?; - } - - METRICS - .storage_logs_chunks_left_to_process - .set(progress.remaining_chunk_ids.len()); - let semaphore = Semaphore::new(config.concurrent_queries_count as usize); - let tasks = progress.remaining_chunk_ids.into_iter().map(|chunk_id| { - self.process_storage_logs_single_chunk( - &semaphore, - last_miniblock_number_in_batch, - progress.l1_batch_number, - chunk_id, - progress.chunk_count, - ) - }); - futures::future::try_join_all(tasks).await?; - - METRICS - .snapshot_l1_batch - .set(progress.l1_batch_number.0.into()); - - let elapsed = latency.observe(); - tracing::info!("snapshot_generation_duration: {elapsed:?}"); - tracing::info!("snapshot_l1_batch: {}", METRICS.snapshot_l1_batch.get()); - tracing::info!( - "storage_logs_chunks_count: {}", - METRICS.storage_logs_chunks_count.get() - ); - Ok(()) - } -} - /// Minimum number of storage log chunks to produce. const MIN_CHUNK_COUNT: u64 = 10; diff --git a/core/bin/snapshots_creator/src/tests.rs b/core/bin/snapshots_creator/src/tests.rs index e69b01c445e8..3cca23b5dae8 100644 --- a/core/bin/snapshots_creator/src/tests.rs +++ b/core/bin/snapshots_creator/src/tests.rs @@ -8,10 +8,15 @@ use std::{ use rand::{thread_rng, Rng}; use zksync_dal::StorageProcessor; +use zksync_object_store::ObjectStore; use zksync_types::{ block::{BlockGasCount, L1BatchHeader, MiniblockHeader}, - snapshots::{SnapshotFactoryDependency, SnapshotStorageLog}, - AccountTreeId, Address, ProtocolVersion, StorageKey, StorageLog, H256, + snapshots::{ + SnapshotFactoryDependencies, SnapshotFactoryDependency, SnapshotStorageLog, + SnapshotStorageLogsChunk, SnapshotStorageLogsStorageKey, + }, + AccountTreeId, Address, L1BatchNumber, MiniblockNumber, ProtocolVersion, StorageKey, + StorageLog, H256, }; use super::*; From afa5754a4479d207abe9654bd407c802d4b30803 Mon Sep 17 00:00:00 2001 From: Alex Ostrovski Date: Fri, 15 Dec 2023 14:11:14 +0200 Subject: [PATCH 13/19] Revert extra traits for `SnapshotStorageLogsStorageKey` --- core/lib/object_store/src/objects.rs | 5 ++- core/lib/types/src/snapshots.rs | 60 ---------------------------- 2 files changed, 4 insertions(+), 61 deletions(-) diff --git a/core/lib/object_store/src/objects.rs b/core/lib/object_store/src/objects.rs index 3f6469b9827d..89241c7edf7c 100644 --- a/core/lib/object_store/src/objects.rs +++ b/core/lib/object_store/src/objects.rs @@ -101,7 +101,10 @@ impl StoredObject for SnapshotStorageLogsChunk { type Key<'a> = SnapshotStorageLogsStorageKey; fn encode_key(key: Self::Key<'_>) -> String { - key.to_string() + format!( + "snapshot_l1_batch_{}_storage_logs_part_{:0>4}.json.gzip", + key.l1_batch_number, key.chunk_id + ) } //TODO use better language agnostic serialization format like protobuf diff --git a/core/lib/types/src/snapshots.rs b/core/lib/types/src/snapshots.rs index a0874df4882e..d18de1fb3017 100644 --- a/core/lib/types/src/snapshots.rs +++ b/core/lib/types/src/snapshots.rs @@ -1,6 +1,3 @@ -use std::{fmt, str::FromStr}; - -use anyhow::Context as _; use serde::{Deserialize, Serialize}; use zksync_basic_types::{L1BatchNumber, MiniblockNumber}; @@ -58,42 +55,6 @@ pub struct SnapshotStorageLogsStorageKey { pub chunk_id: u64, } -impl fmt::Display for SnapshotStorageLogsStorageKey { - fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { - write!( - formatter, - "snapshot_l1_batch_{}_storage_logs_part_{:0>4}.json.gzip", - self.l1_batch_number, self.chunk_id - ) - } -} - -impl FromStr for SnapshotStorageLogsStorageKey { - type Err = anyhow::Error; - - fn from_str(s: &str) -> Result { - let s = s - .strip_prefix("snapshot_l1_batch_") - .context("missing `snapshot_l1_batch_` prefix")?; - let underscore_pos = s.find('_').context("missing '_' char")?; - let (l1_batch_str, s) = s.split_at(underscore_pos); - let l1_batch_number = L1BatchNumber(l1_batch_str.parse().context("invalid L1 batch")?); - - let s = s - .strip_prefix("_storage_logs_part_") - .context("missing `_storage_logs_part_`")?; - let s = s - .strip_suffix(".json.gzip") - .context("missing `.json.gzip` suffix")?; - let chunk_id = s.parse().context("invalid chunk ID")?; - - Ok(Self { - l1_batch_number, - chunk_id, - }) - } -} - #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub struct SnapshotStorageLogsChunk { @@ -120,24 +81,3 @@ pub struct SnapshotFactoryDependencies { pub struct SnapshotFactoryDependency { pub bytecode: Bytes, } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn displaying_and_parsing_storage_logs_key() { - let key = SnapshotStorageLogsStorageKey { - l1_batch_number: L1BatchNumber(12345), - chunk_id: 42, - }; - let key_str = key.to_string(); - assert_eq!( - key_str, - "snapshot_l1_batch_12345_storage_logs_part_0042.json.gzip" - ); - - let parsed_key: SnapshotStorageLogsStorageKey = key_str.parse().unwrap(); - assert_eq!(parsed_key, key); - } -} From 5dbfc5354f6cabc801a2a89030f881074ea9aaf2 Mon Sep 17 00:00:00 2001 From: Alex Ostrovski Date: Fri, 15 Dec 2023 15:43:16 +0200 Subject: [PATCH 14/19] Rename `SnapshotProgress.is_new_snapshot` --- core/bin/snapshots_creator/src/creator.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/core/bin/snapshots_creator/src/creator.rs b/core/bin/snapshots_creator/src/creator.rs index 71b3b7795e79..08b562c1a738 100644 --- a/core/bin/snapshots_creator/src/creator.rs +++ b/core/bin/snapshots_creator/src/creator.rs @@ -25,7 +25,8 @@ use crate::{ #[derive(Debug)] struct SnapshotProgress { l1_batch_number: L1BatchNumber, - needs_persisting_factory_deps: bool, + /// `true` if the snapshot is new (i.e., its progress is not recovered from Postgres). + is_new_snapshot: bool, chunk_count: u64, remaining_chunk_ids: Vec, } @@ -34,7 +35,7 @@ impl SnapshotProgress { fn new(l1_batch_number: L1BatchNumber, chunk_count: u64) -> Self { Self { l1_batch_number, - needs_persisting_factory_deps: true, + is_new_snapshot: true, chunk_count, remaining_chunk_ids: (0..chunk_count).collect(), } @@ -50,7 +51,7 @@ impl SnapshotProgress { Self { l1_batch_number: snapshot.l1_batch_number, - needs_persisting_factory_deps: false, + is_new_snapshot: false, chunk_count: snapshot.storage_logs_filepaths.len() as u64, remaining_chunk_ids, } @@ -278,7 +279,7 @@ impl SnapshotCreator { progress.l1_batch_number ); - if progress.needs_persisting_factory_deps { + if progress.is_new_snapshot { let factory_deps_output_file = self .process_factory_deps(last_miniblock_number_in_batch, progress.l1_batch_number) .await?; From 598de62b835976c0d0978fd86e3d06aa958f331b Mon Sep 17 00:00:00 2001 From: Alex Ostrovski Date: Fri, 15 Dec 2023 16:17:29 +0200 Subject: [PATCH 15/19] Split `AllSnapshots` into complete / incomplete --- core/bin/snapshots_creator/src/main.rs | 9 +++ core/lib/dal/sqlx-data.json | 42 ++++++++------ core/lib/dal/src/snapshots_dal.rs | 57 +++++++++++++++---- core/lib/types/src/snapshots.rs | 13 ++++- .../api_server/web3/namespaces/snapshots.rs | 2 + .../src/api_server/web3/tests/snapshots.rs | 17 +++++- 6 files changed, 107 insertions(+), 33 deletions(-) diff --git a/core/bin/snapshots_creator/src/main.rs b/core/bin/snapshots_creator/src/main.rs index f1a9e75c8611..f81a4f75d091 100644 --- a/core/bin/snapshots_creator/src/main.rs +++ b/core/bin/snapshots_creator/src/main.rs @@ -1,4 +1,13 @@ //! Snapshot creator utility. Intended to run on a schedule, with each run creating a new snapshot. +//! +//! # Assumptions +//! +//! The snapshot creator is fault-tolerant; if it stops in the middle of creating a snapshot, +//! this snapshot will be continued from roughly the same point after the restart. If this is +//! undesired, remove the `snapshots` table record corresponding to the pending snapshot. +//! +//! It is assumed that the snapshot creator is run as a singleton process (no more than 1 instance +//! at a time). use anyhow::Context as _; use prometheus_exporter::PrometheusExporterConfig; diff --git a/core/lib/dal/sqlx-data.json b/core/lib/dal/sqlx-data.json index 0de7ab05a7c1..b045400732d6 100644 --- a/core/lib/dal/sqlx-data.json +++ b/core/lib/dal/sqlx-data.json @@ -623,6 +623,30 @@ }, "query": "\n SELECT COUNT(*) as \"count!\", status as \"status!\"\n FROM prover_jobs\n GROUP BY status\n " }, + "117757bfbe10fd36a171bd4511210b0a5a4b150156c13b89e44484dbe2c8e4a0": { + "describe": { + "columns": [ + { + "name": "l1_batch_number", + "ordinal": 0, + "type_info": "Int8" + }, + { + "name": "is_pending!", + "ordinal": 1, + "type_info": "Bool" + } + ], + "nullable": [ + false, + null + ], + "parameters": { + "Left": [] + } + }, + "query": "SELECT l1_batch_number, ''::text = ANY(storage_logs_filepaths) AS \"is_pending!\" FROM snapshots ORDER BY l1_batch_number DESC" + }, "13e5f6a2a73eaa979229611ffdbed86d6e5e1bad0c645d39b56fdc47f5c17971": { "describe": { "columns": [ @@ -10584,24 +10608,6 @@ }, "query": "DELETE FROM transactions WHERE in_mempool = TRUE AND initiator_address = ANY($1)" }, - "e9517bcf2f0b01ffd82dc8c1d7da84b3759ef4739e0e89e509b16a43c61147fb": { - "describe": { - "columns": [ - { - "name": "l1_batch_number", - "ordinal": 0, - "type_info": "Int8" - } - ], - "nullable": [ - false - ], - "parameters": { - "Left": [] - } - }, - "query": "SELECT l1_batch_number FROM snapshots ORDER BY l1_batch_number" - }, "e9b03a0d79eb40a67eab9bdaac8447fc17922bea89bcc6a89eb8eadf147835fe": { "describe": { "columns": [], diff --git a/core/lib/dal/src/snapshots_dal.rs b/core/lib/dal/src/snapshots_dal.rs index 6beebb56034f..cd4ae5041e2a 100644 --- a/core/lib/dal/src/snapshots_dal.rs +++ b/core/lib/dal/src/snapshots_dal.rs @@ -77,18 +77,31 @@ impl SnapshotsDal<'_, '_> { } pub async fn get_all_snapshots(&mut self) -> sqlx::Result { - let records = - sqlx::query!("SELECT l1_batch_number FROM snapshots ORDER BY l1_batch_number") - .instrument("get_all_snapshots") - .report_latency() - .fetch_all(self.storage.conn()) - .await? - .into_iter() - .map(|row| L1BatchNumber(row.l1_batch_number as u32)) - .collect(); + let rows = sqlx::query!( + "SELECT l1_batch_number, ''::text = ANY(storage_logs_filepaths) AS \"is_pending!\" \ + FROM snapshots \ + ORDER BY l1_batch_number DESC" + ) + .instrument("get_all_snapshots") + .report_latency() + .fetch_all(self.storage.conn()) + .await?; + + // Normally, there should be no more than 1 pending snapshot, so we preallocate `Vec`s accordingly. + let mut snapshots_l1_batch_numbers = Vec::with_capacity(rows.len()); + let mut pending_snapshots_l1_batch_numbers = vec![]; + for row in rows { + let l1_batch_number = L1BatchNumber(row.l1_batch_number as u32); + if row.is_pending { + pending_snapshots_l1_batch_numbers.push(l1_batch_number); + } else { + snapshots_l1_batch_numbers.push(l1_batch_number); + } + } Ok(AllSnapshots { - snapshots_l1_batch_numbers: records, + snapshots_l1_batch_numbers, + incomplete_snapshots_l1_batch_numbers: pending_snapshots_l1_batch_numbers, }) } @@ -147,8 +160,28 @@ mod tests { .get_all_snapshots() .await .expect("Failed to retrieve snapshots"); - assert_eq!(1, snapshots.snapshots_l1_batch_numbers.len()); - assert_eq!(snapshots.snapshots_l1_batch_numbers[0], l1_batch_number); + assert_eq!(snapshots.snapshots_l1_batch_numbers, []); + assert_eq!( + snapshots.incomplete_snapshots_l1_batch_numbers, + [l1_batch_number] + ); + + for i in 0..2 { + dal.add_storage_logs_filepath_for_snapshot( + l1_batch_number, + i, + "gs:///bucket/chunk.bin", + ) + .await + .unwrap(); + } + + let snapshots = dal + .get_all_snapshots() + .await + .expect("Failed to retrieve snapshots"); + assert_eq!(snapshots.snapshots_l1_batch_numbers, [l1_batch_number]); + assert_eq!(snapshots.incomplete_snapshots_l1_batch_numbers, []); let snapshot_metadata = dal .get_snapshot_metadata(l1_batch_number) diff --git a/core/lib/types/src/snapshots.rs b/core/lib/types/src/snapshots.rs index d18de1fb3017..ec192a02b179 100644 --- a/core/lib/types/src/snapshots.rs +++ b/core/lib/types/src/snapshots.rs @@ -3,10 +3,17 @@ use zksync_basic_types::{L1BatchNumber, MiniblockNumber}; use crate::{commitment::L1BatchWithMetadata, Bytes, StorageKey, StorageValue}; +/// Information about all snapshots persisted by the node. #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub struct AllSnapshots { + /// L1 batch numbers for complete snapshots. Ordered by descending number (i.e., 0th element + /// corresponds to the newest snapshot). pub snapshots_l1_batch_numbers: Vec, + /// Same as `snapshots_l1_batch_numbers`, but for incomplete snapshots (ones in which some + /// storage log chunks may be missing). + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub incomplete_snapshots_l1_batch_numbers: Vec, } /// Storage snapshot metadata. Used in DAL to fetch certain snapshot data. @@ -28,13 +35,15 @@ impl SnapshotMetadata { } } -//contains all data not contained in factory_deps/storage_logs files to perform restore process +/// Snapshot data returned by using JSON-RPC API. +/// Contains all data not contained in factory deps / storage logs files to perform restore process. #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub struct SnapshotHeader { pub l1_batch_number: L1BatchNumber, pub miniblock_number: MiniblockNumber, - //ordered by chunk ids + pub is_complete: bool, + /// Ordered by chunk IDs. pub storage_logs_chunks: Vec, pub factory_deps_filepath: String, pub last_l1_batch_with_metadata: L1BatchWithMetadata, diff --git a/core/lib/zksync_core/src/api_server/web3/namespaces/snapshots.rs b/core/lib/zksync_core/src/api_server/web3/namespaces/snapshots.rs index 170c368ec13f..91605c6ed544 100644 --- a/core/lib/zksync_core/src/api_server/web3/namespaces/snapshots.rs +++ b/core/lib/zksync_core/src/api_server/web3/namespaces/snapshots.rs @@ -70,6 +70,7 @@ impl SnapshotsNamespace { }; let snapshot_files = snapshot_metadata.storage_logs_filepaths; + let is_complete = snapshot_files.iter().all(Option::is_some); let chunks = snapshot_files .into_iter() .enumerate() @@ -103,6 +104,7 @@ impl SnapshotsNamespace { Ok(Some(SnapshotHeader { l1_batch_number: snapshot_metadata.l1_batch_number, miniblock_number, + is_complete, last_l1_batch_with_metadata: l1_batch_with_metadata, storage_logs_chunks: chunks, factory_deps_filepath: snapshot_metadata.factory_deps_filepath, diff --git a/core/lib/zksync_core/src/api_server/web3/tests/snapshots.rs b/core/lib/zksync_core/src/api_server/web3/tests/snapshots.rs index 474358e81193..2e42e351677a 100644 --- a/core/lib/zksync_core/src/api_server/web3/tests/snapshots.rs +++ b/core/lib/zksync_core/src/api_server/web3/tests/snapshots.rs @@ -48,6 +48,10 @@ impl SnapshotBasics { assert!(chunk_ids.iter().all(|&id| id < Self::CHUNK_COUNT)); Self { chunk_ids } } + + fn is_complete_snapshot(&self) -> bool { + self.chunk_ids == HashSet::from_iter(0..Self::CHUNK_COUNT) + } } #[async_trait] @@ -70,13 +74,24 @@ impl HttpTest for SnapshotBasics { } let all_snapshots = client.get_all_snapshots().await?; - assert_eq!(all_snapshots.snapshots_l1_batch_numbers, [L1BatchNumber(1)]); + if self.is_complete_snapshot() { + assert_eq!(all_snapshots.snapshots_l1_batch_numbers, [L1BatchNumber(1)]); + assert_eq!(all_snapshots.incomplete_snapshots_l1_batch_numbers, []); + } else { + assert_eq!(all_snapshots.snapshots_l1_batch_numbers, []); + assert_eq!( + all_snapshots.incomplete_snapshots_l1_batch_numbers, + [L1BatchNumber(1)] + ); + } + let snapshot_header = client .get_snapshot_by_l1_batch_number(L1BatchNumber(1)) .await? .context("no snapshot for L1 batch #1")?; assert_eq!(snapshot_header.l1_batch_number, L1BatchNumber(1)); assert_eq!(snapshot_header.miniblock_number, MiniblockNumber(1)); + assert_eq!(snapshot_header.is_complete, self.is_complete_snapshot()); assert_eq!( snapshot_header.factory_deps_filepath, "file:///factory_deps" From d3127380fd0b4b6a1d6738ef5a6b180fbbf3b761 Mon Sep 17 00:00:00 2001 From: Alex Ostrovski Date: Mon, 18 Dec 2023 10:16:21 +0200 Subject: [PATCH 16/19] Do not expose incomplete snapshots in API --- core/bin/snapshots_creator/src/tests.rs | 6 ++- core/lib/dal/sqlx-data.json | 42 ++++++++----------- core/lib/dal/src/snapshots_dal.rs | 32 +++++--------- core/lib/types/src/snapshots.rs | 5 --- .../api_server/web3/namespaces/snapshots.rs | 9 +++- .../src/api_server/web3/tests/snapshots.rs | 16 +++---- 6 files changed, 48 insertions(+), 62 deletions(-) diff --git a/core/bin/snapshots_creator/src/tests.rs b/core/bin/snapshots_creator/src/tests.rs index 3cca23b5dae8..0e74e4773bcc 100644 --- a/core/bin/snapshots_creator/src/tests.rs +++ b/core/bin/snapshots_creator/src/tests.rs @@ -255,7 +255,11 @@ async fn persisting_snapshot_metadata() { .unwrap(); // Check snapshot metadata in Postgres. - let snapshots = conn.snapshots_dal().get_all_snapshots().await.unwrap(); + let snapshots = conn + .snapshots_dal() + .get_all_complete_snapshots() + .await + .unwrap(); assert_eq!(snapshots.snapshots_l1_batch_numbers.len(), 1); let snapshot_l1_batch_number = snapshots.snapshots_l1_batch_numbers[0]; assert_eq!(snapshot_l1_batch_number, L1BatchNumber(8)); diff --git a/core/lib/dal/sqlx-data.json b/core/lib/dal/sqlx-data.json index b045400732d6..d8acc701852f 100644 --- a/core/lib/dal/sqlx-data.json +++ b/core/lib/dal/sqlx-data.json @@ -623,30 +623,6 @@ }, "query": "\n SELECT COUNT(*) as \"count!\", status as \"status!\"\n FROM prover_jobs\n GROUP BY status\n " }, - "117757bfbe10fd36a171bd4511210b0a5a4b150156c13b89e44484dbe2c8e4a0": { - "describe": { - "columns": [ - { - "name": "l1_batch_number", - "ordinal": 0, - "type_info": "Int8" - }, - { - "name": "is_pending!", - "ordinal": 1, - "type_info": "Bool" - } - ], - "nullable": [ - false, - null - ], - "parameters": { - "Left": [] - } - }, - "query": "SELECT l1_batch_number, ''::text = ANY(storage_logs_filepaths) AS \"is_pending!\" FROM snapshots ORDER BY l1_batch_number DESC" - }, "13e5f6a2a73eaa979229611ffdbed86d6e5e1bad0c645d39b56fdc47f5c17971": { "describe": { "columns": [ @@ -5680,6 +5656,24 @@ }, "query": "INSERT INTO factory_deps (bytecode_hash, bytecode, miniblock_number, created_at, updated_at) SELECT u.bytecode_hash, u.bytecode, $3, now(), now() FROM UNNEST($1::bytea[], $2::bytea[]) AS u(bytecode_hash, bytecode) ON CONFLICT (bytecode_hash) DO NOTHING" }, + "71e3b3d51b286ed3712848e7f03428461ab546fe7a005a5327f6a2fcacd7ff16": { + "describe": { + "columns": [ + { + "name": "l1_batch_number", + "ordinal": 0, + "type_info": "Int8" + } + ], + "nullable": [ + false + ], + "parameters": { + "Left": [] + } + }, + "query": "SELECT l1_batch_number FROM snapshots WHERE NOT (''::text = ANY(storage_logs_filepaths)) ORDER BY l1_batch_number DESC" + }, "741b13b0a4769a30186c650a4a1b24855806a27ccd8d5a50594741842dde44ec": { "describe": { "columns": [ diff --git a/core/lib/dal/src/snapshots_dal.rs b/core/lib/dal/src/snapshots_dal.rs index cd4ae5041e2a..8b4c13cca566 100644 --- a/core/lib/dal/src/snapshots_dal.rs +++ b/core/lib/dal/src/snapshots_dal.rs @@ -76,32 +76,25 @@ impl SnapshotsDal<'_, '_> { Ok(()) } - pub async fn get_all_snapshots(&mut self) -> sqlx::Result { + pub async fn get_all_complete_snapshots(&mut self) -> sqlx::Result { let rows = sqlx::query!( - "SELECT l1_batch_number, ''::text = ANY(storage_logs_filepaths) AS \"is_pending!\" \ + "SELECT l1_batch_number \ FROM snapshots \ + WHERE NOT (''::text = ANY(storage_logs_filepaths)) \ ORDER BY l1_batch_number DESC" ) - .instrument("get_all_snapshots") + .instrument("get_all_complete_snapshots") .report_latency() .fetch_all(self.storage.conn()) .await?; - // Normally, there should be no more than 1 pending snapshot, so we preallocate `Vec`s accordingly. - let mut snapshots_l1_batch_numbers = Vec::with_capacity(rows.len()); - let mut pending_snapshots_l1_batch_numbers = vec![]; - for row in rows { - let l1_batch_number = L1BatchNumber(row.l1_batch_number as u32); - if row.is_pending { - pending_snapshots_l1_batch_numbers.push(l1_batch_number); - } else { - snapshots_l1_batch_numbers.push(l1_batch_number); - } - } + let snapshots_l1_batch_numbers = rows + .into_iter() + .map(|row| L1BatchNumber(row.l1_batch_number as u32)) + .collect(); Ok(AllSnapshots { snapshots_l1_batch_numbers, - incomplete_snapshots_l1_batch_numbers: pending_snapshots_l1_batch_numbers, }) } @@ -157,14 +150,10 @@ mod tests { .expect("Failed to add snapshot"); let snapshots = dal - .get_all_snapshots() + .get_all_complete_snapshots() .await .expect("Failed to retrieve snapshots"); assert_eq!(snapshots.snapshots_l1_batch_numbers, []); - assert_eq!( - snapshots.incomplete_snapshots_l1_batch_numbers, - [l1_batch_number] - ); for i in 0..2 { dal.add_storage_logs_filepath_for_snapshot( @@ -177,11 +166,10 @@ mod tests { } let snapshots = dal - .get_all_snapshots() + .get_all_complete_snapshots() .await .expect("Failed to retrieve snapshots"); assert_eq!(snapshots.snapshots_l1_batch_numbers, [l1_batch_number]); - assert_eq!(snapshots.incomplete_snapshots_l1_batch_numbers, []); let snapshot_metadata = dal .get_snapshot_metadata(l1_batch_number) diff --git a/core/lib/types/src/snapshots.rs b/core/lib/types/src/snapshots.rs index ec192a02b179..079fa23e2843 100644 --- a/core/lib/types/src/snapshots.rs +++ b/core/lib/types/src/snapshots.rs @@ -10,10 +10,6 @@ pub struct AllSnapshots { /// L1 batch numbers for complete snapshots. Ordered by descending number (i.e., 0th element /// corresponds to the newest snapshot). pub snapshots_l1_batch_numbers: Vec, - /// Same as `snapshots_l1_batch_numbers`, but for incomplete snapshots (ones in which some - /// storage log chunks may be missing). - #[serde(default, skip_serializing_if = "Vec::is_empty")] - pub incomplete_snapshots_l1_batch_numbers: Vec, } /// Storage snapshot metadata. Used in DAL to fetch certain snapshot data. @@ -42,7 +38,6 @@ impl SnapshotMetadata { pub struct SnapshotHeader { pub l1_batch_number: L1BatchNumber, pub miniblock_number: MiniblockNumber, - pub is_complete: bool, /// Ordered by chunk IDs. pub storage_logs_chunks: Vec, pub factory_deps_filepath: String, diff --git a/core/lib/zksync_core/src/api_server/web3/namespaces/snapshots.rs b/core/lib/zksync_core/src/api_server/web3/namespaces/snapshots.rs index 91605c6ed544..7ebc0a9fff13 100644 --- a/core/lib/zksync_core/src/api_server/web3/namespaces/snapshots.rs +++ b/core/lib/zksync_core/src/api_server/web3/namespaces/snapshots.rs @@ -39,7 +39,7 @@ impl SnapshotsNamespace { .map_err(|err| internal_error(method_name, err))?; let mut snapshots_dal = storage_processor.snapshots_dal(); let response = snapshots_dal - .get_all_snapshots() + .get_all_complete_snapshots() .await .map_err(|err| internal_error(method_name, err)); method_latency.observe(); @@ -71,6 +71,12 @@ impl SnapshotsNamespace { let snapshot_files = snapshot_metadata.storage_logs_filepaths; let is_complete = snapshot_files.iter().all(Option::is_some); + if !is_complete { + // We don't return incomplete snapshots via API. + method_latency.observe(); + return Ok(None); + } + let chunks = snapshot_files .into_iter() .enumerate() @@ -104,7 +110,6 @@ impl SnapshotsNamespace { Ok(Some(SnapshotHeader { l1_batch_number: snapshot_metadata.l1_batch_number, miniblock_number, - is_complete, last_l1_batch_with_metadata: l1_batch_with_metadata, storage_logs_chunks: chunks, factory_deps_filepath: snapshot_metadata.factory_deps_filepath, diff --git a/core/lib/zksync_core/src/api_server/web3/tests/snapshots.rs b/core/lib/zksync_core/src/api_server/web3/tests/snapshots.rs index 2e42e351677a..79fb0eb36651 100644 --- a/core/lib/zksync_core/src/api_server/web3/tests/snapshots.rs +++ b/core/lib/zksync_core/src/api_server/web3/tests/snapshots.rs @@ -76,22 +76,22 @@ impl HttpTest for SnapshotBasics { let all_snapshots = client.get_all_snapshots().await?; if self.is_complete_snapshot() { assert_eq!(all_snapshots.snapshots_l1_batch_numbers, [L1BatchNumber(1)]); - assert_eq!(all_snapshots.incomplete_snapshots_l1_batch_numbers, []); } else { assert_eq!(all_snapshots.snapshots_l1_batch_numbers, []); - assert_eq!( - all_snapshots.incomplete_snapshots_l1_batch_numbers, - [L1BatchNumber(1)] - ); } let snapshot_header = client .get_snapshot_by_l1_batch_number(L1BatchNumber(1)) - .await? - .context("no snapshot for L1 batch #1")?; + .await?; + let snapshot_header = if self.is_complete_snapshot() { + snapshot_header.context("no snapshot for L1 batch #1")? + } else { + assert!(snapshot_header.is_none()); + return Ok(()); + }; + assert_eq!(snapshot_header.l1_batch_number, L1BatchNumber(1)); assert_eq!(snapshot_header.miniblock_number, MiniblockNumber(1)); - assert_eq!(snapshot_header.is_complete, self.is_complete_snapshot()); assert_eq!( snapshot_header.factory_deps_filepath, "file:///factory_deps" From 5db83c08e7222cde414169ffe4e558bcbdacfbf7 Mon Sep 17 00:00:00 2001 From: Alex Ostrovski Date: Wed, 20 Dec 2023 13:05:52 +0200 Subject: [PATCH 17/19] Extract `load_or_initialize_snapshot_progress()` --- core/bin/snapshots_creator/src/creator.rs | 64 +++++++++++++---------- 1 file changed, 35 insertions(+), 29 deletions(-) diff --git a/core/bin/snapshots_creator/src/creator.rs b/core/bin/snapshots_creator/src/creator.rs index 08b562c1a738..aad281821309 100644 --- a/core/bin/snapshots_creator/src/creator.rs +++ b/core/bin/snapshots_creator/src/creator.rs @@ -69,6 +69,12 @@ pub(crate) struct SnapshotCreator { } impl SnapshotCreator { + async fn connect_to_replica(&self) -> anyhow::Result> { + self.replica_pool + .access_storage_tagged("snapshots_creator") + .await + } + async fn process_storage_logs_single_chunk( &self, semaphore: &Semaphore, @@ -84,10 +90,7 @@ impl SnapshotCreator { } let hashed_keys_range = get_chunk_hashed_keys_range(chunk_id, chunk_count); - let mut conn = self - .replica_pool - .access_storage_tagged("snapshots_creator") - .await?; + let mut conn = self.connect_to_replica().await?; let latency = METRICS.storage_logs_processing_duration[&StorageChunkStage::LoadFromPostgres].start(); @@ -145,10 +148,7 @@ impl SnapshotCreator { miniblock_number: MiniblockNumber, l1_batch_number: L1BatchNumber, ) -> anyhow::Result { - let mut conn = self - .replica_pool - .access_storage_tagged("snapshots_creator") - .await?; + let mut conn = self.connect_to_replica().await?; tracing::info!("Loading factory deps from Postgres..."); let latency = @@ -183,7 +183,7 @@ impl SnapshotCreator { Ok(output_filepath) } - /// Returns `None` if the created snapshot would coincide with `latest_snapshot`. + /// Returns `Ok(None)` if the created snapshot would coincide with `latest_snapshot`. async fn initialize_snapshot_progress( config: &SnapshotsCreatorConfig, min_chunk_count: u64, @@ -224,17 +224,12 @@ impl SnapshotCreator { Ok(Some(SnapshotProgress::new(l1_batch_number, chunk_count))) } - pub async fn run( - self, - config: SnapshotsCreatorConfig, + /// Returns `Ok(None)` if a snapshot should not be created / resumed. + async fn load_or_initialize_snapshot_progress( + &self, + config: &SnapshotsCreatorConfig, min_chunk_count: u64, - ) -> anyhow::Result<()> { - let latency = METRICS.snapshot_generation_duration.start(); - - let mut conn = self - .replica_pool - .access_storage_tagged("snapshots_creator") - .await?; + ) -> anyhow::Result> { let mut master_conn = self .master_pool .access_storage_tagged("snapshots_creator") @@ -248,23 +243,34 @@ impl SnapshotCreator { let pending_snapshot = latest_snapshot .as_ref() .filter(|snapshot| !snapshot.is_complete()); - let progress = if let Some(snapshot) = pending_snapshot { - SnapshotProgress::from_existing_snapshot(snapshot) + if let Some(snapshot) = pending_snapshot { + Ok(Some(SnapshotProgress::from_existing_snapshot(snapshot))) } else { - let progress = Self::initialize_snapshot_progress( - &config, + Self::initialize_snapshot_progress( + config, min_chunk_count, latest_snapshot.as_ref(), - &mut conn, + &mut self.connect_to_replica().await?, ) - .await?; + .await + } + } - match progress { - Some(progress) => progress, - None => return Ok(()), - } + pub async fn run( + self, + config: SnapshotsCreatorConfig, + min_chunk_count: u64, + ) -> anyhow::Result<()> { + let latency = METRICS.snapshot_generation_duration.start(); + + let Some(progress) = self + .load_or_initialize_snapshot_progress(&config, min_chunk_count) + .await? + else { + return Ok(()); // No snapshot creation is necessary }; + let mut conn = self.connect_to_replica().await?; let (_, last_miniblock_number_in_batch) = conn .blocks_dal() .get_miniblock_range_of_l1_batch(progress.l1_batch_number) From 34808294a7f17e526b28d1d3e086ace0159c5463 Mon Sep 17 00:00:00 2001 From: Alex Ostrovski Date: Wed, 3 Jan 2024 13:39:39 +0200 Subject: [PATCH 18/19] Rename API test structs --- .../src/api_server/web3/tests/mod.rs | 24 +++++----- .../src/api_server/web3/tests/snapshots.rs | 12 ++--- .../src/api_server/web3/tests/ws.rs | 48 +++++++++---------- 3 files changed, 42 insertions(+), 42 deletions(-) diff --git a/core/lib/zksync_core/src/api_server/web3/tests/mod.rs b/core/lib/zksync_core/src/api_server/web3/tests/mod.rs index 178392d324f2..3333e72faf54 100644 --- a/core/lib/zksync_core/src/api_server/web3/tests/mod.rs +++ b/core/lib/zksync_core/src/api_server/web3/tests/mod.rs @@ -300,10 +300,10 @@ async fn store_events( } #[derive(Debug)] -struct HttpServerBasics; +struct HttpServerBasicsTest; #[async_trait] -impl HttpTest for HttpServerBasics { +impl HttpTest for HttpServerBasicsTest { async fn test(&self, client: &HttpClient, _pool: &ConnectionPool) -> anyhow::Result<()> { let block_number = client.get_block_number().await?; assert_eq!(block_number, U64::from(0)); @@ -322,14 +322,14 @@ impl HttpTest for HttpServerBasics { #[tokio::test] async fn http_server_basics() { - test_http_server(HttpServerBasics).await; + test_http_server(HttpServerBasicsTest).await; } #[derive(Debug)] -struct BasicFilterChanges; +struct BasicFilterChangesTest; #[async_trait] -impl HttpTest for BasicFilterChanges { +impl HttpTest for BasicFilterChangesTest { async fn test(&self, client: &HttpClient, pool: &ConnectionPool) -> anyhow::Result<()> { let block_filter_id = client.new_block_filter().await?; let tx_filter_id = client.new_pending_transaction_filter().await?; @@ -370,14 +370,14 @@ impl HttpTest for BasicFilterChanges { #[tokio::test] async fn basic_filter_changes() { - test_http_server(BasicFilterChanges).await; + test_http_server(BasicFilterChangesTest).await; } #[derive(Debug)] -struct LogFilterChanges; +struct LogFilterChangesTest; #[async_trait] -impl HttpTest for LogFilterChanges { +impl HttpTest for LogFilterChangesTest { async fn test(&self, client: &HttpClient, pool: &ConnectionPool) -> anyhow::Result<()> { let all_logs_filter_id = client.new_filter(Filter::default()).await?; let address_filter = Filter { @@ -425,14 +425,14 @@ impl HttpTest for LogFilterChanges { #[tokio::test] async fn log_filter_changes() { - test_http_server(LogFilterChanges).await; + test_http_server(LogFilterChangesTest).await; } #[derive(Debug)] -struct LogFilterChangesWithBlockBoundaries; +struct LogFilterChangesWithBlockBoundariesTest; #[async_trait] -impl HttpTest for LogFilterChangesWithBlockBoundaries { +impl HttpTest for LogFilterChangesWithBlockBoundariesTest { async fn test(&self, client: &HttpClient, pool: &ConnectionPool) -> anyhow::Result<()> { let lower_bound_filter = Filter { from_block: Some(api::BlockNumber::Number(2.into())), @@ -522,5 +522,5 @@ impl HttpTest for LogFilterChangesWithBlockBoundaries { #[tokio::test] async fn log_filter_changes_with_block_boundaries() { - test_http_server(LogFilterChangesWithBlockBoundaries).await; + test_http_server(LogFilterChangesWithBlockBoundariesTest).await; } diff --git a/core/lib/zksync_core/src/api_server/web3/tests/snapshots.rs b/core/lib/zksync_core/src/api_server/web3/tests/snapshots.rs index 79fb0eb36651..608f2845065f 100644 --- a/core/lib/zksync_core/src/api_server/web3/tests/snapshots.rs +++ b/core/lib/zksync_core/src/api_server/web3/tests/snapshots.rs @@ -36,11 +36,11 @@ async fn seal_l1_batch( } #[derive(Debug)] -struct SnapshotBasics { +struct SnapshotBasicsTest { chunk_ids: HashSet, } -impl SnapshotBasics { +impl SnapshotBasicsTest { const CHUNK_COUNT: u64 = 5; fn new(chunk_ids: impl IntoIterator) -> Self { @@ -55,7 +55,7 @@ impl SnapshotBasics { } #[async_trait] -impl HttpTest for SnapshotBasics { +impl HttpTest for SnapshotBasicsTest { async fn test(&self, client: &HttpClient, pool: &ConnectionPool) -> anyhow::Result<()> { let mut storage = pool.access_storage().await.unwrap(); store_miniblock(&mut storage).await?; @@ -111,15 +111,15 @@ impl HttpTest for SnapshotBasics { #[tokio::test] async fn snapshot_without_chunks() { - test_http_server(SnapshotBasics::new([])).await; + test_http_server(SnapshotBasicsTest::new([])).await; } #[tokio::test] async fn snapshot_with_some_chunks() { - test_http_server(SnapshotBasics::new([0, 2, 4])).await; + test_http_server(SnapshotBasicsTest::new([0, 2, 4])).await; } #[tokio::test] async fn snapshot_with_all_chunks() { - test_http_server(SnapshotBasics::new(0..SnapshotBasics::CHUNK_COUNT)).await; + test_http_server(SnapshotBasicsTest::new(0..SnapshotBasicsTest::CHUNK_COUNT)).await; } diff --git a/core/lib/zksync_core/src/api_server/web3/tests/ws.rs b/core/lib/zksync_core/src/api_server/web3/tests/ws.rs index 5a58e72f8af4..509b1e194e7f 100644 --- a/core/lib/zksync_core/src/api_server/web3/tests/ws.rs +++ b/core/lib/zksync_core/src/api_server/web3/tests/ws.rs @@ -114,10 +114,10 @@ async fn test_ws_server(test: impl WsTest) { } #[derive(Debug)] -struct WsServerCanStart; +struct WsServerCanStartTest; #[async_trait] -impl WsTest for WsServerCanStart { +impl WsTest for WsServerCanStartTest { async fn test( &self, client: &WsClient, @@ -141,14 +141,14 @@ impl WsTest for WsServerCanStart { #[tokio::test] async fn ws_server_can_start() { - test_ws_server(WsServerCanStart).await; + test_ws_server(WsServerCanStartTest).await; } #[derive(Debug)] -struct BasicSubscriptions; +struct BasicSubscriptionsTest; #[async_trait] -impl WsTest for BasicSubscriptions { +impl WsTest for BasicSubscriptionsTest { async fn test( &self, client: &WsClient, @@ -194,11 +194,11 @@ impl WsTest for BasicSubscriptions { #[tokio::test] async fn basic_subscriptions() { - test_ws_server(BasicSubscriptions).await; + test_ws_server(BasicSubscriptionsTest).await; } #[derive(Debug)] -struct LogSubscriptions; +struct LogSubscriptionsTest; #[derive(Debug)] struct Subscriptions { @@ -249,7 +249,7 @@ impl Subscriptions { } #[async_trait] -impl WsTest for LogSubscriptions { +impl WsTest for LogSubscriptionsTest { async fn test( &self, client: &WsClient, @@ -315,14 +315,14 @@ async fn collect_logs( #[tokio::test] async fn log_subscriptions() { - test_ws_server(LogSubscriptions).await; + test_ws_server(LogSubscriptionsTest).await; } #[derive(Debug)] -struct LogSubscriptionsWithNewBlock; +struct LogSubscriptionsWithNewBlockTest; #[async_trait] -impl WsTest for LogSubscriptionsWithNewBlock { +impl WsTest for LogSubscriptionsWithNewBlockTest { async fn test( &self, client: &WsClient, @@ -363,14 +363,14 @@ impl WsTest for LogSubscriptionsWithNewBlock { #[tokio::test] async fn log_subscriptions_with_new_block() { - test_ws_server(LogSubscriptionsWithNewBlock).await; + test_ws_server(LogSubscriptionsWithNewBlockTest).await; } #[derive(Debug)] -struct LogSubscriptionsWithManyBlocks; +struct LogSubscriptionsWithManyBlocksTest; #[async_trait] -impl WsTest for LogSubscriptionsWithManyBlocks { +impl WsTest for LogSubscriptionsWithManyBlocksTest { async fn test( &self, client: &WsClient, @@ -409,14 +409,14 @@ impl WsTest for LogSubscriptionsWithManyBlocks { #[tokio::test] async fn log_subscriptions_with_many_new_blocks_at_once() { - test_ws_server(LogSubscriptionsWithManyBlocks).await; + test_ws_server(LogSubscriptionsWithManyBlocksTest).await; } #[derive(Debug)] -struct LogSubscriptionsWithDelay; +struct LogSubscriptionsWithDelayTest; #[async_trait] -impl WsTest for LogSubscriptionsWithDelay { +impl WsTest for LogSubscriptionsWithDelayTest { async fn test( &self, client: &WsClient, @@ -473,14 +473,14 @@ impl WsTest for LogSubscriptionsWithDelay { #[tokio::test] async fn log_subscriptions_with_delay() { - test_ws_server(LogSubscriptionsWithDelay).await; + test_ws_server(LogSubscriptionsWithDelayTest).await; } #[derive(Debug)] -struct RateLimiting; +struct RateLimitingTest; #[async_trait] -impl WsTest for RateLimiting { +impl WsTest for RateLimitingTest { async fn test( &self, client: &WsClient, @@ -510,14 +510,14 @@ impl WsTest for RateLimiting { #[tokio::test] async fn rate_limiting() { - test_ws_server(RateLimiting).await; + test_ws_server(RateLimitingTest).await; } #[derive(Debug)] -struct BatchGetsRateLimited; +struct BatchGetsRateLimitedTest; #[async_trait] -impl WsTest for BatchGetsRateLimited { +impl WsTest for BatchGetsRateLimitedTest { async fn test( &self, client: &WsClient, @@ -554,5 +554,5 @@ impl WsTest for BatchGetsRateLimited { #[tokio::test] async fn batch_rate_limiting() { - test_ws_server(BatchGetsRateLimited).await; + test_ws_server(BatchGetsRateLimitedTest).await; } From e160667c471eda51c9cb81607464814cadacd0eb Mon Sep 17 00:00:00 2001 From: Alex Ostrovski Date: Wed, 3 Jan 2024 17:59:02 +0200 Subject: [PATCH 19/19] Explain `load_or_initialize_snapshot_progress` shortcut --- core/bin/snapshots_creator/src/creator.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/bin/snapshots_creator/src/creator.rs b/core/bin/snapshots_creator/src/creator.rs index 161ab7fab761..da2ac930dbd4 100644 --- a/core/bin/snapshots_creator/src/creator.rs +++ b/core/bin/snapshots_creator/src/creator.rs @@ -267,7 +267,8 @@ impl SnapshotCreator { .load_or_initialize_snapshot_progress(&config, min_chunk_count) .await? else { - return Ok(()); // No snapshot creation is necessary + // No snapshot creation is necessary; a snapshot for the current L1 batch is already created + return Ok(()); }; let mut conn = self.connect_to_replica().await?;