From 7a83d63fe07af65dd340e4dfd11f53c5344e7cb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Szil=C3=A1gyi?= Date: Fri, 15 Sep 2023 15:06:25 +0300 Subject: [PATCH 1/2] eth/downloader: prevent pivot moves after state commit (#28126) --- eth/downloader/downloader.go | 37 +++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/eth/downloader/downloader.go b/eth/downloader/downloader.go index a0cc2fd34..fc588dd4a 100644 --- a/eth/downloader/downloader.go +++ b/eth/downloader/downloader.go @@ -1777,17 +1777,30 @@ func (d *Downloader) processFastSyncContent() error { // To cater for moving pivot points, track the pivot block and subsequently // accumulated download results separately. + // + // These will be nil up to the point where we reach the pivot, and will only + // be set temporarily if the synced blocks are piling up, but the pivot is + // still busy downloading. In that case, we need to occasionally check for + // pivot moves, so need to unblock the loop. These fields will accumulate + // the results in the meantime. + // + // Note, there's no issue with memory piling up since after 64 blocks the + // pivot will forcefully move so these accumulators will be dropped. var ( oldPivot *fetchResult // Locked in pivot block, might change eventually oldTail []*fetchResult // Downloaded content after the pivot ) for { - // Wait for the next batch of downloaded data to be available, and if the pivot - // block became stale, move the goalpost - results := d.queue.Results(oldPivot == nil) // Block if we're not monitoring pivot staleness + // Wait for the next batch of downloaded data to be available. If we have + // not yet reached the pivot point, wait blockingly as there's no need to + // spin-loop check for pivot moves. If we reached the pivot but have not + // yet processed it, check for results async, so we might notice pivot + // moves while state syncing. If the pivot was passed fully, block again + // as there's no more reason to check for pivot moves at all. + results := d.queue.Results(oldPivot == nil) if len(results) == 0 { // If pivot sync is done, stop - if oldPivot == nil { + if atomic.LoadInt32(&d.committed) == 1 { return sync.Cancel() } // If sync failed, stop @@ -1807,21 +1820,23 @@ func (d *Downloader) processFastSyncContent() error { pivot := d.pivotHeader d.pivotLock.RUnlock() - if oldPivot == nil { - if pivot.Root != sync.root { - sync.Cancel() - sync = d.syncState(pivot.Root) + if oldPivot == nil { // no results piling up, we can move the pivot + if atomic.LoadInt32(&d.committed) == 0 { // not yet passed the pivot, we can move the pivot + if pivot.Root != sync.root { // pivot position changed, we can move the pivot + sync.Cancel() + sync = d.syncState(pivot.Root) - go closeOnErr(sync) + go closeOnErr(sync) + } } - } else { + } else { // results already piled up, consume before handling pivot move results = append(append([]*fetchResult{oldPivot}, oldTail...), results...) } // Split around the pivot block and process the two sides via fast/full sync if atomic.LoadInt32(&d.committed) == 0 { latest := results[len(results)-1].Header // If the height is above the pivot block by 2 sets, it means the pivot - // become stale in the network and it was garbage collected, move to a + // become stale in the network, and it was garbage collected, move to a // new pivot. // // Note, we have `reorgProtHeaderDelay` number of blocks withheld, Those From 095cef6f9295db2f7588b4f84c888535d7a6178e Mon Sep 17 00:00:00 2001 From: rjl493456442 Date: Sun, 17 Sep 2023 22:35:09 +0800 Subject: [PATCH 2/2] core, eth/downloader: fix genesis state missing due to state sync (#28124) * core: fix chain repair corner case in path-based scheme * eth/downloader: disable trie database whenever state sync is launched --- core/blockchain.go | 98 ++++++++++++++++++++---------------- eth/downloader/downloader.go | 7 +++ 2 files changed, 62 insertions(+), 43 deletions(-) diff --git a/core/blockchain.go b/core/blockchain.go index 632a679a0..d1ec0cedc 100644 --- a/core/blockchain.go +++ b/core/blockchain.go @@ -372,28 +372,38 @@ func NewBlockChain(db ethdb.Database, cacheConfig *CacheConfig, genesis *Genesis // Make sure the state associated with the block is available head := bc.CurrentBlock() if !bc.HasState(head.Root()) { - // Head state is missing, before the state recovery, find out the - // disk layer point of snapshot(if it's enabled). Make sure the - // rewound point is lower than disk layer. - var diskRoot common.Hash - if bc.cacheConfig.SnapshotLimit > 0 { - diskRoot = rawdb.ReadSnapshotRoot(bc.db) - } - if diskRoot != (common.Hash{}) { - log.Warn("Head state missing, repairing", "number", head.Number(), "hash", head.Hash(), "snaproot", diskRoot) - - snapDisk, err := bc.setHeadBeyondRoot(head.NumberU64(), diskRoot, true) - if err != nil { - return nil, err - } - // Chain rewound, persist old snapshot number to indicate recovery procedure - if snapDisk != 0 { - rawdb.WriteSnapshotRecoveryNumber(bc.db, snapDisk) - } + if head.NumberU64() == 0 { + // The genesis state is missing, which is only possible in the path-based + // scheme. This situation occurs when the state syncer overwrites it. + // + // The solution is to reset the state to the genesis state. Although it may not + // match the sync target, the state healer will later address and correct any + // inconsistencies. + bc.resetState() } else { - log.Warn("Head state missing, repairing", "number", head.Number(), "hash", head.Hash()) - if _, err := bc.setHeadBeyondRoot(head.NumberU64(), common.Hash{}, true); err != nil { - return nil, err + // Head state is missing, before the state recovery, find out the + // disk layer point of snapshot(if it's enabled). Make sure the + // rewound point is lower than disk layer. + var diskRoot common.Hash + if bc.cacheConfig.SnapshotLimit > 0 { + diskRoot = rawdb.ReadSnapshotRoot(bc.db) + } + if diskRoot != (common.Hash{}) { + log.Warn("Head state missing, repairing", "number", head.Number, "hash", head.Hash(), "snaproot", diskRoot) + + snapDisk, err := bc.setHeadBeyondRoot(head.NumberU64(), diskRoot, true) + if err != nil { + return nil, err + } + // Chain rewound, persist old snapshot number to indicate recovery procedure + if snapDisk != 0 { + rawdb.WriteSnapshotRecoveryNumber(bc.db, snapDisk) + } + } else { + log.Warn("Head state missing, repairing", "number", head.Number, "hash", head.Hash()) + if _, err := bc.setHeadBeyondRoot(head.NumberU64(), common.Hash{}, true); err != nil { + return nil, err + } } } } @@ -665,6 +675,28 @@ func (bc *BlockChain) SetHead(head uint64) error { return err } +// resetState resets the persistent state to genesis state if it's not present. +func (bc *BlockChain) resetState() { + // Short circuit if the genesis state is already present. + root := bc.genesisBlock.Root() + if bc.HasState(root) { + return + } + // Reset the state database to empty for committing genesis state. + // Note, it should only happen in path-based scheme and Reset function + // is also only call-able in this mode. + if bc.triedb.Scheme() == rawdb.PathScheme { + if err := bc.triedb.Reset(types.EmptyRootHash); err != nil { + log.Crit("Failed to clean state", "err", err) // Shouldn't happen + } + } + // Write genesis state into database. + if err := CommitGenesisState(bc.db, bc.triedb, bc.genesisBlock.Hash()); err != nil { + log.Crit("Failed to commit genesis state", "err", err) + } + log.Info("Reset state to genesis", "root", root) +} + // setHeadBeyondRoot rewinds the local chain to a new head with the extra condition // that the rewind must pass the specified state root. This method is meant to be // used when rewinding with snapshots enabled to ensure that we go back further than @@ -687,26 +719,6 @@ func (bc *BlockChain) setHeadBeyondRoot(head uint64, root common.Hash, repair bo pivot := rawdb.ReadLastPivotNumber(bc.db) frozen, _ := bc.db.Ancients() - // resetState resets the persistent state to genesis if it's not available. - resetState := func() { - // Short circuit if the genesis state is already present. - if bc.HasState(bc.genesisBlock.Root()) { - return - } - // Reset the state database to empty for committing genesis state. - // Note, it should only happen in path-based scheme and Reset function - // is also only call-able in this mode. - if bc.triedb.Scheme() == rawdb.PathScheme { - if err := bc.triedb.Reset(types.EmptyRootHash); err != nil { - log.Crit("Failed to clean state", "err", err) // Shouldn't happen - } - } - // Write genesis state into database. - if err := CommitGenesisState(bc.db, bc.triedb, bc.genesisBlock.Hash()); err != nil { - log.Crit("Failed to commit genesis state", "err", err) - } - } - updateFn := func(db ethdb.KeyValueWriter, header *types.Header) (uint64, bool) { // Rewind the blockchain, ensuring we don't end up with a stateless head // block. Note, depth equality is permitted to allow using SetHead as a @@ -716,7 +728,7 @@ func (bc *BlockChain) setHeadBeyondRoot(head uint64, root common.Hash, repair bo if newHeadBlock == nil { log.Error("Gap in the chain, rewinding to genesis", "number", header.Number, "hash", header.Hash()) newHeadBlock = bc.genesisBlock - resetState() + bc.resetState() } else { // Block exists, keep rewinding until we find one with state, // keeping rewinding until we exceed the optional threshold @@ -747,7 +759,7 @@ func (bc *BlockChain) setHeadBeyondRoot(head uint64, root common.Hash, repair bo } if beyondRoot || newHeadBlock.NumberU64() == 0 { if newHeadBlock.NumberU64() == 0 { - resetState() + bc.resetState() } else if !bc.HasState(newHeadBlock.Root()) { // Rewind to a block with recoverable state. If the state is // missing, run the state recovery here. diff --git a/eth/downloader/downloader.go b/eth/downloader/downloader.go index fc588dd4a..345c6d98f 100644 --- a/eth/downloader/downloader.go +++ b/eth/downloader/downloader.go @@ -383,6 +383,13 @@ func (d *Downloader) synchronise(id string, hash common.Hash, td *big.Int, mode // but until snap becomes prevalent, we should support both. TODO(karalabe). if mode == SnapSync { if !d.snapSync { + // Snap sync will directly modify the persistent state, making the entire + // trie database unusable until the state is fully synced. To prevent any + // subsequent state reads, explicitly disable the trie database and state + // syncer is responsible to address and correct any state missing. + if d.blockchain.TrieDB().Scheme() == rawdb.PathScheme { + d.blockchain.TrieDB().Reset(types.EmptyRootHash) + } // Snap sync uses the snapshot namespace to store potentially flakey data until // sync completely heals and finishes. Pause snapshot maintenance in the mean // time to prevent access.