-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Spike: Pause compaction when out of sync #10392
Closed
Closed
Changes from all commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
6426c27
signal chain in and out of sync to compaction workers
ZenGround0 ac9db5c
check yield before GC
ZenGround0 36d274a
Implement yield friendly online GC
ZenGround0 7a4082c
Stop swallowing errors
ZenGround0 68f402d
fix: revert chain sync mutex to a regular lock
Stebalien File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -91,7 +91,35 @@ func (s *SplitStore) HeadChange(_, apply []*types.TipSet) error { | |
// Regardless, we put a mutex in HeadChange just to be safe | ||
|
||
if !atomic.CompareAndSwapInt32(&s.compacting, 0, 1) { | ||
// we are currently compacting -- protect the new tipset(s) | ||
// we are currently compacting | ||
// 1. Signal sync condition to yield compaction when out of sync and resume when in sync | ||
timestamp := time.Unix(int64(curTs.MinTimestamp()), 0) | ||
if CheckSyncGap && time.Since(timestamp) > SyncGapTime { | ||
/* Chain out of sync */ | ||
if atomic.CompareAndSwapInt32(&s.outOfSync, 0, 1) { | ||
// transition from in sync to out of sync | ||
s.chainSyncMx.Lock() | ||
s.chainSyncFinished = false | ||
s.chainSyncMx.Unlock() | ||
} | ||
// already out of sync, no signaling necessary | ||
|
||
} | ||
// TODO: ok to use hysteresis with no transitions between 30s and 1m? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When we're syncing, the head will only be taken when we get fully in sync, so this is probably fine (well, really the sync new head is only selected when we complete the previous sync, but the effect is similar) |
||
if time.Since(timestamp) < SyncWaitTime { | ||
/* Chain in sync */ | ||
if atomic.CompareAndSwapInt32(&s.outOfSync, 0, 0) { | ||
// already in sync, no signaling necessary | ||
} else { | ||
// transition from out of sync to in sync | ||
s.chainSyncMx.Lock() | ||
s.chainSyncFinished = true | ||
s.chainSyncCond.Broadcast() | ||
s.chainSyncMx.Unlock() | ||
} | ||
|
||
} | ||
// 2. protect the new tipset(s) | ||
s.protectTipSets(apply) | ||
return nil | ||
} | ||
|
@@ -427,7 +455,7 @@ func (s *SplitStore) protectTxnRefs(markSet MarkSet) error { | |
// transactionally protect a reference by walking the object and marking. | ||
// concurrent markings are short circuited by checking the markset. | ||
func (s *SplitStore) doTxnProtect(root cid.Cid, markSet MarkSet) (int64, error) { | ||
if err := s.checkClosing(); err != nil { | ||
if err := s.checkYield(); err != nil { | ||
return 0, err | ||
} | ||
|
||
|
@@ -545,7 +573,7 @@ func (s *SplitStore) doCompact(curTs *types.TipSet) error { | |
} | ||
defer coldSet.Close() //nolint:errcheck | ||
|
||
if err := s.checkClosing(); err != nil { | ||
if err := s.checkYield(); err != nil { | ||
return err | ||
} | ||
|
||
|
@@ -617,7 +645,7 @@ func (s *SplitStore) doCompact(curTs *types.TipSet) error { | |
|
||
log.Infow("marking done", "took", time.Since(startMark), "marked", *count) | ||
|
||
if err := s.checkClosing(); err != nil { | ||
if err := s.checkYield(); err != nil { | ||
return err | ||
} | ||
|
||
|
@@ -627,7 +655,7 @@ func (s *SplitStore) doCompact(curTs *types.TipSet) error { | |
return xerrors.Errorf("error protecting transactional refs: %w", err) | ||
} | ||
|
||
if err := s.checkClosing(); err != nil { | ||
if err := s.checkYield(); err != nil { | ||
return err | ||
} | ||
|
||
|
@@ -704,7 +732,7 @@ func (s *SplitStore) doCompact(curTs *types.TipSet) error { | |
stats.Record(s.ctx, metrics.SplitstoreCompactionHot.M(hotCnt)) | ||
stats.Record(s.ctx, metrics.SplitstoreCompactionCold.M(coldCnt)) | ||
|
||
if err := s.checkClosing(); err != nil { | ||
if err := s.checkYield(); err != nil { | ||
return err | ||
} | ||
|
||
|
@@ -713,7 +741,7 @@ func (s *SplitStore) doCompact(curTs *types.TipSet) error { | |
// possibly delete objects we didn't have when we were collecting cold objects) | ||
s.waitForMissingRefs(markSet) | ||
|
||
if err := s.checkClosing(); err != nil { | ||
if err := s.checkYield(); err != nil { | ||
return err | ||
} | ||
|
||
|
@@ -733,7 +761,7 @@ func (s *SplitStore) doCompact(curTs *types.TipSet) error { | |
} | ||
log.Infow("moving done", "took", time.Since(startMove)) | ||
|
||
if err := s.checkClosing(); err != nil { | ||
if err := s.checkYield(); err != nil { | ||
return err | ||
} | ||
|
||
|
@@ -764,7 +792,7 @@ func (s *SplitStore) doCompact(curTs *types.TipSet) error { | |
} | ||
|
||
// wait for the head to catch up so that the current tipset is marked | ||
s.waitForSync() | ||
s.waitForTxnSync() | ||
|
||
if err := s.checkClosing(); err != nil { | ||
return err | ||
|
@@ -865,7 +893,7 @@ func (s *SplitStore) beginCriticalSection(markSet MarkSet) error { | |
return nil | ||
} | ||
|
||
func (s *SplitStore) waitForSync() { | ||
func (s *SplitStore) waitForTxnSync() { | ||
log.Info("waiting for sync") | ||
if !CheckSyncGap { | ||
log.Warnf("If you see this outside of test it is a serious splitstore issue") | ||
|
@@ -884,6 +912,25 @@ func (s *SplitStore) waitForSync() { | |
} | ||
} | ||
|
||
// Block compaction operations if chain sync has fallen behind | ||
func (s *SplitStore) waitForSync() { | ||
if atomic.LoadInt32(&s.outOfSync) == 0 { | ||
return | ||
} | ||
s.chainSyncMx.Lock() | ||
defer s.chainSyncMx.Unlock() | ||
|
||
for !s.chainSyncFinished { | ||
s.chainSyncCond.Wait() | ||
} | ||
} | ||
|
||
// Combined sync and closing check | ||
func (s *SplitStore) checkYield() error { | ||
s.waitForSync() | ||
return s.checkClosing() | ||
} | ||
|
||
func (s *SplitStore) endTxnProtect() { | ||
s.txnLk.Lock() | ||
defer s.txnLk.Unlock() | ||
|
@@ -1037,7 +1084,7 @@ func (s *SplitStore) walkChain(ts *types.TipSet, inclState, inclMsgs abi.ChainEp | |
|
||
for len(toWalk) > 0 { | ||
// walking can take a while, so check this with every opportunity | ||
if err := s.checkClosing(); err != nil { | ||
if err := s.checkYield(); err != nil { | ||
return err | ||
} | ||
|
||
|
@@ -1106,7 +1153,7 @@ func (s *SplitStore) walkObject(c cid.Cid, visitor ObjectVisitor, f func(cid.Cid | |
} | ||
|
||
// check this before recursing | ||
if err := s.checkClosing(); err != nil { | ||
if err := s.checkYield(); err != nil { | ||
return 0, err | ||
} | ||
|
||
|
@@ -1175,7 +1222,7 @@ func (s *SplitStore) walkObjectIncomplete(c cid.Cid, visitor ObjectVisitor, f, m | |
} | ||
|
||
// check this before recursing | ||
if err := s.checkClosing(); err != nil { | ||
if err := s.checkYield(); err != nil { | ||
return sz, err | ||
} | ||
|
||
|
@@ -1262,7 +1309,7 @@ func (s *SplitStore) moveColdBlocks(coldr *ColdSetReader) error { | |
batch := make([]blocks.Block, 0, batchSize) | ||
|
||
err := coldr.ForEach(func(c cid.Cid) error { | ||
if err := s.checkClosing(); err != nil { | ||
if err := s.checkYield(); err != nil { | ||
return err | ||
} | ||
blk, err := s.hot.Get(s.ctx, c) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we have confidence in this number? Did we check that it's "good enough" on mainnet?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Need to check on this. There's some evidence that it might be too high right now resulting in chain getting out of sync. Alternate strategy we can use is leave as is and see if chain sync issues persist correlated to moving GC.
@vyzo for reference do you have an estimate of a reasonable time for moving GC to complete today?
cc @TippyFlitsUK you might have an even better idea.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe 5-10 min.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
lets take a fresh measurement?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For reference I got 50m on my fairly resourced mainnet node
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
^ with current default of half CPUs
I'll measure with 8 and compare when I get to similar garbage levels
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
working without major issues on mainnet (cc @TippyFlitsUK ) and this seems to be preventing out of sync on moving GC
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One last thing to note: I'm suspicious that reducing goroutines here is the main thing causing improvement since moving GC is anecdotally the thing which causes chain sync to get left behind and we are not doing any yielding during move (yet) in this change. If we are concerned about risk of compaction deadlock we could just include the simple go routine limiting of moving GC.