-
Notifications
You must be signed in to change notification settings - Fork 997
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Handle rapid deletion and recreation of subgraphs more gracefully #4044
Changes from all commits
1c34167
5ff71a3
82045fd
4c25b79
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -373,6 +373,20 @@ impl SubgraphStoreInner { | |
Ok(site) | ||
} | ||
|
||
fn evict(&self, id: &DeploymentHash) -> Result<(), StoreError> { | ||
if let Some((site, _)) = self.sites.remove(id) { | ||
let store = self.stores.get(&site.shard).ok_or_else(|| { | ||
constraint_violation!( | ||
"shard {} for deployment sgd{} not found when evicting", | ||
site.shard, | ||
site.id | ||
) | ||
})?; | ||
store.layout_cache.remove(&site); | ||
} | ||
Ok(()) | ||
} | ||
|
||
fn find_site(&self, id: DeploymentId) -> Result<Arc<Site>, StoreError> { | ||
if let Some(site) = self.sites.find(|site| site.id == id) { | ||
return Ok(site); | ||
|
@@ -499,6 +513,8 @@ impl SubgraphStoreInner { | |
#[cfg(not(debug_assertions))] | ||
assert!(!replace); | ||
|
||
self.evict(&schema.id)?; | ||
|
||
let graft_base = deployment | ||
.graft_base | ||
.as_ref() | ||
|
@@ -1264,6 +1280,18 @@ impl SubgraphStoreTrait for SubgraphStore { | |
Ok(writable) | ||
} | ||
|
||
async fn stop_subgraph(&self, loc: &DeploymentLocator) -> Result<(), StoreError> { | ||
self.evict(&loc.hash)?; | ||
|
||
// Remove the writable from the cache and stop it | ||
let deployment = loc.id.into(); | ||
let writable = self.writables.lock().unwrap().remove(&deployment); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I got it right docs right, this would only panic due to a programming error, so it is relatively safe to unwrap here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would panic if the lock is poisoned, i.e., if some other thread paniced while holding the lock. In that case, it's anybody's guess what's happening and panicing here is really the only thing we can do. |
||
match writable { | ||
Some(writable) => writable.stop().await, | ||
None => Ok(()), | ||
} | ||
} | ||
|
||
fn is_deployed(&self, id: &DeploymentHash) -> Result<bool, StoreError> { | ||
match self.site(id) { | ||
Ok(_) => Ok(true), | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -400,6 +400,7 @@ impl BlockTracker { | |
self.revert = self.revert.min(block_ptr.number); | ||
self.block = self.block.min(block_ptr.number); | ||
} | ||
Request::Stop => { /* do nothing */ } | ||
} | ||
} | ||
|
||
|
@@ -438,10 +439,16 @@ enum Request { | |
block_ptr: BlockPtr, | ||
firehose_cursor: FirehoseCursor, | ||
}, | ||
Stop, | ||
} | ||
|
||
enum ExecResult { | ||
Continue, | ||
Stop, | ||
Comment on lines
+445
to
+447
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just a suggestion, so feel free to ignore this comment. This enum somewhat similar to the ControlFlow from While we don't need to return any data out of this, maybe we could redefine it as type ExecResult = ControlFlow<()> I know this won't change anything, but it would make room if we want to return data from those contexts in the future. |
||
} | ||
|
||
impl Request { | ||
fn execute(&self) -> Result<(), StoreError> { | ||
fn execute(&self) -> Result<ExecResult, StoreError> { | ||
match self { | ||
Request::Write { | ||
store, | ||
|
@@ -453,21 +460,26 @@ impl Request { | |
deterministic_errors, | ||
manifest_idx_and_name, | ||
offchain_to_remove, | ||
} => store.transact_block_operations( | ||
block_ptr_to, | ||
firehose_cursor, | ||
mods, | ||
stopwatch, | ||
data_sources, | ||
deterministic_errors, | ||
manifest_idx_and_name, | ||
offchain_to_remove, | ||
), | ||
} => store | ||
.transact_block_operations( | ||
block_ptr_to, | ||
firehose_cursor, | ||
mods, | ||
stopwatch, | ||
data_sources, | ||
deterministic_errors, | ||
manifest_idx_and_name, | ||
offchain_to_remove, | ||
) | ||
.map(|()| ExecResult::Continue), | ||
Request::RevertTo { | ||
store, | ||
block_ptr, | ||
firehose_cursor, | ||
} => store.revert_block_operations(block_ptr.clone(), firehose_cursor), | ||
} => store | ||
.revert_block_operations(block_ptr.clone(), firehose_cursor) | ||
.map(|()| ExecResult::Continue), | ||
Request::Stop => return Ok(ExecResult::Stop), | ||
} | ||
} | ||
} | ||
|
@@ -556,12 +568,19 @@ impl Queue { | |
}; | ||
|
||
let _section = queue.stopwatch.start_section("queue_pop"); | ||
use ExecResult::*; | ||
match res { | ||
Ok(Ok(())) => { | ||
Ok(Ok(Continue)) => { | ||
// The request has been handled. It's now safe to remove it | ||
// from the queue | ||
queue.queue.pop().await; | ||
} | ||
Ok(Ok(Stop)) => { | ||
// Graceful shutdown. We also handled the request | ||
// successfully | ||
queue.queue.pop().await; | ||
return; | ||
} | ||
Ok(Err(e)) => { | ||
error!(logger, "Subgraph writer failed"; "error" => e.to_string()); | ||
queue.record_err(e); | ||
|
@@ -616,6 +635,10 @@ impl Queue { | |
self.check_err() | ||
} | ||
|
||
async fn stop(&self) -> Result<(), StoreError> { | ||
self.push(Request::Stop).await | ||
} | ||
|
||
fn check_err(&self) -> Result<(), StoreError> { | ||
if let Some(err) = self.write_err.lock().unwrap().take() { | ||
return Err(err); | ||
|
@@ -670,7 +693,7 @@ impl Queue { | |
None | ||
} | ||
} | ||
Request::RevertTo { .. } => None, | ||
Request::RevertTo { .. } | Request::Stop => None, | ||
} | ||
}); | ||
|
||
|
@@ -725,7 +748,7 @@ impl Queue { | |
} | ||
} | ||
} | ||
Request::RevertTo { .. } => { /* nothing to do */ } | ||
Request::RevertTo { .. } | Request::Stop => { /* nothing to do */ } | ||
} | ||
map | ||
}, | ||
|
@@ -779,7 +802,7 @@ impl Queue { | |
.collect(); | ||
} | ||
} | ||
Request::RevertTo { .. } => { /* nothing to do */ } | ||
Request::RevertTo { .. } | Request::Stop => { /* nothing to do */ } | ||
} | ||
dds | ||
}); | ||
|
@@ -925,6 +948,13 @@ impl Writer { | |
Writer::Async(queue) => queue.poisoned(), | ||
} | ||
} | ||
|
||
async fn stop(&self) -> Result<(), StoreError> { | ||
match self { | ||
Writer::Sync(_) => Ok(()), | ||
Writer::Async(queue) => queue.stop().await, | ||
} | ||
} | ||
} | ||
|
||
pub struct WritableStore { | ||
|
@@ -962,6 +992,10 @@ impl WritableStore { | |
pub(crate) fn poisoned(&self) -> bool { | ||
self.writer.poisoned() | ||
} | ||
|
||
pub(crate) async fn stop(&self) -> Result<(), StoreError> { | ||
self.writer.stop().await | ||
} | ||
} | ||
|
||
impl ReadStore for WritableStore { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the reason we call
evict
on deployment creation so we have more confidence that the cache will be cleared?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, to make sure that we don't have a stale entry for the same hash in the cache (which could happen if a previous deployment of the same hash was stopped and then redeployed) This is really redundant with the eviction in
stop_subgraph
but since it's a cheap operation seemed safer to do it in both places.