-
Notifications
You must be signed in to change notification settings - Fork 171
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
RUST-1443 Ensure monitors close after server is removed from topology #744
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -101,25 +101,11 @@ impl Monitor { | |
// We only go to sleep when using the polling protocol (i.e. server never returned a | ||
// topologyVersion) or when the most recent check failed. | ||
if self.topology_version.is_none() || !check_succeeded { | ||
#[cfg(test)] | ||
let min_frequency = self | ||
.client_options | ||
.test_options | ||
.as_ref() | ||
.and_then(|to| to.min_heartbeat_freq) | ||
.unwrap_or(MIN_HEARTBEAT_FREQUENCY); | ||
|
||
#[cfg(not(test))] | ||
let min_frequency = MIN_HEARTBEAT_FREQUENCY; | ||
|
||
tokio::select! { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I moved the minimum delay into |
||
_ = runtime::delay_for(min_frequency) => {}, | ||
_ = self.request_receiver.wait_for_server_close() => { | ||
break; | ||
} | ||
} | ||
self.request_receiver | ||
.wait_for_check_request(heartbeat_frequency - min_frequency) | ||
.wait_for_check_request( | ||
self.client_options.min_heartbeat_frequency(), | ||
heartbeat_frequency, | ||
) | ||
.await; | ||
} | ||
} | ||
|
@@ -607,17 +593,28 @@ impl MonitorRequestReceiver { | |
err | ||
} | ||
|
||
/// Wait for a request to immediately check the server to come in, guarded by the provided | ||
/// timeout. If a cancellation request is received indicating the topology has closed, this | ||
/// method will return. All other cancellation requests will be ignored. | ||
async fn wait_for_check_request(&mut self, timeout: Duration) { | ||
/// Wait for a request to immediately check the server to be received, guarded by the provided | ||
/// timeout. If the server associated with this monitor is removed from the topology, this | ||
/// method will return. | ||
/// | ||
/// The `delay` parameter indicates how long this method should wait before listening to | ||
/// requests. The time spent in the delay counts toward the provided timeout. | ||
async fn wait_for_check_request(&mut self, delay: Duration, timeout: Duration) { | ||
let _ = runtime::timeout(timeout, async { | ||
let wait_for_check_request = async { | ||
runtime::delay_for(delay).await; | ||
self.topology_check_request_receiver | ||
.wait_for_check_request() | ||
.await; | ||
}; | ||
tokio::pin!(wait_for_check_request); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we need the |
||
|
||
loop { | ||
tokio::select! { | ||
_ = self.individual_check_request_receiver.changed() => { | ||
break; | ||
} | ||
_ = self.topology_check_request_receiver.wait_for_check_request() => { | ||
_ = &mut wait_for_check_request => { | ||
break; | ||
} | ||
_ = self.handle_listener.wait_for_all_handle_drops() => { | ||
|
@@ -633,11 +630,6 @@ impl MonitorRequestReceiver { | |
self.cancellation_receiver.borrow_and_update(); | ||
} | ||
|
||
/// Wait until the server associated with this monitor has been closed. | ||
async fn wait_for_server_close(&mut self) { | ||
self.handle_listener.wait_for_all_handle_drops().await; | ||
} | ||
|
||
fn is_alive(&self) -> bool { | ||
self.handle_listener.is_alive() | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
use std::{ | ||
collections::HashSet, | ||
sync::Arc, | ||
time::{Duration, Instant}, | ||
}; | ||
|
@@ -8,8 +9,12 @@ use semver::VersionReq; | |
use tokio::sync::{RwLockReadGuard, RwLockWriteGuard}; | ||
|
||
use crate::{ | ||
client::options::{ClientOptions, ServerAddress}, | ||
cmap::RawCommandResponse, | ||
error::{Error, ErrorKind}, | ||
event::sdam::SdamEventHandler, | ||
hello::{LEGACY_HELLO_COMMAND_NAME, LEGACY_HELLO_COMMAND_NAME_LOWERCASE}, | ||
sdam::{ServerDescription, Topology}, | ||
test::{ | ||
log_uncaptured, | ||
CmapEvent, | ||
|
@@ -269,3 +274,91 @@ async fn repl_set_name_mismatch() -> crate::error::Result<()> { | |
|
||
Ok(()) | ||
} | ||
|
||
/// Test verifying that a server's monitor stops after the server has been removed from the | ||
/// topology. | ||
#[cfg_attr(feature = "tokio-runtime", tokio::test(flavor = "multi_thread"))] | ||
#[cfg_attr(feature = "async-std-runtime", async_std::test)] | ||
async fn removed_server_monitor_stops() -> crate::error::Result<()> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this test was copied pretty much as-is from #733 |
||
let _guard = LOCK.run_concurrently().await; | ||
|
||
let handler = Arc::new(EventHandler::new()); | ||
let options = ClientOptions::builder() | ||
.hosts(vec![ | ||
ServerAddress::parse("localhost:49152")?, | ||
ServerAddress::parse("localhost:49153")?, | ||
ServerAddress::parse("localhost:49154")?, | ||
]) | ||
.heartbeat_freq(Duration::from_millis(50)) | ||
.sdam_event_handler(handler.clone() as Arc<dyn SdamEventHandler>) | ||
.repl_set_name("foo".to_string()) | ||
.build(); | ||
|
||
let hosts = options.hosts.clone(); | ||
let set_name = options.repl_set_name.clone().unwrap(); | ||
|
||
let mut subscriber = handler.subscribe(); | ||
let topology = Topology::new(options)?; | ||
|
||
// Wait until all three monitors have started. | ||
let mut seen_monitors = HashSet::new(); | ||
subscriber | ||
.wait_for_event(Duration::from_millis(500), |event| { | ||
if let Event::Sdam(SdamEvent::ServerHeartbeatStarted(e)) = event { | ||
seen_monitors.insert(e.server_address.clone()); | ||
} | ||
seen_monitors.len() == hosts.len() | ||
}) | ||
.await | ||
.expect("should see all three monitors start"); | ||
|
||
// Remove the third host from the topology. | ||
let hello = doc! { | ||
"ok": 1, | ||
"isWritablePrimary": true, | ||
"hosts": [ | ||
hosts[0].clone().to_string(), | ||
hosts[1].clone().to_string(), | ||
], | ||
"me": hosts[0].clone().to_string(), | ||
"setName": set_name, | ||
"maxBsonObjectSize": 1234, | ||
"maxWriteBatchSize": 1234, | ||
"maxMessageSizeBytes": 1234, | ||
"minWireVersion": 0, | ||
"maxWireVersion": 13, | ||
}; | ||
let hello_reply = RawCommandResponse::with_document_and_address(hosts[0].clone(), hello) | ||
.unwrap() | ||
.into_hello_reply() | ||
.unwrap(); | ||
|
||
topology | ||
.clone_updater() | ||
.update(ServerDescription::new_from_hello_reply( | ||
hosts[0].clone(), | ||
hello_reply, | ||
Duration::from_millis(10), | ||
)) | ||
.await; | ||
|
||
subscriber.wait_for_event(Duration::from_secs(1), |event| { | ||
matches!(event, Event::Sdam(SdamEvent::ServerClosed(e)) if e.address == hosts[2]) | ||
}).await.expect("should see server closed event"); | ||
|
||
// Capture heartbeat events for 1 second. The monitor for the removed server should stop | ||
// publishing them. | ||
let events = subscriber.collect_events(Duration::from_secs(1), |event| { | ||
matches!(event, Event::Sdam(SdamEvent::ServerHeartbeatStarted(e)) if e.server_address == hosts[2]) | ||
}).await; | ||
|
||
// Use 3 to account for any heartbeats that happen to start between emitting the ServerClosed | ||
// event and actually publishing the state with the closed server. | ||
assert!( | ||
events.len() < 3, | ||
"expected monitor for removed server to stop performing checks, but saw {} heartbeats", | ||
events.len() | ||
); | ||
|
||
Ok(()) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1081,10 +1081,11 @@ pub(crate) struct TopologyCheckRequestReceiver { | |
|
||
impl TopologyCheckRequestReceiver { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was a minor change I made to mitigate the case where the value hasn't changed but there are still operations requesting an update. I don't believe this case actually could actually happen with the existing implementation, but I think it's best to make that clear here. |
||
pub(crate) async fn wait_for_check_request(&mut self) { | ||
while self.receiver.changed().await.is_ok() { | ||
if *self.receiver.borrow() > 0 { | ||
break; | ||
} | ||
while *self.receiver.borrow() == 0 { | ||
// If all the requesters hung up, then just return early. | ||
if self.receiver.changed().await.is_err() { | ||
return; | ||
}; | ||
} | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This validation existed, but it was in the URI parsing stage. I've moved it to
ClientOptions::validate
to ensure it gets hit even when the options are constructed directly.