Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Merged by Bors] - Fix rare file close deadlock. #1009

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ Check [Keep a Changelog](http://keepachangelog.com/) for recommendations on how
since `/etc/resolv.conf` and `/etc/hosts` were in the local read override,
leading to use the local nameserver for resolving. Fixes [#989](https://github.com/metalbear-co/mirrord/issues/989)
- mirrord-agent: Infinite reading a file when using `fgets`/`read_line` due to bug seeking to start of file.
- Rare deadlock on file close that caused the e2e file-ops test to sometimes fail
([#994](https://github.com/metalbear-co/mirrord/issues/994)).

## 3.21.0

Expand Down
22 changes: 15 additions & 7 deletions mirrord/layer/src/file/ops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,17 @@ impl RemoteFile {

impl Drop for RemoteFile {
fn drop(&mut self) {
trace!("dropping RemoteFile {self:?}");
t4lz marked this conversation as resolved.
Show resolved Hide resolved
// Warning: Don't log from here. This is called when self is removed from OPEN_FILES, so
// during the whole execution of this function, OPEN_FILES is locked.
// When emitting logs, sometimes a file `write` operation is required, in order for the
// operation to complete. The write operation is hooked and at some point tries to lock
// `OPEN_FILES`, which means the thread deadlocks with itself (we call
// `OPEN_FILES.lock()?.remove()` and then while still locked, `OPEN_FILES.lock()` again)
let closing_file = Close { fd: self.fd };

if let Err(err) = blocking_send_file_message(FileOperation::Close(closing_file)) {
warn!("Failed to send close file {self:?} message: {err:?}");
};
blocking_send_file_message(FileOperation::Close(closing_file)).expect(
"mirrord failed to send close file message to main layer thread. Error: {err:?}",
);
}
}

Expand Down Expand Up @@ -242,8 +247,11 @@ pub(crate) fn fdopendir(fd: RawFd) -> Detour<usize> {
// usize == ptr size
// we don't return a pointer to an address that contains DIR

let mut open_files = OPEN_FILES.lock()?;
let remote_file_fd = open_files.get(&fd).ok_or(Bypass::LocalFdNotFound(fd))?.fd;
let remote_file_fd = OPEN_FILES
.lock()?
.get(&fd)
.ok_or(Bypass::LocalFdNotFound(fd))?
.fd;
t4lz marked this conversation as resolved.
Show resolved Hide resolved

let (dir_channel_tx, dir_channel_rx) = oneshot::channel();

Expand All @@ -263,7 +271,7 @@ pub(crate) fn fdopendir(fd: RawFd) -> Detour<usize> {
.insert(local_dir_fd as usize, remote_dir_fd);

// According to docs, when using fdopendir, the fd is now managed by OS - i.e closed
open_files.remove(&fd);
OPEN_FILES.lock()?.remove(&fd);
t4lz marked this conversation as resolved.
Show resolved Hide resolved

Detour::Success(local_dir_fd as usize)
}
Expand Down
11 changes: 6 additions & 5 deletions mirrord/layer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -787,11 +787,10 @@ pub(crate) fn close_layer_fd(fd: c_int) {
.is_active();

// Remove from sockets, or if not a socket, remove from files if file mode active
SOCKETS.lock().unwrap().remove(&fd).is_none().then(|| {
if file_mode_active {
OPEN_FILES.lock().unwrap().remove(&fd);
}
});
if SOCKETS.lock().unwrap().remove(&fd).is_none() && file_mode_active {
// SOCKETS lock dropped, not holding it while locking OPEN_FILES.
OPEN_FILES.lock().unwrap().remove(&fd);
}
t4lz marked this conversation as resolved.
Show resolved Hide resolved
}

// TODO: When this is annotated with `hook_guard_fn`, then the outgoing sockets never call it (we
Expand Down Expand Up @@ -831,6 +830,8 @@ pub(crate) unsafe extern "C" fn close_nocancel_detour(fd: c_int) -> c_int {
#[cfg(target_os = "linux")]
#[hook_guard_fn]
pub(crate) unsafe extern "C" fn uv_fs_close(a: usize, b: usize, fd: c_int, c: usize) -> c_int {
// In this case we call `close_layer_fd` before the original close function, because execution
// does not return to here after calling `FN_UV_FS_CLOSE`.
close_layer_fd(fd);
FN_UV_FS_CLOSE(a, b, fd, c)
}
Expand Down
15 changes: 8 additions & 7 deletions mirrord/layer/src/socket/ops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,7 @@ pub(super) fn socket(domain: c_int, type_: c_int, protocol: c_int) -> Detour<Raw
kind: socket_kind,
};

let mut sockets = SOCKETS.lock()?;
sockets.insert(socket_fd, Arc::new(new_socket));
SOCKETS.lock()?.insert(socket_fd, Arc::new(new_socket));

Detour::Success(socket_fd)
}
Expand Down Expand Up @@ -504,11 +503,13 @@ pub(super) fn fcntl(orig_fd: c_int, cmd: c_int, fcntl_fd: i32) -> Result<(), Hoo

#[tracing::instrument(level = "trace")]
pub(super) fn dup(fd: c_int, dup_fd: i32) -> Result<(), HookError> {
let mut sockets = SOCKETS.lock()?;
if let Some(socket) = sockets.get(&fd).cloned() {
sockets.insert(dup_fd as RawFd, socket);
return Ok(());
}
{
let mut sockets = SOCKETS.lock()?;
if let Some(socket) = sockets.get(&fd).cloned() {
sockets.insert(dup_fd as RawFd, socket);
return Ok(());
}
} // Drop sockets, free Mutex.

let mut files = OPEN_FILES.lock()?;
if let Some(file) = files.get(&fd).cloned() {
Expand Down
2 changes: 1 addition & 1 deletion tests/src/file_ops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ mod file_ops {
FileOps::Python,
FileOps::Go18,
FileOps::Go19,
// FileOps::Go20,
FileOps::Go20,
FileOps::Rust
)]
ops: FileOps,
Expand Down