From bd54128413339c9e60a2e16dcd48545a50951a6f Mon Sep 17 00:00:00 2001 From: yihuaf Date: Thu, 29 Jul 2021 21:43:02 +0200 Subject: [PATCH] Refactor how the closure used for clone to create the new process. --- src/container/builder_impl.rs | 24 ++++++--- src/process/fork.rs | 99 +++++++++++++++++++++-------------- 2 files changed, 76 insertions(+), 47 deletions(-) diff --git a/src/container/builder_impl.rs b/src/container/builder_impl.rs index d705a8da3..adf1c48c8 100644 --- a/src/container/builder_impl.rs +++ b/src/container/builder_impl.rs @@ -68,15 +68,23 @@ impl ContainerBuilderImpl { let (mut parent, parent_channel) = parent::ParentProcess::new(self.rootless.clone())?; let mut child = child::ChildProcess::new(parent_channel)?; - let cb = Box::new(|| { + let init = self.init; + let rootless = self.rootless.clone(); + let spec = self.spec.clone(); + let syscall = self.syscall.clone(); + let rootfs = self.rootfs.clone(); + let console_socket = self.console_socket.clone(); + let notify_path = self.notify_path.clone(); + + let cb = Box::new(move || { if let Err(error) = container_init( - self.init, - self.rootless.clone(), - self.spec.clone(), - self.syscall.clone(), - self.rootfs.clone(), - self.console_socket.clone(), - self.notify_path.clone(), + init, + rootless, + spec, + syscall, + rootfs, + console_socket, + notify_path, &mut child, ) { log::debug!("failed to run container_init: {:?}", error); diff --git a/src/process/fork.rs b/src/process/fork.rs index c1c4ebbbd..b7a4a760e 100644 --- a/src/process/fork.rs +++ b/src/process/fork.rs @@ -1,26 +1,23 @@ +use anyhow::bail; use anyhow::Context; use anyhow::Result; -use libc::c_int; -use libc::c_void; use nix::errno::Errno; use nix::sched; use nix::sys; use nix::sys::mman; use nix::unistd::Pid; -use std::mem; use std::ptr; +// The clone callback is used in clone call. It is a boxed closure and it needs +// to trasfer the ownership of related memory to the new process. +type CloneCb = Box isize + Send>; + /// clone uses syscall clone(2) to create a new process for the container init /// process. Using clone syscall gives us better control over how to can create /// the new container process, where we can enter into namespaces directly instead /// of using unshare and fork. This call will only create one new process, instead /// of two using fork. -pub fn clone(mut cb: sched::CloneCb, clone_flags: sched::CloneFlags) -> Result { - extern "C" fn callback(data: *mut sched::CloneCb) -> c_int { - let cb: &mut sched::CloneCb = unsafe { &mut *data }; - (*cb)() as c_int - } - +pub fn clone(cb: CloneCb, clone_flags: sched::CloneFlags) -> Result { // Use sysconf to find the page size. If there is an error, we assume // the default 4K page size. let page_size: usize = unsafe { @@ -60,39 +57,63 @@ pub fn clone(mut cb: sched::CloneCb, clone_flags: sched::CloneFlags) -> Result

isize>) -> i32), - child_stack_top, - combined, - &mut cb as *mut _ as *mut c_void, - ) - }; - let pid = Errno::result(res).map(Pid::from_raw)?; + let combined = clone_flags.bits() | signal as libc::c_int; + + // We are passing the boxed closure "cb" into the clone function as the a + // function pointer in C. The box closure in Rust is both a function pointer + // and a struct. However, when casting the box closure into libc::c_void, + // the function pointer will be lost. Therefore, to work around the issue, + // we double box the closure. This is consistant with how std::unix::thread + // handles the closure. + // Ref: https://github.com/rust-lang/rust/blob/master/library/std/src/sys/unix/thread.rs + let data = Box::into_raw(Box::new(cb)); + // The main is a wrapper function passed into clone call below. The "data" + // arg is actually a raw pointer to a Box closure. so here, we re-box the + // pointer back into a box closure so the main takes ownership of the + // memory. Then we can call the closure passed in. + extern "C" fn main(data: *mut libc::c_void) -> libc::c_int { + unsafe { Box::from_raw(data as *mut CloneCb)() as i32 } + } - Ok(pid) + // The nix::sched::clone wrapper doesn't provide the right interface. Using + // the clone syscall is one of the rare cases where we don't want rust to + // manage the child stack memory. Instead, we want to use c_void directly + // here. Therefore, here we are using libc::clone syscall directly for + // better control. The child stack will be cleaned when exec is called or + // the child process terminates. The nix wrapper also does not treat the + // closure memory correctly. The wrapper implementation fails to pass the + // right ownership to the new child process. + // Ref: https://github.com/nix-rust/nix/issues/919 + // Ref: https://github.com/nix-rust/nix/pull/920 + let res = unsafe { libc::clone(main, child_stack_top, combined, data as *mut libc::c_void) }; + match res { + -1 => { + // Since the clone call failed, the closure passed in didn't get + // consumed. To complete the circle, we can safely box up the + // closure again and let rust manage this memory for us. + unsafe { drop(Box::from_raw(data)) }; + bail!( + "Failed clone to create new process: {:?}", + Errno::result(res) + ) + } + pid => Ok(Pid::from_raw(pid)), + } } #[cfg(test)] @@ -115,7 +136,7 @@ mod tests { // namespace is needed for the test to run without root let flags = sched::CloneFlags::CLONE_NEWPID | sched::CloneFlags::CLONE_NEWUSER; let pid = super::clone( - Box::new(|| { + Box::new(move || { if cb().is_err() { return -1; }