Refactor how the closure used for clone to create the new process.

yihuaf · Jul 30, 2021 · bd54128 · bd54128
1 parent 218c7a8
commit bd54128
Show file tree

Hide file tree

Showing 2 changed files with 76 additions and 47 deletions.
diff --git a/src/container/builder_impl.rs b/src/container/builder_impl.rs
@@ -68,15 +68,23 @@ impl ContainerBuilderImpl {
         let (mut parent, parent_channel) = parent::ParentProcess::new(self.rootless.clone())?;
         let mut child = child::ChildProcess::new(parent_channel)?;
 
-        let cb = Box::new(|| {
+        let init = self.init;
+        let rootless = self.rootless.clone();
+        let spec = self.spec.clone();
+        let syscall = self.syscall.clone();
+        let rootfs = self.rootfs.clone();
+        let console_socket = self.console_socket.clone();
+        let notify_path = self.notify_path.clone();
+
+        let cb = Box::new(move || {
             if let Err(error) = container_init(
-                self.init,
-                self.rootless.clone(),
-                self.spec.clone(),
-                self.syscall.clone(),
-                self.rootfs.clone(),
-                self.console_socket.clone(),
-                self.notify_path.clone(),
+                init,
+                rootless,
+                spec,
+                syscall,
+                rootfs,
+                console_socket,
+                notify_path,
                 &mut child,
             ) {
                 log::debug!("failed to run container_init: {:?}", error);

diff --git a/src/process/fork.rs b/src/process/fork.rs
@@ -1,26 +1,23 @@
+use anyhow::bail;
 use anyhow::Context;
 use anyhow::Result;
-use libc::c_int;
-use libc::c_void;
 use nix::errno::Errno;
 use nix::sched;
 use nix::sys;
 use nix::sys::mman;
 use nix::unistd::Pid;
-use std::mem;
 use std::ptr;
 
+// The clone callback is used in clone call. It is a boxed closure and it needs
+// to trasfer the ownership of related memory to the new process.
+type CloneCb = Box<dyn FnOnce() -> isize + Send>;
+
 /// clone uses syscall clone(2) to create a new process for the container init
 /// process. Using clone syscall gives us better control over how to can create
 /// the new container process, where we can enter into namespaces directly instead
 /// of using unshare and fork. This call will only create one new process, instead
 /// of two using fork.
-pub fn clone(mut cb: sched::CloneCb, clone_flags: sched::CloneFlags) -> Result<Pid> {
-    extern "C" fn callback(data: *mut sched::CloneCb) -> c_int {
-        let cb: &mut sched::CloneCb = unsafe { &mut *data };
-        (*cb)() as c_int
-    }
-
+pub fn clone(cb: CloneCb, clone_flags: sched::CloneFlags) -> Result<Pid> {
     // Use sysconf to find the page size. If there is an error, we assume
     // the default 4K page size.
     let page_size: usize = unsafe {
@@ -60,39 +57,63 @@ pub fn clone(mut cb: sched::CloneCb, clone_flags: sched::CloneFlags) -> Result<P
             0,
         )?
     };
+    // Consistant with how pthread_create sets up the stack, we create a
+    // guard page of 1 page, to protect the child stack collision. Note, for
+    // clone call, the child stack will grow downward, so the bottom of the
+    // child stack is in the beginning.
+    unsafe {
+        mman::mprotect(child_stack, page_size, mman::ProtFlags::PROT_NONE)
+            .with_context(|| "Failed to create guard page")?
+    };
+
+    // Since the child stack for clone grows downward, we need to pass in
+    // the top of the stack address.
+    let child_stack_top = unsafe { child_stack.add(default_stack_size) };
 
     // Adds SIGCHLD flag to mimic the same behavior as fork.
     let signal = sys::signal::Signal::SIGCHLD;
-    let combined = clone_flags.bits() | signal as c_int;
-    let res = unsafe {
-        // Consistant with how pthread_create sets up the stack, we create a
-        // guard page of 1 page, to protect the child stack collision. Note, for
-        // clone call, the child stack will grow downward, so the bottom of the
-        // child stack is in the beginning.
-        mman::mprotect(child_stack, page_size, mman::ProtFlags::PROT_NONE)
-            .with_context(|| "Failed to create guard page")?;
-
-        // Since the child stack for clone grows downward, we need to pass in
-        // the top of the stack address.
-        let child_stack_top = child_stack.add(default_stack_size);
-
-        // Using the clone syscall is one of the rare cases where we don't want
-        // rust to manage the child stack memory.  Instead, we want to use
-        // c_void directly here. The nix::sched::clone wrapper doesn't provide
-        // the right interface and its interface can't be changed. Therefore,
-        // here we are using libc::clone syscall directly for better control.
-        // The child stack will be cleaned when exec is called or the child
-        // process terminates.
-        libc::clone(
-            mem::transmute(callback as extern "C" fn(*mut Box<dyn FnMut() -> isize>) -> i32),
-            child_stack_top,
-            combined,
-            &mut cb as *mut _ as *mut c_void,
-        )
-    };
-    let pid = Errno::result(res).map(Pid::from_raw)?;
+    let combined = clone_flags.bits() | signal as libc::c_int;
+
+    // We are passing the boxed closure "cb" into the clone function as the a
+    // function pointer in C. The box closure in Rust is both a function pointer
+    // and a struct. However, when casting the box closure into libc::c_void,
+    // the function pointer will be lost. Therefore, to work around the issue,
+    // we double box the closure. This is consistant with how std::unix::thread
+    // handles the closure.
+    // Ref: https://github.com/rust-lang/rust/blob/master/library/std/src/sys/unix/thread.rs
+    let data = Box::into_raw(Box::new(cb));
+    // The main is a wrapper function passed into clone call below. The "data"
+    // arg is actually a raw pointer to a Box closure. so here, we re-box the
+    // pointer back into a box closure so the main takes ownership of the
+    // memory. Then we can call the closure passed in.
+    extern "C" fn main(data: *mut libc::c_void) -> libc::c_int {
+        unsafe { Box::from_raw(data as *mut CloneCb)() as i32 }
+    }
 
-    Ok(pid)
+    // The nix::sched::clone wrapper doesn't provide the right interface.  Using
+    // the clone syscall is one of the rare cases where we don't want rust to
+    // manage the child stack memory. Instead, we want to use c_void directly
+    // here.  Therefore, here we are using libc::clone syscall directly for
+    // better control.  The child stack will be cleaned when exec is called or
+    // the child process terminates. The nix wrapper also does not treat the
+    // closure memory correctly. The wrapper implementation fails to pass the
+    // right ownership to the new child process.
+    // Ref: https://github.com/nix-rust/nix/issues/919
+    // Ref: https://github.com/nix-rust/nix/pull/920
+    let res = unsafe { libc::clone(main, child_stack_top, combined, data as *mut libc::c_void) };
+    match res {
+        -1 => {
+            // Since the clone call failed, the closure passed in didn't get
+            // consumed. To complete the circle, we can safely box up the
+            // closure again and let rust manage this memory for us.
+            unsafe { drop(Box::from_raw(data)) };
+            bail!(
+                "Failed clone to create new process: {:?}",
+                Errno::result(res)
+            )
+        }
+        pid => Ok(Pid::from_raw(pid)),
+    }
 }
 
 #[cfg(test)]
@@ -115,7 +136,7 @@ mod tests {
         // namespace is needed for the test to run without root
         let flags = sched::CloneFlags::CLONE_NEWPID | sched::CloneFlags::CLONE_NEWUSER;
         let pid = super::clone(
-            Box::new(|| {
+            Box::new(move || {
                 if cb().is_err() {
                     return -1;
                 }