Implement the container_clone using CLONE_PARENT

Signed-off-by: Eric Fang <[email protected]>
youki-dev · Mar 1, 2023 · eb2ffbf · eb2ffbf
1 parent ecbeea7
commit eb2ffbf
Showing 1 changed file with 176 additions and 2 deletions.
diff --git a/crates/libcontainer/src/process/fork.rs b/crates/libcontainer/src/process/fork.rs
@@ -1,6 +1,14 @@
-use anyhow::Result;
-use nix::unistd;
+use std::num;
+
+use anyhow::{bail, Context, Result};
+use nix::errno::Errno;
+use nix::sys::mman;
 use nix::unistd::Pid;
+use nix::{sched, unistd};
+
+const DEFAULT_STACK_SIZE: usize = 8 * 1024 * 1024; // 8M
+const DEFAULT_PAGE_SIZE: usize = 4 * 1024; // 4K
+type CloneCb = Box<dyn FnOnce() -> isize + Send>;
 
 // Execute the cb in another process. Make the fork works more like thread_spawn
 // or clone, so it is easier to reason. Compared to clone call, fork is easier
@@ -27,8 +35,123 @@ pub fn container_fork<F: FnOnce() -> Result<i32>>(cb: F) -> Result<Pid> {
     }
 }
 
+pub fn container_clone(cb: CloneCb) -> Result<Pid> {
+    // Use sysconf to find the page size. If there is an error, we assume
+    // the default 4K page size.
+    let page_size: usize = unsafe {
+        match libc::sysconf(libc::_SC_PAGE_SIZE) {
+            -1 => DEFAULT_PAGE_SIZE,
+            x => x as usize,
+        }
+    };
+
+    // Find out the default stack max size through getrlimit.
+    let mut rlimit = libc::rlimit {
+        rlim_cur: 0,
+        rlim_max: 0,
+    };
+    unsafe { Errno::result(libc::getrlimit(libc::RLIMIT_STACK, &mut rlimit))? };
+
+    // mmap will return ENOMEM if stack size is unlimited
+    let default_stack_size = if rlimit.rlim_cur != u64::MAX {
+        rlimit.rlim_cur as usize
+    } else {
+        log::debug!("stack size returned by getrlimit() is unlimited, use DEFAULT_STACK_SIZE({DEFAULT_STACK_SIZE})");
+        DEFAULT_STACK_SIZE
+    };
+
+    // Using the clone syscall requires us to create the stack space for the
+    // child process instead of taken cared for us like fork call. We use mmap
+    // here to create the stack.  Instead of guessing how much space the child
+    // process needs, we allocate through mmap to the system default limit,
+    // which is 8MB on most of the linux system today. This is OK since mmap
+    // will only researve the address space upfront, instead of allocating
+    // physical memory upfront.  The stack will grow as needed, up to the size
+    // researved, so no wasted memory here. Lastly, the child stack only needs
+    // to support the container init process set up code in Youki. When Youki
+    // calls exec into the container payload, exec will reset the stack.  Note,
+    // do not use MAP_GROWSDOWN since it is not well supported.
+    // Ref: https://man7.org/linux/man-pages/man2/mmap.2.html
+    let child_stack = unsafe {
+        mman::mmap(
+            None,
+            num::NonZeroUsize::new_unchecked(default_stack_size),
+            mman::ProtFlags::PROT_READ | mman::ProtFlags::PROT_WRITE,
+            mman::MapFlags::MAP_PRIVATE | mman::MapFlags::MAP_ANONYMOUS | mman::MapFlags::MAP_STACK,
+            -1,
+            0,
+        )?
+    };
+    // Consistant with how pthread_create sets up the stack, we create a
+    // guard page of 1 page, to protect the child stack collision. Note, for
+    // clone call, the child stack will grow downward, so the bottom of the
+    // child stack is in the beginning.
+    unsafe {
+        mman::mprotect(child_stack, page_size, mman::ProtFlags::PROT_NONE)
+            .with_context(|| "failed to create guard page")?
+    };
+
+    // Since the child stack for clone grows downward, we need to pass in
+    // the top of the stack address.
+    let child_stack_top = unsafe { child_stack.add(default_stack_size) };
+
+    // Adds SIGCHLD and CLONE_PARENT to the clone flags
+    let clone_flags: libc::c_int =
+        nix::sys::signal::Signal::SIGCHLD as libc::c_int | sched::CloneFlags::CLONE_PARENT.bits();
+
+    // We are passing the boxed closure "cb" into the clone function as the a
+    // function pointer in C. The box closure in Rust is both a function pointer
+    // and a struct. However, when casting the box closure into libc::c_void,
+    // the function pointer will be lost. Therefore, to work around the issue,
+    // we double box the closure. This is consistant with how std::unix::thread
+    // handles the closure.
+    // Ref: https://github.com/rust-lang/rust/blob/master/library/std/src/sys/unix/thread.rs
+    let data = Box::into_raw(Box::new(cb));
+    // The main is a wrapper function passed into clone call below. The "data"
+    // arg is actually a raw pointer to a Box closure. so here, we re-box the
+    // pointer back into a box closure so the main takes ownership of the
+    // memory. Then we can call the closure passed in.
+    extern "C" fn main(data: *mut libc::c_void) -> libc::c_int {
+        unsafe { Box::from_raw(data as *mut CloneCb)() as i32 }
+    }
+
+    // The nix::sched::clone wrapper doesn't provide the right interface.  Using
+    // the clone syscall is one of the rare cases where we don't want rust to
+    // manage the child stack memory. Instead, we want to use c_void directly
+    // here.  Therefore, here we are using libc::clone syscall directly for
+    // better control.  The child stack will be cleaned when exec is called or
+    // the child process terminates. The nix wrapper also does not treat the
+    // closure memory correctly. The wrapper implementation fails to pass the
+    // right ownership to the new child process.
+    // Ref: https://github.com/nix-rust/nix/issues/919
+    // Ref: https://github.com/nix-rust/nix/pull/920
+    let res = unsafe {
+        libc::clone(
+            main,
+            child_stack_top,
+            clone_flags,
+            data as *mut libc::c_void,
+        )
+    };
+    match res {
+        -1 => {
+            // Since the clone call failed, the closure passed in didn't get
+            // consumed. To complete the circle, we can safely box up the
+            // closure again and let rust manage this memory for us.
+            unsafe { drop(Box::from_raw(data)) };
+            bail!(
+                "failed clone to create new process: {:?}",
+                Errno::result(res)
+            )
+        }
+        pid => Ok(Pid::from_raw(pid)),
+    }
+}
+
 #[cfg(test)]
 mod test {
+    use crate::process::channel::channel;
+
     use super::*;
     use anyhow::{bail, Result};
     use nix::sys::wait::{waitpid, WaitStatus};
@@ -58,4 +181,55 @@ mod test {
             _ => bail!("test failed"),
         }
     }
+
+    #[test]
+    fn test_container_clone() -> Result<()> {
+        // The `container_clone` will create the process as a sibling process
+        // (share the same parent) as the calling process. In Unix, a process
+        // can only wait on the immediate children process and can't wait the
+        // sibling process. Therefore, to test the logic, we will have to fork a
+        // process first and then let the forked process call `container_clone`.
+        // Then the testing process (the process where test is called), who are
+        // the parent to the forked process and the sibling process from
+        // `container_clone`, can wait on both process.
+
+        // We need to use a channel so that the forked process can pass the pid
+        // of the sibling process to the testing process.
+        let (sender, receiver) = &mut channel::<i32>()?;
+
+        match unsafe { unistd::fork()? } {
+            unistd::ForkResult::Parent { child } => {
+                let sibling_process_pid =
+                    Pid::from_raw(receiver.recv().with_context(|| {
+                        "failed to receive the sibling pid from forked process"
+                    })?);
+                receiver.close()?;
+                match waitpid(sibling_process_pid, None).expect("wait pid failed.") {
+                    WaitStatus::Exited(p, status) => {
+                        assert_eq!(sibling_process_pid, p);
+                        assert_eq!(status, 0);
+                    }
+                    _ => bail!("failed to wait on the sibling process"),
+                }
+                // After sibling process exits, we can wait on the forked process.
+                match waitpid(child, None).expect("wait pid failed.") {
+                    WaitStatus::Exited(p, status) => {
+                        assert_eq!(child, p);
+                        assert_eq!(status, 0);
+                    }
+                    _ => bail!("failed to wait on the forked process"),
+                }
+            }
+            unistd::ForkResult::Child => {
+                // Inside the forked process. We call `container_clone` and pass
+                // the pid to the parent process.
+                let pid = container_clone(Box::new(move || 0))?;
+                sender.send(pid.as_raw())?;
+                sender.close()?;
+                std::process::exit(0);
+            }
+        };
+
+        Ok(())
+    }
 }