diff --git a/crates/libcontainer/src/process/fork.rs b/crates/libcontainer/src/process/fork.rs index 12ba2dfcf1..d8be41ffb3 100644 --- a/crates/libcontainer/src/process/fork.rs +++ b/crates/libcontainer/src/process/fork.rs @@ -1,6 +1,14 @@ -use anyhow::Result; -use nix::unistd; +use std::num; + +use anyhow::{bail, Context, Result}; +use nix::errno::Errno; +use nix::sys::mman; use nix::unistd::Pid; +use nix::{sched, unistd}; + +const DEFAULT_STACK_SIZE: usize = 8 * 1024 * 1024; // 8M +const DEFAULT_PAGE_SIZE: usize = 4 * 1024; // 4K +type CloneCb = Box isize + Send>; // Execute the cb in another process. Make the fork works more like thread_spawn // or clone, so it is easier to reason. Compared to clone call, fork is easier @@ -27,8 +35,123 @@ pub fn container_fork Result>(cb: F) -> Result { } } +pub fn container_clone(cb: CloneCb) -> Result { + // Use sysconf to find the page size. If there is an error, we assume + // the default 4K page size. + let page_size: usize = unsafe { + match libc::sysconf(libc::_SC_PAGE_SIZE) { + -1 => DEFAULT_PAGE_SIZE, + x => x as usize, + } + }; + + // Find out the default stack max size through getrlimit. + let mut rlimit = libc::rlimit { + rlim_cur: 0, + rlim_max: 0, + }; + unsafe { Errno::result(libc::getrlimit(libc::RLIMIT_STACK, &mut rlimit))? }; + + // mmap will return ENOMEM if stack size is unlimited + let default_stack_size = if rlimit.rlim_cur != u64::MAX { + rlimit.rlim_cur as usize + } else { + log::debug!("stack size returned by getrlimit() is unlimited, use DEFAULT_STACK_SIZE({DEFAULT_STACK_SIZE})"); + DEFAULT_STACK_SIZE + }; + + // Using the clone syscall requires us to create the stack space for the + // child process instead of taken cared for us like fork call. We use mmap + // here to create the stack. Instead of guessing how much space the child + // process needs, we allocate through mmap to the system default limit, + // which is 8MB on most of the linux system today. This is OK since mmap + // will only researve the address space upfront, instead of allocating + // physical memory upfront. The stack will grow as needed, up to the size + // researved, so no wasted memory here. Lastly, the child stack only needs + // to support the container init process set up code in Youki. When Youki + // calls exec into the container payload, exec will reset the stack. Note, + // do not use MAP_GROWSDOWN since it is not well supported. + // Ref: https://man7.org/linux/man-pages/man2/mmap.2.html + let child_stack = unsafe { + mman::mmap( + None, + num::NonZeroUsize::new_unchecked(default_stack_size), + mman::ProtFlags::PROT_READ | mman::ProtFlags::PROT_WRITE, + mman::MapFlags::MAP_PRIVATE | mman::MapFlags::MAP_ANONYMOUS | mman::MapFlags::MAP_STACK, + -1, + 0, + )? + }; + // Consistant with how pthread_create sets up the stack, we create a + // guard page of 1 page, to protect the child stack collision. Note, for + // clone call, the child stack will grow downward, so the bottom of the + // child stack is in the beginning. + unsafe { + mman::mprotect(child_stack, page_size, mman::ProtFlags::PROT_NONE) + .with_context(|| "failed to create guard page")? + }; + + // Since the child stack for clone grows downward, we need to pass in + // the top of the stack address. + let child_stack_top = unsafe { child_stack.add(default_stack_size) }; + + // Adds SIGCHLD and CLONE_PARENT to the clone flags + let clone_flags: libc::c_int = + nix::sys::signal::Signal::SIGCHLD as libc::c_int | sched::CloneFlags::CLONE_PARENT.bits(); + + // We are passing the boxed closure "cb" into the clone function as the a + // function pointer in C. The box closure in Rust is both a function pointer + // and a struct. However, when casting the box closure into libc::c_void, + // the function pointer will be lost. Therefore, to work around the issue, + // we double box the closure. This is consistant with how std::unix::thread + // handles the closure. + // Ref: https://github.com/rust-lang/rust/blob/master/library/std/src/sys/unix/thread.rs + let data = Box::into_raw(Box::new(cb)); + // The main is a wrapper function passed into clone call below. The "data" + // arg is actually a raw pointer to a Box closure. so here, we re-box the + // pointer back into a box closure so the main takes ownership of the + // memory. Then we can call the closure passed in. + extern "C" fn main(data: *mut libc::c_void) -> libc::c_int { + unsafe { Box::from_raw(data as *mut CloneCb)() as i32 } + } + + // The nix::sched::clone wrapper doesn't provide the right interface. Using + // the clone syscall is one of the rare cases where we don't want rust to + // manage the child stack memory. Instead, we want to use c_void directly + // here. Therefore, here we are using libc::clone syscall directly for + // better control. The child stack will be cleaned when exec is called or + // the child process terminates. The nix wrapper also does not treat the + // closure memory correctly. The wrapper implementation fails to pass the + // right ownership to the new child process. + // Ref: https://github.com/nix-rust/nix/issues/919 + // Ref: https://github.com/nix-rust/nix/pull/920 + let res = unsafe { + libc::clone( + main, + child_stack_top, + clone_flags, + data as *mut libc::c_void, + ) + }; + match res { + -1 => { + // Since the clone call failed, the closure passed in didn't get + // consumed. To complete the circle, we can safely box up the + // closure again and let rust manage this memory for us. + unsafe { drop(Box::from_raw(data)) }; + bail!( + "failed clone to create new process: {:?}", + Errno::result(res) + ) + } + pid => Ok(Pid::from_raw(pid)), + } +} + #[cfg(test)] mod test { + use crate::process::channel::channel; + use super::*; use anyhow::{bail, Result}; use nix::sys::wait::{waitpid, WaitStatus}; @@ -58,4 +181,55 @@ mod test { _ => bail!("test failed"), } } + + #[test] + fn test_container_clone() -> Result<()> { + // The `container_clone` will create the process as a sibling process + // (share the same parent) as the calling process. In Unix, a process + // can only wait on the immediate children process and can't wait the + // sibling process. Therefore, to test the logic, we will have to fork a + // process first and then let the forked process call `container_clone`. + // Then the testing process (the process where test is called), who are + // the parent to the forked process and the sibling process from + // `container_clone`, can wait on both process. + + // We need to use a channel so that the forked process can pass the pid + // of the sibling process to the testing process. + let (sender, receiver) = &mut channel::()?; + + match unsafe { unistd::fork()? } { + unistd::ForkResult::Parent { child } => { + let sibling_process_pid = + Pid::from_raw(receiver.recv().with_context(|| { + "failed to receive the sibling pid from forked process" + })?); + receiver.close()?; + match waitpid(sibling_process_pid, None).expect("wait pid failed.") { + WaitStatus::Exited(p, status) => { + assert_eq!(sibling_process_pid, p); + assert_eq!(status, 0); + } + _ => bail!("failed to wait on the sibling process"), + } + // After sibling process exits, we can wait on the forked process. + match waitpid(child, None).expect("wait pid failed.") { + WaitStatus::Exited(p, status) => { + assert_eq!(child, p); + assert_eq!(status, 0); + } + _ => bail!("failed to wait on the forked process"), + } + } + unistd::ForkResult::Child => { + // Inside the forked process. We call `container_clone` and pass + // the pid to the parent process. + let pid = container_clone(Box::new(move || 0))?; + sender.send(pid.as_raw())?; + sender.close()?; + std::process::exit(0); + } + }; + + Ok(()) + } }