diff --git a/oci_spec/Cargo.lock b/oci_spec/Cargo.lock index 93303448b..75bf4aaf3 100644 --- a/oci_spec/Cargo.lock +++ b/oci_spec/Cargo.lock @@ -17,6 +17,12 @@ version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28b2cd92db5cbd74e8e5028f7e27dd7aa3090e89e4f2a197cc7c8dfb69c7063b" +[[package]] +name = "autocfg" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" + [[package]] name = "bitflags" version = "1.2.1" @@ -102,9 +108,9 @@ checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736" [[package]] name = "libc" -version = "0.2.95" +version = "0.2.98" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "789da6d93f1b866ffe175afc5322a4d76c038605a1c3319bb57b06967ca98a36" +checksum = "320cfe77175da3a483efed4bc0adc1968ca050b098ce4f2f1c13a56626128790" [[package]] name = "log" @@ -121,16 +127,26 @@ version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc" +[[package]] +name = "memoffset" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9" +dependencies = [ + "autocfg", +] + [[package]] name = "nix" -version = "0.19.1" +version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2ccba0cfe4fdf15982d1674c69b1fd80bad427d293849982668dfe454bd61f2" +checksum = "cf1e25ee6b412c2a1e3fcb6a4499a5c1bfe7f43e014bdce9a6b6666e5aa2d187" dependencies = [ "bitflags", "cc", "cfg-if", "libc", + "memoffset", ] [[package]] diff --git a/src/process/fork.rs b/src/process/fork.rs index 3d11f756e..c1c4ebbbd 100644 --- a/src/process/fork.rs +++ b/src/process/fork.rs @@ -1,15 +1,96 @@ +use anyhow::Context; use anyhow::Result; +use libc::c_int; +use libc::c_void; +use nix::errno::Errno; use nix::sched; +use nix::sys; +use nix::sys::mman; use nix::unistd::Pid; +use std::mem; +use std::ptr; -pub fn clone(cb: sched::CloneCb, clone_flags: sched::CloneFlags) -> Result { - // unlike fork, clone requires the caller to allocate the stack. here, we use the default - // 4KB for stack size, consistant with the runc implementation. - const STACK_SIZE: usize = 4096; - let stack: &mut [u8; STACK_SIZE] = &mut [0; STACK_SIZE]; - // pass in the SIGCHID flag to mimic the effect of forking a process - let signal = nix::sys::signal::Signal::SIGCHLD; - let pid = sched::clone(cb, stack, clone_flags, Some(signal as i32))?; +/// clone uses syscall clone(2) to create a new process for the container init +/// process. Using clone syscall gives us better control over how to can create +/// the new container process, where we can enter into namespaces directly instead +/// of using unshare and fork. This call will only create one new process, instead +/// of two using fork. +pub fn clone(mut cb: sched::CloneCb, clone_flags: sched::CloneFlags) -> Result { + extern "C" fn callback(data: *mut sched::CloneCb) -> c_int { + let cb: &mut sched::CloneCb = unsafe { &mut *data }; + (*cb)() as c_int + } + + // Use sysconf to find the page size. If there is an error, we assume + // the default 4K page size. + let page_size: usize = unsafe { + match libc::sysconf(libc::_SC_PAGE_SIZE) { + -1 => 4 * 1024, // default to 4K page size + x => x as usize, + } + }; + + // Find out the default stack max size through getrlimit. + let mut rlimit = libc::rlimit { + rlim_cur: 0, + rlim_max: 0, + }; + unsafe { Errno::result(libc::getrlimit(libc::RLIMIT_STACK, &mut rlimit))? }; + let default_stack_size = rlimit.rlim_cur as usize; + + // Using the clone syscall requires us to create the stack space for the + // child process instead of taken cared for us like fork call. We use mmap + // here to create the stack. Instead of guessing how much space the child + // process needs, we allocate through mmap to the system default limit, + // which is 8MB on most of the linux system today. This is OK since mmap + // will only researve the address space upfront, instead of allocating + // physical memory upfront. The stack will grow as needed, up to the size + // researved, so no wasted memory here. Lastly, the child stack only needs + // to support the container init process set up code in Youki. When Youki + // calls exec into the container payload, exec will reset the stack. Note, + // do not use MAP_GROWSDOWN since it is not well supported. + // Ref: https://man7.org/linux/man-pages/man2/mmap.2.html + let child_stack = unsafe { + mman::mmap( + ptr::null_mut(), + default_stack_size, + mman::ProtFlags::PROT_READ | mman::ProtFlags::PROT_WRITE, + mman::MapFlags::MAP_PRIVATE | mman::MapFlags::MAP_ANONYMOUS | mman::MapFlags::MAP_STACK, + -1, + 0, + )? + }; + + // Adds SIGCHLD flag to mimic the same behavior as fork. + let signal = sys::signal::Signal::SIGCHLD; + let combined = clone_flags.bits() | signal as c_int; + let res = unsafe { + // Consistant with how pthread_create sets up the stack, we create a + // guard page of 1 page, to protect the child stack collision. Note, for + // clone call, the child stack will grow downward, so the bottom of the + // child stack is in the beginning. + mman::mprotect(child_stack, page_size, mman::ProtFlags::PROT_NONE) + .with_context(|| "Failed to create guard page")?; + + // Since the child stack for clone grows downward, we need to pass in + // the top of the stack address. + let child_stack_top = child_stack.add(default_stack_size); + + // Using the clone syscall is one of the rare cases where we don't want + // rust to manage the child stack memory. Instead, we want to use + // c_void directly here. The nix::sched::clone wrapper doesn't provide + // the right interface and its interface can't be changed. Therefore, + // here we are using libc::clone syscall directly for better control. + // The child stack will be cleaned when exec is called or the child + // process terminates. + libc::clone( + mem::transmute(callback as extern "C" fn(*mut Box isize>) -> i32), + child_stack_top, + combined, + &mut cb as *mut _ as *mut c_void, + ) + }; + let pid = Errno::result(res).map(Pid::from_raw)?; Ok(pid) } @@ -23,7 +104,7 @@ mod tests { #[test] fn test_fork_clone() -> Result<()> { let cb = || -> Result<()> { - // in a new pid namespace, pid of this process should be 1 + // In a new pid namespace, pid of this process should be 1 let pid = unistd::getpid(); assert_eq!(unistd::Pid::from_raw(1), pid, "PID should set to 1"); @@ -57,4 +138,31 @@ mod tests { bail!("Process didn't exit correctly") } + + #[test] + fn test_clone_stack_allocation() -> Result<()> { + let flags = sched::CloneFlags::empty(); + let pid = super::clone( + Box::new(|| { + let mut array_on_stack = [0u8; 4096]; + array_on_stack.iter_mut().for_each(|x| *x = 0); + + 0 + }), + flags, + )?; + + let status = nix::sys::wait::waitpid(pid, None)?; + if let nix::sys::wait::WaitStatus::Exited(_, exit_code) = status { + assert_eq!( + 0, exit_code, + "Process didn't exit correctly {:?}", + exit_code + ); + + return Ok(()); + } + + bail!("Process didn't exit correctly") + } }