From f29c39ed49373e44e7ce97a849f40f0c80495dd4 Mon Sep 17 00:00:00 2001 From: yihuaf Date: Thu, 29 Jul 2021 02:15:37 +0200 Subject: [PATCH 1/7] Refactor fork/clone child stack management. 1. Allocate the child stack using mmap, instead of on the stack. The LLVM may optimize the memory in the wrong way to create a race condition where to memory is still in use but free'ed. 2. Instead of having fix size 4K, we use the ulimit -s set on the system. This is consistant with what fork and pthread_create uses. --- src/process/fork.rs | 86 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 77 insertions(+), 9 deletions(-) diff --git a/src/process/fork.rs b/src/process/fork.rs index 3d11f756e..a4ecdf516 100644 --- a/src/process/fork.rs +++ b/src/process/fork.rs @@ -1,15 +1,56 @@ use anyhow::Result; +use libc::c_int; +use libc::c_void; +use nix::errno::Errno; use nix::sched; use nix::unistd::Pid; +use std::mem; -pub fn clone(cb: sched::CloneCb, clone_flags: sched::CloneFlags) -> Result { - // unlike fork, clone requires the caller to allocate the stack. here, we use the default - // 4KB for stack size, consistant with the runc implementation. - const STACK_SIZE: usize = 4096; - let stack: &mut [u8; STACK_SIZE] = &mut [0; STACK_SIZE]; - // pass in the SIGCHID flag to mimic the effect of forking a process - let signal = nix::sys::signal::Signal::SIGCHLD; - let pid = sched::clone(cb, stack, clone_flags, Some(signal as i32))?; +pub fn clone(mut cb: sched::CloneCb, clone_flags: sched::CloneFlags) -> Result { + extern "C" fn callback(data: *mut sched::CloneCb) -> c_int { + let cb: &mut sched::CloneCb = unsafe { &mut *data }; + (*cb)() as c_int + } + + let child_stack_top = unsafe { + let page_size: usize = match libc::sysconf(libc::_SC_PAGE_SIZE) { + -1 => 4 * 1024, // default to 4K page size + x => x as usize, + }; + + let mut rlimit = libc::rlimit { + rlim_cur: 0, + rlim_max: 0, + }; + + Errno::result(libc::getrlimit(libc::RLIMIT_STACK, &mut rlimit))?; + let default_stack_size = rlimit.rlim_cur as usize; + + let child_stack = libc::mmap( + libc::PT_NULL as *mut c_void, + default_stack_size, + libc::PROT_READ | libc::PROT_WRITE, + libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_STACK, + -1, + 0, + ); + Errno::result(libc::mprotect(child_stack, page_size, libc::PROT_NONE))?; + let child_stack_top = child_stack.add(default_stack_size); + + child_stack_top + }; + + let res = unsafe { + let signal = nix::sys::signal::Signal::SIGCHLD; + let combined = clone_flags.bits() | signal as c_int; + libc::clone( + mem::transmute(callback as extern "C" fn(*mut Box isize>) -> i32), + child_stack_top, + combined, + &mut cb as *mut _ as *mut c_void, + ) + }; + let pid = Errno::result(res).map(Pid::from_raw)?; Ok(pid) } @@ -23,7 +64,7 @@ mod tests { #[test] fn test_fork_clone() -> Result<()> { let cb = || -> Result<()> { - // in a new pid namespace, pid of this process should be 1 + // In a new pid namespace, pid of this process should be 1 let pid = unistd::getpid(); assert_eq!(unistd::Pid::from_raw(1), pid, "PID should set to 1"); @@ -57,4 +98,31 @@ mod tests { bail!("Process didn't exit correctly") } + + #[test] + fn test_clone_stack_allocation() -> Result<()> { + let flags = sched::CloneFlags::empty(); + let pid = super::clone( + Box::new(|| { + let mut array_on_stack = [0u8; 4096]; + array_on_stack.iter_mut().for_each(|x| *x = 0); + + 0 + }), + flags, + )?; + + let status = nix::sys::wait::waitpid(pid, None)?; + if let nix::sys::wait::WaitStatus::Exited(_, exit_code) = status { + assert_eq!( + 0, exit_code, + "Process didn't exit correctly {:?}", + exit_code + ); + + return Ok(()); + } + + bail!("Process didn't exit correctly") + } } From c2ad0a62008e86b5fa6e51e66c97844412312d56 Mon Sep 17 00:00:00 2001 From: yihuaf Date: Thu, 29 Jul 2021 02:49:39 +0200 Subject: [PATCH 2/7] Adds documentation --- src/process/fork.rs | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/src/process/fork.rs b/src/process/fork.rs index a4ecdf516..3e4642b0c 100644 --- a/src/process/fork.rs +++ b/src/process/fork.rs @@ -3,29 +3,51 @@ use libc::c_int; use libc::c_void; use nix::errno::Errno; use nix::sched; +use nix::sys; use nix::unistd::Pid; use std::mem; +/// clone uses syscall clone(2) to create a new process for the container init +/// process. Using clone syscall gives us better control over how to can create +/// the new container process, where we can enter into namespaces directly instead +/// of using unshare and fork. This call will only create one new process, instead +/// of two using fork. pub fn clone(mut cb: sched::CloneCb, clone_flags: sched::CloneFlags) -> Result { extern "C" fn callback(data: *mut sched::CloneCb) -> c_int { let cb: &mut sched::CloneCb = unsafe { &mut *data }; (*cb)() as c_int } + // Using the clone syscall requires us to create the stack space for the + // child process instead of taken cared for us like fork call. We use mmap + // here to create the stack. Instead of guessing how much space the child + // process needs, we allocate through mmap to the system default limit, + // which is 8MB on most of the linux system today. This is OK since mmap + // will only researve the address space upfront, instead of allocating + // physical memory upfront. The stack will grow as needed, up to the size + // researved, so no wasted memory here. Lastly, the child stack only needs + // to support the container init process set up code in Youki. When Youki + // calls exec into the container payload, exec will reset the stack. let child_stack_top = unsafe { + // Use sysconf to find the page size. If there is an error, we assume + // the default 4K page size. let page_size: usize = match libc::sysconf(libc::_SC_PAGE_SIZE) { -1 => 4 * 1024, // default to 4K page size x => x as usize, }; + // Find out the default stack max size through getrlimit. let mut rlimit = libc::rlimit { rlim_cur: 0, rlim_max: 0, }; - Errno::result(libc::getrlimit(libc::RLIMIT_STACK, &mut rlimit))?; let default_stack_size = rlimit.rlim_cur as usize; + // mmap to reserve the child stack. Note, even though we researved + // `default_stack_size`, the memory is not allocated until it is used. + // Also, do not use MAP_GROWSDOWN since it is not well supported. + // Ref: https://man7.org/linux/man-pages/man2/mmap.2.html let child_stack = libc::mmap( libc::PT_NULL as *mut c_void, default_stack_size, @@ -34,14 +56,21 @@ pub fn clone(mut cb: sched::CloneCb, clone_flags: sched::CloneFlags) -> Result

isize>) -> i32), From a5ade9087ecd501be269af875bc52bf893ad0e3d Mon Sep 17 00:00:00 2001 From: yihuaf Date: Thu, 29 Jul 2021 02:53:51 +0200 Subject: [PATCH 3/7] refactor to use multiple unsafe block --- src/process/fork.rs | 51 ++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/src/process/fork.rs b/src/process/fork.rs index 3e4642b0c..9a4a41a0f 100644 --- a/src/process/fork.rs +++ b/src/process/fork.rs @@ -18,6 +18,23 @@ pub fn clone(mut cb: sched::CloneCb, clone_flags: sched::CloneFlags) -> Result

4 * 1024, // default to 4K page size + x => x as usize, + } + }; + + // Find out the default stack max size through getrlimit. + let mut rlimit = libc::rlimit { + rlim_cur: 0, + rlim_max: 0, + }; + unsafe { Errno::result(libc::getrlimit(libc::RLIMIT_STACK, &mut rlimit))? }; + let default_stack_size = rlimit.rlim_cur as usize; + // Using the clone syscall requires us to create the stack space for the // child process instead of taken cared for us like fork call. We use mmap // here to create the stack. Instead of guessing how much space the child @@ -28,34 +45,20 @@ pub fn clone(mut cb: sched::CloneCb, clone_flags: sched::CloneFlags) -> Result

4 * 1024, // default to 4K page size - x => x as usize, - }; - - // Find out the default stack max size through getrlimit. - let mut rlimit = libc::rlimit { - rlim_cur: 0, - rlim_max: 0, - }; - Errno::result(libc::getrlimit(libc::RLIMIT_STACK, &mut rlimit))?; - let default_stack_size = rlimit.rlim_cur as usize; - - // mmap to reserve the child stack. Note, even though we researved - // `default_stack_size`, the memory is not allocated until it is used. - // Also, do not use MAP_GROWSDOWN since it is not well supported. - // Ref: https://man7.org/linux/man-pages/man2/mmap.2.html - let child_stack = libc::mmap( + // Ref: https://man7.org/linux/man-pages/man2/mmap.2.html + let child_stack = unsafe { + // Note, do not use MAP_GROWSDOWN since it is not well supported. + libc::mmap( libc::PT_NULL as *mut c_void, default_stack_size, libc::PROT_READ | libc::PROT_WRITE, libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_STACK, -1, 0, - ); + ) + }; + + let res = unsafe { // Consistant with how pthread_create sets up the stack, we create a // guard page of 1 page, to protect the child stack collision. Note, for // clone call, the child stack will grow downward, so the bottom of the @@ -66,10 +69,6 @@ pub fn clone(mut cb: sched::CloneCb, clone_flags: sched::CloneFlags) -> Result

Date: Thu, 29 Jul 2021 05:55:55 +0200 Subject: [PATCH 4/7] Use nix::sys::mmap instead of libc --- src/process/fork.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/process/fork.rs b/src/process/fork.rs index 9a4a41a0f..29b4b78bc 100644 --- a/src/process/fork.rs +++ b/src/process/fork.rs @@ -1,11 +1,14 @@ +use anyhow::Context; use anyhow::Result; use libc::c_int; use libc::c_void; use nix::errno::Errno; use nix::sched; use nix::sys; +use nix::sys::mman; use nix::unistd::Pid; use std::mem; +use std::ptr; /// clone uses syscall clone(2) to create a new process for the container init /// process. Using clone syscall gives us better control over how to can create @@ -48,14 +51,14 @@ pub fn clone(mut cb: sched::CloneCb, clone_flags: sched::CloneFlags) -> Result

Result

Date: Thu, 29 Jul 2021 05:58:23 +0200 Subject: [PATCH 5/7] Update cargo.lock --- oci_spec/Cargo.lock | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/oci_spec/Cargo.lock b/oci_spec/Cargo.lock index 93303448b..75bf4aaf3 100644 --- a/oci_spec/Cargo.lock +++ b/oci_spec/Cargo.lock @@ -17,6 +17,12 @@ version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28b2cd92db5cbd74e8e5028f7e27dd7aa3090e89e4f2a197cc7c8dfb69c7063b" +[[package]] +name = "autocfg" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" + [[package]] name = "bitflags" version = "1.2.1" @@ -102,9 +108,9 @@ checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736" [[package]] name = "libc" -version = "0.2.95" +version = "0.2.98" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "789da6d93f1b866ffe175afc5322a4d76c038605a1c3319bb57b06967ca98a36" +checksum = "320cfe77175da3a483efed4bc0adc1968ca050b098ce4f2f1c13a56626128790" [[package]] name = "log" @@ -121,16 +127,26 @@ version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc" +[[package]] +name = "memoffset" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9" +dependencies = [ + "autocfg", +] + [[package]] name = "nix" -version = "0.19.1" +version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2ccba0cfe4fdf15982d1674c69b1fd80bad427d293849982668dfe454bd61f2" +checksum = "cf1e25ee6b412c2a1e3fcb6a4499a5c1bfe7f43e014bdce9a6b6666e5aa2d187" dependencies = [ "bitflags", "cc", "cfg-if", "libc", + "memoffset", ] [[package]] From ed5294a7871711297ce2c78023a5e1529a80f4df Mon Sep 17 00:00:00 2001 From: yihuaf Date: Thu, 29 Jul 2021 08:05:40 +0200 Subject: [PATCH 6/7] minor fix --- src/process/fork.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/process/fork.rs b/src/process/fork.rs index 29b4b78bc..5ab4959b6 100644 --- a/src/process/fork.rs +++ b/src/process/fork.rs @@ -47,10 +47,10 @@ pub fn clone(mut cb: sched::CloneCb, clone_flags: sched::CloneFlags) -> Result

Result

Result

isize>) -> i32), child_stack_top, From 3d8094a11f22c76c53757e620e5c7adb6a1ec49b Mon Sep 17 00:00:00 2001 From: yihuaf Date: Thu, 29 Jul 2021 19:53:16 +0200 Subject: [PATCH 7/7] Add comments explaining libc::clone --- src/process/fork.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/process/fork.rs b/src/process/fork.rs index 5ab4959b6..c1c4ebbbd 100644 --- a/src/process/fork.rs +++ b/src/process/fork.rs @@ -76,6 +76,13 @@ pub fn clone(mut cb: sched::CloneCb, clone_flags: sched::CloneFlags) -> Result

isize>) -> i32), child_stack_top,