Skip to content

Commit

Permalink
Implement the container_clone using CLONE_PARENT
Browse files Browse the repository at this point in the history
Signed-off-by: Eric Fang <[email protected]>
  • Loading branch information
yihuaf committed Mar 1, 2023
1 parent ecbeea7 commit eb2ffbf
Showing 1 changed file with 176 additions and 2 deletions.
178 changes: 176 additions & 2 deletions crates/libcontainer/src/process/fork.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
use anyhow::Result;
use nix::unistd;
use std::num;

use anyhow::{bail, Context, Result};
use nix::errno::Errno;
use nix::sys::mman;
use nix::unistd::Pid;
use nix::{sched, unistd};

const DEFAULT_STACK_SIZE: usize = 8 * 1024 * 1024; // 8M
const DEFAULT_PAGE_SIZE: usize = 4 * 1024; // 4K
type CloneCb = Box<dyn FnOnce() -> isize + Send>;

// Execute the cb in another process. Make the fork works more like thread_spawn
// or clone, so it is easier to reason. Compared to clone call, fork is easier
Expand All @@ -27,8 +35,123 @@ pub fn container_fork<F: FnOnce() -> Result<i32>>(cb: F) -> Result<Pid> {
}
}

pub fn container_clone(cb: CloneCb) -> Result<Pid> {
// Use sysconf to find the page size. If there is an error, we assume
// the default 4K page size.
let page_size: usize = unsafe {
match libc::sysconf(libc::_SC_PAGE_SIZE) {
-1 => DEFAULT_PAGE_SIZE,
x => x as usize,
}
};

// Find out the default stack max size through getrlimit.
let mut rlimit = libc::rlimit {
rlim_cur: 0,
rlim_max: 0,
};
unsafe { Errno::result(libc::getrlimit(libc::RLIMIT_STACK, &mut rlimit))? };

// mmap will return ENOMEM if stack size is unlimited
let default_stack_size = if rlimit.rlim_cur != u64::MAX {
rlimit.rlim_cur as usize
} else {
log::debug!("stack size returned by getrlimit() is unlimited, use DEFAULT_STACK_SIZE({DEFAULT_STACK_SIZE})");
DEFAULT_STACK_SIZE
};

// Using the clone syscall requires us to create the stack space for the
// child process instead of taken cared for us like fork call. We use mmap
// here to create the stack. Instead of guessing how much space the child
// process needs, we allocate through mmap to the system default limit,
// which is 8MB on most of the linux system today. This is OK since mmap
// will only researve the address space upfront, instead of allocating
// physical memory upfront. The stack will grow as needed, up to the size
// researved, so no wasted memory here. Lastly, the child stack only needs
// to support the container init process set up code in Youki. When Youki
// calls exec into the container payload, exec will reset the stack. Note,
// do not use MAP_GROWSDOWN since it is not well supported.
// Ref: https://man7.org/linux/man-pages/man2/mmap.2.html
let child_stack = unsafe {
mman::mmap(
None,
num::NonZeroUsize::new_unchecked(default_stack_size),
mman::ProtFlags::PROT_READ | mman::ProtFlags::PROT_WRITE,
mman::MapFlags::MAP_PRIVATE | mman::MapFlags::MAP_ANONYMOUS | mman::MapFlags::MAP_STACK,
-1,
0,
)?
};
// Consistant with how pthread_create sets up the stack, we create a
// guard page of 1 page, to protect the child stack collision. Note, for
// clone call, the child stack will grow downward, so the bottom of the
// child stack is in the beginning.
unsafe {
mman::mprotect(child_stack, page_size, mman::ProtFlags::PROT_NONE)
.with_context(|| "failed to create guard page")?
};

// Since the child stack for clone grows downward, we need to pass in
// the top of the stack address.
let child_stack_top = unsafe { child_stack.add(default_stack_size) };

// Adds SIGCHLD and CLONE_PARENT to the clone flags
let clone_flags: libc::c_int =
nix::sys::signal::Signal::SIGCHLD as libc::c_int | sched::CloneFlags::CLONE_PARENT.bits();

// We are passing the boxed closure "cb" into the clone function as the a
// function pointer in C. The box closure in Rust is both a function pointer
// and a struct. However, when casting the box closure into libc::c_void,
// the function pointer will be lost. Therefore, to work around the issue,
// we double box the closure. This is consistant with how std::unix::thread
// handles the closure.
// Ref: https://github.com/rust-lang/rust/blob/master/library/std/src/sys/unix/thread.rs
let data = Box::into_raw(Box::new(cb));
// The main is a wrapper function passed into clone call below. The "data"
// arg is actually a raw pointer to a Box closure. so here, we re-box the
// pointer back into a box closure so the main takes ownership of the
// memory. Then we can call the closure passed in.
extern "C" fn main(data: *mut libc::c_void) -> libc::c_int {
unsafe { Box::from_raw(data as *mut CloneCb)() as i32 }
}

// The nix::sched::clone wrapper doesn't provide the right interface. Using
// the clone syscall is one of the rare cases where we don't want rust to
// manage the child stack memory. Instead, we want to use c_void directly
// here. Therefore, here we are using libc::clone syscall directly for
// better control. The child stack will be cleaned when exec is called or
// the child process terminates. The nix wrapper also does not treat the
// closure memory correctly. The wrapper implementation fails to pass the
// right ownership to the new child process.
// Ref: https://github.com/nix-rust/nix/issues/919
// Ref: https://github.com/nix-rust/nix/pull/920
let res = unsafe {
libc::clone(
main,
child_stack_top,
clone_flags,
data as *mut libc::c_void,
)
};
match res {
-1 => {
// Since the clone call failed, the closure passed in didn't get
// consumed. To complete the circle, we can safely box up the
// closure again and let rust manage this memory for us.
unsafe { drop(Box::from_raw(data)) };
bail!(
"failed clone to create new process: {:?}",
Errno::result(res)
)
}
pid => Ok(Pid::from_raw(pid)),
}
}

#[cfg(test)]
mod test {
use crate::process::channel::channel;

use super::*;
use anyhow::{bail, Result};
use nix::sys::wait::{waitpid, WaitStatus};
Expand Down Expand Up @@ -58,4 +181,55 @@ mod test {
_ => bail!("test failed"),
}
}

#[test]
fn test_container_clone() -> Result<()> {
// The `container_clone` will create the process as a sibling process
// (share the same parent) as the calling process. In Unix, a process
// can only wait on the immediate children process and can't wait the
// sibling process. Therefore, to test the logic, we will have to fork a
// process first and then let the forked process call `container_clone`.
// Then the testing process (the process where test is called), who are
// the parent to the forked process and the sibling process from
// `container_clone`, can wait on both process.

// We need to use a channel so that the forked process can pass the pid
// of the sibling process to the testing process.
let (sender, receiver) = &mut channel::<i32>()?;

match unsafe { unistd::fork()? } {
unistd::ForkResult::Parent { child } => {
let sibling_process_pid =
Pid::from_raw(receiver.recv().with_context(|| {
"failed to receive the sibling pid from forked process"
})?);
receiver.close()?;
match waitpid(sibling_process_pid, None).expect("wait pid failed.") {
WaitStatus::Exited(p, status) => {
assert_eq!(sibling_process_pid, p);
assert_eq!(status, 0);
}
_ => bail!("failed to wait on the sibling process"),
}
// After sibling process exits, we can wait on the forked process.
match waitpid(child, None).expect("wait pid failed.") {
WaitStatus::Exited(p, status) => {
assert_eq!(child, p);
assert_eq!(status, 0);
}
_ => bail!("failed to wait on the forked process"),
}
}
unistd::ForkResult::Child => {
// Inside the forked process. We call `container_clone` and pass
// the pid to the parent process.
let pid = container_clone(Box::new(move || 0))?;
sender.send(pid.as_raw())?;
sender.close()?;
std::process::exit(0);
}
};

Ok(())
}
}

0 comments on commit eb2ffbf

Please sign in to comment.