Skip to content

Commit

Permalink
Refactor how the closure used for clone to create the new process.
Browse files Browse the repository at this point in the history
  • Loading branch information
yihuaf committed Jul 30, 2021
1 parent 218c7a8 commit bd54128
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 47 deletions.
24 changes: 16 additions & 8 deletions src/container/builder_impl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,15 +68,23 @@ impl ContainerBuilderImpl {
let (mut parent, parent_channel) = parent::ParentProcess::new(self.rootless.clone())?;
let mut child = child::ChildProcess::new(parent_channel)?;

let cb = Box::new(|| {
let init = self.init;
let rootless = self.rootless.clone();
let spec = self.spec.clone();
let syscall = self.syscall.clone();
let rootfs = self.rootfs.clone();
let console_socket = self.console_socket.clone();
let notify_path = self.notify_path.clone();

let cb = Box::new(move || {
if let Err(error) = container_init(
self.init,
self.rootless.clone(),
self.spec.clone(),
self.syscall.clone(),
self.rootfs.clone(),
self.console_socket.clone(),
self.notify_path.clone(),
init,
rootless,
spec,
syscall,
rootfs,
console_socket,
notify_path,
&mut child,
) {
log::debug!("failed to run container_init: {:?}", error);
Expand Down
99 changes: 60 additions & 39 deletions src/process/fork.rs
Original file line number Diff line number Diff line change
@@ -1,26 +1,23 @@
use anyhow::bail;
use anyhow::Context;
use anyhow::Result;
use libc::c_int;
use libc::c_void;
use nix::errno::Errno;
use nix::sched;
use nix::sys;
use nix::sys::mman;
use nix::unistd::Pid;
use std::mem;
use std::ptr;

// The clone callback is used in clone call. It is a boxed closure and it needs
// to trasfer the ownership of related memory to the new process.
type CloneCb = Box<dyn FnOnce() -> isize + Send>;

/// clone uses syscall clone(2) to create a new process for the container init
/// process. Using clone syscall gives us better control over how to can create
/// the new container process, where we can enter into namespaces directly instead
/// of using unshare and fork. This call will only create one new process, instead
/// of two using fork.
pub fn clone(mut cb: sched::CloneCb, clone_flags: sched::CloneFlags) -> Result<Pid> {
extern "C" fn callback(data: *mut sched::CloneCb) -> c_int {
let cb: &mut sched::CloneCb = unsafe { &mut *data };
(*cb)() as c_int
}

pub fn clone(cb: CloneCb, clone_flags: sched::CloneFlags) -> Result<Pid> {
// Use sysconf to find the page size. If there is an error, we assume
// the default 4K page size.
let page_size: usize = unsafe {
Expand Down Expand Up @@ -60,39 +57,63 @@ pub fn clone(mut cb: sched::CloneCb, clone_flags: sched::CloneFlags) -> Result<P
0,
)?
};
// Consistant with how pthread_create sets up the stack, we create a
// guard page of 1 page, to protect the child stack collision. Note, for
// clone call, the child stack will grow downward, so the bottom of the
// child stack is in the beginning.
unsafe {
mman::mprotect(child_stack, page_size, mman::ProtFlags::PROT_NONE)
.with_context(|| "Failed to create guard page")?
};

// Since the child stack for clone grows downward, we need to pass in
// the top of the stack address.
let child_stack_top = unsafe { child_stack.add(default_stack_size) };

// Adds SIGCHLD flag to mimic the same behavior as fork.
let signal = sys::signal::Signal::SIGCHLD;
let combined = clone_flags.bits() | signal as c_int;
let res = unsafe {
// Consistant with how pthread_create sets up the stack, we create a
// guard page of 1 page, to protect the child stack collision. Note, for
// clone call, the child stack will grow downward, so the bottom of the
// child stack is in the beginning.
mman::mprotect(child_stack, page_size, mman::ProtFlags::PROT_NONE)
.with_context(|| "Failed to create guard page")?;

// Since the child stack for clone grows downward, we need to pass in
// the top of the stack address.
let child_stack_top = child_stack.add(default_stack_size);

// Using the clone syscall is one of the rare cases where we don't want
// rust to manage the child stack memory. Instead, we want to use
// c_void directly here. The nix::sched::clone wrapper doesn't provide
// the right interface and its interface can't be changed. Therefore,
// here we are using libc::clone syscall directly for better control.
// The child stack will be cleaned when exec is called or the child
// process terminates.
libc::clone(
mem::transmute(callback as extern "C" fn(*mut Box<dyn FnMut() -> isize>) -> i32),
child_stack_top,
combined,
&mut cb as *mut _ as *mut c_void,
)
};
let pid = Errno::result(res).map(Pid::from_raw)?;
let combined = clone_flags.bits() | signal as libc::c_int;

// We are passing the boxed closure "cb" into the clone function as the a
// function pointer in C. The box closure in Rust is both a function pointer
// and a struct. However, when casting the box closure into libc::c_void,
// the function pointer will be lost. Therefore, to work around the issue,
// we double box the closure. This is consistant with how std::unix::thread
// handles the closure.
// Ref: https://github.com/rust-lang/rust/blob/master/library/std/src/sys/unix/thread.rs
let data = Box::into_raw(Box::new(cb));
// The main is a wrapper function passed into clone call below. The "data"
// arg is actually a raw pointer to a Box closure. so here, we re-box the
// pointer back into a box closure so the main takes ownership of the
// memory. Then we can call the closure passed in.
extern "C" fn main(data: *mut libc::c_void) -> libc::c_int {
unsafe { Box::from_raw(data as *mut CloneCb)() as i32 }
}

Ok(pid)
// The nix::sched::clone wrapper doesn't provide the right interface. Using
// the clone syscall is one of the rare cases where we don't want rust to
// manage the child stack memory. Instead, we want to use c_void directly
// here. Therefore, here we are using libc::clone syscall directly for
// better control. The child stack will be cleaned when exec is called or
// the child process terminates. The nix wrapper also does not treat the
// closure memory correctly. The wrapper implementation fails to pass the
// right ownership to the new child process.
// Ref: https://github.com/nix-rust/nix/issues/919
// Ref: https://github.com/nix-rust/nix/pull/920
let res = unsafe { libc::clone(main, child_stack_top, combined, data as *mut libc::c_void) };
match res {
-1 => {
// Since the clone call failed, the closure passed in didn't get
// consumed. To complete the circle, we can safely box up the
// closure again and let rust manage this memory for us.
unsafe { drop(Box::from_raw(data)) };
bail!(
"Failed clone to create new process: {:?}",
Errno::result(res)
)
}
pid => Ok(Pid::from_raw(pid)),
}
}

#[cfg(test)]
Expand All @@ -115,7 +136,7 @@ mod tests {
// namespace is needed for the test to run without root
let flags = sched::CloneFlags::CLONE_NEWPID | sched::CloneFlags::CLONE_NEWUSER;
let pid = super::clone(
Box::new(|| {
Box::new(move || {
if cb().is_err() {
return -1;
}
Expand Down

0 comments on commit bd54128

Please sign in to comment.