Skip to content

Commit

Permalink
Merge pull request #173 from yihuaf/yihuaf/163
Browse files Browse the repository at this point in the history
Fix how closure is transferred to the clone call.
  • Loading branch information
utam0k authored Jul 31, 2021
2 parents 218c7a8 + e0e9815 commit 307e44a
Show file tree
Hide file tree
Showing 5 changed files with 142 additions and 78 deletions.
85 changes: 52 additions & 33 deletions src/container/builder_impl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,6 @@ pub(super) struct ContainerBuilderImpl {
pub use_systemd: bool,
/// Id of the container
pub container_id: String,
/// Directory where the state of the container will be stored
pub container_dir: PathBuf,
/// OCI complient runtime spec
pub spec: Spec,
/// Root filesystem of the container
Expand Down Expand Up @@ -66,19 +64,26 @@ impl ContainerBuilderImpl {

// create the parent and child process structure so the parent and child process can sync with each other
let (mut parent, parent_channel) = parent::ParentProcess::new(self.rootless.clone())?;
let mut child = child::ChildProcess::new(parent_channel)?;

let cb = Box::new(|| {
if let Err(error) = container_init(
self.init,
self.rootless.clone(),
self.spec.clone(),
self.syscall.clone(),
self.rootfs.clone(),
self.console_socket.clone(),
self.notify_path.clone(),
&mut child,
) {
let child = child::ChildProcess::new(parent_channel)?;

// This init_args will be passed to the container init process,
// therefore we will have to move all the variable by value. Since self
// is a shared reference, we have to clone these variables here.
let init_args = ContainerInitArgs {
init: self.init,
syscall: self.syscall.clone(),
spec: self.spec.clone(),
rootfs: self.rootfs.clone(),
console_socket: self.console_socket.clone(),
rootless: self.rootless.clone(),
notify_path: self.notify_path.clone(),
child: child,
};

// We have to box up this closure to correctly pass to the init function
// of the new process.
let cb = Box::new(move || {
if let Err(error) = container_init(init_args) {
log::debug!("failed to run container_init: {:?}", error);
return -1;
}
Expand Down Expand Up @@ -114,22 +119,36 @@ impl ContainerBuilderImpl {
}
}

fn container_init(
init: bool,
rootless: Option<Rootless>,
spec: Spec,
command: LinuxSyscall,
rootfs: PathBuf,
console_socket: Option<FileDescriptor>,
notify_name: PathBuf,
child: &mut child::ChildProcess,
) -> Result<()> {
struct ContainerInitArgs {
/// Flag indicating if an init or a tenant container should be created
pub init: bool,
/// Interface to operating system primitives
pub syscall: LinuxSyscall,
/// OCI complient runtime spec
pub spec: Spec,
/// Root filesystem of the container
pub rootfs: PathBuf,
/// Socket to communicate the file descriptor of the ptty
pub console_socket: Option<FileDescriptor>,
/// Options for rootless containers
pub rootless: Option<Rootless>,
/// Path to the Unix Domain Socket to communicate container start
pub notify_path: PathBuf,
/// Pipe used to communicate with the child process
pub child: child::ChildProcess,
}

fn container_init(args: ContainerInitArgs) -> Result<()> {
let command = &args.syscall;
let spec = &args.spec;
let linux = &spec.linux.as_ref().context("no linux in spec")?;
let namespaces: Namespaces = linux.namespaces.clone().into();
// need to create the notify socket before we pivot root, since the unix
// domain socket used here is outside of the rootfs of container
let mut notify_socket: NotifyListener = NotifyListener::new(&notify_name)?;
let mut notify_socket: NotifyListener = NotifyListener::new(&args.notify_path)?;
let proc = &spec.process.as_ref().context("no process in spec")?;
let rootfs = &args.rootfs;
let mut child = args.child;

// if Out-of-memory score adjustment is set in specification. set the score
// value for the current process check
Expand All @@ -146,7 +165,7 @@ fn container_init(
// namespace will be created, check
// https://man7.org/linux/man-pages/man7/user_namespaces.7.html for more
// information
if rootless.is_some() {
if args.rootless.is_some() {
// child needs to be dumpable, otherwise the non root parent is not
// allowed to write the uid/gid maps
prctl::set_dumpable(true).unwrap();
Expand All @@ -165,7 +184,7 @@ fn container_init(
.context("failed to become root")?;

// set up tty if specified
if let Some(csocketfd) = console_socket {
if let Some(csocketfd) = args.console_socket {
tty::setup_console(&csocketfd)?;
}

Expand All @@ -178,7 +197,7 @@ fn container_init(
let _ = prctl::set_no_new_privileges(true);
}

if init {
if args.init {
rootfs::prepare_rootfs(
&spec,
&rootfs,
Expand All @@ -190,14 +209,14 @@ fn container_init(

// change the root of filesystem of the process to the rootfs
command
.pivot_rootfs(&rootfs)
.with_context(|| format!("Failed to pivot root to {:?}", &rootfs))?;
.pivot_rootfs(rootfs)
.with_context(|| format!("Failed to pivot root to {:?}", rootfs))?;
}

command.set_id(Uid::from_raw(proc.user.uid), Gid::from_raw(proc.user.gid))?;
capabilities::reset_effective(&command)?;
capabilities::reset_effective(command)?;
if let Some(caps) = &proc.capabilities {
capabilities::drop_privileges(&caps, &command)?;
capabilities::drop_privileges(&caps, command)?;
}

// notify parents that the init process is ready to execute the payload.
Expand Down
1 change: 0 additions & 1 deletion src/container/init_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ impl InitContainerBuilder {
pid_file: self.base.pid_file,
console_socket: csocketfd,
use_systemd: self.use_systemd,
container_dir,
spec,
rootfs,
rootless,
Expand Down
1 change: 0 additions & 1 deletion src/container/tenant_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,6 @@ impl TenantContainerBuilder {
pid_file: self.base.pid_file,
console_socket: csocketfd,
use_systemd,
container_dir,
spec,
rootfs,
rootless,
Expand Down
10 changes: 6 additions & 4 deletions src/process/child.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,12 @@ pub struct ChildProcess {
poll: Option<Poll>,
}

// Note : The original youki process first forks into 'parent' (P) and 'child' (C1) process
// of which this represents the child (C1) process. The C1 then again forks into parent process which is C1,
// and Child (C2) process. C2 is called as init process as it will run the command of the container. But form
// a process point of view, init process is child of child process, which is child of original youki process.
// Note: The original Youki process "forks" a child process using clone(2). The
// child process will become the container init process, where it will set up
// namespaces, device mounts, and etc. for the container process. Finally, the
// container init process will run the actual container payload through exec
// call. The ChildProcess will be used to synchronize between the Youki main
// process and the child process (container init process).
impl ChildProcess {
/// create a new Child process structure
pub fn new(parent_channel: ParentChannel) -> Result<Self> {
Expand Down
123 changes: 84 additions & 39 deletions src/process/fork.rs
Original file line number Diff line number Diff line change
@@ -1,26 +1,23 @@
use anyhow::bail;
use anyhow::Context;
use anyhow::Result;
use libc::c_int;
use libc::c_void;
use nix::errno::Errno;
use nix::sched;
use nix::sys;
use nix::sys::mman;
use nix::unistd::Pid;
use std::mem;
use std::ptr;

// The clone callback is used in clone call. It is a boxed closure and it needs
// to trasfer the ownership of related memory to the new process.
type CloneCb = Box<dyn FnOnce() -> isize + Send>;

/// clone uses syscall clone(2) to create a new process for the container init
/// process. Using clone syscall gives us better control over how to can create
/// the new container process, where we can enter into namespaces directly instead
/// of using unshare and fork. This call will only create one new process, instead
/// of two using fork.
pub fn clone(mut cb: sched::CloneCb, clone_flags: sched::CloneFlags) -> Result<Pid> {
extern "C" fn callback(data: *mut sched::CloneCb) -> c_int {
let cb: &mut sched::CloneCb = unsafe { &mut *data };
(*cb)() as c_int
}

pub fn clone(cb: CloneCb, clone_flags: sched::CloneFlags) -> Result<Pid> {
// Use sysconf to find the page size. If there is an error, we assume
// the default 4K page size.
let page_size: usize = unsafe {
Expand Down Expand Up @@ -60,45 +57,70 @@ pub fn clone(mut cb: sched::CloneCb, clone_flags: sched::CloneFlags) -> Result<P
0,
)?
};
// Consistant with how pthread_create sets up the stack, we create a
// guard page of 1 page, to protect the child stack collision. Note, for
// clone call, the child stack will grow downward, so the bottom of the
// child stack is in the beginning.
unsafe {
mman::mprotect(child_stack, page_size, mman::ProtFlags::PROT_NONE)
.with_context(|| "Failed to create guard page")?
};

// Since the child stack for clone grows downward, we need to pass in
// the top of the stack address.
let child_stack_top = unsafe { child_stack.add(default_stack_size) };

// Adds SIGCHLD flag to mimic the same behavior as fork.
let signal = sys::signal::Signal::SIGCHLD;
let combined = clone_flags.bits() | signal as c_int;
let res = unsafe {
// Consistant with how pthread_create sets up the stack, we create a
// guard page of 1 page, to protect the child stack collision. Note, for
// clone call, the child stack will grow downward, so the bottom of the
// child stack is in the beginning.
mman::mprotect(child_stack, page_size, mman::ProtFlags::PROT_NONE)
.with_context(|| "Failed to create guard page")?;

// Since the child stack for clone grows downward, we need to pass in
// the top of the stack address.
let child_stack_top = child_stack.add(default_stack_size);

// Using the clone syscall is one of the rare cases where we don't want
// rust to manage the child stack memory. Instead, we want to use
// c_void directly here. The nix::sched::clone wrapper doesn't provide
// the right interface and its interface can't be changed. Therefore,
// here we are using libc::clone syscall directly for better control.
// The child stack will be cleaned when exec is called or the child
// process terminates.
libc::clone(
mem::transmute(callback as extern "C" fn(*mut Box<dyn FnMut() -> isize>) -> i32),
child_stack_top,
combined,
&mut cb as *mut _ as *mut c_void,
)
};
let pid = Errno::result(res).map(Pid::from_raw)?;
let combined = clone_flags.bits() | signal as libc::c_int;

// We are passing the boxed closure "cb" into the clone function as the a
// function pointer in C. The box closure in Rust is both a function pointer
// and a struct. However, when casting the box closure into libc::c_void,
// the function pointer will be lost. Therefore, to work around the issue,
// we double box the closure. This is consistant with how std::unix::thread
// handles the closure.
// Ref: https://github.com/rust-lang/rust/blob/master/library/std/src/sys/unix/thread.rs
let data = Box::into_raw(Box::new(cb));
// The main is a wrapper function passed into clone call below. The "data"
// arg is actually a raw pointer to a Box closure. so here, we re-box the
// pointer back into a box closure so the main takes ownership of the
// memory. Then we can call the closure passed in.
extern "C" fn main(data: *mut libc::c_void) -> libc::c_int {
unsafe { Box::from_raw(data as *mut CloneCb)() as i32 }
}

Ok(pid)
// The nix::sched::clone wrapper doesn't provide the right interface. Using
// the clone syscall is one of the rare cases where we don't want rust to
// manage the child stack memory. Instead, we want to use c_void directly
// here. Therefore, here we are using libc::clone syscall directly for
// better control. The child stack will be cleaned when exec is called or
// the child process terminates. The nix wrapper also does not treat the
// closure memory correctly. The wrapper implementation fails to pass the
// right ownership to the new child process.
// Ref: https://github.com/nix-rust/nix/issues/919
// Ref: https://github.com/nix-rust/nix/pull/920
let res = unsafe { libc::clone(main, child_stack_top, combined, data as *mut libc::c_void) };
match res {
-1 => {
// Since the clone call failed, the closure passed in didn't get
// consumed. To complete the circle, we can safely box up the
// closure again and let rust manage this memory for us.
unsafe { drop(Box::from_raw(data)) };
bail!(
"Failed clone to create new process: {:?}",
Errno::result(res)
)
}
pid => Ok(Pid::from_raw(pid)),
}
}

#[cfg(test)]
mod tests {
use super::*;
use anyhow::bail;
use nix::sys::wait;
use nix::unistd;

#[test]
Expand All @@ -115,7 +137,7 @@ mod tests {
// namespace is needed for the test to run without root
let flags = sched::CloneFlags::CLONE_NEWPID | sched::CloneFlags::CLONE_NEWUSER;
let pid = super::clone(
Box::new(|| {
Box::new(move || {
if cb().is_err() {
return -1;
}
Expand Down Expand Up @@ -165,4 +187,27 @@ mod tests {

bail!("Process didn't exit correctly")
}

fn clone_closure_ownership_test_payload() -> super::CloneCb {
// The vec should not be deallocated after this function returns. The
// ownership should correctly transfer to the closure returned, to be
// passed to the clone and new child process.
let numbers: Vec<i32> = (0..101).into_iter().collect();
Box::new(move || {
assert_eq!(numbers.iter().sum::<i32>(), 5050);
0
})
}

#[test]
fn test_clone_closure_ownership() -> Result<()> {
let flags = sched::CloneFlags::empty();

let pid = super::clone(clone_closure_ownership_test_payload(), flags)?;
let exit_status =
wait::waitpid(pid, Some(wait::WaitPidFlag::__WALL)).expect("Waiting for child");
assert_eq!(exit_status, wait::WaitStatus::Exited(pid, 0));

Ok(())
}
}

0 comments on commit 307e44a

Please sign in to comment.