Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix how closure is transferred to the clone call. #173

Merged
merged 5 commits into from
Jul 31, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 52 additions & 33 deletions src/container/builder_impl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,6 @@ pub(super) struct ContainerBuilderImpl {
pub use_systemd: bool,
/// Id of the container
pub container_id: String,
/// Directory where the state of the container will be stored
pub container_dir: PathBuf,
/// OCI complient runtime spec
pub spec: Spec,
/// Root filesystem of the container
Expand Down Expand Up @@ -66,19 +64,26 @@ impl ContainerBuilderImpl {

// create the parent and child process structure so the parent and child process can sync with each other
let (mut parent, parent_channel) = parent::ParentProcess::new(self.rootless.clone())?;
let mut child = child::ChildProcess::new(parent_channel)?;

let cb = Box::new(|| {
if let Err(error) = container_init(
self.init,
self.rootless.clone(),
self.spec.clone(),
self.syscall.clone(),
self.rootfs.clone(),
self.console_socket.clone(),
self.notify_path.clone(),
&mut child,
) {
let child = child::ChildProcess::new(parent_channel)?;

// This init_args will be passed to the container init process,
// therefore we will have to move all the variable by value. Since self
// is a shared reference, we have to clone these variables here.
let init_args = ContainerInitArgs {
init: self.init,
syscall: self.syscall.clone(),
spec: self.spec.clone(),
rootfs: self.rootfs.clone(),
console_socket: self.console_socket.clone(),
rootless: self.rootless.clone(),
notify_path: self.notify_path.clone(),
child: child,
};

// We have to box up this closure to correctly pass to the init function
// of the new process.
let cb = Box::new(move || {
if let Err(error) = container_init(init_args) {
log::debug!("failed to run container_init: {:?}", error);
return -1;
}
Expand Down Expand Up @@ -114,22 +119,36 @@ impl ContainerBuilderImpl {
}
}

fn container_init(
init: bool,
rootless: Option<Rootless>,
spec: Spec,
command: LinuxSyscall,
rootfs: PathBuf,
console_socket: Option<FileDescriptor>,
notify_name: PathBuf,
child: &mut child::ChildProcess,
) -> Result<()> {
struct ContainerInitArgs {
/// Flag indicating if an init or a tenant container should be created
pub init: bool,
/// Interface to operating system primitives
pub syscall: LinuxSyscall,
/// OCI complient runtime spec
pub spec: Spec,
/// Root filesystem of the container
pub rootfs: PathBuf,
/// Socket to communicate the file descriptor of the ptty
pub console_socket: Option<FileDescriptor>,
/// Options for rootless containers
pub rootless: Option<Rootless>,
/// Path to the Unix Domain Socket to communicate container start
pub notify_path: PathBuf,
/// Pipe used to communicate with the child process
pub child: child::ChildProcess,
}

fn container_init(args: ContainerInitArgs) -> Result<()> {
let command = &args.syscall;
let spec = &args.spec;
let linux = &spec.linux.as_ref().context("no linux in spec")?;
let namespaces: Namespaces = linux.namespaces.clone().into();
// need to create the notify socket before we pivot root, since the unix
// domain socket used here is outside of the rootfs of container
let mut notify_socket: NotifyListener = NotifyListener::new(&notify_name)?;
let mut notify_socket: NotifyListener = NotifyListener::new(&args.notify_path)?;
let proc = &spec.process.as_ref().context("no process in spec")?;
let rootfs = &args.rootfs;
let mut child = args.child;

// if Out-of-memory score adjustment is set in specification. set the score
// value for the current process check
Expand All @@ -146,7 +165,7 @@ fn container_init(
// namespace will be created, check
// https://man7.org/linux/man-pages/man7/user_namespaces.7.html for more
// information
if rootless.is_some() {
if args.rootless.is_some() {
// child needs to be dumpable, otherwise the non root parent is not
// allowed to write the uid/gid maps
prctl::set_dumpable(true).unwrap();
Expand All @@ -165,7 +184,7 @@ fn container_init(
.context("failed to become root")?;

// set up tty if specified
if let Some(csocketfd) = console_socket {
if let Some(csocketfd) = args.console_socket {
tty::setup_console(&csocketfd)?;
}

Expand All @@ -178,7 +197,7 @@ fn container_init(
let _ = prctl::set_no_new_privileges(true);
}

if init {
if args.init {
rootfs::prepare_rootfs(
&spec,
&rootfs,
Expand All @@ -190,14 +209,14 @@ fn container_init(

// change the root of filesystem of the process to the rootfs
command
.pivot_rootfs(&rootfs)
.with_context(|| format!("Failed to pivot root to {:?}", &rootfs))?;
.pivot_rootfs(rootfs)
.with_context(|| format!("Failed to pivot root to {:?}", rootfs))?;
}

command.set_id(Uid::from_raw(proc.user.uid), Gid::from_raw(proc.user.gid))?;
capabilities::reset_effective(&command)?;
capabilities::reset_effective(command)?;
if let Some(caps) = &proc.capabilities {
capabilities::drop_privileges(&caps, &command)?;
capabilities::drop_privileges(&caps, command)?;
}

// notify parents that the init process is ready to execute the payload.
Expand Down
1 change: 0 additions & 1 deletion src/container/init_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ impl InitContainerBuilder {
pid_file: self.base.pid_file,
console_socket: csocketfd,
use_systemd: self.use_systemd,
container_dir,
spec,
rootfs,
rootless,
Expand Down
1 change: 0 additions & 1 deletion src/container/tenant_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,6 @@ impl TenantContainerBuilder {
pid_file: self.base.pid_file,
console_socket: csocketfd,
use_systemd,
container_dir,
spec,
rootfs,
rootless,
Expand Down
10 changes: 6 additions & 4 deletions src/process/child.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,12 @@ pub struct ChildProcess {
poll: Option<Poll>,
}

// Note : The original youki process first forks into 'parent' (P) and 'child' (C1) process
// of which this represents the child (C1) process. The C1 then again forks into parent process which is C1,
// and Child (C2) process. C2 is called as init process as it will run the command of the container. But form
// a process point of view, init process is child of child process, which is child of original youki process.
// Note: The original Youki process "forks" a child process using clone(2). The
// child process will become the container init process, where it will set up
// namespaces, device mounts, and etc. for the container process. Finally, the
// container init process will run the actual container payload through exec
// call. The ChildProcess will be used to synchronize between the Youki main
// process and the child process (container init process).
impl ChildProcess {
/// create a new Child process structure
pub fn new(parent_channel: ParentChannel) -> Result<Self> {
Expand Down
123 changes: 84 additions & 39 deletions src/process/fork.rs
Original file line number Diff line number Diff line change
@@ -1,26 +1,23 @@
use anyhow::bail;
use anyhow::Context;
use anyhow::Result;
use libc::c_int;
use libc::c_void;
use nix::errno::Errno;
use nix::sched;
use nix::sys;
use nix::sys::mman;
use nix::unistd::Pid;
use std::mem;
use std::ptr;

// The clone callback is used in clone call. It is a boxed closure and it needs
// to trasfer the ownership of related memory to the new process.
type CloneCb = Box<dyn FnOnce() -> isize + Send>;

/// clone uses syscall clone(2) to create a new process for the container init
/// process. Using clone syscall gives us better control over how to can create
/// the new container process, where we can enter into namespaces directly instead
/// of using unshare and fork. This call will only create one new process, instead
/// of two using fork.
pub fn clone(mut cb: sched::CloneCb, clone_flags: sched::CloneFlags) -> Result<Pid> {
extern "C" fn callback(data: *mut sched::CloneCb) -> c_int {
let cb: &mut sched::CloneCb = unsafe { &mut *data };
(*cb)() as c_int
}

pub fn clone(cb: CloneCb, clone_flags: sched::CloneFlags) -> Result<Pid> {
// Use sysconf to find the page size. If there is an error, we assume
// the default 4K page size.
let page_size: usize = unsafe {
Expand Down Expand Up @@ -60,45 +57,70 @@ pub fn clone(mut cb: sched::CloneCb, clone_flags: sched::CloneFlags) -> Result<P
0,
)?
};
// Consistant with how pthread_create sets up the stack, we create a
// guard page of 1 page, to protect the child stack collision. Note, for
// clone call, the child stack will grow downward, so the bottom of the
// child stack is in the beginning.
unsafe {
mman::mprotect(child_stack, page_size, mman::ProtFlags::PROT_NONE)
.with_context(|| "Failed to create guard page")?
};

// Since the child stack for clone grows downward, we need to pass in
// the top of the stack address.
let child_stack_top = unsafe { child_stack.add(default_stack_size) };

// Adds SIGCHLD flag to mimic the same behavior as fork.
let signal = sys::signal::Signal::SIGCHLD;
let combined = clone_flags.bits() | signal as c_int;
let res = unsafe {
// Consistant with how pthread_create sets up the stack, we create a
// guard page of 1 page, to protect the child stack collision. Note, for
// clone call, the child stack will grow downward, so the bottom of the
// child stack is in the beginning.
mman::mprotect(child_stack, page_size, mman::ProtFlags::PROT_NONE)
.with_context(|| "Failed to create guard page")?;

// Since the child stack for clone grows downward, we need to pass in
// the top of the stack address.
let child_stack_top = child_stack.add(default_stack_size);

// Using the clone syscall is one of the rare cases where we don't want
// rust to manage the child stack memory. Instead, we want to use
// c_void directly here. The nix::sched::clone wrapper doesn't provide
// the right interface and its interface can't be changed. Therefore,
// here we are using libc::clone syscall directly for better control.
// The child stack will be cleaned when exec is called or the child
// process terminates.
libc::clone(
mem::transmute(callback as extern "C" fn(*mut Box<dyn FnMut() -> isize>) -> i32),
child_stack_top,
combined,
&mut cb as *mut _ as *mut c_void,
)
};
let pid = Errno::result(res).map(Pid::from_raw)?;
let combined = clone_flags.bits() | signal as libc::c_int;

// We are passing the boxed closure "cb" into the clone function as the a
// function pointer in C. The box closure in Rust is both a function pointer
// and a struct. However, when casting the box closure into libc::c_void,
// the function pointer will be lost. Therefore, to work around the issue,
// we double box the closure. This is consistant with how std::unix::thread
// handles the closure.
// Ref: https://github.com/rust-lang/rust/blob/master/library/std/src/sys/unix/thread.rs
let data = Box::into_raw(Box::new(cb));
// The main is a wrapper function passed into clone call below. The "data"
// arg is actually a raw pointer to a Box closure. so here, we re-box the
// pointer back into a box closure so the main takes ownership of the
// memory. Then we can call the closure passed in.
extern "C" fn main(data: *mut libc::c_void) -> libc::c_int {
unsafe { Box::from_raw(data as *mut CloneCb)() as i32 }
}

Ok(pid)
// The nix::sched::clone wrapper doesn't provide the right interface. Using
// the clone syscall is one of the rare cases where we don't want rust to
// manage the child stack memory. Instead, we want to use c_void directly
// here. Therefore, here we are using libc::clone syscall directly for
// better control. The child stack will be cleaned when exec is called or
// the child process terminates. The nix wrapper also does not treat the
// closure memory correctly. The wrapper implementation fails to pass the
// right ownership to the new child process.
// Ref: https://github.com/nix-rust/nix/issues/919
// Ref: https://github.com/nix-rust/nix/pull/920
let res = unsafe { libc::clone(main, child_stack_top, combined, data as *mut libc::c_void) };
match res {
-1 => {
// Since the clone call failed, the closure passed in didn't get
// consumed. To complete the circle, we can safely box up the
// closure again and let rust manage this memory for us.
unsafe { drop(Box::from_raw(data)) };
bail!(
"Failed clone to create new process: {:?}",
Errno::result(res)
)
}
pid => Ok(Pid::from_raw(pid)),
}
}

#[cfg(test)]
mod tests {
use super::*;
use anyhow::bail;
use nix::sys::wait;
use nix::unistd;

#[test]
Expand All @@ -115,7 +137,7 @@ mod tests {
// namespace is needed for the test to run without root
let flags = sched::CloneFlags::CLONE_NEWPID | sched::CloneFlags::CLONE_NEWUSER;
let pid = super::clone(
Box::new(|| {
Box::new(move || {
if cb().is_err() {
return -1;
}
Expand Down Expand Up @@ -165,4 +187,27 @@ mod tests {

bail!("Process didn't exit correctly")
}

fn clone_closure_ownership_test_payload() -> super::CloneCb {
// The vec should not be deallocated after this function returns. The
// ownership should correctly transfer to the closure returned, to be
// passed to the clone and new child process.
let numbers: Vec<i32> = (0..101).into_iter().collect();
Box::new(move || {
assert_eq!(numbers.iter().sum::<i32>(), 5050);
0
})
}

#[test]
fn test_clone_closure_ownership() -> Result<()> {
let flags = sched::CloneFlags::empty();

let pid = super::clone(clone_closure_ownership_test_payload(), flags)?;
let exit_status =
wait::waitpid(pid, Some(wait::WaitPidFlag::__WALL)).expect("Waiting for child");
assert_eq!(exit_status, wait::WaitStatus::Exited(pid, 0));

Ok(())
}
}