diff --git a/src/container/builder_impl.rs b/src/container/builder_impl.rs index d705a8da3..2e68ba65a 100644 --- a/src/container/builder_impl.rs +++ b/src/container/builder_impl.rs @@ -30,8 +30,6 @@ pub(super) struct ContainerBuilderImpl { pub use_systemd: bool, /// Id of the container pub container_id: String, - /// Directory where the state of the container will be stored - pub container_dir: PathBuf, /// OCI complient runtime spec pub spec: Spec, /// Root filesystem of the container @@ -66,19 +64,26 @@ impl ContainerBuilderImpl { // create the parent and child process structure so the parent and child process can sync with each other let (mut parent, parent_channel) = parent::ParentProcess::new(self.rootless.clone())?; - let mut child = child::ChildProcess::new(parent_channel)?; - - let cb = Box::new(|| { - if let Err(error) = container_init( - self.init, - self.rootless.clone(), - self.spec.clone(), - self.syscall.clone(), - self.rootfs.clone(), - self.console_socket.clone(), - self.notify_path.clone(), - &mut child, - ) { + let child = child::ChildProcess::new(parent_channel)?; + + // This init_args will be passed to the container init process, + // therefore we will have to move all the variable by value. Since self + // is a shared reference, we have to clone these variables here. + let init_args = ContainerInitArgs { + init: self.init, + syscall: self.syscall.clone(), + spec: self.spec.clone(), + rootfs: self.rootfs.clone(), + console_socket: self.console_socket.clone(), + rootless: self.rootless.clone(), + notify_path: self.notify_path.clone(), + child: child, + }; + + // We have to box up this closure to correctly pass to the init function + // of the new process. + let cb = Box::new(move || { + if let Err(error) = container_init(init_args) { log::debug!("failed to run container_init: {:?}", error); return -1; } @@ -114,22 +119,36 @@ impl ContainerBuilderImpl { } } -fn container_init( - init: bool, - rootless: Option, - spec: Spec, - command: LinuxSyscall, - rootfs: PathBuf, - console_socket: Option, - notify_name: PathBuf, - child: &mut child::ChildProcess, -) -> Result<()> { +struct ContainerInitArgs { + /// Flag indicating if an init or a tenant container should be created + pub init: bool, + /// Interface to operating system primitives + pub syscall: LinuxSyscall, + /// OCI complient runtime spec + pub spec: Spec, + /// Root filesystem of the container + pub rootfs: PathBuf, + /// Socket to communicate the file descriptor of the ptty + pub console_socket: Option, + /// Options for rootless containers + pub rootless: Option, + /// Path to the Unix Domain Socket to communicate container start + pub notify_path: PathBuf, + /// Pipe used to communicate with the child process + pub child: child::ChildProcess, +} + +fn container_init(args: ContainerInitArgs) -> Result<()> { + let command = &args.syscall; + let spec = &args.spec; let linux = &spec.linux.as_ref().context("no linux in spec")?; let namespaces: Namespaces = linux.namespaces.clone().into(); // need to create the notify socket before we pivot root, since the unix // domain socket used here is outside of the rootfs of container - let mut notify_socket: NotifyListener = NotifyListener::new(¬ify_name)?; + let mut notify_socket: NotifyListener = NotifyListener::new(&args.notify_path)?; let proc = &spec.process.as_ref().context("no process in spec")?; + let rootfs = &args.rootfs; + let mut child = args.child; // if Out-of-memory score adjustment is set in specification. set the score // value for the current process check @@ -146,7 +165,7 @@ fn container_init( // namespace will be created, check // https://man7.org/linux/man-pages/man7/user_namespaces.7.html for more // information - if rootless.is_some() { + if args.rootless.is_some() { // child needs to be dumpable, otherwise the non root parent is not // allowed to write the uid/gid maps prctl::set_dumpable(true).unwrap(); @@ -165,7 +184,7 @@ fn container_init( .context("failed to become root")?; // set up tty if specified - if let Some(csocketfd) = console_socket { + if let Some(csocketfd) = args.console_socket { tty::setup_console(&csocketfd)?; } @@ -178,7 +197,7 @@ fn container_init( let _ = prctl::set_no_new_privileges(true); } - if init { + if args.init { rootfs::prepare_rootfs( &spec, &rootfs, @@ -190,14 +209,14 @@ fn container_init( // change the root of filesystem of the process to the rootfs command - .pivot_rootfs(&rootfs) - .with_context(|| format!("Failed to pivot root to {:?}", &rootfs))?; + .pivot_rootfs(rootfs) + .with_context(|| format!("Failed to pivot root to {:?}", rootfs))?; } command.set_id(Uid::from_raw(proc.user.uid), Gid::from_raw(proc.user.gid))?; - capabilities::reset_effective(&command)?; + capabilities::reset_effective(command)?; if let Some(caps) = &proc.capabilities { - capabilities::drop_privileges(&caps, &command)?; + capabilities::drop_privileges(&caps, command)?; } // notify parents that the init process is ready to execute the payload. diff --git a/src/container/init_builder.rs b/src/container/init_builder.rs index da7eb2c06..389a8935b 100644 --- a/src/container/init_builder.rs +++ b/src/container/init_builder.rs @@ -72,7 +72,6 @@ impl InitContainerBuilder { pid_file: self.base.pid_file, console_socket: csocketfd, use_systemd: self.use_systemd, - container_dir, spec, rootfs, rootless, diff --git a/src/container/tenant_builder.rs b/src/container/tenant_builder.rs index 84d4bf8c5..98998e398 100644 --- a/src/container/tenant_builder.rs +++ b/src/container/tenant_builder.rs @@ -111,7 +111,6 @@ impl TenantContainerBuilder { pid_file: self.base.pid_file, console_socket: csocketfd, use_systemd, - container_dir, spec, rootfs, rootless, diff --git a/src/process/child.rs b/src/process/child.rs index eb790226d..ed1b09723 100644 --- a/src/process/child.rs +++ b/src/process/child.rs @@ -17,10 +17,12 @@ pub struct ChildProcess { poll: Option, } -// Note : The original youki process first forks into 'parent' (P) and 'child' (C1) process -// of which this represents the child (C1) process. The C1 then again forks into parent process which is C1, -// and Child (C2) process. C2 is called as init process as it will run the command of the container. But form -// a process point of view, init process is child of child process, which is child of original youki process. +// Note: The original Youki process "forks" a child process using clone(2). The +// child process will become the container init process, where it will set up +// namespaces, device mounts, and etc. for the container process. Finally, the +// container init process will run the actual container payload through exec +// call. The ChildProcess will be used to synchronize between the Youki main +// process and the child process (container init process). impl ChildProcess { /// create a new Child process structure pub fn new(parent_channel: ParentChannel) -> Result { diff --git a/src/process/fork.rs b/src/process/fork.rs index c1c4ebbbd..576ac7c77 100644 --- a/src/process/fork.rs +++ b/src/process/fork.rs @@ -1,26 +1,23 @@ +use anyhow::bail; use anyhow::Context; use anyhow::Result; -use libc::c_int; -use libc::c_void; use nix::errno::Errno; use nix::sched; use nix::sys; use nix::sys::mman; use nix::unistd::Pid; -use std::mem; use std::ptr; +// The clone callback is used in clone call. It is a boxed closure and it needs +// to trasfer the ownership of related memory to the new process. +type CloneCb = Box isize + Send>; + /// clone uses syscall clone(2) to create a new process for the container init /// process. Using clone syscall gives us better control over how to can create /// the new container process, where we can enter into namespaces directly instead /// of using unshare and fork. This call will only create one new process, instead /// of two using fork. -pub fn clone(mut cb: sched::CloneCb, clone_flags: sched::CloneFlags) -> Result { - extern "C" fn callback(data: *mut sched::CloneCb) -> c_int { - let cb: &mut sched::CloneCb = unsafe { &mut *data }; - (*cb)() as c_int - } - +pub fn clone(cb: CloneCb, clone_flags: sched::CloneFlags) -> Result { // Use sysconf to find the page size. If there is an error, we assume // the default 4K page size. let page_size: usize = unsafe { @@ -60,45 +57,70 @@ pub fn clone(mut cb: sched::CloneCb, clone_flags: sched::CloneFlags) -> Result

isize>) -> i32), - child_stack_top, - combined, - &mut cb as *mut _ as *mut c_void, - ) - }; - let pid = Errno::result(res).map(Pid::from_raw)?; + let combined = clone_flags.bits() | signal as libc::c_int; + + // We are passing the boxed closure "cb" into the clone function as the a + // function pointer in C. The box closure in Rust is both a function pointer + // and a struct. However, when casting the box closure into libc::c_void, + // the function pointer will be lost. Therefore, to work around the issue, + // we double box the closure. This is consistant with how std::unix::thread + // handles the closure. + // Ref: https://github.com/rust-lang/rust/blob/master/library/std/src/sys/unix/thread.rs + let data = Box::into_raw(Box::new(cb)); + // The main is a wrapper function passed into clone call below. The "data" + // arg is actually a raw pointer to a Box closure. so here, we re-box the + // pointer back into a box closure so the main takes ownership of the + // memory. Then we can call the closure passed in. + extern "C" fn main(data: *mut libc::c_void) -> libc::c_int { + unsafe { Box::from_raw(data as *mut CloneCb)() as i32 } + } - Ok(pid) + // The nix::sched::clone wrapper doesn't provide the right interface. Using + // the clone syscall is one of the rare cases where we don't want rust to + // manage the child stack memory. Instead, we want to use c_void directly + // here. Therefore, here we are using libc::clone syscall directly for + // better control. The child stack will be cleaned when exec is called or + // the child process terminates. The nix wrapper also does not treat the + // closure memory correctly. The wrapper implementation fails to pass the + // right ownership to the new child process. + // Ref: https://github.com/nix-rust/nix/issues/919 + // Ref: https://github.com/nix-rust/nix/pull/920 + let res = unsafe { libc::clone(main, child_stack_top, combined, data as *mut libc::c_void) }; + match res { + -1 => { + // Since the clone call failed, the closure passed in didn't get + // consumed. To complete the circle, we can safely box up the + // closure again and let rust manage this memory for us. + unsafe { drop(Box::from_raw(data)) }; + bail!( + "Failed clone to create new process: {:?}", + Errno::result(res) + ) + } + pid => Ok(Pid::from_raw(pid)), + } } #[cfg(test)] mod tests { use super::*; use anyhow::bail; + use nix::sys::wait; use nix::unistd; #[test] @@ -115,7 +137,7 @@ mod tests { // namespace is needed for the test to run without root let flags = sched::CloneFlags::CLONE_NEWPID | sched::CloneFlags::CLONE_NEWUSER; let pid = super::clone( - Box::new(|| { + Box::new(move || { if cb().is_err() { return -1; } @@ -165,4 +187,27 @@ mod tests { bail!("Process didn't exit correctly") } + + fn clone_closure_ownership_test_payload() -> super::CloneCb { + // The vec should not be deallocated after this function returns. The + // ownership should correctly transfer to the closure returned, to be + // passed to the clone and new child process. + let numbers: Vec = (0..101).into_iter().collect(); + Box::new(move || { + assert_eq!(numbers.iter().sum::(), 5050); + 0 + }) + } + + #[test] + fn test_clone_closure_ownership() -> Result<()> { + let flags = sched::CloneFlags::empty(); + + let pid = super::clone(clone_closure_ownership_test_payload(), flags)?; + let exit_status = + wait::waitpid(pid, Some(wait::WaitPidFlag::__WALL)).expect("Waiting for child"); + assert_eq!(exit_status, wait::WaitStatus::Exited(pid, 0)); + + Ok(()) + } }