From bd54128413339c9e60a2e16dcd48545a50951a6f Mon Sep 17 00:00:00 2001 From: yihuaf Date: Thu, 29 Jul 2021 21:43:02 +0200 Subject: [PATCH 1/5] Refactor how the closure used for clone to create the new process. --- src/container/builder_impl.rs | 24 ++++++--- src/process/fork.rs | 99 +++++++++++++++++++++-------------- 2 files changed, 76 insertions(+), 47 deletions(-) diff --git a/src/container/builder_impl.rs b/src/container/builder_impl.rs index d705a8da3..adf1c48c8 100644 --- a/src/container/builder_impl.rs +++ b/src/container/builder_impl.rs @@ -68,15 +68,23 @@ impl ContainerBuilderImpl { let (mut parent, parent_channel) = parent::ParentProcess::new(self.rootless.clone())?; let mut child = child::ChildProcess::new(parent_channel)?; - let cb = Box::new(|| { + let init = self.init; + let rootless = self.rootless.clone(); + let spec = self.spec.clone(); + let syscall = self.syscall.clone(); + let rootfs = self.rootfs.clone(); + let console_socket = self.console_socket.clone(); + let notify_path = self.notify_path.clone(); + + let cb = Box::new(move || { if let Err(error) = container_init( - self.init, - self.rootless.clone(), - self.spec.clone(), - self.syscall.clone(), - self.rootfs.clone(), - self.console_socket.clone(), - self.notify_path.clone(), + init, + rootless, + spec, + syscall, + rootfs, + console_socket, + notify_path, &mut child, ) { log::debug!("failed to run container_init: {:?}", error); diff --git a/src/process/fork.rs b/src/process/fork.rs index c1c4ebbbd..b7a4a760e 100644 --- a/src/process/fork.rs +++ b/src/process/fork.rs @@ -1,26 +1,23 @@ +use anyhow::bail; use anyhow::Context; use anyhow::Result; -use libc::c_int; -use libc::c_void; use nix::errno::Errno; use nix::sched; use nix::sys; use nix::sys::mman; use nix::unistd::Pid; -use std::mem; use std::ptr; +// The clone callback is used in clone call. It is a boxed closure and it needs +// to trasfer the ownership of related memory to the new process. +type CloneCb = Box isize + Send>; + /// clone uses syscall clone(2) to create a new process for the container init /// process. Using clone syscall gives us better control over how to can create /// the new container process, where we can enter into namespaces directly instead /// of using unshare and fork. This call will only create one new process, instead /// of two using fork. -pub fn clone(mut cb: sched::CloneCb, clone_flags: sched::CloneFlags) -> Result { - extern "C" fn callback(data: *mut sched::CloneCb) -> c_int { - let cb: &mut sched::CloneCb = unsafe { &mut *data }; - (*cb)() as c_int - } - +pub fn clone(cb: CloneCb, clone_flags: sched::CloneFlags) -> Result { // Use sysconf to find the page size. If there is an error, we assume // the default 4K page size. let page_size: usize = unsafe { @@ -60,39 +57,63 @@ pub fn clone(mut cb: sched::CloneCb, clone_flags: sched::CloneFlags) -> Result

isize>) -> i32), - child_stack_top, - combined, - &mut cb as *mut _ as *mut c_void, - ) - }; - let pid = Errno::result(res).map(Pid::from_raw)?; + let combined = clone_flags.bits() | signal as libc::c_int; + + // We are passing the boxed closure "cb" into the clone function as the a + // function pointer in C. The box closure in Rust is both a function pointer + // and a struct. However, when casting the box closure into libc::c_void, + // the function pointer will be lost. Therefore, to work around the issue, + // we double box the closure. This is consistant with how std::unix::thread + // handles the closure. + // Ref: https://github.com/rust-lang/rust/blob/master/library/std/src/sys/unix/thread.rs + let data = Box::into_raw(Box::new(cb)); + // The main is a wrapper function passed into clone call below. The "data" + // arg is actually a raw pointer to a Box closure. so here, we re-box the + // pointer back into a box closure so the main takes ownership of the + // memory. Then we can call the closure passed in. + extern "C" fn main(data: *mut libc::c_void) -> libc::c_int { + unsafe { Box::from_raw(data as *mut CloneCb)() as i32 } + } - Ok(pid) + // The nix::sched::clone wrapper doesn't provide the right interface. Using + // the clone syscall is one of the rare cases where we don't want rust to + // manage the child stack memory. Instead, we want to use c_void directly + // here. Therefore, here we are using libc::clone syscall directly for + // better control. The child stack will be cleaned when exec is called or + // the child process terminates. The nix wrapper also does not treat the + // closure memory correctly. The wrapper implementation fails to pass the + // right ownership to the new child process. + // Ref: https://github.com/nix-rust/nix/issues/919 + // Ref: https://github.com/nix-rust/nix/pull/920 + let res = unsafe { libc::clone(main, child_stack_top, combined, data as *mut libc::c_void) }; + match res { + -1 => { + // Since the clone call failed, the closure passed in didn't get + // consumed. To complete the circle, we can safely box up the + // closure again and let rust manage this memory for us. + unsafe { drop(Box::from_raw(data)) }; + bail!( + "Failed clone to create new process: {:?}", + Errno::result(res) + ) + } + pid => Ok(Pid::from_raw(pid)), + } } #[cfg(test)] @@ -115,7 +136,7 @@ mod tests { // namespace is needed for the test to run without root let flags = sched::CloneFlags::CLONE_NEWPID | sched::CloneFlags::CLONE_NEWUSER; let pid = super::clone( - Box::new(|| { + Box::new(move || { if cb().is_err() { return -1; } From f4027f583182b2d9ec30818cb381dcc2ba9bf10a Mon Sep 17 00:00:00 2001 From: yihuaf Date: Thu, 29 Jul 2021 21:55:54 +0200 Subject: [PATCH 2/5] adds unit test --- src/process/fork.rs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/process/fork.rs b/src/process/fork.rs index b7a4a760e..576ac7c77 100644 --- a/src/process/fork.rs +++ b/src/process/fork.rs @@ -120,6 +120,7 @@ pub fn clone(cb: CloneCb, clone_flags: sched::CloneFlags) -> Result { mod tests { use super::*; use anyhow::bail; + use nix::sys::wait; use nix::unistd; #[test] @@ -186,4 +187,27 @@ mod tests { bail!("Process didn't exit correctly") } + + fn clone_closure_ownership_test_payload() -> super::CloneCb { + // The vec should not be deallocated after this function returns. The + // ownership should correctly transfer to the closure returned, to be + // passed to the clone and new child process. + let numbers: Vec = (0..101).into_iter().collect(); + Box::new(move || { + assert_eq!(numbers.iter().sum::(), 5050); + 0 + }) + } + + #[test] + fn test_clone_closure_ownership() -> Result<()> { + let flags = sched::CloneFlags::empty(); + + let pid = super::clone(clone_closure_ownership_test_payload(), flags)?; + let exit_status = + wait::waitpid(pid, Some(wait::WaitPidFlag::__WALL)).expect("Waiting for child"); + assert_eq!(exit_status, wait::WaitStatus::Exited(pid, 0)); + + Ok(()) + } } From 494646c0641364d89865c4203cac67bcf37b2760 Mon Sep 17 00:00:00 2001 From: yihuaf Date: Fri, 30 Jul 2021 03:52:45 +0200 Subject: [PATCH 3/5] parent f4027f583182b2d9ec30818cb381dcc2ba9bf10a author yihuaf 1627609965 +0200 committer yihuaf 1627665696 +0200 Group the args of container_init into a struct --- src/container/builder_impl.rs | 88 ++++++++++++++++++++--------------- src/process/child.rs | 10 ++-- 2 files changed, 56 insertions(+), 42 deletions(-) diff --git a/src/container/builder_impl.rs b/src/container/builder_impl.rs index adf1c48c8..ed27fac60 100644 --- a/src/container/builder_impl.rs +++ b/src/container/builder_impl.rs @@ -21,6 +21,24 @@ use crate::{ use super::{Container, ContainerStatus}; +struct ContainerInitArgs { + /// Flag indicating if an init or a tenant container should be created + pub init: bool, + /// Interface to operating system primitives + pub syscall: LinuxSyscall, + /// OCI complient runtime spec + pub spec: Spec, + /// Root filesystem of the container + pub rootfs: PathBuf, + /// Socket to communicate the file descriptor of the ptty + pub console_socket: Option, + /// Options for rootless containers + pub rootless: Option, + /// Path to the Unix Domain Socket to communicate container start + pub notify_path: PathBuf, + /// Pipe used to communicate with the child process + pub child: child::ChildProcess, +} pub(super) struct ContainerBuilderImpl { /// Flag indicating if an init or a tenant container should be created pub init: bool, @@ -66,27 +84,26 @@ impl ContainerBuilderImpl { // create the parent and child process structure so the parent and child process can sync with each other let (mut parent, parent_channel) = parent::ParentProcess::new(self.rootless.clone())?; - let mut child = child::ChildProcess::new(parent_channel)?; - - let init = self.init; - let rootless = self.rootless.clone(); - let spec = self.spec.clone(); - let syscall = self.syscall.clone(); - let rootfs = self.rootfs.clone(); - let console_socket = self.console_socket.clone(); - let notify_path = self.notify_path.clone(); - + let child = child::ChildProcess::new(parent_channel)?; + + // This init_args will be passed to the container init process, + // therefore we will have to move all the variable by value. Since self + // is a shared reference, we have to clone these variables here. + let init_args = ContainerInitArgs { + init: self.init, + syscall: self.syscall.clone(), + spec: self.spec.clone(), + rootfs: self.rootfs.clone(), + console_socket: self.console_socket.clone(), + rootless: self.rootless.clone(), + notify_path: self.notify_path.clone(), + child: child, + }; + + // We have to box up this closure to correctly pass to the init function + // of the new process. let cb = Box::new(move || { - if let Err(error) = container_init( - init, - rootless, - spec, - syscall, - rootfs, - console_socket, - notify_path, - &mut child, - ) { + if let Err(error) = container_init(init_args) { log::debug!("failed to run container_init: {:?}", error); return -1; } @@ -122,22 +139,17 @@ impl ContainerBuilderImpl { } } -fn container_init( - init: bool, - rootless: Option, - spec: Spec, - command: LinuxSyscall, - rootfs: PathBuf, - console_socket: Option, - notify_name: PathBuf, - child: &mut child::ChildProcess, -) -> Result<()> { +fn container_init(args: ContainerInitArgs) -> Result<()> { + let command = &args.syscall; + let spec = &args.spec; let linux = &spec.linux.as_ref().context("no linux in spec")?; let namespaces: Namespaces = linux.namespaces.clone().into(); // need to create the notify socket before we pivot root, since the unix // domain socket used here is outside of the rootfs of container - let mut notify_socket: NotifyListener = NotifyListener::new(¬ify_name)?; + let mut notify_socket: NotifyListener = NotifyListener::new(&args.notify_path)?; let proc = &spec.process.as_ref().context("no process in spec")?; + let rootfs = &args.rootfs; + let mut child = args.child; // if Out-of-memory score adjustment is set in specification. set the score // value for the current process check @@ -154,7 +166,7 @@ fn container_init( // namespace will be created, check // https://man7.org/linux/man-pages/man7/user_namespaces.7.html for more // information - if rootless.is_some() { + if args.rootless.is_some() { // child needs to be dumpable, otherwise the non root parent is not // allowed to write the uid/gid maps prctl::set_dumpable(true).unwrap(); @@ -173,7 +185,7 @@ fn container_init( .context("failed to become root")?; // set up tty if specified - if let Some(csocketfd) = console_socket { + if let Some(csocketfd) = args.console_socket { tty::setup_console(&csocketfd)?; } @@ -186,7 +198,7 @@ fn container_init( let _ = prctl::set_no_new_privileges(true); } - if init { + if args.init { rootfs::prepare_rootfs( &spec, &rootfs, @@ -198,14 +210,14 @@ fn container_init( // change the root of filesystem of the process to the rootfs command - .pivot_rootfs(&rootfs) - .with_context(|| format!("Failed to pivot root to {:?}", &rootfs))?; + .pivot_rootfs(rootfs) + .with_context(|| format!("Failed to pivot root to {:?}", rootfs))?; } command.set_id(Uid::from_raw(proc.user.uid), Gid::from_raw(proc.user.gid))?; - capabilities::reset_effective(&command)?; + capabilities::reset_effective(command)?; if let Some(caps) = &proc.capabilities { - capabilities::drop_privileges(&caps, &command)?; + capabilities::drop_privileges(&caps, command)?; } // notify parents that the init process is ready to execute the payload. diff --git a/src/process/child.rs b/src/process/child.rs index eb790226d..ed1b09723 100644 --- a/src/process/child.rs +++ b/src/process/child.rs @@ -17,10 +17,12 @@ pub struct ChildProcess { poll: Option, } -// Note : The original youki process first forks into 'parent' (P) and 'child' (C1) process -// of which this represents the child (C1) process. The C1 then again forks into parent process which is C1, -// and Child (C2) process. C2 is called as init process as it will run the command of the container. But form -// a process point of view, init process is child of child process, which is child of original youki process. +// Note: The original Youki process "forks" a child process using clone(2). The +// child process will become the container init process, where it will set up +// namespaces, device mounts, and etc. for the container process. Finally, the +// container init process will run the actual container payload through exec +// call. The ChildProcess will be used to synchronize between the Youki main +// process and the child process (container init process). impl ChildProcess { /// create a new Child process structure pub fn new(parent_channel: ParentChannel) -> Result { From 682c1dd6cf47e6aaa5e0fef7624a7ed6bf38082e Mon Sep 17 00:00:00 2001 From: yihuaf Date: Fri, 30 Jul 2021 03:57:43 +0200 Subject: [PATCH 4/5] remove a warning where container_dir is unused. --- src/container/builder_impl.rs | 2 -- src/container/init_builder.rs | 1 - src/container/tenant_builder.rs | 1 - 3 files changed, 4 deletions(-) diff --git a/src/container/builder_impl.rs b/src/container/builder_impl.rs index ed27fac60..45a2616b4 100644 --- a/src/container/builder_impl.rs +++ b/src/container/builder_impl.rs @@ -48,8 +48,6 @@ pub(super) struct ContainerBuilderImpl { pub use_systemd: bool, /// Id of the container pub container_id: String, - /// Directory where the state of the container will be stored - pub container_dir: PathBuf, /// OCI complient runtime spec pub spec: Spec, /// Root filesystem of the container diff --git a/src/container/init_builder.rs b/src/container/init_builder.rs index da7eb2c06..389a8935b 100644 --- a/src/container/init_builder.rs +++ b/src/container/init_builder.rs @@ -72,7 +72,6 @@ impl InitContainerBuilder { pid_file: self.base.pid_file, console_socket: csocketfd, use_systemd: self.use_systemd, - container_dir, spec, rootfs, rootless, diff --git a/src/container/tenant_builder.rs b/src/container/tenant_builder.rs index 84d4bf8c5..98998e398 100644 --- a/src/container/tenant_builder.rs +++ b/src/container/tenant_builder.rs @@ -111,7 +111,6 @@ impl TenantContainerBuilder { pid_file: self.base.pid_file, console_socket: csocketfd, use_systemd, - container_dir, spec, rootfs, rootless, From e0e9815a430d79b069fb2e311952cd4d8dac45c0 Mon Sep 17 00:00:00 2001 From: yihuaf Date: Fri, 30 Jul 2021 19:32:26 +0200 Subject: [PATCH 5/5] Move ContainerInitArgs closer to where it is used --- src/container/builder_impl.rs | 37 ++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/src/container/builder_impl.rs b/src/container/builder_impl.rs index 45a2616b4..2e68ba65a 100644 --- a/src/container/builder_impl.rs +++ b/src/container/builder_impl.rs @@ -21,24 +21,6 @@ use crate::{ use super::{Container, ContainerStatus}; -struct ContainerInitArgs { - /// Flag indicating if an init or a tenant container should be created - pub init: bool, - /// Interface to operating system primitives - pub syscall: LinuxSyscall, - /// OCI complient runtime spec - pub spec: Spec, - /// Root filesystem of the container - pub rootfs: PathBuf, - /// Socket to communicate the file descriptor of the ptty - pub console_socket: Option, - /// Options for rootless containers - pub rootless: Option, - /// Path to the Unix Domain Socket to communicate container start - pub notify_path: PathBuf, - /// Pipe used to communicate with the child process - pub child: child::ChildProcess, -} pub(super) struct ContainerBuilderImpl { /// Flag indicating if an init or a tenant container should be created pub init: bool, @@ -137,6 +119,25 @@ impl ContainerBuilderImpl { } } +struct ContainerInitArgs { + /// Flag indicating if an init or a tenant container should be created + pub init: bool, + /// Interface to operating system primitives + pub syscall: LinuxSyscall, + /// OCI complient runtime spec + pub spec: Spec, + /// Root filesystem of the container + pub rootfs: PathBuf, + /// Socket to communicate the file descriptor of the ptty + pub console_socket: Option, + /// Options for rootless containers + pub rootless: Option, + /// Path to the Unix Domain Socket to communicate container start + pub notify_path: PathBuf, + /// Pipe used to communicate with the child process + pub child: child::ChildProcess, +} + fn container_init(args: ContainerInitArgs) -> Result<()> { let command = &args.syscall; let spec = &args.spec;