diff --git a/Cargo.lock b/Cargo.lock index 6bc2437f81..07402f75f7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -377,6 +377,16 @@ dependencies = [ "os_str_bytes", ] +[[package]] +name = "clone3" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ee4e061ea30800291ca09663878f3953840a69b08ce244b3e8b26e894d9f60f" +dependencies = [ + "bitflags", + "uapi", +] + [[package]] name = "cmake" version = "0.1.49" @@ -1610,6 +1620,7 @@ dependencies = [ "bitflags", "caps", "chrono", + "clone3", "crossbeam-channel", "fastrand", "futures", @@ -3114,6 +3125,32 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" +[[package]] +name = "uapi" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "019450240401d342e2a5bc47f7fbaeb002a38fe18197b83788750d7ffb143274" +dependencies = [ + "cc", + "cfg-if 0.1.10", + "libc", + "uapi-proc", +] + +[[package]] +name = "uapi-proc" +version = "0.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54de46f980cea7b2ae8d8f7f9f1c35cf7062c68343e99345ef73758f8e60975a" +dependencies = [ + "lazy_static", + "libc", + "proc-macro2", + "quote", + "regex", + "syn", +] + [[package]] name = "unicase" version = "2.6.0" diff --git a/crates/libcontainer/Cargo.toml b/crates/libcontainer/Cargo.toml index 946fd22aa1..aacd84223a 100644 --- a/crates/libcontainer/Cargo.toml +++ b/crates/libcontainer/Cargo.toml @@ -48,6 +48,7 @@ wasmer-wasi = { version = "2.3.0", optional = true } wasmedge-sdk = { version = "0.7.1", optional = true } wasmtime = {version = "6.0.0", optional = true } wasmtime-wasi = {version = "6.0.0", optional = true } +clone3 = "0.2.3" [dev-dependencies] oci-spec = { version = "^0.6.0", features = ["proptests", "runtime"] } diff --git a/crates/libcontainer/src/process/fork.rs b/crates/libcontainer/src/process/fork.rs index 12ba2dfcf1..3cd6ee75a6 100644 --- a/crates/libcontainer/src/process/fork.rs +++ b/crates/libcontainer/src/process/fork.rs @@ -1,37 +1,76 @@ -use anyhow::Result; -use nix::unistd; +use anyhow::{Context, Result}; +use libc::SIGCHLD; use nix::unistd::Pid; -// Execute the cb in another process. Make the fork works more like thread_spawn -// or clone, so it is easier to reason. Compared to clone call, fork is easier -// to use since fork will magically take care of all the variable copying. If -// using clone, we would have to manually make sure all the variables are -// correctly send to the new process, especially Rust borrow checker will be a -// lot of hassel to deal with every details. -pub fn container_fork Result>(cb: F) -> Result { - // here we return the child's pid in case of parent, the i32 in return signature, - // and for child, we run the callback function, and exit with the same exit code - // given by it. If there was any error when trying to run callback, exit with -1 - match unsafe { unistd::fork()? } { - unistd::ForkResult::Parent { child } => Ok(child), - unistd::ForkResult::Child => { +// Fork/Clone a sibling process that shares the same parent as the calling +// process. This is used to launch the container init process so the parent +// process of the calling process can receive ownership of the process. If we +// clone a child process as the init process, the calling process (likely the +// youki main process) will exit and the init process will be re-parented to the +// process 1 (system init process), which is not the right behavior of what we +// look for. +pub fn container_clone_sibling Result>(cb: F) -> Result { + let mut clone = clone3::Clone3::default(); + let mut pidfd = -1; + // Note: normally, an exit signal is required, but when using + // `CLONE_PARENT`, the `clone3` will return EINVAL if an exit signal is set. + // The older `clone` will not return EINVAL in this case. Instead it ignores + // the exit signal bits in the glibc wrapper. + clone.flag_pidfd(&mut pidfd).flag_parent(); + + container_clone(cb, clone).with_context(|| "failed to clone sibling process") +} + +// A simple clone wrapper to clone3 so we can share this logic in different +// fork/clone situations. We decided to minimally support kernel version >= 5.4, +// and `clone3` requires only kernel version >= 5.3. Therefore, we don't need to +// fall back to `clone` or `fork`. +fn container_clone Result>(cb:F, mut clone_cmd: clone3::Clone3) -> Result { + // Return the child's pid in case of parent/calling process, and for the + // cloned process, run the callback function, and exit with the same exit + // code returned by the callback. If there was any error when trying to run + // callback, exit with -1 + match unsafe { clone_cmd.call().with_context(|| "failed to run clone3")? } { + 0 => { + // Inside the cloned process let ret = match cb() { Err(error) => { - log::debug!("failed to run fork: {:?}", error); + log::debug!("failed to run child process in clone: {:?}", error); -1 } Ok(exit_code) => exit_code, }; std::process::exit(ret); } + pid => { + Ok(Pid::from_raw(pid)) + } } } +// Execute the cb in another process. Make the fork works more like thread_spawn +// or clone, so it is easier to reason. Compared to clone call, fork is easier +// to use since fork will magically take care of all the variable copying. If +// using clone, we would have to manually make sure all the variables are +// correctly send to the new process, especially Rust borrow checker will be a +// lot of hassel to deal with every details. +pub fn container_fork Result>(cb: F) -> Result { + // Using `clone3` to mimic the effect of `fork`. + let mut clone = clone3::Clone3::default(); + let mut pidfd = -1; + clone.flag_pidfd(&mut pidfd).exit_signal(SIGCHLD as u64); + + container_clone(cb, clone).with_context(|| "failed to fork process") +} + #[cfg(test)] mod test { + use crate::process::channel::channel; + use super::*; use anyhow::{bail, Result}; use nix::sys::wait::{waitpid, WaitStatus}; + use nix::unistd; #[test] fn test_container_fork() -> Result<()> { @@ -58,4 +97,55 @@ mod test { _ => bail!("test failed"), } } + + #[test] + fn test_container_clone_sibling() -> Result<()> { + // The `container_clone` will create the process as a sibling process + // (share the same parent) as the calling process. In Unix, a process + // can only wait on the immediate children process and can't wait the + // sibling process. Therefore, to test the logic, we will have to fork a + // process first and then let the forked process call `container_clone`. + // Then the testing process (the process where test is called), who are + // the parent to the forked process and the sibling process from + // `container_clone`, can wait on both process. + + // We need to use a channel so that the forked process can pass the pid + // of the sibling process to the testing process. + let (sender, receiver) = &mut channel::()?; + + match unsafe { unistd::fork()? } { + unistd::ForkResult::Parent { child } => { + let sibling_process_pid = + Pid::from_raw(receiver.recv().with_context(|| { + "failed to receive the sibling pid from forked process" + })?); + receiver.close()?; + match waitpid(sibling_process_pid, None).expect("wait pid failed.") { + WaitStatus::Exited(p, status) => { + assert_eq!(sibling_process_pid, p); + assert_eq!(status, 0); + } + _ => bail!("failed to wait on the sibling process"), + } + // After sibling process exits, we can wait on the forked process. + match waitpid(child, None).expect("wait pid failed.") { + WaitStatus::Exited(p, status) => { + assert_eq!(child, p); + assert_eq!(status, 0); + } + _ => bail!("failed to wait on the forked process"), + } + } + unistd::ForkResult::Child => { + // Inside the forked process. We call `container_clone` and pass + // the pid to the parent process. + let pid = container_clone_sibling(|| Ok(0))?; + sender.send(pid.as_raw())?; + sender.close()?; + std::process::exit(0); + } + }; + + Ok(()) + } }