diff --git a/src/commands/create.rs b/src/commands/create.rs index d3dc53233..cfe6eedba 100644 --- a/src/commands/create.rs +++ b/src/commands/create.rs @@ -19,6 +19,9 @@ pub struct Create { /// Unix socket (file) path , which will receive file descriptor of the writing end of the pseudoterminal #[clap(short, long)] console_socket: Option, + /// Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total) + #[clap(long, default_value = "0")] + preserve_fds: i32, /// name of the container instance to be started pub container_id: String, } @@ -35,12 +38,14 @@ impl Create { pid_file: Option, bundle: PathBuf, console_socket: Option, + preserve_fds: i32, ) -> Self { Self { pid_file, bundle, console_socket, container_id, + preserve_fds: preserve_fds, } } /// Starts a new container process @@ -49,6 +54,7 @@ impl Create { .with_pid_file(self.pid_file.as_ref()) .with_console_socket(self.console_socket.as_ref()) .with_root_path(root_path) + .with_preserved_fds(self.preserve_fds) .as_init(&self.bundle) .with_systemd(systemd_cgroup) .build() diff --git a/src/commands/run.rs b/src/commands/run.rs index 895a2b68f..6abd3e2d4 100644 --- a/src/commands/run.rs +++ b/src/commands/run.rs @@ -18,6 +18,9 @@ pub struct Run { /// Unix socket (file) path , which will receive file descriptor of the writing end of the pseudoterminal #[clap(short, long)] console_socket: Option, + /// Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total) + #[clap(long, default_value = "0")] + preserve_fds: i32, /// name of the container instance to be started pub container_id: String, } @@ -29,6 +32,7 @@ impl Run { self.pid_file.clone(), self.bundle.clone(), self.console_socket.clone(), + self.preserve_fds, ) .exec(root_path.clone(), systemd_cgroup)?; diff --git a/src/container/builder.rs b/src/container/builder.rs index fb410fa23..4821f5267 100644 --- a/src/container/builder.rs +++ b/src/container/builder.rs @@ -14,6 +14,8 @@ pub struct ContainerBuilder { pub(super) pid_file: Option, /// Socket to communicate the file descriptor of the ptty pub(super) console_socket: Option, + /// File descriptors to be passed into the container process + pub(super) preserve_fds: i32, } /// Builder that can be used to configure the common properties of @@ -51,6 +53,7 @@ impl ContainerBuilder { syscall: LinuxSyscall, pid_file: None, console_socket: None, + preserve_fds: 0, } } @@ -129,4 +132,19 @@ impl ContainerBuilder { self.console_socket = path.map(|p| p.into()); self } + + /// Sets the console socket, which will be used to send the file descriptor + /// of the pseudoterminal + /// # Example + /// + /// ```no_run + /// # use youki::container::builder::ContainerBuilder; + /// + /// ContainerBuilder::new("74f1a4cb3801".to_owned()) + /// .with_preserved_fds(5); + /// ``` + pub fn with_preserved_fds(mut self, preserved_fds: i32) -> Self { + self.preserve_fds = preserved_fds; + self + } } diff --git a/src/container/builder_impl.rs b/src/container/builder_impl.rs index 21fb101c4..2af258b63 100644 --- a/src/container/builder_impl.rs +++ b/src/container/builder_impl.rs @@ -1,22 +1,15 @@ -use std::{fs, io::Write, path::PathBuf}; - use anyhow::{Context, Result}; -use nix::{ - sched, - unistd::{Gid, Uid}, -}; use oci_spec::Spec; +use std::{fs, path::PathBuf}; use crate::{ - capabilities, cgroups, + cgroups, namespaces::Namespaces, - notify_socket::NotifyListener, - process::{child, fork, parent}, - rootfs, + process::{child, fork, init, parent}, rootless::Rootless, stdio::FileDescriptor, - syscall::{linux::LinuxSyscall, Syscall}, - tty, utils, + syscall::linux::LinuxSyscall, + utils, }; use super::{Container, ContainerStatus}; @@ -45,6 +38,8 @@ pub(super) struct ContainerBuilderImpl { pub notify_path: PathBuf, /// Container state pub container: Option, + /// File descriptos preserved/passed to the container init process. + pub preserve_fds: i32, } impl ContainerBuilderImpl { @@ -69,7 +64,7 @@ impl ContainerBuilderImpl { // This init_args will be passed to the container init process, // therefore we will have to move all the variable by value. Since self // is a shared reference, we have to clone these variables here. - let init_args = ContainerInitArgs { + let init_args = init::ContainerInitArgs { init: self.init, syscall: self.syscall.clone(), spec: self.spec.clone(), @@ -77,13 +72,14 @@ impl ContainerBuilderImpl { console_socket: self.console_socket.clone(), rootless: self.rootless.clone(), notify_path: self.notify_path.clone(), + preserve_fds: self.preserve_fds, child, }; // We have to box up this closure to correctly pass to the init function // of the new process. let cb = Box::new(move || { - if let Err(error) = container_init(init_args) { + if let Err(error) = init::container_init(init_args) { log::debug!("failed to run container_init: {:?}", error); return -1; } @@ -118,118 +114,3 @@ impl ContainerBuilderImpl { Ok(()) } } - -struct ContainerInitArgs { - /// Flag indicating if an init or a tenant container should be created - pub init: bool, - /// Interface to operating system primitives - pub syscall: LinuxSyscall, - /// OCI complient runtime spec - pub spec: Spec, - /// Root filesystem of the container - pub rootfs: PathBuf, - /// Socket to communicate the file descriptor of the ptty - pub console_socket: Option, - /// Options for rootless containers - pub rootless: Option, - /// Path to the Unix Domain Socket to communicate container start - pub notify_path: PathBuf, - /// Pipe used to communicate with the child process - pub child: child::ChildProcess, -} - -fn container_init(args: ContainerInitArgs) -> Result<()> { - let command = &args.syscall; - let spec = &args.spec; - let linux = &spec.linux.as_ref().context("no linux in spec")?; - let namespaces: Namespaces = linux.namespaces.clone().into(); - // need to create the notify socket before we pivot root, since the unix - // domain socket used here is outside of the rootfs of container - let mut notify_socket: NotifyListener = NotifyListener::new(&args.notify_path)?; - let proc = &spec.process.as_ref().context("no process in spec")?; - let rootfs = &args.rootfs; - let mut child = args.child; - - // if Out-of-memory score adjustment is set in specification. set the score - // value for the current process check - // https://dev.to/rrampage/surviving-the-linux-oom-killer-2ki9 for some more - // information - if let Some(ref resource) = linux.resources { - if let Some(oom_score_adj) = resource.oom_score_adj { - let mut f = fs::File::create("/proc/self/oom_score_adj")?; - f.write_all(oom_score_adj.to_string().as_bytes())?; - } - } - - // if new user is specified in specification, this will be true and new - // namespace will be created, check - // https://man7.org/linux/man-pages/man7/user_namespaces.7.html for more - // information - if args.rootless.is_some() { - // child needs to be dumpable, otherwise the non root parent is not - // allowed to write the uid/gid maps - prctl::set_dumpable(true).unwrap(); - child.request_identifier_mapping()?; - child.wait_for_mapping_ack()?; - prctl::set_dumpable(false).unwrap(); - } - - // set limits and namespaces to the process - for rlimit in proc.rlimits.iter() { - command.set_rlimit(rlimit).context("failed to set rlimit")?; - } - - command - .set_id(Uid::from_raw(0), Gid::from_raw(0)) - .context("failed to become root")?; - - // set up tty if specified - if let Some(csocketfd) = args.console_socket { - tty::setup_console(&csocketfd)?; - } - - // join existing namespaces - namespaces.apply_setns()?; - - command.set_hostname(&spec.hostname.as_ref().context("no hostname in spec")?)?; - - if proc.no_new_privileges { - let _ = prctl::set_no_new_privileges(true); - } - - if args.init { - rootfs::prepare_rootfs( - &spec, - &rootfs, - namespaces - .clone_flags - .contains(sched::CloneFlags::CLONE_NEWUSER), - ) - .with_context(|| "Failed to prepare rootfs")?; - - // change the root of filesystem of the process to the rootfs - command - .pivot_rootfs(rootfs) - .with_context(|| format!("Failed to pivot root to {:?}", rootfs))?; - } - - command.set_id(Uid::from_raw(proc.user.uid), Gid::from_raw(proc.user.gid))?; - capabilities::reset_effective(command)?; - if let Some(caps) = &proc.capabilities { - capabilities::drop_privileges(&caps, command)?; - } - - // notify parents that the init process is ready to execute the payload. - child.notify_parent()?; - - // listing on the notify socket for container start command - notify_socket.wait_for_container_start()?; - - let args: &Vec = &proc.args; - let envs: &Vec = &proc.env; - utils::do_exec(&args[0], args, envs)?; - - // After do_exec is called, the process is replaced with the container - // payload through execvp, so it should never reach here. - unreachable!(); -} diff --git a/src/container/init_builder.rs b/src/container/init_builder.rs index 389a8935b..dd824620e 100644 --- a/src/container/init_builder.rs +++ b/src/container/init_builder.rs @@ -77,6 +77,7 @@ impl InitContainerBuilder { rootless, notify_path, container: Some(container_state), + preserve_fds: self.base.preserve_fds, }; builder_impl.create()?; diff --git a/src/container/tenant_builder.rs b/src/container/tenant_builder.rs index 98998e398..a6464a9c9 100644 --- a/src/container/tenant_builder.rs +++ b/src/container/tenant_builder.rs @@ -116,6 +116,7 @@ impl TenantContainerBuilder { rootless, notify_path: notify_path.clone(), container: None, + preserve_fds: self.base.preserve_fds, }; builder_impl.create()?; diff --git a/src/process/init.rs b/src/process/init.rs new file mode 100644 index 000000000..0ae0c5ec0 --- /dev/null +++ b/src/process/init.rs @@ -0,0 +1,258 @@ +use anyhow::{bail, Context, Result}; +use nix::{ + fcntl, sched, sys, + unistd::{Gid, Uid}, +}; +use oci_spec::Spec; +use std::os::unix::io::AsRawFd; +use std::{fs, io::Write, path::Path, path::PathBuf}; + +use crate::{ + capabilities, + namespaces::Namespaces, + notify_socket::NotifyListener, + process::child, + rootfs, + rootless::Rootless, + stdio::FileDescriptor, + syscall::{linux::LinuxSyscall, Syscall}, + tty, utils, +}; + +// Make sure a given path is on procfs. This is to avoid the security risk that +// /proc path is mounted over. Ref: CVE-2019-16884 +fn ensure_procfs(path: &Path) -> Result<()> { + let procfs_fd = fs::File::open(path)?; + let fstat_info = sys::statfs::fstatfs(&procfs_fd.as_raw_fd())?; + + if fstat_info.filesystem_type() != sys::statfs::PROC_SUPER_MAGIC { + bail!(format!("{:?} is not on the procfs", path)); + } + + Ok(()) +} + +// Get a list of open fds for the calling process. +fn get_open_fds() -> Result> { + const PROCFS_FD_PATH: &str = "/proc/self/fd"; + ensure_procfs(Path::new(PROCFS_FD_PATH)) + .with_context(|| format!("{} is not the actual procfs", PROCFS_FD_PATH))?; + + let fds: Vec = fs::read_dir(PROCFS_FD_PATH)? + .filter_map(|entry| match entry { + Ok(entry) => Some(entry.path()), + Err(_) => None, + }) + .filter_map(|path| match path.file_name() { + Some(file_name) => Some(file_name.to_owned()), + None => None, + }) + .filter_map(|file_name| match file_name.to_str() { + Some(file_name) => Some(String::from(file_name)), + None => None, + }) + .filter_map(|file_name| -> Option { + // Convert the file name from string into i32. Since we are looking + // at /proc//fd, anything that's not a number (i32) can be + // ignored. We are only interested in opened fds. + match file_name.parse() { + Ok(fd) => Some(fd), + Err(_) => None, + } + }) + .collect(); + + Ok(fds) +} + +// Cleanup any extra file descriptors, so the new container process will not +// leak a file descriptor from before execve gets executed. The first 3 fd will +// stay open: stdio, stdout, and stderr. We would further preserve the next +// "preserve_fds" number of fds. Set the rest of fd with CLOEXEC flag, so they +// will be closed after execve into the container payload. We can't close the +// fds immediatly since we at least still need it for the pipe used to wait on +// starting the container. +fn cleanup_file_descriptors(preserve_fds: i32) -> Result<()> { + let open_fds = get_open_fds().with_context(|| "Failed to obtain opened fds")?; + // Include stdin, stdout, and stderr for fd 0, 1, and 2 respectively. + let min_fd = preserve_fds + 3; + let to_be_cleaned_up_fds: Vec = open_fds + .iter() + .filter_map(|&fd| if fd >= min_fd { Some(fd) } else { None }) + .collect(); + + to_be_cleaned_up_fds.iter().for_each(|&fd| { + // Intentionally ignore errors here -- the cases where this might fail + // are basically file descriptors that have already been closed. + let _ = fcntl::fcntl(fd, fcntl::F_SETFD(fcntl::FdFlag::FD_CLOEXEC)); + }); + + Ok(()) +} + +pub struct ContainerInitArgs { + /// Flag indicating if an init or a tenant container should be created + pub init: bool, + /// Interface to operating system primitives + pub syscall: LinuxSyscall, + /// OCI complient runtime spec + pub spec: Spec, + /// Root filesystem of the container + pub rootfs: PathBuf, + /// Socket to communicate the file descriptor of the ptty + pub console_socket: Option, + /// Options for rootless containers + pub rootless: Option, + /// Path to the Unix Domain Socket to communicate container start + pub notify_path: PathBuf, + /// File descriptos preserved/passed to the container init process. + pub preserve_fds: i32, + /// Pipe used to communicate with the child process + pub child: child::ChildProcess, +} + +pub fn container_init(args: ContainerInitArgs) -> Result<()> { + let command = &args.syscall; + let spec = &args.spec; + let linux = &spec.linux.as_ref().context("no linux in spec")?; + let namespaces: Namespaces = linux.namespaces.clone().into(); + // need to create the notify socket before we pivot root, since the unix + // domain socket used here is outside of the rootfs of container + let mut notify_socket: NotifyListener = NotifyListener::new(&args.notify_path)?; + let proc = &spec.process.as_ref().context("no process in spec")?; + let rootfs = &args.rootfs; + let mut child = args.child; + + // if Out-of-memory score adjustment is set in specification. set the score + // value for the current process check + // https://dev.to/rrampage/surviving-the-linux-oom-killer-2ki9 for some more + // information + if let Some(ref resource) = linux.resources { + if let Some(oom_score_adj) = resource.oom_score_adj { + let mut f = fs::File::create("/proc/self/oom_score_adj")?; + f.write_all(oom_score_adj.to_string().as_bytes())?; + } + } + + // if new user is specified in specification, this will be true and new + // namespace will be created, check + // https://man7.org/linux/man-pages/man7/user_namespaces.7.html for more + // information + if args.rootless.is_some() { + // child needs to be dumpable, otherwise the non root parent is not + // allowed to write the uid/gid maps + prctl::set_dumpable(true).unwrap(); + child.request_identifier_mapping()?; + child.wait_for_mapping_ack()?; + prctl::set_dumpable(false).unwrap(); + } + + // set limits and namespaces to the process + for rlimit in proc.rlimits.iter() { + command.set_rlimit(rlimit).context("failed to set rlimit")?; + } + + command + .set_id(Uid::from_raw(0), Gid::from_raw(0)) + .context("failed to become root")?; + + // set up tty if specified + if let Some(csocketfd) = args.console_socket { + tty::setup_console(&csocketfd)?; + } + + // join existing namespaces + namespaces.apply_setns()?; + + command.set_hostname(&spec.hostname.as_ref().context("no hostname in spec")?)?; + + if proc.no_new_privileges { + let _ = prctl::set_no_new_privileges(true); + } + + if args.init { + rootfs::prepare_rootfs( + &spec, + &rootfs, + namespaces + .clone_flags + .contains(sched::CloneFlags::CLONE_NEWUSER), + ) + .with_context(|| "Failed to prepare rootfs")?; + + // change the root of filesystem of the process to the rootfs + command + .pivot_rootfs(rootfs) + .with_context(|| format!("Failed to pivot root to {:?}", rootfs))?; + } + + command.set_id(Uid::from_raw(proc.user.uid), Gid::from_raw(proc.user.gid))?; + capabilities::reset_effective(command)?; + if let Some(caps) = &proc.capabilities { + capabilities::drop_privileges(&caps, command)?; + } + + // clean up and handle perserved fds. + cleanup_file_descriptors(args.preserve_fds).with_context(|| "Failed to clean up extra fds")?; + + // notify parents that the init process is ready to execute the payload. + child.notify_parent()?; + + // listing on the notify socket for container start command + notify_socket.wait_for_container_start()?; + + let args: &Vec = &proc.args; + let envs: &Vec = &proc.env; + utils::do_exec(&args[0], args, envs)?; + + // After do_exec is called, the process is replaced with the container + // payload through execvp, so it should never reach here. + unreachable!(); +} + +#[cfg(test)] +mod tests { + use super::*; + use anyhow::{bail, Result}; + use nix::{fcntl, sys, unistd}; + use std::fs; + + #[test] + fn test_get_open_fds() -> Result<()> { + let file = fs::File::open("/dev/null")?; + let fd = file.as_raw_fd(); + let open_fds = super::get_open_fds()?; + + if !open_fds.iter().any(|&v| v == fd) { + bail!("Failed to find the opened dev null fds: {:?}", open_fds); + } + + // explicitly close the file before the test case returns. + drop(file); + + // The stdio fds should also be contained in the list of opened fds. + if !vec![0, 1, 2] + .iter() + .all(|&stdio_fd| open_fds.iter().any(|&open_fd| open_fd == stdio_fd)) + { + bail!("Failed to find the stdio fds: {:?}", open_fds); + } + + Ok(()) + } + + #[test] + fn test_cleanup_file_descriptors() -> Result<()> { + // Open a fd without the CLOEXEC flag. Rust automatically adds the flag, + // so we use fcntl::open here for more control. + let fd = fcntl::open("/dev/null", fcntl::OFlag::O_RDWR, sys::stat::Mode::empty())?; + cleanup_file_descriptors(fd - 1).with_context(|| "Failed to clean up the fds")?; + let fd_flag = fcntl::fcntl(fd, fcntl::F_GETFD)?; + if (fd_flag & fcntl::FdFlag::FD_CLOEXEC.bits()) != 0 { + bail!("CLOEXEC flag is not set correctly"); + } + + unistd::close(fd)?; + Ok(()) + } +} diff --git a/src/process/mod.rs b/src/process/mod.rs index 8fe35449c..c49c28d5f 100644 --- a/src/process/mod.rs +++ b/src/process/mod.rs @@ -5,6 +5,7 @@ use std::time::Duration; pub mod child; pub mod fork; +pub mod init; pub mod message; pub mod parent;