diff --git a/Cargo.lock b/Cargo.lock index 993f4be09..0bbe55ce9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1037,6 +1037,7 @@ name = "libcontainer" version = "0.0.2" dependencies = [ "anyhow", + "bitflags", "caps", "chrono", "crossbeam-channel", @@ -1059,6 +1060,7 @@ dependencies = [ "serde", "serde_json", "serial_test", + "syscalls", "wasmer", "wasmer-wasi", ] @@ -1417,6 +1419,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "paste" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0744126afe1a6dd7f394cb50a716dbe086cb06e255e53d8d0185d82828358fb5" + [[package]] name = "path-clean" version = "0.1.0" @@ -2126,6 +2134,16 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "syscalls" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a837f64f72473acde47fbd13431e83fc9081e52ae49ea1430003831536ed04d7" +dependencies = [ + "cc", + "paste", +] + [[package]] name = "sysinfo" version = "0.23.5" diff --git a/crates/libcontainer/Cargo.toml b/crates/libcontainer/Cargo.toml index 8bb8e25b7..3514bbf32 100644 --- a/crates/libcontainer/Cargo.toml +++ b/crates/libcontainer/Cargo.toml @@ -18,8 +18,8 @@ wasm-wasmer = ["wasmer", "wasmer-wasi"] [dependencies] anyhow = "1.0" +bitflags = "1.3.2" caps = "0.5.3" - chrono = { version="0.4", features = ["serde"] } crossbeam-channel = "0.5" dbus = "0.9.5" @@ -37,6 +37,7 @@ libcgroups = { version = "0.0.2", path = "../libcgroups" } libseccomp = { version = "0.2.3" } serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" +syscalls = "0.5.0" rust-criu = { git = "https://github.com/checkpoint-restore/rust-criu", version = "0.1.0" } wasmer = { version = "2.2.0", optional = true } wasmer-wasi = { version = "2.1.1", optional = true } diff --git a/crates/libcontainer/src/process/container_init_process.rs b/crates/libcontainer/src/process/container_init_process.rs index 84586524a..0fab7469e 100644 --- a/crates/libcontainer/src/process/container_init_process.rs +++ b/crates/libcontainer/src/process/container_init_process.rs @@ -12,10 +12,7 @@ use nix::sched::CloneFlags; use nix::sys::stat::Mode; use nix::unistd::setsid; -use nix::{ - fcntl, - unistd::{self, Gid, Uid}, -}; +use nix::unistd::{self, Gid, Uid}; use oci_spec::runtime::{LinuxNamespaceType, Spec, User}; use std::collections::HashMap; use std::os::unix::io::AsRawFd; @@ -24,58 +21,6 @@ use std::{ path::{Path, PathBuf}, }; -// Get a list of open fds for the calling process. -fn get_open_fds() -> Result> { - const PROCFS_FD_PATH: &str = "/proc/self/fd"; - utils::ensure_procfs(Path::new(PROCFS_FD_PATH)) - .with_context(|| format!("{} is not the actual procfs", PROCFS_FD_PATH))?; - - let fds: Vec = fs::read_dir(PROCFS_FD_PATH)? - .filter_map(|entry| match entry { - Ok(entry) => Some(entry.path()), - Err(_) => None, - }) - .filter_map(|path| path.file_name().map(|file_name| file_name.to_owned())) - .filter_map(|file_name| file_name.to_str().map(String::from)) - .filter_map(|file_name| -> Option { - // Convert the file name from string into i32. Since we are looking - // at /proc//fd, anything that's not a number (i32) can be - // ignored. We are only interested in opened fds. - match file_name.parse() { - Ok(fd) => Some(fd), - Err(_) => None, - } - }) - .collect(); - - Ok(fds) -} - -// Cleanup any extra file descriptors, so the new container process will not -// leak a file descriptor from before execve gets executed. The first 3 fd will -// stay open: stdio, stdout, and stderr. We would further preserve the next -// "preserve_fds" number of fds. Set the rest of fd with CLOEXEC flag, so they -// will be closed after execve into the container payload. We can't close the -// fds immediately since we at least still need it for the pipe used to wait on -// starting the container. -fn cleanup_file_descriptors(preserve_fds: i32) -> Result<()> { - let open_fds = get_open_fds().with_context(|| "Failed to obtain opened fds")?; - // Include stdin, stdout, and stderr for fd 0, 1, and 2 respectively. - let min_fd = preserve_fds + 3; - let to_be_cleaned_up_fds: Vec = open_fds - .iter() - .filter_map(|&fd| if fd >= min_fd { Some(fd) } else { None }) - .collect(); - - to_be_cleaned_up_fds.iter().for_each(|&fd| { - // Intentionally ignore errors here -- the cases where this might fail - // are basically file descriptors that have already been closed. - let _ = fcntl::fcntl(fd, fcntl::F_SETFD(fcntl::FdFlag::FD_CLOEXEC)); - }); - - Ok(()) -} - fn sysctl(kernel_params: &HashMap) -> Result<()> { let sys = PathBuf::from("/proc/sys"); for (kernel_param, value) in kernel_params { @@ -346,23 +291,6 @@ pub fn container_init_process( ) .context("failed to configure uid and gid")?; - // Without no new privileges, seccomp is a privileged operation. We have to - // do this before dropping capabilities. Otherwise, we should do it later, - // as close to exec as possible. - if let Some(seccomp) = linux.seccomp() { - if proc.no_new_privileges().is_none() { - let notify_fd = - seccomp::initialize_seccomp(seccomp).context("failed to execute seccomp")?; - sync_seccomp(notify_fd, main_sender, init_receiver) - .context("failed to sync seccomp")?; - } - } - - capabilities::reset_effective(syscall).context("Failed to reset effective capabilities")?; - if let Some(caps) = proc.capabilities() { - capabilities::drop_privileges(caps, syscall).context("Failed to drop capabilities")?; - } - // Take care of LISTEN_FDS used for systemd-active-socket. If the value is // not 0, then we have to preserve those fds as well, and set up the correct // environment variables. @@ -403,9 +331,33 @@ pub fn container_init_process( } }; - // Clean up and handle preserved fds. We only mark the fd as CLOSEXEC, so we - // don't have to worry about when the fd will be closed. - cleanup_file_descriptors(preserve_fds).with_context(|| "Failed to clean up extra fds")?; + // Cleanup any extra file descriptors, so the new container process will not + // leak a file descriptor from before execve gets executed. The first 3 fd will + // stay open: stdio, stdout, and stderr. We would further preserve the next + // "preserve_fds" number of fds. Set the rest of fd with CLOEXEC flag, so they + // will be closed after execve into the container payload. We can't close the + // fds immediately since we at least still need it for the pipe used to wait on + // starting the container. + syscall + .close_range(preserve_fds) + .with_context(|| "failed to clean up extra fds")?; + + // Without no new privileges, seccomp is a privileged operation. We have to + // do this before dropping capabilities. Otherwise, we should do it later, + // as close to exec as possible. + if let Some(seccomp) = linux.seccomp() { + if proc.no_new_privileges().is_none() { + let notify_fd = + seccomp::initialize_seccomp(seccomp).context("failed to execute seccomp")?; + sync_seccomp(notify_fd, main_sender, init_receiver) + .context("failed to sync seccomp")?; + } + } + + capabilities::reset_effective(syscall).context("Failed to reset effective capabilities")?; + if let Some(caps) = proc.capabilities() { + capabilities::drop_privileges(caps, syscall).context("Failed to drop capabilities")?; + } // Change directory to process.cwd if process.cwd is not empty if do_chdir { @@ -556,56 +508,10 @@ mod tests { syscall::create_syscall, test::{ArgName, MountArgs, TestHelperSyscall}, }; - use nix::{fcntl, sys, unistd}; + use nix::unistd; use oci_spec::runtime::{LinuxNamespaceBuilder, SpecBuilder, UserBuilder}; use serial_test::serial; - use std::{fs, os::unix::prelude::AsRawFd}; - - // Note: We have to run these tests here as serial. The main issue is that - // these tests has a dependency on the system state. The - // cleanup_file_descriptors test is especially evil when running with other - // tests because it would ran around close down different fds. - - #[test] - #[serial] - fn test_get_open_fds() -> Result<()> { - let file = fs::File::open("/dev/null")?; - let fd = file.as_raw_fd(); - let open_fds = super::get_open_fds()?; - - if !open_fds.iter().any(|&v| v == fd) { - bail!("Failed to find the opened dev null fds: {:?}", open_fds); - } - - // explicitly close the file before the test case returns. - drop(file); - - // The stdio fds should also be contained in the list of opened fds. - if !vec![0, 1, 2] - .iter() - .all(|&stdio_fd| open_fds.iter().any(|&open_fd| open_fd == stdio_fd)) - { - bail!("Failed to find the stdio fds: {:?}", open_fds); - } - - Ok(()) - } - - #[test] - #[serial] - fn test_cleanup_file_descriptors() -> Result<()> { - // Open a fd without the CLOEXEC flag. Rust automatically adds the flag, - // so we use fcntl::open here for more control. - let fd = fcntl::open("/dev/null", fcntl::OFlag::O_RDWR, sys::stat::Mode::empty())?; - cleanup_file_descriptors(fd - 1).with_context(|| "Failed to clean up the fds")?; - let fd_flag = fcntl::fcntl(fd, fcntl::F_GETFD)?; - if (fd_flag & fcntl::FdFlag::FD_CLOEXEC.bits()) != 0 { - bail!("CLOEXEC flag is not set correctly"); - } - - unistd::close(fd)?; - Ok(()) - } + use std::fs; #[test] fn test_readonly_path() -> Result<()> { diff --git a/crates/libcontainer/src/syscall/linux.rs b/crates/libcontainer/src/syscall/linux.rs index cf90a855c..1d859ba06 100644 --- a/crates/libcontainer/src/syscall/linux.rs +++ b/crates/libcontainer/src/syscall/linux.rs @@ -1,14 +1,16 @@ //! Implements Command trait for Linux systems #[cfg_attr(coverage, no_coverage)] use std::ffi::{CStr, OsStr}; +use std::fs; use std::os::unix::ffi::OsStrExt; use std::os::unix::fs::symlink; use std::sync::Arc; use std::{any::Any, mem, path::Path, ptr}; -use anyhow::{anyhow, bail, Result}; +use anyhow::{anyhow, bail, Context, Result}; use caps::{CapSet, CapsHashSet}; use libc::{c_char, uid_t}; +use nix::fcntl; use nix::{ errno::Errno, fcntl::{open, OFlag}, @@ -18,11 +20,13 @@ use nix::{ unistd, unistd::{chown, fchdir, pivot_root, setgroups, sethostname, Gid, Uid}, }; +use syscalls::{syscall, SYS_close_range}; use oci_spec::runtime::LinuxRlimit; use super::Syscall; -use crate::capabilities; +use crate::syscall::syscall::CloseRange; +use crate::{capabilities, utils}; /// Empty structure to implement Command trait for #[derive(Clone)] @@ -41,6 +45,51 @@ impl LinuxSyscall { let name: Arc = Self::from_raw_buf(passwd.pw_name); name } + + fn emulate_close_range(preserve_fds: i32) -> Result<()> { + let open_fds = Self::get_open_fds().with_context(|| "failed to obtain opened fds")?; + // Include stdin, stdout, and stderr for fd 0, 1, and 2 respectively. + let min_fd = preserve_fds + 3; + let to_be_cleaned_up_fds: Vec = open_fds + .iter() + .filter_map(|&fd| if fd >= min_fd { Some(fd) } else { None }) + .collect(); + + to_be_cleaned_up_fds.iter().for_each(|&fd| { + // Intentionally ignore errors here -- the cases where this might fail + // are basically file descriptors that have already been closed. + let _ = fcntl::fcntl(fd, fcntl::F_SETFD(fcntl::FdFlag::FD_CLOEXEC)); + }); + + Ok(()) + } + + // Get a list of open fds for the calling process. + fn get_open_fds() -> Result> { + const PROCFS_FD_PATH: &str = "/proc/self/fd"; + utils::ensure_procfs(Path::new(PROCFS_FD_PATH)) + .with_context(|| format!("{} is not the actual procfs", PROCFS_FD_PATH))?; + + let fds: Vec = fs::read_dir(PROCFS_FD_PATH)? + .filter_map(|entry| match entry { + Ok(entry) => Some(entry.path()), + Err(_) => None, + }) + .filter_map(|path| path.file_name().map(|file_name| file_name.to_owned())) + .filter_map(|file_name| file_name.to_str().map(String::from)) + .filter_map(|file_name| -> Option { + // Convert the file name from string into i32. Since we are looking + // at /proc//fd, anything that's not a number (i32) can be + // ignored. We are only interested in opened fds. + match file_name.parse() { + Ok(fd) => Some(fd), + Err(_) => None, + } + }) + .collect(); + + Ok(fds) + } } impl Syscall for LinuxSyscall { @@ -244,4 +293,103 @@ impl Syscall for LinuxSyscall { Err(e) => Err(anyhow!(e)), } } + + fn close_range(&self, preserve_fds: i32) -> Result<()> { + let result = unsafe { + syscall!( + SYS_close_range, + 3 + preserve_fds as usize, + usize::MAX, + CloseRange::CLOEXEC.bits() + ) + }; + + match result { + Ok(_) => Ok(()), + Err(e) if e == syscalls::Errno::ENOSYS || e == syscalls::Errno::EINVAL => { + // close_range was introduced in kernel 5.9 and CLOSEEXEC was introduced in + // kernel 5.11. If the kernel is older we emulate close_range in userspace. + Self::emulate_close_range(preserve_fds) + } + Err(e) => bail!(e), + } + } +} + +#[cfg(test)] +mod tests { + // Note: We have to run these tests here as serial. The main issue is that + // these tests has a dependency on the system state. The + // cleanup_file_descriptors test is especially evil when running with other + // tests because it would ran around close down different fds. + + use std::{fs, os::unix::prelude::AsRawFd}; + + use anyhow::{bail, Context, Result}; + use nix::{fcntl, sys, unistd}; + use serial_test::serial; + + use crate::syscall::Syscall; + + use super::LinuxSyscall; + + #[test] + #[serial] + fn test_get_open_fds() -> Result<()> { + let file = fs::File::open("/dev/null")?; + let fd = file.as_raw_fd(); + let open_fds = LinuxSyscall::get_open_fds()?; + + if !open_fds.iter().any(|&v| v == fd) { + bail!("failed to find the opened dev null fds: {:?}", open_fds); + } + + // explicitly close the file before the test case returns. + drop(file); + + // The stdio fds should also be contained in the list of opened fds. + if !vec![0, 1, 2] + .iter() + .all(|&stdio_fd| open_fds.iter().any(|&open_fd| open_fd == stdio_fd)) + { + bail!("failed to find the stdio fds: {:?}", open_fds); + } + + Ok(()) + } + + #[test] + #[serial] + fn test_close_range_userspace() -> Result<()> { + // Open a fd without the CLOEXEC flag. Rust automatically adds the flag, + // so we use fcntl::open here for more control. + let fd = fcntl::open("/dev/null", fcntl::OFlag::O_RDWR, sys::stat::Mode::empty())?; + LinuxSyscall::emulate_close_range(0).context("failed to clean up the fds")?; + + let fd_flag = fcntl::fcntl(fd, fcntl::F_GETFD)?; + if (fd_flag & fcntl::FdFlag::FD_CLOEXEC.bits()) == 0 { + bail!("CLOEXEC flag is not set correctly"); + } + + unistd::close(fd)?; + Ok(()) + } + + #[test] + #[serial] + fn test_close_range_native() -> Result<()> { + let fd = fcntl::open("/dev/null", fcntl::OFlag::O_RDWR, sys::stat::Mode::empty())?; + let syscall = LinuxSyscall {}; + syscall + .close_range(0) + .context("failed to clean up the fds")?; + + let fd_flag = fcntl::fcntl(fd, fcntl::F_GETFD)?; + if (fd_flag & fcntl::FdFlag::FD_CLOEXEC.bits()) == 0 { + bail!("CLOEXEC flag is not set correctly"); + } + + unistd::close(fd)?; + Ok(()) + } } diff --git a/crates/libcontainer/src/syscall/syscall.rs b/crates/libcontainer/src/syscall/syscall.rs index 6fd4e0d33..ddde6fc7a 100644 --- a/crates/libcontainer/src/syscall/syscall.rs +++ b/crates/libcontainer/src/syscall/syscall.rs @@ -4,6 +4,7 @@ use std::{any::Any, ffi::OsStr, path::Path, sync::Arc}; use anyhow::Result; +use bitflags::bitflags; use caps::{CapSet, CapsHashSet}; use nix::{ mount::MsFlags, @@ -41,6 +42,7 @@ pub trait Syscall { fn mknod(&self, path: &Path, kind: SFlag, perm: Mode, dev: u64) -> Result<()>; fn chown(&self, path: &Path, owner: Option, group: Option) -> Result<()>; fn set_groups(&self, groups: &[Gid]) -> Result<()>; + fn close_range(&self, preserve_fds: i32) -> Result<()>; } pub fn create_syscall() -> Box { @@ -50,3 +52,10 @@ pub fn create_syscall() -> Box { Box::new(LinuxSyscall) } } + +bitflags! { +pub struct CloseRange : usize { + const NONE = 0b00000000; + const UNSHARE = 0b00000010; + const CLOEXEC = 0b00000100; +}} diff --git a/crates/libcontainer/src/syscall/test.rs b/crates/libcontainer/src/syscall/test.rs index 1ea8bae38..6952f8195 100644 --- a/crates/libcontainer/src/syscall/test.rs +++ b/crates/libcontainer/src/syscall/test.rs @@ -227,6 +227,10 @@ impl Syscall for TestHelperSyscall { fn set_groups(&self, groups: &[Gid]) -> anyhow::Result<()> { self.mocks.act(ArgName::Groups, Box::new(groups.to_vec())) } + + fn close_range(&self, _: i32) -> anyhow::Result<()> { + todo!() + } } impl TestHelperSyscall {