From 7d4bf0f3c6daa7394bb528dd29a98b5678783ae1 Mon Sep 17 00:00:00 2001 From: Aidan Hobson Sayers Date: Sat, 24 Aug 2024 21:47:48 +0100 Subject: [PATCH] Fix preserve-fds flag, ensure fd closing happens as late as possible --- .../src/process/container_init_process.rs | 41 ++++++++++--------- .../src/process/container_main_process.rs | 9 ---- 2 files changed, 22 insertions(+), 28 deletions(-) diff --git a/crates/libcontainer/src/process/container_init_process.rs b/crates/libcontainer/src/process/container_init_process.rs index 64182c1c7b..3da4df72ea 100644 --- a/crates/libcontainer/src/process/container_init_process.rs +++ b/crates/libcontainer/src/process/container_init_process.rs @@ -501,18 +501,6 @@ pub fn container_init_process( } }; - // Cleanup any extra file descriptors, so the new container process will not - // leak a file descriptor from before execve gets executed. The first 3 fd will - // stay open: stdio, stdout, and stderr. We would further preserve the next - // "preserve_fds" number of fds. Set the rest of fd with CLOEXEC flag, so they - // will be closed after execve into the container payload. We can't close the - // fds immediately since we at least still need it for the pipe used to wait on - // starting the container. - syscall.close_range(preserve_fds).map_err(|err| { - tracing::error!(?err, "failed to cleanup extra fds"); - InitProcessError::SyscallOther(err) - })?; - // Without no new privileges, seccomp is a privileged operation. We have to // do this before dropping capabilities. Otherwise, we should do it later, // as close to exec as possible. @@ -560,6 +548,28 @@ pub fn container_init_process( err })?; + // add HOME into envs if not exists + if !envs.contains_key("HOME") { + if let Some(dir_home) = utils::get_user_home(proc.user().uid()) { + envs.insert("HOME".to_owned(), dir_home.to_string_lossy().to_string()); + } + } + + // Cleanup any extra file descriptors, so the new container process will not + // leak a file descriptor from before execve gets executed. The first 3 fd will + // stay open: stdio, stdout, and stderr. We would further preserve the next + // "preserve_fds" number of fds. Set the rest of fd with CLOEXEC flag, so they + // will be closed after execve into the container payload. We can't close the + // fds immediately since we at least still need it for the pipe used to wait on + // starting the container. + // + // Note: this should happen very late, in order to avoid accidentally leaking FDs + // Please refer to https://github.com/opencontainers/runc/security/advisories/GHSA-xr7r-f8xq-vfvv for more details. + syscall.close_range(preserve_fds).map_err(|err| { + tracing::error!(?err, "failed to cleanup extra fds"); + InitProcessError::SyscallOther(err) + })?; + // Initialize seccomp profile right before we are ready to execute the // payload so as few syscalls will happen between here and payload exec. The // notify socket will still need network related syscalls. @@ -581,13 +591,6 @@ pub fn container_init_process( tracing::warn!("seccomp not available, unable to set seccomp privileges!") } - // add HOME into envs if not exists - if !envs.contains_key("HOME") { - if let Some(dir_home) = utils::get_user_home(proc.user().uid()) { - envs.insert("HOME".to_owned(), dir_home.to_string_lossy().to_string()); - } - } - args.executor.validate(spec)?; args.executor.setup_envs(envs)?; diff --git a/crates/libcontainer/src/process/container_main_process.rs b/crates/libcontainer/src/process/container_main_process.rs index fd5dfcb1c7..7739a89dac 100644 --- a/crates/libcontainer/src/process/container_main_process.rs +++ b/crates/libcontainer/src/process/container_main_process.rs @@ -75,15 +75,6 @@ pub fn container_main_process(container_args: &ContainerArgs) -> Result<(Pid, bo }) }; - // Before starting the intermediate process, mark all non-stdio open files as O_CLOEXEC - // to ensure we don't leak any file descriptors to the intermediate process. - // Please refer to https://github.com/opencontainers/runc/security/advisories/GHSA-xr7r-f8xq-vfvv for more details. - let syscall = container_args.syscall.create_syscall(); - syscall.close_range(0).map_err(|err| { - tracing::error!(?err, "failed to cleanup extra fds"); - ProcessError::SyscallOther(err) - })?; - let intermediate_pid = fork::container_clone(cb).map_err(|err| { tracing::error!("failed to fork intermediate process: {}", err); ProcessError::IntermediateProcessFailed(err)