From 66be626d7dea54feb620de96592efd24d2f45bd3 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Thu, 12 Dec 2024 23:22:06 +0100 Subject: [PATCH] linux: fix a hang if there are no reads from the tty avoid a "crun exec" hang when the the other end of the terminal stopped reading. That happened because `copy_fd_to_fd` tried to write everything that it has received from the source fd, so it would hang the current process. Prevent that using non blocking file descriptors and using epoll to detect when the file descriptor is available for write. Fixes: https://issues.redhat.com/browse/OCPBUGS-45632 Signed-off-by: Giuseppe Scrivano --- src/libcrun/container.c | 83 +++++++++++++++++++---------- src/libcrun/utils.c | 112 ++++++++++++++++++++++++++++++++++++++++ src/libcrun/utils.h | 33 ++++++++++++ 3 files changed, 201 insertions(+), 27 deletions(-) diff --git a/src/libcrun/container.c b/src/libcrun/container.c index d06d1c20d..ecefc7e76 100644 --- a/src/libcrun/container.c +++ b/src/libcrun/container.c @@ -1968,14 +1968,21 @@ struct wait_for_process_args static int wait_for_process (struct wait_for_process_args *args, libcrun_error_t *err) { + cleanup_channel_fd_pair struct channel_fd_pair *from_terminal = NULL; + cleanup_channel_fd_pair struct channel_fd_pair *to_terminal = NULL; + int ret, container_exit_code = 0, last_process; + cleanup_close int terminal_fd_from = -1; + cleanup_close int terminal_fd_to = -1; + const size_t max_events = 10; cleanup_close int epollfd = -1; cleanup_close int signalfd = -1; - int ret, container_exit_code = 0, last_process; sigset_t mask; - int in_fds[10]; - int in_levelfds[10]; - int in_levelfds_len = 0; + int in_fds[max_events]; int in_fds_len = 0; + int out_fds[max_events]; + int out_fds_len = 0; + size_t i; + cleanup_seccomp_notify_context struct seccomp_notify_context_s *seccomp_notify_ctx = NULL; container_exit_code = 0; @@ -1983,6 +1990,12 @@ wait_for_process (struct wait_for_process_args *args, libcrun_error_t *err) if (args == NULL || args->context == NULL) return crun_make_error (err, 0, "internal error: context is empty"); + for (i = 0; i < max_events; i++) + { + in_fds[i] = -1; + out_fds[i] = -1; + } + if (args->context->pid_file) { char buf[32]; @@ -2054,41 +2067,71 @@ wait_for_process (struct wait_for_process_args *args, libcrun_error_t *err) in_fds[in_fds_len++] = args->seccomp_notify_fd; } + if (args->terminal_fd >= 0) + { + /* The terminal_fd is dup()ed so that it can be registered with + epoll multiple times using different masks. */ + terminal_fd_from = dup (args->terminal_fd); + if (UNLIKELY (terminal_fd_from < 0)) + return crun_make_error (err, errno, "dup terminal fd"); + terminal_fd_to = dup (args->terminal_fd); + if (UNLIKELY (terminal_fd_to < 0)) + return crun_make_error (err, errno, "dup terminal fd"); + + int i, non_blocking_fds[] = { terminal_fd_from, terminal_fd_to, 0, 1, -1 }; + for (i = 0; non_blocking_fds[i] >= 0; i++) + { + ret = set_blocking_fd (non_blocking_fds[i], false, err); + if (UNLIKELY (ret < 0)) + return ret; + } + + from_terminal = channel_fd_pair_new (terminal_fd_from, 1, BUFSIZ); + to_terminal = channel_fd_pair_new (0, terminal_fd_to, BUFSIZ); + } + in_fds[in_fds_len++] = signalfd; if (args->notify_socket >= 0) in_fds[in_fds_len++] = args->notify_socket; if (args->terminal_fd >= 0) { in_fds[in_fds_len++] = 0; - in_levelfds[in_levelfds_len++] = args->terminal_fd; + out_fds[out_fds_len++] = terminal_fd_to; + + in_fds[in_fds_len++] = terminal_fd_from; + out_fds[out_fds_len++] = 1; } - in_fds[in_fds_len++] = -1; - in_levelfds[in_levelfds_len++] = -1; - epollfd = epoll_helper (in_fds, in_levelfds, NULL, NULL, err); + epollfd = epoll_helper (in_fds, NULL, out_fds, NULL, err); if (UNLIKELY (epollfd < 0)) return epollfd; while (1) { + struct epoll_event events[max_events]; struct signalfd_siginfo si; struct winsize ws; - ssize_t res; - struct epoll_event events[10]; int i, nr_events; + ssize_t res; - nr_events = TEMP_FAILURE_RETRY (epoll_wait (epollfd, events, 10, -1)); + nr_events = TEMP_FAILURE_RETRY (epoll_wait (epollfd, events, max_events, -1)); if (UNLIKELY (nr_events < 0)) return crun_make_error (err, errno, "epoll_wait"); for (i = 0; i < nr_events; i++) { - if (events[i].data.fd == 0) + if (events[i].data.fd == 0 || events[i].data.fd == terminal_fd_to) { - ret = copy_from_fd_to_fd (0, args->terminal_fd, 0, err); + ret = channel_fd_pair_process (to_terminal, epollfd, err); if (UNLIKELY (ret < 0)) return crun_error_wrap (err, "copy to terminal fd"); } + else if (events[i].data.fd == 1 || events[i].data.fd == terminal_fd_from) + { + ret = channel_fd_pair_process (from_terminal, epollfd, err); + if (UNLIKELY (ret < 0)) + return crun_error_wrap (err, "copy from terminal fd"); + } else if (events[i].data.fd == args->seccomp_notify_fd) { ret = libcrun_seccomp_notify_plugins (seccomp_notify_ctx, @@ -2096,20 +2139,6 @@ wait_for_process (struct wait_for_process_args *args, libcrun_error_t *err) if (UNLIKELY (ret < 0)) return ret; } - else if (events[i].data.fd == args->terminal_fd) - { - ret = set_blocking_fd (args->terminal_fd, false, err); - if (UNLIKELY (ret < 0)) - return crun_error_wrap (err, "set terminal fd not blocking"); - - ret = copy_from_fd_to_fd (args->terminal_fd, 1, 1, err); - if (UNLIKELY (ret < 0)) - return crun_error_wrap (err, "copy from terminal fd"); - - ret = set_blocking_fd (args->terminal_fd, true, err); - if (UNLIKELY (ret < 0)) - return crun_error_wrap (err, "set terminal fd blocking"); - } else if (events[i].data.fd == args->notify_socket) { ret = handle_notify_socket (args->notify_socket, err); diff --git a/src/libcrun/utils.c b/src/libcrun/utils.c index 901123efd..5d0550b1c 100644 --- a/src/libcrun/utils.c +++ b/src/libcrun/utils.c @@ -19,6 +19,7 @@ #define _GNU_SOURCE #include #include "utils.h" +#include "ring_buffer.h" #include #include #include @@ -1241,6 +1242,25 @@ create_signalfd (sigset_t *mask, libcrun_error_t *err) return ret; } +static int +epoll_helper_toggle (int epollfd, int fd, int events, libcrun_error_t *err) +{ + struct epoll_event ev = {}; + bool add = events != 0; + int ret; + + ev.events = events; + ev.data.fd = fd; + ret = epoll_ctl (epollfd, add ? EPOLL_CTL_ADD : EPOLL_CTL_DEL, fd, &ev); + if (UNLIKELY (ret < 0)) + { + if (errno == EEXIST || errno == ENOENT) + return 0; + return crun_make_error (err, errno, "epoll_ctl `%s` `%d`", add ? "add" : "del", fd); + } + return 0; +} + int epoll_helper (int *in_fds, int *in_levelfds, int *out_fds, int *out_levelfds, libcrun_error_t *err) { @@ -2650,3 +2670,95 @@ cpuset_string_to_bitmask (const char *str, char **out, size_t *out_size, libcrun invalid_input: return crun_make_error (err, 0, "cannot parse input `%s`", str); } + +struct channel_fd_pair +{ + struct ring_buffer rb; + + int in_fd; + int out_fd; + + int infd_epoll_events; + int outfd_epoll_events; +}; + +struct channel_fd_pair * +channel_fd_pair_new (int in_fd, int out_fd, size_t size) +{ + struct channel_fd_pair *channel = xmalloc (sizeof (struct channel_fd_pair)); + channel->in_fd = in_fd; + channel->out_fd = out_fd; + ring_buffer_make (&channel->rb, size); + channel->infd_epoll_events = -1; + channel->outfd_epoll_events = -1; + return channel; +} + +void +channel_fd_pair_free (struct channel_fd_pair *channel) +{ + if (channel == NULL) + return; + + ring_buffer_free (&channel->rb); + free (channel); +} + +int +channel_fd_pair_process (struct channel_fd_pair *channel, int epollfd, libcrun_error_t *err) +{ + bool is_input_eagain = false, is_output_eagain = false, repeat; + int ret, i; + + /* This function is called from an epoll loop. Use a hard limit to avoid infinite loops + and prevent other events from being processed. */ + for (i = 0, repeat = true; i < 1000 && repeat; i++) + { + repeat = false; + if (ring_buffer_get_space_available (&(channel->rb)) >= channel->rb.size / 2) + { + ret = ring_buffer_read (&(channel->rb), channel->in_fd, &is_input_eagain, err); + if (UNLIKELY (ret < 0)) + return ret; + if (ret > 0) + repeat = true; + } + if (ring_buffer_get_data_available (&(channel->rb)) > 0) + { + ret = ring_buffer_write (&(channel->rb), channel->out_fd, &is_output_eagain, err); + if (UNLIKELY (ret < 0)) + return ret; + if (ret > 0) + repeat = true; + } + } + + if (epollfd >= 0) + { + size_t available = ring_buffer_get_space_available (&(channel->rb)); + size_t used = ring_buffer_get_data_available (&(channel->rb)); + int events; + + /* If there is space available in the buffer, we want to read more. */ + events = (available > 0) ? (EPOLLIN | (is_input_eagain ? EPOLLET : 0)) : 0; + if (events != channel->infd_epoll_events) + { + ret = epoll_helper_toggle (epollfd, channel->in_fd, events, err); + if (UNLIKELY (ret < 0)) + return ret; + channel->infd_epoll_events = events; + } + + /* If there is data available in the buffer, we want to write as soon as + it is possible. */ + events = (used > 0) ? (EPOLLOUT | (is_output_eagain ? EPOLLET : 0)) : 0; + if (events != channel->outfd_epoll_events) + { + ret = epoll_helper_toggle (epollfd, channel->out_fd, events, err); + if (UNLIKELY (ret < 0)) + return ret; + channel->outfd_epoll_events = events; + } + } + return 0; +} diff --git a/src/libcrun/utils.h b/src/libcrun/utils.h index 92c6a43ff..f41dd5aed 100644 --- a/src/libcrun/utils.h +++ b/src/libcrun/utils.h @@ -475,4 +475,37 @@ validate_options (unsigned int specified_options, unsigned int supported_options extern int cpuset_string_to_bitmask (const char *str, char **out, size_t *out_size, libcrun_error_t *err); +/* + * A channel_fd_pair takes care of copying data between two file descriptors. + * The two file descriptors are expected to be set to non-blocking mode. + * The channel_fd_pair will buffer data read from the input file descriptor and + * write it to the output file descriptor. If the output file descriptor is not + * ready to accept the data, the channel_fd_pair will buffer the data until it + * can be written. + */ +struct channel_fd_pair; + +struct channel_fd_pair *channel_fd_pair_new (int in_fd, int out_fd, size_t size); + +void channel_fd_pair_free (struct channel_fd_pair *channel); + +/* Process the data in the channel_fd_pair. This function will read data from + * the input file descriptor and write it to the output file descriptor. If + * the output file descriptor is not ready to accept the data, the data will be + * buffered. If epollfd is provided, the in_fd and out_fd will be registered + * and unregistered as necessary. + */ +int channel_fd_pair_process (struct channel_fd_pair *channel, int epollfd, libcrun_error_t *err); + +static inline void +cleanup_channel_fd_pairp (void *p) +{ + struct channel_fd_pair **pp = (struct channel_fd_pair **) p; + if (*pp == NULL) + return; + + channel_fd_pair_free (*pp); +} +#define cleanup_channel_fd_pair __attribute__ ((cleanup (cleanup_channel_fd_pairp))) + #endif