From 70ecb8cdb60c7ee44df6e317fc355063ec25db4e Mon Sep 17 00:00:00 2001 From: Joseph Gooch Date: Wed, 24 Jun 2020 13:46:16 +0000 Subject: [PATCH] Refactor I/O and add SD_NOTIFY proxy support Refactored all the conn_sock functionality to be more generic. It can deal with different types of sockets, stream vs dgram, and reuses all the same callbacks, shutdown and async functionality. Conmon creates a notify socket which podman bind-mounts into the container, and passes in via the spec's environment variables. Conmon relays the READY=1 signal. This is similar to what runc and crun do, but doing it in conmon and NOT passing NOTIFY_SOCKET to the OCI runtime allows us to start up properly without runc and crun blocking on the "start" command. It would also be trivial to add more proxied sockets, i.e. the /dev/log proof of concept I did would now be super easy, if we wanted to revisit that. Signed-off-by: Joseph Gooch --- src/conmon.c | 9 ++ src/conn_sock.c | 282 ++++++++++++++++++++++++++++++++++++++---------- src/conn_sock.h | 37 ++++++- src/ctr_stdio.c | 4 +- 4 files changed, 270 insertions(+), 62 deletions(-) diff --git a/src/conmon.c b/src/conmon.c index 65ac45d7..0c611717 100644 --- a/src/conmon.c +++ b/src/conmon.c @@ -84,6 +84,15 @@ int main(int argc, char *argv[]) exit(0); } + if (getenv("NOTIFY_SOCKET") != NULL) { + setup_notify_socket(getenv("NOTIFY_SOCKET")); + int r = unsetenv("NOTIFY_SOCKET"); + if (r < 0) { + nwarnf("Cannot unset NOTIFY_SOCKET %d", r) + } + } + + /* Environment variables */ sync_pipe_fd = get_pipe_fd_from_env("_OCI_SYNCPIPE"); diff --git a/src/conn_sock.c b/src/conn_sock.c index 3a09eba6..3a0800b3 100644 --- a/src/conn_sock.c +++ b/src/conn_sock.c @@ -13,14 +13,46 @@ #include #include -static gboolean attach_cb(int fd, G_GNUC_UNUSED GIOCondition condition, G_GNUC_UNUSED gpointer user_data); -static gboolean conn_sock_cb(int fd, GIOCondition condition, gpointer user_data); -static gboolean read_conn_sock(struct conn_sock_s *sock); -static gboolean terminate_conn_sock(struct conn_sock_s *sock); -void conn_sock_shutdown(struct conn_sock_s *sock, int how); -static void sock_try_write_to_masterfd_stdin(struct conn_sock_s *sock); -static gboolean masterfd_write_cb(G_GNUC_UNUSED int fd, G_GNUC_UNUSED GIOCondition condition, G_GNUC_UNUSED gpointer user_data); - +static gboolean attach_cb(int fd, G_GNUC_UNUSED GIOCondition condition, gpointer user_data); +static gboolean rmt_sock_cb(int fd, GIOCondition condition, gpointer user_data); +static void init_rmt_sock(struct rmt_sock_s *sock, struct rmt_sock_s *src); +static gboolean read_rmt_sock(struct rmt_sock_s *sock); +static gboolean terminate_rmt_sock(struct rmt_sock_s *sock); +void rmt_sock_shutdown(struct rmt_sock_s *sock, int how); +static void schedule_lcl_sock_write(struct lcl_sock_s *lcl_sock); +static void sock_try_write_to_lcl_sock(struct rmt_sock_s *sock); +static gboolean lcl_sock_write_cb(G_GNUC_UNUSED int fd, G_GNUC_UNUSED GIOCondition condition, G_GNUC_UNUSED gpointer user_data); + +static struct lcl_sock_s lcl_masterfd_stdin = {&masterfd_stdin, true, NULL, "container stdin", NULL}; +struct rmt_sock_s rmt_attach_sock = { + SOCK_TYPE_CONSOLE, /* sock_type */ + -1, /* fd */ + &lcl_masterfd_stdin, /* dest */ + true, /* listening */ + false, /* data_ready */ + true, /* readable */ + true, /* writable */ + 0, /* remaining */ + 0, /* off */ + {0} /* buf */ +}; +static int lcl_notify_host_fd = -1; +static struct sockaddr_un lcl_notify_host_addr = {0}; +static struct lcl_sock_s lcl_notify_host = {&lcl_notify_host_fd, false, NULL, "host notify socket", &lcl_notify_host_addr}; +struct rmt_sock_s rmt_notify_sock = { + SOCK_TYPE_NOTIFY, /* sock_type */ + -1, /* fd */ + &lcl_notify_host, /* dest */ + false, /* listening */ + false, /* data_ready */ + true, /* readable */ + false, /* writable */ + 0, /* remaining */ + 0, /* off */ + {0} /* buf */ +}; + +/* External */ char *setup_console_socket(void) { struct sockaddr_un addr = {0}; @@ -99,6 +131,8 @@ char *setup_attach_socket(void) if (attach_socket_fd == -1) pexit("Failed to create attach socket"); + rmt_attach_sock.fd = attach_socket_fd; + if (fchmod(attach_socket_fd, 0700)) pexit("Failed to change attach socket permissions"); @@ -111,51 +145,127 @@ char *setup_attach_socket(void) if (listen(attach_socket_fd, 10) == -1) pexitf("Failed to listen on attach socket: %s", attach_sock_path); - g_unix_fd_add(attach_socket_fd, G_IO_IN, attach_cb, NULL); + g_unix_fd_add(attach_socket_fd, G_IO_IN, attach_cb, &rmt_attach_sock); return attach_symlink_dir_path; } -static gboolean attach_cb(int fd, G_GNUC_UNUSED GIOCondition condition, G_GNUC_UNUSED gpointer user_data) +void setup_notify_socket(char *socket_path) { - int conn_fd = accept(fd, NULL, NULL); - if (conn_fd == -1) { + int notify_socket_fd = -1; + struct sockaddr_un notify_addr = {0}; + notify_addr.sun_family = AF_UNIX; + + /* Connect to Host socket */ + if (lcl_notify_host_fd < 0) { + lcl_notify_host_fd = socket(AF_UNIX, SOCK_DGRAM | SOCK_NONBLOCK | SOCK_CLOEXEC, 0); + if (lcl_notify_host_fd == -1) { + pexit("Failed to create notify socket"); + } + lcl_notify_host_addr.sun_family = AF_UNIX; + strncpy(lcl_notify_host_addr.sun_path, socket_path, sizeof(lcl_notify_host_addr.sun_path) - 1); + } + + /* + * Create a symlink so we don't exceed unix domain socket + * path length limit. + */ + _cleanup_free_ char *notify_symlink_dir_path = g_build_filename(opt_socket_path, opt_cuuid, NULL); + if (unlink(notify_symlink_dir_path) == -1 && errno != ENOENT) + pexit("Failed to remove existing symlink for notify socket directory"); + + /* + * This is to address a corner case where the symlink path length can end up being + * the same as the socket. When it happens, the symlink prevents the socket from being + * be created. This could still be a problem with other containers, but it is safe + * to assume the CUUIDs don't change length in the same directory. As a workaround, + * in such case, make the symlink one char shorter. + */ + if (strlen(notify_symlink_dir_path) == (sizeof(notify_addr.sun_path) - 1)) + notify_symlink_dir_path[sizeof(notify_addr.sun_path) - 2] = '\0'; + + if (symlink(opt_bundle_path, notify_symlink_dir_path) == -1) + pexit("Failed to create symlink for notify socket"); + + _cleanup_free_ char *notify_sock_path = g_build_filename(opt_socket_path, opt_cuuid, "notify/notify.sock", NULL); + ninfof("notify sock path: %s", notify_sock_path); + + strncpy(notify_addr.sun_path, notify_sock_path, sizeof(notify_addr.sun_path) - 1); + ninfof("addr{sun_family=AF_UNIX, sun_path=%s}", notify_addr.sun_path); + + /* + * We make the socket non-blocking to avoid a race where client aborts connection + * before the server gets a chance to call accept. In that scenario, the server + * accept blocks till a new client connection comes in. + */ + notify_socket_fd = socket(AF_UNIX, SOCK_DGRAM | SOCK_NONBLOCK | SOCK_CLOEXEC, 0); + if (notify_socket_fd == -1) + pexit("Failed to create notify socket"); + + rmt_notify_sock.fd = notify_socket_fd; + + if (unlink(notify_addr.sun_path) == -1 && errno != ENOENT) + pexitf("Failed to remove existing notify socket: %s", notify_addr.sun_path); + + if (bind(notify_socket_fd, (struct sockaddr *)¬ify_addr, sizeof(struct sockaddr_un)) == -1) + pexitf("Failed to bind notify socket: %s", notify_sock_path); + + if (fchmod(notify_socket_fd, 0777)) + pexit("Failed to change notify socket permissions"); + + g_unix_fd_add(notify_socket_fd, G_IO_IN | G_IO_HUP | G_IO_ERR, rmt_sock_cb, &rmt_notify_sock); +} + + +void schedule_master_stdin_write() +{ + schedule_lcl_sock_write(&lcl_masterfd_stdin); +} + +/* Internal */ +static gboolean attach_cb(int fd, G_GNUC_UNUSED GIOCondition condition, gpointer user_data) +{ + struct rmt_sock_s *srcsock = (struct rmt_sock_s *)user_data; + int new_fd = accept(fd, NULL, NULL); + if (new_fd == -1) { if (errno != EWOULDBLOCK) nwarn("Failed to accept client connection on attach socket"); } else { - struct conn_sock_s *conn_sock; - if (conn_socks == NULL) { - conn_socks = g_ptr_array_new_with_free_func(free); + struct rmt_sock_s *rmt_sock; + if (srcsock->dest->readers == NULL) { + srcsock->dest->readers = g_ptr_array_new_with_free_func(free); } - conn_sock = malloc(sizeof(*conn_sock)); - if (conn_sock == NULL) { + rmt_sock = malloc(sizeof(*rmt_sock)); + if (rmt_sock == NULL) { pexit("Failed to allocate memory"); } - conn_sock->fd = conn_fd; - conn_sock->readable = true; - conn_sock->writable = true; - conn_sock->off = 0; - conn_sock->remaining = 0; - conn_sock->data_ready = false; - g_unix_fd_add(conn_sock->fd, G_IO_IN | G_IO_HUP | G_IO_ERR, conn_sock_cb, conn_sock); - g_ptr_array_add(conn_socks, conn_sock); - ninfof("Accepted connection %d", conn_sock->fd); + init_rmt_sock(rmt_sock, srcsock); + rmt_sock->fd = new_fd; + g_unix_fd_add(rmt_sock->fd, G_IO_IN | G_IO_HUP | G_IO_ERR, rmt_sock_cb, rmt_sock); + g_ptr_array_add(rmt_sock->dest->readers, rmt_sock); + if (SOCK_IS_CONSOLE(srcsock->sock_type)) { + if (conn_socks == NULL) { + conn_socks = g_ptr_array_new(); + } + g_ptr_array_add(conn_socks, rmt_sock); + } + ninfof("Accepted%s connection %d", SOCK_IS_CONSOLE(srcsock->sock_type) ? " console" : "", rmt_sock->fd); } return G_SOURCE_CONTINUE; } -static gboolean conn_sock_cb(G_GNUC_UNUSED int fd, GIOCondition condition, gpointer user_data) +static gboolean rmt_sock_cb(G_GNUC_UNUSED int fd, GIOCondition condition, gpointer user_data) { - struct conn_sock_s *sock = (struct conn_sock_s *)user_data; + struct rmt_sock_s *sock = (struct rmt_sock_s *)user_data; if (condition & G_IO_IN) - return read_conn_sock(sock); + return read_rmt_sock(sock); - return terminate_conn_sock(sock); + return terminate_rmt_sock(sock); } -static gboolean read_conn_sock(struct conn_sock_s *sock) +static gboolean read_rmt_sock(struct rmt_sock_s *sock) { ssize_t num_read; @@ -165,40 +275,63 @@ static gboolean read_conn_sock(struct conn_sock_s *sock) return G_SOURCE_REMOVE; } - num_read = read(sock->fd, sock->buf, CONN_SOCK_BUF_SIZE); + if (SOCK_IS_STREAM(sock->sock_type)) { + num_read = read(sock->fd, sock->buf, CONN_SOCK_BUF_SIZE); + } else { + num_read = recvfrom(sock->fd, sock->buf, CONN_SOCK_BUF_SIZE - 1, 0, NULL, NULL); + } + if (num_read < 0) return G_SOURCE_CONTINUE; if (num_read == 0) - return terminate_conn_sock(sock); + return terminate_rmt_sock(sock); /* num_read > 0 */ sock->remaining = num_read; sock->off = 0; - sock_try_write_to_masterfd_stdin(sock); + if (SOCK_IS_NOTIFY(sock->sock_type)) { + /* Do what OCI runtime does - only pass READY=1 */ + sock->buf[num_read] = '\0'; + if (strstr(sock->buf, "READY=1")) { + strncpy(sock->buf, "READY=1", 8); + sock->remaining = 7; + } else { + sock->remaining = 0; + } + } + + if (sock->remaining) + sock_try_write_to_lcl_sock(sock); - /* Not everything was written to stdin, let's wait for the fd to be ready. */ + /* Not everything was written, let's wait for the fd to be ready. */ if (sock->remaining) - schedule_master_stdin_write(); + schedule_lcl_sock_write(sock->dest); return G_SOURCE_CONTINUE; } -static gboolean terminate_conn_sock(struct conn_sock_s *sock) +static gboolean terminate_rmt_sock(struct rmt_sock_s *sock) { - conn_sock_shutdown(sock, SHUT_RD); - if (masterfd_stdin >= 0 && opt_stdin) { - if (!opt_leave_stdin_open) { - close(masterfd_stdin); - masterfd_stdin = -1; + rmt_sock_shutdown(sock, SHUT_RD); + if (SOCK_IS_CONSOLE(sock->sock_type)) { + if (conn_socks == NULL || conn_socks->len == 0) { + if (*(lcl_masterfd_stdin.fd) >= 0 && opt_stdin) { + if (!opt_leave_stdin_open) { + close(*(lcl_masterfd_stdin.fd)); + *(lcl_masterfd_stdin.fd) = -1; + } else { + ninfo("Not closing input"); + } + } } else { - ninfo("Not closing input"); + ninfo("Not closing input - still have open sockets"); } } return G_SOURCE_REMOVE; } -void conn_sock_shutdown(struct conn_sock_s *sock, int how) +void rmt_sock_shutdown(struct rmt_sock_s *sock, int how) { if (sock->fd == -1) return; @@ -216,55 +349,88 @@ void conn_sock_shutdown(struct conn_sock_s *sock, int how) break; } if (!sock->writable && !sock->readable) { + ndebugf("Closing %d", sock->fd); close(sock->fd); sock->fd = -1; - g_ptr_array_remove(conn_socks, sock); + /* conn_socks doesn't free, but readers will */ + if (SOCK_IS_CONSOLE(sock->sock_type) && conn_socks != NULL) { + g_ptr_array_remove(conn_socks, sock); + } + if (sock->dest->readers != NULL) { + g_ptr_array_remove(sock->dest->readers, sock); + } } } -static void write_to_masterfd_stdin(gpointer data, gpointer user_data) +static void write_to_lcl_sock(gpointer data, gpointer user_data) { - struct conn_sock_s *sock = (struct conn_sock_s *)data; + struct rmt_sock_s *sock = (struct rmt_sock_s *)data; bool *has_data = user_data; - sock_try_write_to_masterfd_stdin(sock); + sock_try_write_to_lcl_sock(sock); if (sock->remaining) *has_data = true; else if (sock->data_ready) { sock->data_ready = false; - g_unix_fd_add(sock->fd, G_IO_IN | G_IO_HUP | G_IO_ERR, conn_sock_cb, sock); + g_unix_fd_add(sock->fd, G_IO_IN | G_IO_HUP | G_IO_ERR, rmt_sock_cb, sock); } } -static void sock_try_write_to_masterfd_stdin(struct conn_sock_s *sock) +static void sock_try_write_to_lcl_sock(struct rmt_sock_s *sock) { - if (!sock->remaining || masterfd_stdin < 0) + struct lcl_sock_s *lcl_sock = sock->dest; + ssize_t w = 0; + + if (!sock->remaining || *(lcl_sock->fd) < 0) return; - ssize_t w = write(masterfd_stdin, sock->buf + sock->off, sock->remaining); + if (lcl_sock->is_stream) { + w = write(*(lcl_sock->fd), sock->buf + sock->off, sock->remaining); + } else { + w = sendto(*(lcl_sock->fd), sock->buf + sock->off, sock->remaining, MSG_DONTWAIT | MSG_NOSIGNAL, + (struct sockaddr *)lcl_sock->addr, sizeof(*(lcl_sock->addr))); + } if (w < 0) { - nwarn("Failed to write to container stdin"); + nwarnf("Failed to write %s", lcl_sock->label); } else { sock->off += w; sock->remaining -= w; } } -static gboolean masterfd_write_cb(G_GNUC_UNUSED int fd, G_GNUC_UNUSED GIOCondition condition, G_GNUC_UNUSED gpointer user_data) +static gboolean lcl_sock_write_cb(G_GNUC_UNUSED int fd, G_GNUC_UNUSED GIOCondition condition, gpointer user_data) { + struct lcl_sock_s *lcl_sock = (struct lcl_sock_s *)user_data; bool has_data = FALSE; - if (masterfd_stdin < 0) + if (*(lcl_sock->fd) < 0) return G_SOURCE_REMOVE; - g_ptr_array_foreach(conn_socks, write_to_masterfd_stdin, &has_data); + g_ptr_array_foreach(lcl_sock->readers, write_to_lcl_sock, &has_data); if (has_data) return G_SOURCE_CONTINUE; return G_SOURCE_REMOVE; } -void schedule_master_stdin_write() +static void schedule_lcl_sock_write(struct lcl_sock_s *lcl_sock) { - g_unix_fd_add(masterfd_stdin, G_IO_OUT, masterfd_write_cb, NULL); + if (*(lcl_sock->fd) < 0) + return; + + g_unix_fd_add(*(lcl_sock->fd), G_IO_OUT, lcl_sock_write_cb, lcl_sock); +} + +static void init_rmt_sock(struct rmt_sock_s *sock, struct rmt_sock_s *src) +{ + sock->off = 0; + sock->remaining = 0; + sock->data_ready = false; + sock->listening = false; + if (src) { + sock->readable = src->readable; + sock->writable = src->writable; + sock->dest = src->dest; + sock->sock_type = src->sock_type; + } } diff --git a/src/conn_sock.h b/src/conn_sock.h index 24286f78..5baaa3fb 100644 --- a/src/conn_sock.h +++ b/src/conn_sock.h @@ -4,9 +4,33 @@ #include /* gboolean */ #include "config.h" /* CONN_SOCK_BUF_SIZE */ +#define SOCK_TYPE_CONSOLE 1 +#define SOCK_TYPE_NOTIFY 2 +#define SOCK_IS_CONSOLE(sock_type) ((sock_type) == SOCK_TYPE_CONSOLE) +#define SOCK_IS_NOTIFY(sock_type) ((sock_type) == SOCK_TYPE_NOTIFY) +#define SOCK_IS_STREAM(sock_type) ((sock_type) == SOCK_TYPE_CONSOLE) +#define SOCK_IS_DGRAM(sock_type) ((sock_type) != SOCK_TYPE_CONSOLE) + /* Used for attach */ -struct conn_sock_s { +/* I'm not sure on the nomenclature here. + in_sock and out_sock doesn't seem right, because ctr_stdio + breaks encapsulation and writes directly to the console + sockets. + In most cases in conn_sock.c, this struct is "INPUT" and + the next one is "OUTPUT". Really it's this struct is "one" + and the next is "many", but I don't want the same fd in + two different sockets. + + rmt seems to indicate "Remote User" i.e. attached console, or + "container's /dev/log" or "container's /run/notify" + lcl seems to incidate "A socket we own, locally" i.e. masterfd_stdin + or "host /dev/log" or "host /run/systemd/notify" + */ +struct rmt_sock_s { + int sock_type; int fd; + struct lcl_sock_s *dest; + gboolean listening; gboolean data_ready; gboolean readable; gboolean writable; @@ -15,9 +39,18 @@ struct conn_sock_s { char buf[CONN_SOCK_BUF_SIZE]; }; +struct lcl_sock_s { + int *fd; + gboolean is_stream; + GPtrArray *readers; + char *label; + struct sockaddr_un *addr; +}; + char *setup_console_socket(void); char *setup_attach_socket(void); -void conn_sock_shutdown(struct conn_sock_s *sock, int how); +void setup_notify_socket(char *); +void rmt_sock_shutdown(struct rmt_sock_s *sock, int how); void schedule_master_stdin_write(); #endif // CONN_SOCK_H diff --git a/src/ctr_stdio.c b/src/ctr_stdio.c index 67fac5f0..a5e3f4ab 100644 --- a/src/ctr_stdio.c +++ b/src/ctr_stdio.c @@ -134,11 +134,11 @@ static bool read_stdio(int fd, stdpipe_t pipe, gboolean *eof) real_buf[0] = pipe; for (i = conn_socks->len; i > 0; i--) { - struct conn_sock_s *conn_sock = g_ptr_array_index(conn_socks, i - 1); + struct rmt_sock_s *conn_sock = g_ptr_array_index(conn_socks, i - 1); if (conn_sock->writable && write_all(conn_sock->fd, real_buf, num_read + 1) < 0) { nwarn("Failed to write to socket"); - conn_sock_shutdown(conn_sock, SHUT_WR); + rmt_sock_shutdown(conn_sock, SHUT_WR); } } return true;