Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow CRIU to support restoring into an existing PID namespace #1056

Merged
merged 5 commits into from
Jul 8, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion Documentation/criu.txt
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,12 @@ In other words, do not use it unless really needed.
Tell *criu* that one end of a pair of UNIX sockets (created by
*socketpair*(2)) with the given _id_ is OK to be disconnected.

*--external* **pid[**__inode__**]:**__name__::
adrianreber marked this conversation as resolved.
Show resolved Hide resolved
Mark a PID namespace as external. This can be later used to restore
a process into an existing PID namespace. The label 'name' can be
used to assign another PID namespace during restore with the help
of *--inherit-fd*.

*--freeze-cgroup*::
Use cgroup freezer to collect processes.

Expand Down Expand Up @@ -397,7 +403,7 @@ Restores previously checkpointed processes.
Inherit a file descriptor. This option lets *criu* use an already opened
file descriptor 'N' for restoring a file identified by 'resource'.
This option can be used to restore an external resource dumped
with the help of *--external* *file*, *tty*, and *unix* options.
with the help of *--external* *file*, *tty*, *pid* and *unix* options.
+
The 'resource' argument can be one of the following:
+
Expand Down
124 changes: 101 additions & 23 deletions criu/cr-restore.c
Original file line number Diff line number Diff line change
Expand Up @@ -1335,9 +1335,32 @@ static bool needs_prep_creds(struct pstree_item *item)
return (!item->parent && ((root_ns_mask & CLONE_NEWUSER) || getuid()));
}

static int write_ns_last_pid(pid_t pid)
adrianreber marked this conversation as resolved.
Show resolved Hide resolved
{
char buf[32];
int len;
int fd;

fd = open_proc_rw(PROC_GEN, LAST_PID_PATH);
if (fd < 0)
return -1;

len = snprintf(buf, sizeof(buf), "%d", pid - 1);
if (write(fd, buf, len) != len) {
pr_perror("%d: Write %s to %s", pid, buf,
adrianreber marked this conversation as resolved.
Show resolved Hide resolved
LAST_PID_PATH);
close(fd);
return -1;
}
close(fd);
return 0;
}

static inline int fork_with_pid(struct pstree_item *item)
{
unsigned long clone_flags;
struct cr_clone_arg ca;
struct ns_id *pid_ns = NULL;
int ret = -1;
pid_t pid = vpid(item);

Expand Down Expand Up @@ -1385,36 +1408,91 @@ static inline int fork_with_pid(struct pstree_item *item)

pr_info("Forking task with %d pid (flags 0x%lx)\n", pid, ca.clone_flags);

if (!(ca.clone_flags & CLONE_NEWPID)) {
char buf[32];
int len;
int fd = -1;
if (ca.item->ids)
pid_ns = lookup_ns_by_id(ca.item->ids->pid_ns_id, &pid_ns_desc);

if (!kdat.has_clone3_set_tid) {
fd = open_proc_rw(PROC_GEN, LAST_PID_PATH);
if (fd < 0)
goto err;
clone_flags = ca.clone_flags;
if (pid_ns && pid_ns->ext_key) {
int fd;

/* Not possible to restore into an empty PID namespace. */
BUG_ON(pid == INIT_PID);
adrianreber marked this conversation as resolved.
Show resolved Hide resolved

/*
* Restoring into an existing namespace means that CLONE_NEWPID
* needs to be removed during clone() as the process will be
* created in the correct PID namespace thanks to switch_ns_by_fd().
*/
clone_flags &= ~CLONE_NEWPID;

fd = inherit_fd_lookup_id(pid_ns->ext_key);
if (fd < 0) {
pr_err("Unable to find an external pidns: %s\n", pid_ns->ext_key);
return -1;
}

ret = switch_ns_by_fd(fd, &pid_ns_desc, NULL);
adrianreber marked this conversation as resolved.
Show resolved Hide resolved
close(fd);

/*
* If a process without a PID namespace is restored into
* a PID namespace this tells CRIU to still handle the
* process as if using CLONE_NEWPID.
*/
root_ns_mask |= CLONE_NEWPID;
rsti(item)->clone_flags |= CLONE_NEWPID;
}

if (!(clone_flags & CLONE_NEWPID)) {
pid_t helper_pid = -1;

lock_last_pid();

if (!kdat.has_clone3_set_tid) {
len = snprintf(buf, sizeof(buf), "%d", pid - 1);
if (write(fd, buf, len) != len) {
pr_perror("%d: Write %s to %s", pid, buf,
LAST_PID_PATH);
close(fd);
goto err_unlock;
if (pid_ns && pid_ns->ext_key) {
/*
* Restoring into another namespace requires a helper
* to write to LAST_PID_PATH. Using clone3() this is
* so much easier and simpler. As long as CRIU supports
* clone() this is needed.
*/
helper_pid = fork();
adrianreber marked this conversation as resolved.
Show resolved Hide resolved
if (helper_pid < 0) {
pr_perror("Cannot fork ns_last_pid writer");
goto err_unlock;
}
}
if (helper_pid <= 0) {
adrianreber marked this conversation as resolved.
Show resolved Hide resolved
ret = write_ns_last_pid(pid);
if (ret == -1 && helper_pid == -1)
goto err_unlock;
if (helper_pid == 0)
exit(ret);
}
if (helper_pid > 0) {
adrianreber marked this conversation as resolved.
Show resolved Hide resolved
/* We forked and this is the parent. */
int status;

ret = waitpid(-1, &status, 0);
adrianreber marked this conversation as resolved.
Show resolved Hide resolved
if (ret < 0) {
pr_perror("Cannot wait for ns_last_pid writer");
goto err_unlock;
}
if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
adrianreber marked this conversation as resolved.
Show resolved Hide resolved
pr_err("Writing ns_last_pid failed with error %d\n", status);
ret = -1;
goto err_unlock;
}
}
close(fd);
}
} else {
BUG_ON(pid != INIT_PID);
if (!(pid_ns && pid_ns->ext_key))
BUG_ON(pid != INIT_PID);
adrianreber marked this conversation as resolved.
Show resolved Hide resolved
}

if (kdat.has_clone3_set_tid) {
ret = clone3_with_pid_noasan(restore_task_with_children,
&ca, (ca.clone_flags &
&ca, (clone_flags &
~(CLONE_NEWNET | CLONE_NEWCGROUP | CLONE_NEWTIME)),
SIGCHLD, pid);
} else {
Expand All @@ -1432,7 +1510,7 @@ static inline int fork_with_pid(struct pstree_item *item)
*/
close_pid_proc();
ret = clone_noasan(restore_task_with_children,
(ca.clone_flags &
(clone_flags &
~(CLONE_NEWNET | CLONE_NEWCGROUP | CLONE_NEWTIME)) | SIGCHLD,
&ca);
}
Expand All @@ -1452,9 +1530,9 @@ static inline int fork_with_pid(struct pstree_item *item)
}

err_unlock:
if (!(ca.clone_flags & CLONE_NEWPID))
if (!(clone_flags & CLONE_NEWPID))
unlock_last_pid();
err:

if (ca.core)
core_entry__free_unpacked(ca.core, NULL);
return ret;
Expand Down Expand Up @@ -2149,6 +2227,9 @@ static int restore_root_task(struct pstree_item *init)
* this later.
*/

if (prepare_namespace_before_tasks())
return -1;

if (vpid(init) == INIT_PID) {
if (!(root_ns_mask & CLONE_NEWPID)) {
pr_err("This process tree can only be restored "
Expand All @@ -2165,9 +2246,6 @@ static int restore_root_task(struct pstree_item *init)
if (prepare_userns_hook())
return -1;
adrianreber marked this conversation as resolved.
Show resolved Hide resolved

if (prepare_namespace_before_tasks())
return -1;

__restore_switch_stage_nw(CR_STATE_ROOT_TASK);

ret = fork_with_pid(init);
Expand Down
1 change: 1 addition & 0 deletions criu/image-desc.c
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = {
FD_ENTRY(NETNF_EXP, "netns-exp-%u"),
FD_ENTRY(FILES, "files"),
FD_ENTRY(TIMENS, "timens-%u"),
FD_ENTRY(PIDNS, "pidns-%u"),

[CR_FD_STATS] = {
.fmt = "stats-%s",
Expand Down
1 change: 1 addition & 0 deletions criu/include/image-desc.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ enum {
CR_FD_MNTS,
CR_FD_USERNS,
CR_FD_TIMENS,
CR_FD_PIDNS,

_CR_FD_IPCNS_FROM,
CR_FD_IPC_VAR,
Expand Down
1 change: 1 addition & 0 deletions criu/include/magic.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@
#define FILES_MAGIC 0x56303138 /* Toropets */
#define MEMFD_INODE_MAGIC 0x48453499 /* Dnipro */
#define TIMENS_MAGIC 0x43114433 /* Beslan */
#define PIDNS_MAGIC 0x12345678
adrianreber marked this conversation as resolved.
Show resolved Hide resolved

#define IFADDR_MAGIC RAW_IMAGE_MAGIC
#define ROUTE_MAGIC RAW_IMAGE_MAGIC
Expand Down
1 change: 1 addition & 0 deletions criu/include/protobuf-desc.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ enum {
PB_MEMFD_FILE,
PB_MEMFD_INODE, /* 60 */
PB_TIMENS,
PB_PIDNS,
PB_REMOTE_IMAGE, /* Header for images sent from proxy to cache.*/
PB_LOCAL_IMAGE, /* Header for reading/writing images from/to proxy or cache. */
PB_LOCAL_IMAGE_REPLY, /* Header for reading/writing images reply. */
Expand Down
108 changes: 106 additions & 2 deletions criu/namespaces.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include "util.h"
#include "images/ns.pb-c.h"
#include "images/userns.pb-c.h"
#include "images/pidns.pb-c.h"

static struct ns_desc *ns_desc_array[] = {
&net_ns_desc,
Expand All @@ -46,6 +47,8 @@ static struct ns_desc *ns_desc_array[] = {

static unsigned int join_ns_flags;

static int collect_pid_namespaces(bool);

int check_namespace_opts(void)
{
errno = EINVAL;
Expand Down Expand Up @@ -1078,8 +1081,24 @@ int dump_namespaces(struct pstree_item *item, unsigned int ns_flags)
pr_info("Dumping %d(%d)'s namespaces\n", ns_pid->ns[0].virt, ns_pid->real);

if ((ns_flags & CLONE_NEWPID) && ns_pid->ns[0].virt != 1) {
pr_err("Can't dump a pid namespace without the process init\n");
return -1;
char *val = NULL;
for (ns = ns_ids; ns; ns = ns->next) {
if (ns->nd->cflag == CLONE_NEWPID) {
char id[64];
snprintf(id, sizeof(id), "pid[%u]", ns->kid);
val = external_lookup_by_key(id);
if (IS_ERR_OR_NULL(val)) {
val = NULL;
continue;
}
if (val)
break;
}
}
if (!val) {
pr_err("Can't dump a pid namespace without the process init\n");
return -1;
}
}

for (ns = ns_ids; ns; ns = ns->next) {
Expand Down Expand Up @@ -1563,6 +1582,10 @@ int collect_namespaces(bool for_dump)
if (ret < 0)
return ret;

ret = collect_pid_namespaces(for_dump);
if (ret < 0)
return ret;

return 0;
}

Expand Down Expand Up @@ -1754,6 +1777,44 @@ int prepare_namespace(struct pstree_item *item, unsigned long clone_flags)
return ret;
}

static int read_pid_ns_img(void)
{
struct ns_id *ns;
PidnsEntry *e;

for (ns = ns_ids; ns != NULL; ns = ns->next) {
struct cr_img *img;
int ret;

if (ns->nd != &pid_ns_desc)
continue;

img = open_image(CR_FD_PIDNS, O_RSTR, ns->id);
if (!img)
return -1;

ret = pb_read_one_eof(img, &e, PB_PIDNS);
close_image(img);
if (ret < 0) {
pr_err("Can not read pidns object\n");
return -1;
}
if (ret > 0) {
ns->ext_key = e->ext_key;
/*
* Restoring into an existing PID namespace. This disables
* the check to require a PID 1 when restoring a process
* which used to be in a PID namespace.
* To keep the PID namespace code paths enabled this bit
* will be set after having clone()ed the process.
*/
root_ns_mask &= ~CLONE_NEWPID;
}
}

return 0;
}

int prepare_namespace_before_tasks(void)
{
if (start_usernsd())
Expand All @@ -1771,6 +1832,9 @@ int prepare_namespace_before_tasks(void)
if (read_net_ns_img())
goto err_img;

if (read_pid_ns_img())
goto err_img;

return 0;

err_img:
Expand All @@ -1788,3 +1852,43 @@ int prepare_namespace_before_tasks(void)

struct ns_desc pid_ns_desc = NS_DESC_ENTRY(CLONE_NEWPID, "pid");
struct ns_desc user_ns_desc = NS_DESC_ENTRY(CLONE_NEWUSER, "user");

static int collect_pid_ns(struct ns_id *ns, void *oarg)
{
PidnsEntry e = PIDNS_ENTRY__INIT;
struct cr_img *img;
int ret;
char id[64], *val;

pr_info("Collecting pidns %d/%d\n", ns->id, ns->ns_pid);

snprintf(id, sizeof(id), "pid[%u]", ns->kid);
val = external_lookup_by_key(id);
if (PTR_RET(val))
return 0;

/*
* Only if the user marked the PID namespace as external
* via --external pid[<inode>]:<label> the pidns
* image is written.
*/

pr_debug("The %s pidns is external\n", id);
ns->ext_key = e.ext_key = val;

img = open_image(CR_FD_PIDNS, O_DUMP, ns->id);
if (!img)
return -1;
ret = pb_write_one(img, &e, PB_PIDNS);
close_image(img);

return ret;
}

static int collect_pid_namespaces(bool for_dump)
{
if (!for_dump)
return 0;

return walk_namespaces(&pid_ns_desc, collect_pid_ns, NULL);
}
1 change: 1 addition & 0 deletions criu/protobuf-desc.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include "images/timer.pb-c.h"
#include "images/utsns.pb-c.h"
#include "images/timens.pb-c.h"
#include "images/pidns.pb-c.h"
#include "images/ipc-var.pb-c.h"
#include "images/ipc-shm.pb-c.h"
#include "images/ipc-msg.pb-c.h"
Expand Down
Loading