diff --git a/main.c b/main.c index e020640..3e74c20 100644 --- a/main.c +++ b/main.c @@ -300,6 +300,8 @@ static void usage(const char *argv0) "default=%s)\n", DEFAULT_NETNS_TYPE); printf("--userns-path=PATH specify user namespace path\n"); + printf("--create-sandbox create a new mount namespace and drop all " + "capabilities except CAP_NET_BIND_SERVICE\n"); printf("-a, --api-socket=PATH specify API socket path\n"); printf("-6, --enable-ipv6 enable IPv6 (experimental)\n"); printf("-h, --help show this help and exit\n"); @@ -329,6 +331,7 @@ struct options { char *netns_type; // argv[1] char *netns_path; // --netns-path char *userns_path; // --userns-path + bool create_sandbox; // --create-sandbod }; static void options_init(struct options *options) @@ -381,6 +384,7 @@ static void parse_args(int argc, char *const argv[], struct options *options) #define DISABLE_HOST_LOOPBACK -43 #define NETNS_TYPE -44 #define USERNS_PATH -45 +#define CREATE_SANDBOX -46 #define _DEPRECATED_NO_HOST_LOOPBACK \ -10043 // deprecated in favor of disable-host-loopback const struct option longopts[] = { @@ -395,6 +399,7 @@ static void parse_args(int argc, char *const argv[], struct options *options) { "userns-path", required_argument, NULL, USERNS_PATH }, { "api-socket", required_argument, NULL, 'a' }, { "enable-ipv6", no_argument, NULL, '6' }, + { "create-sandbox", no_argument, NULL, CREATE_SANDBOX }, { "help", no_argument, NULL, 'h' }, { "version", no_argument, NULL, 'v' }, { 0, 0, 0, 0 }, @@ -445,6 +450,10 @@ static void parse_args(int argc, char *const argv[], struct options *options) case DISABLE_HOST_LOOPBACK: options->disable_host_loopback = true; break; + case CREATE_SANDBOX: + printf("WARNING: Support for sandboxing is experimental\n"); + options->create_sandbox = true; + break; case NETNS_TYPE: optarg_netns_type = optarg; break; @@ -493,6 +502,7 @@ static void parse_args(int argc, char *const argv[], struct options *options) #undef NETNS_TYPE #undef USERNS_PATH #undef _DEPRECATED_NO_HOST_LOOPBACK +#undef CREATE_SANDBOX if (argc - optind < 2) { goto error; } @@ -625,6 +635,7 @@ static int slirp4netns_config_from_options(struct slirp4netns_config *cfg, } cfg->enable_ipv6 = cfg->enable_ipv6; cfg->disable_host_loopback = opt->disable_host_loopback; + cfg->create_sandbox = opt->create_sandbox; finish: return rc; } diff --git a/slirp4netns.1.md b/slirp4netns.1.md index 528d0a5..2df6aa7 100644 --- a/slirp4netns.1.md +++ b/slirp4netns.1.md @@ -62,6 +62,11 @@ specify network namespace type ([path|pid], default=pid) **--userns-path=PATH** (since v0.4.0) specify user namespace path +**--create-sandbox** (since v0.4.0) +when running as a root (either on the host, or in a user namespace), create +a new mount namespace where only /etc and /run are mounted from the host and +all the capabilities except `CAP_NET_BIND_SERVICE` are dropped. + **-h**, **--help** show help and exit diff --git a/slirp4netns.c b/slirp4netns.c index 84229a0..8c0740f 100644 --- a/slirp4netns.c +++ b/slirp4netns.c @@ -5,6 +5,11 @@ #include #include #include +#include +#include +#include +#include +#include #include @@ -279,6 +284,158 @@ Slirp *create_slirp(void *opaque, struct slirp4netns_config *s4nn) return slirp; } +static int add_mount(const char *from, const char *to) +{ + int ret; + + ret = mount(from, to, "", + MS_BIND | MS_REC | MS_SLAVE | MS_NOSUID | MS_NODEV | MS_NOEXEC, + NULL); + if (ret < 0) { + fprintf(stderr, "cannot bind mount %s to %s\n", from, to); + return ret; + } + ret = mount(from, to, "", + MS_REMOUNT | MS_BIND | MS_RDONLY | MS_NOSUID | MS_NODEV | + MS_NOEXEC, + NULL); + if (ret < 0) { + fprintf(stderr, "cannot remount ro %s\n", to); + return ret; + } + return 0; +} + +/* lock down the process doing the following: + - create a new mount namespace + - bind mount /etc and /run from the host + - pivot_root in the new tmpfs. + - drop all capabilities. +*/ +static int create_sandbox() +{ + int ret, i; + struct __user_cap_header_struct hdr = { _LINUX_CAPABILITY_VERSION_3, 0 }; + struct __user_cap_data_struct data[2] = { { 0 } }; + + ret = unshare(CLONE_NEWNS); + if (ret < 0) { + fprintf(stderr, "cannot unshare new mount namespace\n"); + return ret; + } + ret = mount("", "/", "", MS_PRIVATE, NULL); + if (ret < 0) { + fprintf(stderr, "cannot remount / private\n"); + return ret; + } + + ret = mount("tmpfs", "/tmp", "tmpfs", MS_NOSUID | MS_NODEV | MS_NOEXEC, + "size=1k"); + if (ret < 0) { + fprintf(stderr, "cannot mount tmpfs on /tmp\n"); + return ret; + } + + ret = mkdir("/tmp/etc", 0755); + if (ret < 0) { + fprintf(stderr, "cannot mkdir /etc\n"); + return ret; + } + + ret = mkdir("/tmp/old", 0755); + if (ret < 0) { + fprintf(stderr, "cannot mkdir /old\n"); + return ret; + } + + ret = mkdir("/tmp/run", 0755); + if (ret < 0) { + fprintf(stderr, "cannot mkdir /run\n"); + return ret; + } + + ret = add_mount("/etc", "/tmp/etc"); + if (ret < 0) { + return ret; + } + + ret = add_mount("/run", "/tmp/run"); + if (ret < 0) { + return ret; + } + + ret = chdir("/tmp"); + if (ret < 0) { + fprintf(stderr, "cannot chdir to /tmp\n"); + return ret; + } + + ret = syscall(__NR_pivot_root, ".", "old"); + if (ret < 0) { + fprintf(stderr, "cannot pivot_root to /tmp\n"); + return ret; + } + + ret = chdir("/"); + if (ret < 0) { + fprintf(stderr, "cannot chdir to /\n"); + return ret; + } + + ret = umount2("/old", MNT_DETACH); + if (ret < 0) { + fprintf(stderr, "cannot umount /old\n"); + return ret; + } + + ret = rmdir("/old"); + if (ret < 0) { + fprintf(stderr, "cannot rmdir /old\n"); + return ret; + } + + ret = mount("tmpfs", "/", "tmpfs", MS_REMOUNT | MS_RDONLY, "size=0k"); + if (ret < 0) { + fprintf(stderr, "cannot mount tmpfs on /tmp\n"); + return ret; + } + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + if (ret < 0) { + fprintf(stderr, "prctl(PR_SET_NO_NEW_PRIVS)\n"); + return ret; + } + + ret = prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0); + if (ret < 0) { + fprintf(stderr, "prctl(PR_CAP_AMBIENT_CLEAR_ALL)\n"); + return ret; + } + for (i = 0;; i++) { + if (i == CAP_NET_BIND_SERVICE) + continue; + ret = prctl(PR_CAPBSET_DROP, i, 0, 0, 0); + if (ret < 0) { + if (errno == EINVAL) + break; + fprintf(stderr, "prctl(PR_CAPBSET_DROP)\n"); + return ret; + } + } + + memset(&data, 0, sizeof(data)); + data[0].effective |= 1 << CAP_NET_BIND_SERVICE; + data[0].permitted |= 1 << CAP_NET_BIND_SERVICE; + data[0].inheritable |= 1 << CAP_NET_BIND_SERVICE; + ret = capset(&hdr, data); + if (ret < 0) { + fprintf(stderr, "capset(0)\n"); + return ret; + } + + return 0; +} + #define ETH_BUF_SIZE (65536) int do_slirp(int tapfd, int readyfd, int exitfd, const char *api_socket, @@ -331,6 +488,10 @@ int do_slirp(int tapfd, int readyfd, int exitfd, const char *api_socket, pollfds_apifd_idx = n_fds - 1; } signal(SIGPIPE, SIG_IGN); + if (cfg->create_sandbox && create_sandbox() < 0) { + fprintf(stderr, "create_sandbox failed\n"); + goto err; + } if (readyfd >= 0) { int rc = -1; do diff --git a/slirp4netns.h b/slirp4netns.h index 4b47e8f..ea67f77 100644 --- a/slirp4netns.h +++ b/slirp4netns.h @@ -13,6 +13,7 @@ struct slirp4netns_config { struct in_addr recommended_vguest; // 10.0.2.100 (slirp itself is unaware of vguest) bool enable_ipv6; bool disable_host_loopback; + bool create_sandbox; }; int do_slirp(int tapfd, int readyfd, int exitfd, const char *api_socket, struct slirp4netns_config *cfg); diff --git a/tests/test-slirp4netns.sh b/tests/test-slirp4netns.sh index ab55771..f137dbb 100755 --- a/tests/test-slirp4netns.sh +++ b/tests/test-slirp4netns.sh @@ -10,10 +10,23 @@ child=$! wait_for_network_namespace $child -slirp4netns $child tun11 & +nsenter --preserve-credentials -U --target=$child slirp4netns --ready-fd=3 --create-sandbox $child tun11 3>ready.file & slirp_pid=$! -wait_for_network_device $child tun11 +# Wait that the sandbox is created +wait_for_file_content 1 ready.file +rm ready.file + +# Check there are no capabilities left in slirp4netns +getpcaps $slirp_pid 2>&1 | tail -n1 > slirp.caps +grep cap_net_bind_service slirp.caps +grep -v cap_sys_admin slirp.caps +rm slirp.caps +test -e /proc/$slirp_pid/root/etc +test -e /proc/$slirp_pid/root/run +test \! -e /proc/$slirp_pid/root/home +test \! -e /proc/$slirp_pid/root/root +test \! -e /proc/$slirp_pid/root/var function cleanup { kill -9 $child $slirp_pid