-
Notifications
You must be signed in to change notification settings - Fork 321
/
linux.c
5350 lines (4451 loc) · 158 KB
/
linux.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* crun - OCI runtime written in C
*
* Copyright (C) 2017, 2018, 2019 Giuseppe Scrivano <[email protected]>
* crun is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* crun is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with crun. If not, see <http://www.gnu.org/licenses/>.
*/
#define _GNU_SOURCE
#include <config.h>
#include "linux.h"
#include "utils.h"
#include <string.h>
#include <sched.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mount.h>
#ifdef HAVE_FSCONFIG_CMD_CREATE_LINUX_MOUNT_H
# include <linux/mount.h>
#endif
#if defined HAVE_FSCONFIG_CMD_CREATE_LINUX_MOUNT_H || defined HAVE_FSCONFIG_CMD_CREATE_SYS_MOUNT_H
# define HAVE_NEW_MOUNT_API
#endif
#include <sys/prctl.h>
#ifdef HAVE_CAP
# include <sys/capability.h>
#endif
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/sysmacros.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <grp.h>
#include <signal.h>
#include "terminal.h"
#include "cgroup.h"
#include "cgroup-utils.h"
#include "status.h"
#include "criu.h"
#include "scheduler.h"
#include <sys/socket.h>
#include <libgen.h>
#include <sys/wait.h>
#include <sys/vfs.h>
#include <limits.h>
#include <inttypes.h>
#include <sys/personality.h>
#include <net/if.h>
#include <sys/xattr.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#include <sched.h>
#include <yajl/yajl_tree.h>
#include <yajl/yajl_gen.h>
#include "mount_flags.h"
#define YAJL_STR(x) ((const unsigned char *) (x))
#ifndef RLIMIT_RTTIME
# define RLIMIT_RTTIME 15
#endif
#ifndef OPEN_TREE_CLONE
# define OPEN_TREE_CLONE 1
#endif
#ifndef OPEN_TREE_CLOEXEC
# define OPEN_TREE_CLOEXEC O_CLOEXEC
#endif
#ifndef MOVE_MOUNT_F_EMPTY_PATH
# define MOVE_MOUNT_F_EMPTY_PATH 0x00000004
#endif
#ifndef MOVE_MOUNT_T_EMPTY_PATH
# define MOVE_MOUNT_T_EMPTY_PATH 0x00000040
#endif
struct remount_s
{
struct remount_s *next;
char *target;
int targetfd;
unsigned long flags;
char *data;
};
struct private_data_s
{
struct remount_s *remounts;
/* Filled by libcrun_run_linux_container(). Useful to query what
namespaces are available. */
int unshare_flags;
#if CLONE_NEWCGROUP
int unshare_cgroupns;
#endif
char *host_notify_socket_path;
char *container_notify_socket_path;
bool mount_dev_from_host;
unsigned long rootfs_propagation;
bool deny_setgroups;
const char *rootfs;
int rootfsfd;
size_t rootfs_len;
int notify_socket_tree_fd;
struct libcrun_fd_map *mount_fds;
struct libcrun_fd_map *dev_fds;
/* Used to save stdin, stdout, stderr during checkpointing to descriptors.json
* and needed during restore. */
char *external_descriptors;
};
struct linux_namespace_s
{
const char *name;
const char *ns_file;
int value;
};
static void
cleanup_private_data (void *private_data)
{
struct private_data_s *p = private_data;
if (p->rootfsfd >= 0)
TEMP_FAILURE_RETRY (close (p->rootfsfd));
if (p->mount_fds)
cleanup_close_mapp (&(p->mount_fds));
if (p->dev_fds)
cleanup_close_mapp (&(p->dev_fds));
free (p->host_notify_socket_path);
free (p->container_notify_socket_path);
free (p->external_descriptors);
free (p);
}
static struct private_data_s *
get_private_data (struct libcrun_container_s *container)
{
if (container->private_data == NULL)
{
struct private_data_s *p = xmalloc0 (sizeof (*p));
container->private_data = p;
p->rootfsfd = -1;
p->notify_socket_tree_fd = -1;
container->cleanup_private_data = cleanup_private_data;
}
return container->private_data;
}
#ifndef CLONE_NEWTIME
# define CLONE_NEWTIME 0
#endif
#ifndef CLONE_NEWCGROUP
# define CLONE_NEWCGROUP 0
#endif
#ifndef AT_RECURSIVE
# define AT_RECURSIVE 0x8000
#endif
static struct linux_namespace_s namespaces[] = { { "mount", "mnt", CLONE_NEWNS },
{ "network", "net", CLONE_NEWNET },
{ "ipc", "ipc", CLONE_NEWIPC },
{ "pid", "pid", CLONE_NEWPID },
{ "uts", "uts", CLONE_NEWUTS },
{ "user", "user", CLONE_NEWUSER },
#if CLONE_NEWCGROUP
{ "cgroup", "cgroup", CLONE_NEWCGROUP },
#endif
#if CLONE_NEWTIME
{ "time", "time", CLONE_NEWTIME },
#endif
{ NULL, NULL, 0 } };
static int
get_and_reset (int *old)
{
int tmp = *old;
*old = -1;
return tmp;
}
int
libcrun_find_namespace (const char *name)
{
struct linux_namespace_s *it;
for (it = namespaces; it->name; it++)
if (strcmp (it->name, name) == 0)
return it->value;
return -1;
}
#ifndef __aligned_u64
# define __aligned_u64 uint64_t __attribute__ ((aligned (8)))
#endif
#ifndef CLONE_INTO_CGROUP
# define CLONE_INTO_CGROUP 0x200000000ULL
#endif
struct _clone3_args
{
__aligned_u64 flags;
__aligned_u64 pidfd;
__aligned_u64 child_tid;
__aligned_u64 parent_tid;
__aligned_u64 exit_signal;
__aligned_u64 stack;
__aligned_u64 stack_size;
__aligned_u64 tls;
__aligned_u64 set_tid;
__aligned_u64 set_tid_size;
__aligned_u64 cgroup;
};
static int
syscall_clone3 (struct _clone3_args *args)
{
#ifdef __NR_clone3
return (int) syscall (__NR_clone3, args, sizeof (*args));
#else
(void) args;
errno = ENOSYS;
return -1;
#endif
}
static int
syscall_fsopen (const char *fs_name, unsigned int flags)
{
#if defined __NR_fsopen
return (int) syscall (__NR_fsopen, fs_name, flags);
#else
(void) fs_name;
(void) flags;
errno = ENOSYS;
return -1;
#endif
}
static int
syscall_fsmount (int fsfd, unsigned int flags, unsigned int attr_flags)
{
#if defined __NR_fsmount
return (int) syscall (__NR_fsmount, fsfd, flags, attr_flags);
#else
(void) fsfd;
(void) flags;
(void) attr_flags;
errno = ENOSYS;
return -1;
#endif
}
static int
syscall_fsconfig (int fsfd, unsigned int cmd, const char *key, const void *val, int aux)
{
#if defined __NR_fsconfig
return (int) syscall (__NR_fsconfig, fsfd, cmd, key, val, aux);
#else
(void) fsfd;
(void) cmd;
(void) key;
(void) val;
(void) aux;
errno = ENOSYS;
return -1;
#endif
}
static int
syscall_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, unsigned int flags)
{
#if defined __NR_move_mount
return (int) syscall (__NR_move_mount, from_dfd, from_pathname, to_dfd, to_pathname, flags);
#else
(void) from_dfd;
(void) from_pathname;
(void) to_dfd;
(void) to_pathname;
(void) flags;
errno = ENOSYS;
return -1;
#endif
}
/* ignore this being unused - it's (currently) only unused if not building with systemd,
but conditioning the definition of syscall_open_tree on HAVE_SYSTEMD seems pretty silly */
__attribute__ ((unused)) static int
syscall_open_tree (int dfd, const char *pathname, unsigned int flags)
{
#if defined __NR_open_tree
return (int) syscall (__NR_open_tree, dfd, pathname, flags);
#else
(void) dfd;
(void) pathname;
(void) flags;
errno = ENOSYS;
return -1;
#endif
}
struct mount_attr_s
{
uint64_t attr_set;
uint64_t attr_clr;
uint64_t propagation;
uint64_t userns_fd;
};
#ifndef MOUNT_ATTR_RDONLY
# define MOUNT_ATTR_RDONLY 0x00000001 /* Mount read-only */
#endif
#ifndef MOUNT_ATTR_IDMAP
# define MOUNT_ATTR_IDMAP 0x00100000 /* Idmap mount to @userns_fd in struct mount_attr. */
#endif
static int
syscall_mount_setattr (int dfd, const char *path, unsigned int flags,
struct mount_attr_s *attr)
{
#ifdef __NR_mount_setattr
return (int) syscall (__NR_mount_setattr, dfd, path, flags, attr, sizeof (*attr));
#else
(void) dfd;
(void) path;
(void) flags;
(void) attr;
errno = ENOSYS;
return -1;
#endif
}
static int
syscall_keyctl_join (const char *name)
{
#define KEYCTL_JOIN_SESSION_KEYRING 0x1
return (int) syscall (__NR_keyctl, KEYCTL_JOIN_SESSION_KEYRING, name, 0);
}
static int
syscall_pidfd_open (pid_t pid, unsigned int flags)
{
#if defined __NR_pidfd_open
return (int) syscall (__NR_pidfd_open, pid, flags);
#else
(void) pid;
(void) flags;
errno = ENOSYS;
return -1;
#endif
}
static int
syscall_pidfd_send_signal (int pidfd, int sig, siginfo_t *info, unsigned int flags)
{
#if defined __NR_pidfd_send_signal
return (int) syscall (__NR_pidfd_send_signal, pidfd, sig, info, flags);
#else
(void) pidfd;
(void) sig;
(void) info;
(void) flags;
errno = ENOSYS;
return -1;
#endif
}
static int
do_mount_setattr (const char *target, int targetfd, uint64_t clear, uint64_t set, libcrun_error_t *err)
{
struct mount_attr_s attr = {
0,
};
int ret;
attr.attr_set = set;
attr.attr_clr = clear;
ret = syscall_mount_setattr (targetfd, "", AT_RECURSIVE | AT_EMPTY_PATH, &attr);
if (UNLIKELY (ret < 0))
return crun_make_error (err, errno, "mount_setattr `/%s`", target);
return 0;
}
static int
get_bind_mount (int dirfd, const char *src, bool recursive, bool rdonly, libcrun_error_t *err)
{
cleanup_close int open_tree_fd = -1;
struct mount_attr_s attr = {
0,
};
int recursive_flag = (recursive ? AT_RECURSIVE : 0);
int ret;
if (rdonly)
attr.attr_set = MS_RDONLY;
errno = 0;
open_tree_fd = syscall_open_tree (dirfd, src,
AT_NO_AUTOMOUNT | OPEN_TREE_CLOEXEC
| OPEN_TREE_CLONE | recursive_flag);
if (UNLIKELY (open_tree_fd < 0))
return crun_make_error (err, errno, "open_tree `%s`", src);
ret = syscall_mount_setattr (open_tree_fd, "", AT_EMPTY_PATH | recursive_flag, &attr);
if (UNLIKELY (ret < 0))
return crun_make_error (err, errno, "mount_setattr `%s`", src);
return get_and_reset (&open_tree_fd);
}
int
parse_idmapped_mount_option (runtime_spec_schema_config_schema *def, bool is_uids, char *option, char **out, size_t *len, libcrun_error_t *err)
{
size_t written = 0, allocated = 256;
cleanup_free char *mappings = NULL;
const char *it;
mappings = xmalloc (allocated);
for (it = option; *it;)
{
bool relative = false;
long value[3];
size_t i;
if (*it == '\0')
break;
if (*it == '#')
it++;
if (*it == '@')
{
relative = true;
it++;
}
/* read a triplet: file system id - host id - size. */
for (i = 0; i < 3; i++)
{
char *endptr = NULL;
if (i > 0 && *it == '-')
it++;
if (*it == '\0')
return crun_make_error (err, errno, "invalid mapping specified `%s`", option);
errno = 0;
value[i] = strtol (it, &endptr, 10);
if (errno || endptr == it)
return crun_make_error (err, errno, "invalid mapping specified `%s`", option);
it = endptr;
}
if (relative)
{
runtime_spec_schema_defs_id_mapping **mappings;
size_t mappings_len;
if (def == NULL
|| def->linux == NULL
|| (is_uids && def->linux->uid_mappings_len == 0)
|| (! is_uids && def->linux->gid_mappings_len == 0))
return crun_make_error (err, 0, "specified a relative mapping without user namespace mappings");
mappings_len = (is_uids ? def->linux->uid_mappings_len : def->linux->gid_mappings_len);
mappings = is_uids ? def->linux->uid_mappings : def->linux->gid_mappings;
for (i = 0; i < mappings_len; i++)
if (value[0] >= mappings[i]->container_id && value[0] < mappings[i]->container_id + mappings[i]->size)
break;
if (i == mappings_len)
return crun_make_error (err, 0, "could not find a user namespace mapping for the relative mapping `%s`", option);
value[1] += mappings[i]->host_id - mappings[i]->container_id;
}
if (written > allocated - 64)
{
allocated += 256;
mappings = xrealloc (mappings, allocated);
}
written += sprintf (mappings + written, "%ld %ld %ld\n", value[0], value[1], value[2]);
}
*(mappings + written) = '\0';
*len = written;
*out = mappings;
mappings = NULL;
return 0;
}
static char *
format_mount_mappings (runtime_spec_schema_defs_id_mapping **mappings, size_t mappings_len, size_t *written)
{
/* 64 is more than enough room to print 3 uint32. */
const size_t max_len_mapping = 64;
char *ret;
size_t s;
*written = 0;
ret = xmalloc (max_len_mapping * mappings_len + 1);
for (s = 0; s < mappings_len; s++)
{
size_t len;
len = snprintf (ret + *written, max_len_mapping, "%" PRIu32 " %" PRIu32 " %" PRIu32 "\n",
mappings[s]->container_id,
mappings[s]->host_id,
mappings[s]->size);
*written += len;
}
return ret;
}
static char *
format_mount_mapping (uint32_t container_id, uint32_t host_id, uint32_t size, size_t *written)
{
runtime_spec_schema_defs_id_mapping mapping = {
.container_id = container_id,
.host_id = host_id,
.size = size,
};
runtime_spec_schema_defs_id_mapping *mappings[] = {
&mapping,
NULL,
};
return format_mount_mappings (mappings, 1, written);
}
static bool
has_same_mappings (runtime_spec_schema_config_schema *def, runtime_spec_schema_defs_mount *mnt)
{
size_t s;
if (def->linux == NULL)
return mnt->uid_mappings_len == 0 && mnt->gid_mappings_len == 0;
if (mnt->uid_mappings_len != def->linux->uid_mappings_len)
return false;
if (mnt->gid_mappings_len != def->linux->gid_mappings_len)
return false;
for (s = 0; s < mnt->uid_mappings_len; s++)
{
if (mnt->uid_mappings[s]->container_id != def->linux->uid_mappings[s]->container_id)
return false;
if (mnt->uid_mappings[s]->host_id != def->linux->uid_mappings[s]->host_id)
return false;
if (mnt->uid_mappings[s]->size != def->linux->uid_mappings[s]->size)
return false;
}
for (s = 0; s < mnt->gid_mappings_len; s++)
{
if (mnt->gid_mappings[s]->container_id != def->linux->gid_mappings[s]->container_id)
return false;
if (mnt->gid_mappings[s]->host_id != def->linux->gid_mappings[s]->host_id)
return false;
if (mnt->gid_mappings[s]->size != def->linux->gid_mappings[s]->size)
return false;
}
return true;
}
static pid_t
maybe_create_userns_for_idmapped_mount (runtime_spec_schema_config_schema *def,
runtime_spec_schema_defs_mount *mnt,
const char *options, pid_t *pid_out,
libcrun_error_t *err)
{
cleanup_pid pid_t pid = -1;
char proc_file[64];
bool need_new_userns = mnt->uid_mappings_len ? ! has_same_mappings (def, mnt) : options != NULL;
if (! need_new_userns)
return 0;
pid = syscall_clone (CLONE_NEWUSER | SIGCHLD, NULL);
if (UNLIKELY (pid < 0))
return crun_make_error (err, errno, "clone");
if (pid == 0)
{
prctl (PR_SET_PDEATHSIG, SIGKILL);
while (1)
pause ();
_exit (EXIT_SUCCESS);
}
if (mnt->uid_mappings_len)
{
cleanup_free char *uid_map = NULL;
cleanup_free char *gid_map = NULL;
size_t written = 0;
int ret;
uid_map = format_mount_mappings (mnt->uid_mappings, mnt->uid_mappings_len, &written);
sprintf (proc_file, "/proc/%d/uid_map", pid);
ret = write_file (proc_file, uid_map, written, err);
if (UNLIKELY (ret < 0))
return ret;
gid_map = format_mount_mappings (mnt->gid_mappings, mnt->gid_mappings_len, &written);
sprintf (proc_file, "/proc/%d/gid_map", pid);
ret = write_file (proc_file, gid_map, written, err);
if (UNLIKELY (ret < 0))
return ret;
}
else
{
cleanup_free char *dup_options = NULL;
char *option, *saveptr = NULL;
if (! options)
return crun_make_error (err, 0, "internal error: no mappings found");
dup_options = xstrdup (options);
/* If there are no OCI mappings specified, then parse the annotation. */
for (option = strtok_r (dup_options, ";", &saveptr); option; option = strtok_r (NULL, ";", &saveptr))
{
cleanup_free char *mappings = NULL;
bool is_uids = false;
size_t len = 0;
int ret;
if (has_prefix (option, "uids="))
{
is_uids = true;
sprintf (proc_file, "/proc/%d/uid_map", pid);
}
else if (has_prefix (option, "gids="))
sprintf (proc_file, "/proc/%d/gid_map", pid);
else
return crun_make_error (err, 0, "invalid option `%s` specified", option);
ret = parse_idmapped_mount_option (def, is_uids, option + 5 /* strlen ("uids="), strlen ("gids=")*/, &mappings, &len, err);
if (UNLIKELY (ret < 0))
return ret;
ret = write_file (proc_file, mappings, len, err);
if (UNLIKELY (ret < 0))
return ret;
}
}
*pid_out = pid;
pid = -1;
return 0;
}
static char *
get_idmapped_option (runtime_spec_schema_defs_mount *mnt)
{
size_t i;
for (i = 0; i < mnt->options_len; i++)
if (has_prefix (mnt->options[i], "idmap"))
return mnt->options[i];
return NULL;
}
static int
maybe_get_idmapped_mount (runtime_spec_schema_config_schema *def, runtime_spec_schema_defs_mount *mnt, pid_t pid, int *out_fd, libcrun_error_t *err)
{
cleanup_close int open_tree_fd = -1;
cleanup_pid pid_t created_pid = -1;
struct mount_attr_s attr = {
0,
};
const char *idmap_option = "";
cleanup_close int fd = -1;
const char *options;
char proc_path[64];
bool has_mappings;
int ret;
*out_fd = -1;
has_mappings = mnt->uid_mappings_len > 0 || mnt->gid_mappings_len > 0 || (idmap_option = get_idmapped_option (mnt));
if (! has_mappings)
return 0;
if ((mnt->uid_mappings == NULL) != (mnt->gid_mappings == NULL))
return crun_make_error (err, 0, "invalid mappings specified for the mount on `%s`", mnt->destination);
/* If there are options specified, create a new user namespace with the configured mappings. */
options = strchr (idmap_option, '=');
if (options)
{
/* Skip the '=' itself. */
options++;
if (options[0] == '\0')
options = NULL;
}
ret = maybe_create_userns_for_idmapped_mount (def, mnt, options, &created_pid, err);
if (UNLIKELY (ret < 0))
return ret;
if (created_pid > 0)
pid = created_pid;
sprintf (proc_path, "/proc/%d/ns/user", pid);
fd = open (proc_path, O_RDONLY);
if (UNLIKELY (fd < 0))
return crun_make_error (err, errno, "open `%s`", proc_path);
open_tree_fd = syscall_open_tree (-1, mnt->source,
AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLOEXEC | OPEN_TREE_CLONE);
if (UNLIKELY (open_tree_fd < 0))
return crun_make_error (err, errno, "open `%s`", mnt->source);
attr.attr_set = MOUNT_ATTR_IDMAP;
attr.userns_fd = fd;
ret = syscall_mount_setattr (open_tree_fd, "", AT_EMPTY_PATH, &attr);
if (UNLIKELY (ret < 0))
return crun_make_error (err, errno, "mount_setattr `%s`", mnt->source);
*out_fd = get_and_reset (&open_tree_fd);
return 0;
}
int
libcrun_create_keyring (const char *name, const char *label, libcrun_error_t *err)
{
const char *const keycreate = "/proc/self/attr/keycreate";
cleanup_close int labelfd = -1;
bool label_set = false;
int ret;
if (label)
{
labelfd = open (keycreate, O_WRONLY | O_CLOEXEC);
if (UNLIKELY (labelfd < 0))
{
if (errno != ENOENT)
return crun_make_error (err, errno, "open `%s`", keycreate);
}
else
{
ret = write (labelfd, label, strlen (label));
if (UNLIKELY (ret < 0))
return crun_make_error (err, errno, "write to `%s`", keycreate);
label_set = true;
}
}
ret = syscall_keyctl_join (name);
if (UNLIKELY (ret < 0))
{
if (errno == ENOSYS)
{
libcrun_warning ("could not create a new keyring: keyctl_join is not supported");
ret = 0;
goto out;
}
ret = crun_make_error (err, errno, "create keyring `%s`", name);
goto out;
}
out:
/* Best effort attempt to reset the SELinux label used for new keyrings. */
if (label_set)
(void) write (labelfd, "", 0);
return ret;
}
static void
get_uid_gid_from_def (runtime_spec_schema_config_schema *def, uid_t *uid, gid_t *gid)
{
*uid = 0;
*gid = 0;
if (def->process && def->process->user)
{
if (def->process->user->uid)
*uid = def->process->user->uid;
if (def->process->user->gid)
*gid = def->process->user->gid;
}
}
static unsigned long
get_mount_flags (const char *name, int current_flags, int *found, unsigned long *extra_flags, uint64_t *rec_clear, uint64_t *rec_set)
{
const struct propagation_flags_s *prop;
prop = libcrun_str2mount_flags (name);
if (found)
*found = prop ? 1 : 0;
if (prop == NULL)
return 0;
if (prop->extra_flags & OPTION_RECURSIVE)
{
if (rec_clear && prop->clear)
*rec_clear |= prop->flags;
if (rec_set && ! prop->clear)
*rec_set |= prop->flags;
}
if (extra_flags)
*extra_flags |= prop->extra_flags;
if (prop->clear)
return current_flags & ~prop->flags;
return current_flags | prop->flags;
}
static unsigned long
get_mount_flags_or_option (const char *name, int current_flags, unsigned long *extra_flags, char **option, uint64_t *rec_clear, uint64_t *rec_set)
{
int found;
__attribute__ ((unused)) cleanup_free char *prev = NULL;
unsigned long flags = get_mount_flags (name, current_flags, &found, extra_flags, rec_clear, rec_set);
if (found)
return flags;
prev = *option;
if (*option && **option)
xasprintf (option, "%s,%s", *option, name);
else
*option = xstrdup (name);
return 0;
}
int
pivot_root (const char *new_root, const char *put_old)
{
return syscall (__NR_pivot_root, new_root, put_old);
}
static void
free_remount (struct remount_s *r)
{
if (r->targetfd >= 0)
close (r->targetfd);
free (r->data);
free (r->target);
free (r);
}
static struct remount_s *
make_remount (int targetfd, const char *target, unsigned long flags, const char *data, struct remount_s *next)
{
struct remount_s *ret = xmalloc (sizeof (*ret));
ret->target = xstrdup (target);
ret->flags = flags;
ret->data = data ? xstrdup (data) : NULL;
ret->next = next;
ret->targetfd = targetfd;
return ret;
}
static int
do_remount (int targetfd, const char *target, unsigned long flags, const char *data, libcrun_error_t *err)
{
int ret;
proc_fd_path_t target_buffer;
const char *real_target = target;
if (targetfd >= 0)
{
get_proc_self_fd_path (target_buffer, targetfd);
real_target = target_buffer;
}
/* Older kernels (seen on 4.18) fail with EINVAL if data is set when
setting MS_RDONLY. */
if (flags & (MS_REMOUNT | MS_RDONLY))
data = NULL;
ret = mount (NULL, real_target, NULL, flags, data);
if (UNLIKELY (ret < 0))
{
unsigned long remount_flags;
struct statfs sfs;
ret = statfs (real_target, &sfs);
if (UNLIKELY (ret < 0))
return crun_make_error (err, errno, "statfs `%s`", real_target);
remount_flags = sfs.f_flags & (MS_NOSUID | MS_NODEV | MS_NOEXEC);
if ((flags | remount_flags) != flags)
{
ret = mount (NULL, real_target, NULL, flags | remount_flags, data);
if (LIKELY (ret == 0))
return 0;
/* If it still fails and MS_RDONLY is present in the mount, try adding it. */
if (sfs.f_flags & MS_RDONLY)
{
remount_flags = sfs.f_flags & (MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RDONLY);
ret = mount (NULL, real_target, NULL, flags | remount_flags, data);
}
}
if (UNLIKELY (ret < 0))
return crun_make_error (err, errno, "remount `%s`", target);
}
return 0;
}
static int
finalize_mounts (libcrun_container_t *container, libcrun_error_t *err)
{
int ret = 0;
struct remount_s *r = get_private_data (container)->remounts;
while (r)
{
struct remount_s *next = r->next;
ret = do_remount (r->targetfd, r->target, r->flags, r->data, err);
if (UNLIKELY (ret < 0))
goto cleanup;
free_remount (r);
r = next;
}
cleanup:
while (r)
{
struct remount_s *next = r->next;
free_remount (r);
r = next;
}
get_private_data (container)->remounts = NULL;
return ret;
}
static int
open_mount_target (libcrun_container_t *container, const char *target_rel, libcrun_error_t *err)
{
const char *rootfs = get_private_data (container)->rootfs;
size_t rootfs_len = get_private_data (container)->rootfs_len;
int rootfsfd = get_private_data (container)->rootfsfd;
if (rootfsfd < 0)
return crun_make_error (err, 0, "invalid rootfs state");
return safe_openat (rootfsfd, rootfs, rootfs_len, target_rel, O_PATH | O_CLOEXEC, 0, err);
}
/* Attempt to open a mount of the specified type. */
static int
fsopen_mount (const char *type, const char *labeltype, const char *label)
{
#ifdef HAVE_NEW_MOUNT_API
cleanup_close int fsfd = -1;
int ret;
fsfd = syscall_fsopen (type, FSOPEN_CLOEXEC);
if (UNLIKELY (fsfd < 0))