在 kernel.md
有稍微 trace eBPF 的運作機制,而這邊會做更詳細的追蹤,使用的 kernel 版本為 5.13.11。
首先關於 bpf 的操作是透過 nr 321 的 syscall 來執行,下列為傳入的參數:
%rax | System call | %rdi | %rsi | %rdx |
---|---|---|---|---|
321 | sys_bpf | int cmd | union bpf_attr *attr | unsigned int size |
之後會到 kernel 的 bpf syscall handler 來執行 (src):
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr;
int err;
if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
return -EPERM;
/* 傳入的 size 可能跟 sizeof(attr) 不相同,因此要檢查多的部分是否為 null */
err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
if (err)
return err;
/* size 為 sizeof(attr) or size 之中較小的值 */
size = min_t(u32, size, sizeof(attr));
/* copy attributes from user space, may be less than sizeof(bpf_attr) */
memset(&attr, 0, sizeof(attr));
if (copy_from_user(&attr, uattr, size) != 0)
return -EFAULT;
/**
* https://elixir.bootlin.com/linux/v5.13.11/source/security/security.c#L2566
* 定義一連串 bpf cmd 檢測
*/
err = security_bpf(cmd, &attr, size);
if (err < 0)
return err;
/* 根據 user 傳入的 cmd 做對應的行為,像是 create map、update map 等等 */
switch (cmd) {
case BPF_MAP_CREATE:
err = map_create(&attr);
break;
...
default:
err = -EINVAL;
break;
}
return err;
}
-
這邊的 call flow 是從
__x64_sys_bpf
-->__do_sys_bpf
,所以該 macro 展開後應該會是__do_sys_bpf
,不過__x64_sys_bpf
似乎也沒做什麼事 -
copy_from_user()
是從 user 提供的 ptr copy 資料回 kernel mode:static __always_inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) { if (likely(check_copy_size(to, n, false))) n = _copy_from_user(to, from, n); return n; }
會先執行
check_copy_size()
檢查資料的合法性,合法的定義如下:- not bogus address
- fully contained by stack (or stack frame, when available)
- fully within SLAB object (or object whitelist area, when available)
- not in kernel text
-
雖說
security_bpf()
看起來是檢測 bpf 的安全性,過程中會從security_hook_heads+1800
嘗試拿出 function pointer 去執行 (?),但如果沒有特別設置,則什麼都不會做:int security_bpf(int cmd, union bpf_attr *attr, unsigned int size) { return call_int_hook(bpf, 0, cmd, attr, size); }
map_create()
為 bpf 在建立 kernel 跟 userland 共享的記憶體空間,之後透過 map_lookup_elem()
來從 userland 取得 kernel space 最新資料,或是透過 map_update_elem()
將 userland 的資料更新上去。
首先建立時會參照 user 傳入的 struct union bpf_attr
(size: 0x78):
union bpf_attr {
struct { /* anonymous struct used by BPF_MAP_CREATE command */
/* 沒特別設定的話,userland 傳入的應該只會有前 5 個 */
__u32 map_type; /* one of enum bpf_map_type */
__u32 key_size; /* size of key in bytes */
__u32 value_size; /* size of value in bytes */
__u32 max_entries; /* max number of entries in a map */
__u32 map_flags; /* BPF_MAP_CREATE related
* flags defined above.
*/
/* ------------------------------------------ */
__u32 inner_map_fd; /* fd pointing to the inner map */
__u32 numa_node; /* numa node (effective only if
* BPF_F_NUMA_NODE is set).
*/
char map_name[BPF_OBJ_NAME_LEN];
__u32 map_ifindex; /* ifindex of netdev to create on */
__u32 btf_fd; /* fd pointing to a BTF type data */
__u32 btf_key_type_id; /* BTF type_id of the key */
__u32 btf_value_type_id; /* BTF type_id of the value */
__u32 btf_vmlinux_value_type_id;/* BTF type_id of a kernel-
* struct stored as the
* map value
*/
};
...
而 map_create()
的程式碼如下:
static int map_create(union bpf_attr *attr)
{
int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_map *map;
int f_flags;
int err;
err = CHECK_ATTR(BPF_MAP_CREATE);
if (err)
return -EINVAL;
/* 不確定什麼時候會進入這兩個 condition,不過我看 attr 只有前面 4~5 個 member 被初始化而已 */
if (attr->btf_vmlinux_value_type_id) {
if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
attr->btf_key_type_id || attr->btf_value_type_id)
return -EINVAL;
} else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
return -EINVAL;
}
/* 檢測 flag,bpf 的 fd 也有分 RD, WR, RDWR */
f_flags = bpf_get_file_flag(attr->map_flags);
if (f_flags < 0)
return f_flags;
/**
* numa - Non-Uniform Memory Access
* 應該是跟硬體架構有關的判斷式,不過 usermode 似乎可以透過 BPF_F_NUMA_NODE 此 flag 來選擇 numa node
*/
if (numa_node != NUMA_NO_NODE &&
((unsigned int)numa_node >= nr_node_ids ||
!node_online(numa_node)))
return -EINVAL;
/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
map = find_and_alloc_map(attr); /* 根據傳入的 attr 新增一個 map */
if (IS_ERR(map))
return PTR_ERR(map);
/* 複製 attr 的 map_name 到 map->name */
err = bpf_obj_name_cpy(map->name, attr->map_name,
sizeof(attr->map_name));
if (err < 0)
goto free_map;
/* 設置與 lock 相關的 member */
atomic64_set(&map->refcnt, 1);
atomic64_set(&map->usercnt, 1);
mutex_init(&map->freeze_mutex);
map->spin_lock_off = -EINVAL;
if (attr->btf_key_type_id || attr->btf_value_type_id ||
/* Even the map's value is a kernel's struct,
* the bpf_prog.o must have BTF to begin with
* to figure out the corresponding kernel's
* counter part. Thus, attr->btf_fd has
* to be valid also.
*/
attr->btf_vmlinux_value_type_id) {
struct btf *btf;
btf = btf_get_by_fd(attr->btf_fd);
if (IS_ERR(btf)) {
err = PTR_ERR(btf);
goto free_map;
}
if (btf_is_kernel(btf)) {
btf_put(btf);
err = -EACCES;
goto free_map;
}
map->btf = btf;
if (attr->btf_value_type_id) {
err = map_check_btf(map, btf, attr->btf_key_type_id,
attr->btf_value_type_id);
if (err)
goto free_map;
}
map->btf_key_type_id = attr->btf_key_type_id;
map->btf_value_type_id = attr->btf_value_type_id;
map->btf_vmlinux_value_type_id =
attr->btf_vmlinux_value_type_id;
}
/* 這種 security_* 的 function 都是在 security/security.c 定義,會看有沒有 pre-define hook 可以呼叫
如果沒有的話就什麼事情也不做
*/
err = security_bpf_map_alloc(map);
if (err)
goto free_map;
/* published the map to the userspace */
/* 背後是用 idr 做的,而 idr 的機制我不是很了解,只查到是用 radix-tree 去 implement int->ptr
* 用途像是 device name 等等
*/
err = bpf_map_alloc_id(map);
if (err)
goto free_map_sec;
bpf_map_save_memcg(map);
/* 會 assign 新的 fd 給 map,正常的話會是一個新的 fd */
err = bpf_map_new_fd(map, f_flags);
if (err < 0) {
/* failed to allocate fd.
* bpf_map_put_with_uref() is needed because the above
* bpf_map_alloc_id() has published the map
* to the userspace and the userspace may
* have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
*/
bpf_map_put_with_uref(map);
return err;
}
/* 正常離開 */
return err;
free_map_sec:
security_bpf_map_free(map);
free_map:
btf_put(map->btf);
map->ops->map_free(map);
return err;
}
-
find_and_alloc_map()
會根據attr->map_type
來建立 map (src):static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) { const struct bpf_map_ops *ops; u32 type = attr->map_type; struct bpf_map *map; int err; /* bpf_map_types 一共有 29 個,如 fixed_percpu_data, htab_map_ops, array_map_ops,... 等等 */ if (type >= ARRAY_SIZE(bpf_map_types)) return ERR_PTR(-EINVAL); type = array_index_nospec(type, ARRAY_SIZE(bpf_map_types)); /* 取得 type index (e.g. array = 2) */ ops = bpf_map_types[type]; /* 取得對應 type 的 op function table */ if (!ops) return ERR_PTR(-EINVAL); /** * 同類的 ops function 都定義在同個檔案,如 array 就是在 https://elixir.bootlin.com/linux/v5.13.11/source/kernel/bpf/arraymap.c */ if (ops->map_alloc_check) { /* 檢查 attr 中關於 map 的資料是否為 array specified */ err = ops->map_alloc_check(attr); if (err) return ERR_PTR(err); } if (attr->map_ifindex) ops = &bpf_map_offload_ops; map = ops->map_alloc(attr); /* array 的話會 call array_map_alloc() */ if (IS_ERR(map)) return map; /* 到這,map 已經根據 attr 初始化完成,並且也檢測了合法性,最後只需要在 assign op function 以及 type */ map->ops = ops; map->map_type = type; return map; }
array_map_alloc()
做了以下事情:- 透過
round_up()
bitwise 的操作,將attr->max_entries
擴展到 >=attr->max_entries
的二冪次值 array_size += (u64) max_entries * elem_size
為 struct 需要的大小,array_size 為sizeof(struct bpf_array)
,而max_entries
為 extend 後的 entry 數量,elem_size 為 user 傳入的attr->value_size
bpf_map_area_alloc
以array_size
為參數,間接呼叫area = kmalloc_node(size, ...)
來建立- 用
kmalloc_node()
跟kmalloc()
的差別在於 bpf 可能會需要選擇特定的 NUMA node,不過如果沒特別指定的話 (node
傳入 -1) 其實沒什麼差別
- 用
bpf_map_init_from_attr()
會根據傳入的 attr 來初始化 map,只是做一些簡單的 assign 而已
- 透過
bpf_prog_load
會接收使用者傳入的 attr,並且 emulate 執行 insn 看是否合法 (是否有 infinite loop 等等),可謂說是 bpf 最關鍵的一個 function。
透過 switch (cmd)
會進入 bpf_prog_load()
(src),而上半部分的程式碼在做 bpf_prog
的初始化,以及建構整個執行環境:
static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
{
enum bpf_prog_type type = attr->prog_type;
struct bpf_prog *prog, *dst_prog = NULL;
struct btf *attach_btf = NULL;
int err;
char license[128];
bool is_gpl;
if (CHECK_ATTR(BPF_PROG_LOAD))
return -EINVAL;
/* 只有這些 flag 是被允許的 */
if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
BPF_F_ANY_ALIGNMENT |
BPF_F_TEST_STATE_FREQ |
BPF_F_SLEEPABLE |
BPF_F_TEST_RND_HI32))
return -EINVAL;
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
(attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
!bpf_capable())
return -EPERM;
/* copy eBPF program license from user space */
/* 把 license copy 到 kernel */
if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
sizeof(license) - 1) < 0)
return -EFAULT;
license[sizeof(license) - 1] = 0;
/* eBPF programs must be GPL compatible to use GPL-ed functions */
/* license_is_gpl_compatible() 有列出一連串相容的 license */
is_gpl = license_is_gpl_compatible(license);
if (attr->insn_cnt == 0 ||
attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
return -E2BIG;
/**
* 只能 type BPF_PROG_TYPE_SOCKET_FILTER 或是 BPF_PROG_TYPE_CGROUP_SKB)
* 但是 enum bpf_prog_type 裡面有很多 type @_@ ?
*/
if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
type != BPF_PROG_TYPE_CGROUP_SKB &&
!bpf_capable())
return -EPERM;
if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN))
return -EPERM;
if (is_perfmon_prog_type(type) && !perfmon_capable())
return -EPERM;
/* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
* or btf, we need to check which one it is
*/
/* 用來 attach 舊的 prog_fd ? */
if (attr->attach_prog_fd) {
dst_prog = bpf_prog_get(attr->attach_prog_fd);
if (IS_ERR(dst_prog)) {
dst_prog = NULL;
attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd);
if (IS_ERR(attach_btf))
return -EINVAL;
if (!btf_is_kernel(attach_btf)) {
/* attaching through specifying bpf_prog's BTF
* objects directly might be supported eventually
*/
btf_put(attach_btf);
return -ENOTSUPP;
}
}
} else if (attr->attach_btf_id) {
/* fall back to vmlinux BTF, if BTF type ID is specified */
attach_btf = bpf_get_btf_vmlinux();
if (IS_ERR(attach_btf))
return PTR_ERR(attach_btf);
if (!attach_btf)
return -EINVAL;
btf_get(attach_btf);
}
/**
* bpf_prog_load_check_attach():
* Sets expected_attach_type in @attr if prog type requires it but has
* some attach types that have to be backward compatible
* 不過只有 BPF_PROG_TYPE_CGROUP_SOCK 可以設 expected_attach_type 成 BPF_CGROUP_INET_SOCK_CREATE
*/
bpf_prog_load_fixup_attach_type(attr);
if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
attach_btf, attr->attach_btf_id,
dst_prog)) {
/**
* 只有在 bpf_prog_load_check_attach() return EINVAL (invalid value) 時會進入
* 上述 function 在一一檢查 prog_type 與 expected_attach_type 的關係
*/
if (dst_prog)
bpf_prog_put(dst_prog);
if (attach_btf)
btf_put(attach_btf);
return -EINVAL;
}
/* plain bpf_prog allocation */
/**
* 取得 struct bpf_prog + insn length 大小的 space 作為第一個參數
* 之後執行 bpf_prog_alloc 來
*/
prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
if (!prog) {
if (dst_prog)
bpf_prog_put(dst_prog);
if (attach_btf)
btf_put(attach_btf);
return -ENOMEM;
}
prog->expected_attach_type = attr->expected_attach_type;
prog->aux->attach_btf = attach_btf;
prog->aux->attach_btf_id = attr->attach_btf_id;
prog->aux->dst_prog = dst_prog;
prog->aux->offload_requested = !!attr->prog_ifindex;
prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
err = security_bpf_prog_alloc(prog->aux);
if (err)
goto free_prog;
prog->aux->user = get_current_user();
prog->len = attr->insn_cnt;
err = -EFAULT;
if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns),
bpf_prog_insn_size(prog)) != 0)
goto free_prog_sec;
prog->orig_prog = NULL;
prog->jited = 0;
/* 已經有一個 reference */
atomic64_set(&prog->aux->refcnt, 1);
prog->gpl_compatible = is_gpl ? 1 : 0;
if (bpf_prog_is_dev_bound(prog->aux)) {
err = bpf_prog_offload_init(prog, attr);
if (err)
goto free_prog_sec;
}
/* find program type: socket_filter vs tracing_filter */
/**
* 找到對應 prog type 的 type index / ops,並 assign 給 prog->aux->ops 以及 prog->type
* 就 BPF_PROG_TYPE_SOCKET_FILTER 來說,
* ops 會是 sk_filter_prog_ops
* type 會是 BPF_PROG_TYPE_SOCKET_FILTER
*/
err = find_prog_type(type, prog);
if (err < 0)
goto free_prog_sec;
prog->aux->load_time = ktime_get_boottime_ns();
/* copy prog name from user mode */
err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
sizeof(attr->prog_name));
if (err < 0)
goto free_prog_sec;
...
-
bpf_prog_alloc
相關 function 有使用到GFP
prefix 的 flag,而GFP
本身為 Get Free Pages =__get_free_pages
-
指定 allocate memory 時的行為,e.g.
GFP_ATOMIC
為在 allocate page 時不會有 context-switch -
背後會透過
__vmalloc
來建立存放 bpf_prog -
程式碼 (src):
struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) { /* 上述說明的 GFP flag */ gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *prog; int cpu; /* 建立一個沒有設 stats 的 prog struct */ prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags); if (!prog) return NULL; /* 這邊才設 stats */ /* 可是回傳的不是一般的 address,而是 e.g 0x607ff0c030a0 */ prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags); if (!prog->stats) { free_percpu(prog->active); kfree(prog->aux); vfree(prog); return NULL; } /* bpf 來說不怎麼重要 ? */ for_each_possible_cpu(cpu) { struct bpf_prog_stats *pstats; pstats = per_cpu_ptr(prog->stats, cpu); u64_stats_init(&pstats->syncp); } return prog; }
過程中呼叫的
bpf_prog_alloc_no_stats()
為主要 allocate 的部分:struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags; struct bpf_prog_aux *aux; struct bpf_prog *fp; /* 一樣找到 >= 的 pow of 2 value (以 page 為 base) */ size = round_up(size, PAGE_SIZE); /* 沒有很大的 prog 都會直接分一個 page 給他 (0x1000) */ fp = __vmalloc(size, gfp_flags); if (fp == NULL) return NULL; aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT | gfp_extra_flags); if (aux == NULL) { vfree(fp); return NULL; } fp->active = alloc_percpu_gfp(int, GFP_KERNEL_ACCOUNT | gfp_extra_flags); if (!fp->active) { vfree(fp); kfree(aux); return NULL; } fp->pages = size / PAGE_SIZE; /* 站了幾個 page */ fp->aux = aux; fp->aux->prog = fp; fp->jit_requested = ebpf_jit_enabled(); INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode); /* init lock */ mutex_init(&fp->aux->used_maps_mutex); mutex_init(&fp->aux->dst_mutex); return fp; }
-
一共分配兩塊 memory:
bpf_prog
以及aux
-
ebpf_jit_enabled()
會影響到是否可以用 JIT 優化,還是只能都用 interpreterstatic inline bool ebpf_jit_enabled(void) { return bpf_jit_enable && bpf_jit_is_ebpf(); } static inline bool bpf_jit_is_ebpf(void) { /* 要從編譯 kernel 的時候設定 */ # ifdef CONFIG_HAVE_EBPF_JIT return true; # else return false; # endif } /* bpf_jit_enable 在 kernel/bpf/core.c */ #ifdef CONFIG_BPF_JIT /* All BPF JIT sysctl knobs here. */ int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
-
-
最後還會有一個
struct bpf_prog_stats
,還蠻小的 (src):struct bpf_prog_stats { u64 cnt; u64 nsecs; u64 misses; struct u64_stats_sync syncp; } __aligned(2 * sizeof(u64));
-
到這邊為只,可以看一下 bpf_prog
目前長的樣子:
{
pages = 1,
jited = 0,
jit_requested = 1,
gpl_compatible = 1,
cb_access = 0,
dst_needed = 0,
blinded = 0,
is_func = 0,
kprobe_override = 0,
has_callchain_buf = 0,
enforce_expected_attach_type = 0,
call_get_stack = 0,
type = BPF_PROG_TYPE_SOCKET_FILTER,
expected_attach_type = BPF_CGROUP_INET_INGRESS,
len = 37,
jited_len = 0,
tag = "\000\000\000\000\000\000\000",
stats = 0x607ff0c030a0,
active = 0x607ff0c03094,
bpf_func = 0x0 <fixed_percpu_data>,
aux = 0xffff888006783400,
orig_prog = 0x0 <fixed_percpu_data>,
insns = 0xffffc9000006d048,
insnsi = 0xffffc9000006d048
}
bpf_prog_load()
的下半部分執行 verifier,檢測 insn 本身是否合法:
...
/* run eBPF verifier */
err = bpf_check(&prog, attr, uattr);
if (err < 0)
goto free_used_maps;
prog = bpf_prog_select_runtime(prog, &err);
if (err < 0)
goto free_used_maps;
err = bpf_prog_alloc_id(prog);
if (err)
goto free_used_maps;
/* Upon success of bpf_prog_alloc_id(), the BPF prog is
* effectively publicly exposed. However, retrieving via
* bpf_prog_get_fd_by_id() will take another reference,
* therefore it cannot be gone underneath us.
*
* Only for the time /after/ successful bpf_prog_new_fd()
* and before returning to userspace, we might just hold
* one reference and any parallel close on that fd could
* rip everything out. Hence, below notifications must
* happen before bpf_prog_new_fd().
*
* Also, any failure handling from this point onwards must
* be using bpf_prog_put() given the program is exposed.
*/
bpf_prog_kallsyms_add(prog);
perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
bpf_audit_prog(prog, BPF_AUDIT_LOAD);
err = bpf_prog_new_fd(prog);
if (err < 0)
bpf_prog_put(prog);
return err;
free_used_maps:
/* In case we have subprogs, we need to wait for a grace
* period before we can tear down JIT memory since symbols
* are already exposed under kallsyms.
*/
__bpf_prog_put_noref(prog, prog->aux->func_cnt);
return err;
free_prog_sec:
free_uid(prog->aux->user);
security_bpf_prog_free(prog->aux);
free_prog:
if (prog->aux->attach_btf)
btf_put(prog->aux->attach_btf);
bpf_prog_free(prog);
return err;
}
其中 bpf_check()
是關鍵。
verifier 使用到的 struct 可以在 include/linux/bpf_verifier.h
,主要有兩個 bpf_verifier_env
(size: 0x1d30) 以及 bpf_verifier_log
(size: 0x418):
/* single container for all structs
* one verifier_env per bpf_check() call
* 一次只會有一個 bpf_verifier_env 在 bpf_check() 的執行過程
*/
struct bpf_verifier_env {
u32 insn_idx;
u32 prev_insn_idx;
struct bpf_prog *prog; /* eBPF program being verified */
const struct bpf_verifier_ops *ops;
struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */
int stack_size; /* number of states to be processed */
bool strict_alignment; /* perform strict pointer alignment checks */
bool test_state_freq;
...
};
#define BPF_VERIFIER_TMP_LOG_SIZE 1024
struct bpf_verifier_log {
u32 level;
char kbuf[BPF_VERIFIER_TMP_LOG_SIZE];
char __user *ubuf;
u32 len_used;
u32 len_total;
};
bpf_check()
上半段程式碼如下 (src):
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
union bpf_attr __user *uattr)
{
u64 start_time = ktime_get_ns();
struct bpf_verifier_env *env;
struct bpf_verifier_log *log;
int i, len, ret = -EINVAL;
bool is_priv;
/* no program is valid */
if (ARRAY_SIZE(bpf_verifier_ops) == 0)
return -EINVAL;
/* 'struct bpf_verifier_env' can be global, but since it's not small,
* allocate/free it every time bpf_check() is called
*/
/* allocate 用來存放 env 的 memory */
env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
if (!env)
return -ENOMEM;
/* env 中也包含 log */
log = &env->log;
len = (*prog)->len;
/* 每個 insn 都有一個 struct bpf_insn_aux_data */
env->insn_aux_data =
vzalloc(array_size(sizeof(struct bpf_insn_aux_data), len));
ret = -ENOMEM;
if (!env->insn_aux_data)
goto err_free_env;
for (i = 0; i < len; i++)
env->insn_aux_data[i].orig_idx = i;
env->prog = *prog;
/* 每個 prog->type 都有不同的 verifier ops,如 fixed_percpu_data, sk_filter_verifier_ops, kprobe_verifier_ops, ... */
/* BPF_PROG_TYPE_SOCKET_FILTER 對到 sk_filter_verifier_ops */
env->ops = bpf_verifier_ops[env->prog->type];
is_priv = bpf_capable(); /* capable(CAP_BPF) || capable(CAP_SYS_ADMIN) */
/* 什麼都沒做 @_@ */
bpf_get_btf_vmlinux();
/* grab the mutex to protect few globals used by verifier */
/* lock ! 保護 verifier 的 globals */
if (!is_priv)
mutex_lock(&bpf_verifier_lock);
if (attr->log_level || attr->log_buf || attr->log_size) {
/* user requested verbose verifier output
* and supplied buffer to store the verification trace
*/
log->level = attr->log_level;
log->ubuf = (char __user *) (unsigned long) attr->log_buf;
log->len_total = attr->log_size;
ret = -EINVAL;
/* log attributes have to be sane */
/**
* (2**31 - 1) >> 2 >= log size >= 128
* 5 > log->level > 0
* @BPF_LOG_MASK = 1 (l1) | 2 (l2) | 4 (stats) == 0b111
*/
if (log->len_total < 128 || log->len_total > UINT_MAX >> 2 ||
!log->level || !log->ubuf || log->level & ~BPF_LOG_MASK)
goto err_unlock;
}
/* 什麼都沒做, too */
if (IS_ERR(btf_vmlinux)) {
/* Either gcc or pahole or kernel are broken. */
verbose(env, "in-kernel BTF is malformed\n");
ret = PTR_ERR(btf_vmlinux);
goto skip_full_check;
}
/* 沒特別設就是 false */
env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
env->strict_alignment = true;
if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
env->strict_alignment = false;
/* capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN) */
env->allow_ptr_leaks = bpf_allow_ptr_leaks();
/* capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN) */
env->allow_uninit_stack = bpf_allow_uninit_stack();
/* capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN) */
env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access();
/* capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN) */
env->bypass_spec_v1 = bpf_bypass_spec_v1();
/* capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN) */
env->bypass_spec_v4 = bpf_bypass_spec_v4();
/* 以上都是需要 perfmon_capable() */
/* capable(CAP_BPF) || capable(CAP_SYS_ADMIN),看有無執行 bpf 的權限 */
env->bpf_capable = bpf_capable();
if (is_priv)
env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
/**
* bpf_verifier_state_list 紀錄每個 verifier 的階段,不過這邊單純 allocate ptr size
* 因為 env->explored_states 是 pointer of pointer
*/
env->explored_states = kvcalloc(state_htab_size(env),
sizeof(struct bpf_verifier_state_list *),
GFP_USER);
ret = -ENOMEM;
if (!env->explored_states)
goto skip_full_check;
ret = add_subprog_and_kfunc(env);
if (ret < 0)
goto skip_full_check;
ret = check_subprogs(env);
if (ret < 0)
goto skip_full_check;
ret = check_btf_info(env, attr, uattr);
if (ret < 0)
goto skip_full_check;
ret = check_attach_btf_id(env);
if (ret)
goto skip_full_check;
ret = resolve_pseudo_ldimm64(env);
if (ret < 0)
goto skip_full_check;
/* return aux->offload_requested */
if (bpf_prog_is_dev_bound(env->prog->aux)) {
ret = bpf_prog_offload_verifier_prep(env->prog);
if (ret)
goto skip_full_check;
}
ret = check_cfg(env);
if (ret < 0)
goto skip_full_check;
ret = do_check_subprogs(env);
ret = ret ?: do_check_main(env);
if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux))
ret = bpf_prog_offload_finalize(env);
...
}
- 一系列的檢查:
add_subprog_and_kfunc()
-->add_subprog()
-->find_subprog()
add_subprog()
- 新增 sub prog (淺顯易懂 ?)find_subprog()
- 用bsearch
找對應 off 的 subprog,不過透過add_subprog_and_kfunc()
呼叫的 off 為 0
check_subprogs()
- 只有特定 type 的 prog 會需要做更多的檢查:
check_btf_info()
check_attach_btf_id()
resolve_pseudo_ldimm64()
check_cfg()
do_check_subprogs()
do_check_main()
add_subprog_and_kfunc()
新增 sub prog 以及 kfunc:
static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
{
struct bpf_subprog_info *subprog = env->subprog_info;
struct bpf_insn *insn = env->prog->insnsi;
int i, ret, insn_cnt = env->prog->len;
/* Add entry function. */
ret = add_subprog(env, 0);
if (ret)
return ret;
for (i = 0; i < insn_cnt; i++, insn++) {
/**
* call --> insn->code == (BPF_JMP | BPF_CALL) && insn->src_reg == BPF_PSEUDO_CALL
* kfunc call --> insn->code == (BPF_JMP | BPF_CALL) && insn->src_reg == BPF_PSEUDO_KFUNC_CALL
* func --> insn->code == (BPF_LD | BPF_IMM | BPF_DW) && insn->src_reg == BPF_PSEUDO_FUNC
*/
/* 不是 func + 不是 call + 不是 kfunc --> cont */
if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn) &&
!bpf_pseudo_kfunc_call(insn))
continue;
if (!env->bpf_capable) {
verbose(env, "loading/calling other bpf or kernel functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
return -EPERM;
}
/* 若為 call / kfunc call / func,就新增一個 sub prog */
if (bpf_pseudo_func(insn)) {
ret = add_subprog(env, i + insn->imm + 1);
if (ret >= 0)
/* remember subprog */
insn[1].imm = ret;
} else if (bpf_pseudo_call(insn)) {
ret = add_subprog(env, i + insn->imm + 1);
} else {
ret = add_kfunc_call(env, insn->imm);
}
if (ret < 0)
return ret;
}
/* Add a fake 'exit' subprog which could simplify subprog iteration
* logic. 'subprog_cnt' should not be increased.
*/
/* 假的 exit subprog @_@ ? */
subprog[env->subprog_cnt].start = insn_cnt;
if (env->log.level & BPF_LOG_LEVEL2)
for (i = 0; i < env->subprog_cnt; i++)
verbose(env, "func#%d @%d\n", i, subprog[i].start);
return 0;
}
但實際上新增 subprog 是由 add_subprog()
完成:
static int add_subprog(struct bpf_verifier_env *env, int off)
{
int insn_cnt = env->prog->len;
int ret;
if (off >= insn_cnt || off < 0) {
verbose(env, "call to invalid destination\n");
return -EINVAL;
}
ret = find_subprog(env, off); /* 找 off = 0 的 subprog 是否存在 */
if (ret >= 0)
return ret;
if (env->subprog_cnt >= BPF_MAX_SUBPROGS) {
verbose(env, "too many subprograms\n");
return -E2BIG;
}
/* determine subprog starts. The end is one before the next starts */
/* env->subprog_cnt++ */
env->subprog_info[env->subprog_cnt++].start = off;
/* sorted by prog->start,因此新增的 subprog 會是起頭 */
sort(env->subprog_info, env->subprog_cnt,
sizeof(env->subprog_info[0]), cmp_subprogs, NULL);
return env->subprog_cnt - 1;
}
接著執行 check_subprogs()
,檢查所有的 subprog 是否 JMP 都在同個 subprog:
static int check_subprogs(struct bpf_verifier_env *env)
{
int i, subprog_start, subprog_end, off, cur_subprog = 0;
struct bpf_subprog_info *subprog = env->subprog_info;
struct bpf_insn *insn = env->prog->insnsi;
int insn_cnt = env->prog->len;
/* now check that all jumps are within the same subprog */
/* 檢查所有 JMP 相關的 insn 都在同個 subprog (?) */
/* 當前的 subprog 的開頭 ~ 下一個 subprog 的開頭為當前 subprog 執行的週期 */
/* 在 add_subprog_and_kfunc() 最後會有 fake subprog,就是要產生 subprog_end */
subprog_start = subprog[cur_subprog].start;
subprog_end = subprog[cur_subprog + 1].start;
for (i = 0; i < insn_cnt; i++) {
u8 code = insn[i].code;
if (code == (BPF_JMP | BPF_CALL) &&
insn[i].imm == BPF_FUNC_tail_call &&
insn[i].src_reg != BPF_PSEUDO_CALL)
subprog[cur_subprog].has_tail_call = true;
if (BPF_CLASS(code) == BPF_LD &&
(BPF_MODE(code) == BPF_ABS || BPF_MODE(code) == BPF_IND))
subprog[cur_subprog].has_ld_abs = true;
if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32)
goto next;
if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
goto next;
off = i + insn[i].off + 1;
if (off < subprog_start || off >= subprog_end) {
verbose(env, "jump out of range from insn %d to %d\n", i, off);
return -EINVAL;
}
next:
if (i == subprog_end - 1) {
/* to avoid fall-through from one subprog into another
* the last insn of the subprog should be either exit
* or unconditional jump back
*/
if (code != (BPF_JMP | BPF_EXIT) &&
code != (BPF_JMP | BPF_JA)) {
verbose(env, "last insn is not an exit or jmp\n");
return -EINVAL;
}
/* 換下一個 subprog */
subprog_start = subprog_end;
cur_subprog++;
if (cur_subprog < env->subprog_cnt)
subprog_end = subprog[cur_subprog + 1].start;
}
}
return 0;
}
check_btf_info()
檢查 BTF (BPF Type Format) :
static int check_btf_info(struct bpf_verifier_env *env,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
struct btf *btf;
int err;
if (!attr->func_info_cnt && !attr->line_info_cnt) {
/**
* traverse 所有的 env->subprog_info[],檢查是否有 has_ld_abs / has_tail_call 的情況
* @ LD_ABS is not allowed in subprogs without BTF
* @ tail_call is not allowed in subprogs without BTF
*/
if (check_abnormal_return(env))
return -EINVAL;
return 0;
}
/* TODO: 以下還沒 trace 到 */
...
return 0;
}
resolve_pseudo_ldimm64()
找到 ld_imm64
insn 中的 imm,並將 map_fd 轉換成 struct bpf_map *
:
/* find and rewrite pseudo imm in ld_imm64 instructions:
*
* 1. if it accesses map FD, replace it with actual map pointer.
* 2. if it accesses btf_id of a VAR, replace it with pointer to the var.
*
* NOTE: btf_vmlinux is required for converting pseudo btf_id.
*/
static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
{
struct bpf_insn *insn = env->prog->insnsi; /* 取得 insn list */
int insn_cnt = env->prog->len;
int i, j, err;
err = bpf_prog_calc_tag(env->prog); /* 為 prog 計算 tag,過程中會使用到 SHA1 */
if (err)
return err;
/* 遍歷每個 insn */
for (i = 0; i < insn_cnt; i++, insn++) {
/**
* 使用 LDX 但是不是用 MEM,或是 imm 不為 0,就代表使用到其他保留的欄位
* 猜想是因為 LDX 在 userland 只有 BPF_LDX_MEM 能用,而使用時 imm == 0 以及 mode == BPF_MEM
*/
if (BPF_CLASS(insn->code) == BPF_LDX &&
(BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
verbose(env, "BPF_LDX uses reserved fields\n");
return -EINVAL;
}
/**
* usermode 使用 BPF_LD_IMM64_RAW macro 時為 {.code = BPF_LD | BPF_DW | BPF_IMM,...}
* 並且只有 BPF_LD_MAP_FD 以及 BPF_LD_IMM64 macro 作為 wrapper 使用 BPF_LD_IMM64_RAW
* 兩者差在 LD_MAP_FD 的 src 會是 BPF_PSEUDO_MAP_FD (1),而 LD_IMM64 src 為 0
*/
if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
struct bpf_insn_aux_data *aux;
struct bpf_map *map;
struct fd f;
u64 addr;
/**
* ld_imm 為最後一個 insn,或者是下一個 insn 有被使用
* 因為 bpf_insn 只能使用 imm32,因此如果要用 imm64 的話,
* 則需要兩個 insn,並且分別保存前 32 bits 的 imm 以及後 32 bits 的 imm
* 而第二個 insn 除了 imm 外其他欄位階為 0
*/
if (i == insn_cnt - 1 || insn[1].code != 0 ||
insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
insn[1].off != 0) {
verbose(env, "invalid bpf_ld_imm64 insn\n");
return -EINVAL;
}
/* 當 src_reg 不同時有不同的檢查機制,有的甚至直接當作合法的 (?) */
if (insn[0].src_reg == 0)
/* valid generic load 64-bit imm */
goto next_insn;
if (insn[0].src_reg == BPF_PSEUDO_BTF_ID) {
aux = &env->insn_aux_data[i];
err = check_pseudo_btf_id(env, insn, aux);
if (err)
return err;
goto next_insn;
}
if (insn[0].src_reg == BPF_PSEUDO_FUNC) {
aux = &env->insn_aux_data[i];
aux->ptr_type = PTR_TO_FUNC;
goto next_insn;
}
/* In final convert_pseudo_ld_imm64() step, this is
* converted into regular 64-bit imm load insn.
*/
if ((insn[0].src_reg != BPF_PSEUDO_MAP_FD &&
insn[0].src_reg != BPF_PSEUDO_MAP_VALUE) ||
(insn[0].src_reg == BPF_PSEUDO_MAP_FD &&
insn[1].imm != 0)) {
verbose(env,
"unrecognized bpf_ld_imm64 insn\n");
return -EINVAL;
}
/* 使用 BPF_LD_MAP_FD 時 imm 會是 fd number,透過此 func 來取得 kernel fd struct */
f = fdget(insn[0].imm);
/* kernel fd struct 是用欄位 file->private 來保存檔案資料,e.g. f.file->private_data */
map = __bpf_map_get(f);
if (IS_ERR(map)) {
verbose(env, "fd %d is not pointing to valid bpf_map\n",
insn[0].imm);
return PTR_ERR(map);
}
/**
* Validate that trace type programs use preallocated hash maps
* trace type 有 BPF_PROG_TYPE_KPROBE, TRACEPOINT, PERF_EVENT, RAW_TRACEPOINT
*
* 除此之外還檢查 spinlock 的使用,如 socket filter, tracing prog 以及 sleepable prog
* 都不能使用 bpf_spin_lock,因為在 tracing prog tracepoint 在 locked region、或是在
* locked region 睡著,就會造成其他的 thread 拿不到 lock
*
* 設計考量 (?) 只允許 sleepable prog 使用 array, hash, ringbuf maps
*/
err = check_map_prog_compatibility(env, map, env->prog);
if (err) {
fdput(f);
return err;
}
aux = &env->insn_aux_data[i];
/**
* 當 src 是 BPF_PSEUDO_MAP_FD,就會把 imm 從 fd number 改成 map addr
* 因此 BPF_LD_MAP_FD 能夠取得指令 fd 的 map struct
*/
if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
addr = (unsigned long)map;
} else {
u32 off = insn[1].imm;
if (off >= BPF_MAX_VAR_OFF) {
verbose(env, "direct value offset of %u is not allowed\n", off);
fdput(f);
return -EINVAL;
}
/* map 的檢測 ... */
aux->map_off = off;
addr += off;
}
insn[0].imm = (u32)addr;
insn[1].imm = addr >> 32;
/* check whether we recorded this map already */
for (j = 0; j < env->used_map_cnt; j++) {
if (env->used_maps[j] == map) { /* 找到 pre-allocate 的 map */
aux->map_index = j; /* aux 記錄下來*/
fdput(f); /* 將資料印出 @__@ ? */
goto next_insn;
}
}
if (env->used_map_cnt >= MAX_USED_MAPS) {
fdput(f);
return -E2BIG;
}
/* atomic 增加 map->refcnt */
bpf_map_inc(map);
aux->map_index = env->used_map_cnt;
env->used_maps[env->used_map_cnt++] = map;
if (bpf_map_is_cgroup_storage(map) &&
bpf_cgroup_storage_assign(env->prog->aux, map)) {
verbose(env, "only one cgroup storage of each type is allowed\n");
fdput(f);
return -EBUSY;
}
fdput(f);
next_insn:
insn++;
i++;
continue;
}
/* Basic sanity check before we invest more work here. */
if (!bpf_opcode_in_insntable(insn->code)) {
verbose(env, "unknown opcode %02x\n", insn->code);
return -EINVAL;
}
}
/* now all pseudo BPF_LD_IMM64 instructions load valid
* 'struct bpf_map *' into a register instead of user map_fd.
* These pointers will be used later by verifier to validate map access.
*/
/* 將 map_fd 轉換成 struct bpf_map * */
return 0;
}
-
關於
BPF_XXX
的 macro 於 include/uapi/linux/bpf_common.h 被定義 (uapi 為 user api ?),主要幾個重點 macro 如下:/** * user mode 傳入的是 bpf_insn.code,而通常都會以 | 的方式將 class / size / mode 組合起來,如: * .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM */ /* 0000 0111 */ #define BPF_CLASS(code) ((code) & 0x07) #define BPF_LD 0x00 #define BPF_LDX 0x01 ... /* ld/ldx fields */ /* 0001 1000 */ #define BPF_SIZE(code) ((code) & 0x18) #define BPF_W 0x00 /* 32-bit */ ... /* 1110 0000 */ #define BPF_MODE(code) ((code) & 0xe0) #define BPF_IMM 0x00 #define BPF_ABS 0x20 ... /* alu/jmp fields */ /* 1111 0000 */ #define BPF_OP(code) ((code) & 0xf0) #define BPF_ADD 0x00 #define BPF_SUB 0x10 ... /* 並且在 common.h 中也有定義最多的 insn 數量為 4096 */ #ifndef BPF_MAXINSNS #define BPF_MAXINSNS 4096
LD
與LDX
的差別在於是否可以使用 offset,因此 X 的意思應該為 extended
下個 check_cfg()
程式碼意外的少,主要是用 DFS 看 BPF 內是否有 loop (back edge):
static int check_cfg(struct bpf_verifier_env *env)
{
int insn_cnt = env->prog->len;
int *insn_stack, *insn_state;
int ret = 0;
int i;
/* 標記每個 insn 是否被 discover */
insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
if (!insn_state)
return -ENOMEM;
/* DFS stack,將目前走到的 insn push 上去 */
insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
if (!insn_stack) {
kvfree(insn_state);
return -ENOMEM;
}
/* 第一個 insn 被走訪過 */
insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
insn_stack[0] = 0; /* 0 is the first instruction */
env->cfg.cur_stack = 1; /* stack top */
while (env->cfg.cur_stack > 0) {
int t = insn_stack[env->cfg.cur_stack - 1];
/* 走訪 insn */
ret = visit_insn(t, insn_cnt, env);
switch (ret) {
case DONE_EXPLORING: /* EXIT */
insn_state[t] = EXPLORED;
env->cfg.cur_stack--;
break;
case KEEP_EXPLORING:
break;
default:
if (ret > 0) { /* error happen */
verbose(env, "visit_insn internal bug\n");
ret = -EFAULT;
}
goto err_free;
}
}
/* top 最低為 0 */
if (env->cfg.cur_stack < 0) {
verbose(env, "pop stack internal bug\n");
ret = -EFAULT;
goto err_free;
}
/* 有些 insn 不會被走到 */
for (i = 0; i < insn_cnt; i++) {
if (insn_state[i] != EXPLORED) {
verbose(env, "unreachable insn %d\n", i);
ret = -EINVAL;
goto err_free;
}
}
ret = 0; /* cfg looks good */
err_free:
kvfree(insn_state);
kvfree(insn_stack);
env->cfg.insn_state = env->cfg.insn_stack = NULL;
return ret;
}
-
看過
kmalloc()
、vmalloc()
,但是就沒看過kvcalloc()
。為kvmalloc_array(n, size, flags | __GFP_ZERO)
,底層呼叫kvmalloc_node()
,function 的說明為:attempt to allocate physically contiguous memory, but upon failure, fall back to non-contiguous (vmalloc) allocation
就是一種
kmalloc()
以及vmalloc()
混用的 fu -
關鍵的部分在
visit_insn()
的回傳值:/* Visits the instruction at index t and returns one of the following: * < 0 - an error occurred * DONE_EXPLORING - the instruction was fully explored * KEEP_EXPLORING - there is still work to be done before it is fully explored */ /** * init_explored_state(env, w) * env->insn_aux_data[idx].prune_point = true */ /* visit insn[t] */ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) { struct bpf_insn *insns = env->prog->insnsi; int ret; /* pseudo function call */ if (bpf_pseudo_func(insns + t)) return visit_func_call_insn(t, insn_cnt, insns, env, true); /* All non-branch instructions have a single fall-through edge. */ /* jmp 的 edge 為單一 (fall-through) */ if (BPF_CLASS(insns[t].code) != BPF_JMP && BPF_CLASS(insns[t].code) != BPF_JMP32) return push_insn(t, t + 1, FALLTHROUGH, env, false); switch (BPF_OP(insns[t].code)) { case BPF_EXIT: /* EXIT */ return DONE_EXPLORING; case BPF_CALL: /* call pseudo func */ return visit_func_call_insn(t, insn_cnt, insns, env, insns[t].src_reg == BPF_PSEUDO_CALL); case BPF_JA: /* jmp always */ if (BPF_SRC(insns[t].code) != BPF_K) return -EINVAL; /* unconditional jump with single edge */ /* 直接跳去 offset 的地方執行 */ ret = push_insn(t, t + insns[t].off + 1, FALLTHROUGH, env, true); if (ret) /* DONE_EXPLORING */ return ret; init_explored_state(env, t + insns[t].off + 1); if (t + 1 < insn_cnt) init_explored_state(env, t + 1); return ret; default: /* conditional jump with two edges */ init_explored_state(env, t); ret = push_insn(t, t + 1, FALLTHROUGH, env, true); if (ret) return ret; return push_insn(t, t + insns[t].off + 1, BRANCH, env, true); } }
-
其中
visit_func_call_insn()
:/* 不太確定 visit_callee 的意思 */ static int visit_func_call_insn(int t, int insn_cnt, struct bpf_insn *insns, struct bpf_verifier_env *env, bool visit_callee) { int ret; /* function 的 edge 為單一 (fall-through) */ ret = push_insn(t, t + 1, FALLTHROUGH, env, false); if (ret) return ret; if (t + 1 < insn_cnt) init_explored_state(env, t + 1); if (visit_callee) { init_explored_state(env, t); ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env, false); } return ret; }
-
push_insn()
(這部份把 verbose 的程式碼拿掉了):/* t, w, e - match pseudo-code above: * t - index of current instruction * w - next instruction * e - edge */ /** * 當 function or jmp 執行 push_insn 時,loop_ok 為 false,代表不允許走到 discovered insn * discovered ---> discovered (X) (loop) * discovered ---> explored (O) (cross-edge) */ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, bool loop_ok) { int *insn_stack = env->cfg.insn_stack; int *insn_state = env->cfg.insn_state; /* 已經走過了 */ if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) return DONE_EXPLORING; if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH)) return DONE_EXPLORING; /* insn 不合法 */ if (w < 0 || w >= env->prog->len) { return -EINVAL; } if (e == BRANCH) /* mark branch target for state pruning */ init_explored_state(env, w); if (insn_state[w] == 0) { /* 還沒 discovered */ /* tree-edge */ insn_state[t] = DISCOVERED | e; insn_state[w] = DISCOVERED; if (env->cfg.cur_stack >= env->prog->len) return -E2BIG; insn_stack[env->cfg.cur_stack++] = w; /* push to stack */ return KEEP_EXPLORING; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { /* function / jmp */ /* DONE_EXPLORING 的部分在 check_cfg 才會 assign */ if (loop_ok && env->bpf_capable) return DONE_EXPLORING; return -EINVAL; } else if (insn_state[w] == EXPLORED) { /* forward- or cross-edge */ /* 走到其他已經 explored 完畢的 edge */ insn_state[t] = DISCOVERED | e; } else { return -EFAULT; } return DONE_EXPLORING; }
-
insn state:
enum { /* insn 狀態 */ DISCOVERED = 0x10, /* 發現了,正在走 */ EXPLORED = 0x20, /* 此後的 insn 已經走完 */ /* insn 對應的 bb 情況,branch 會有分岔 */ FALLTHROUGH = 1, BRANCH = 2, }; enum { DONE_EXPLORING = 0, KEEP_EXPLORING = 1, };
-
pseudo code:
procedure DFS-iterative(G,v): label v as discovered let S be a stack S.push(v) while S is not empty t <- S.pop() if t is what we are looking for: return t for all edges e in G.adjacentEdges(t) do if edge e is already labelled continue with the next edge w <- G.adjacentVertex(t,e) if vertex w is not discovered and not explored label e as tree-edge label w as discovered S.push(w) continue at 5 else if vertex w is discovered label e as back-edge else // vertex w is explored label e as forward- or cross-edge label t as explored S.pop()
-
到此還是不太了解 subprog 是以怎樣的方式存在,也許能透過 bpf check function do_check_subprogs()
來更加了解:
static int do_check_subprogs(struct bpf_verifier_env *env)
{
struct bpf_prog_aux *aux = env->prog->aux;
int i, ret;
if (!aux->func_info)
return 0;
for (i = 1; i < env->subprog_cnt; i++) {
if (aux->func_info_aux[i].linkage != BTF_FUNC_GLOBAL)
continue;
env->insn_idx = env->subprog_info[i].start;
WARN_ON_ONCE(env->insn_idx == 0);
ret = do_check_common(env, i);
if (ret) {
return ret;
} else if (env->log.level & BPF_LOG_LEVEL) {
verbose(env,
"Func#%d is safe for any args that match its prototype\n",
i);
}
}
return 0;
}
-
linux 在 function 上方有提供一段 useful 的註釋:
/* Verify all global functions in a BPF program one by one based on their BTF. * All global functions must pass verification. Otherwise the whole program is rejected. * Consider: * int bar(int); * int foo(int f) * { * return bar(f); * } * int bar(int b) * { * ... * } * foo() will be verified first for R1=any_scalar_value. During verification it * will be assumed that bar() already verified successfully and call to bar() * from foo() will be checked for type match only. Later bar() will be verified * independently to check that it's safe for R1=any_scalar_value. */
代表在檢查
foo()
時,會假設使用到的 function 皆為 verification,而在檢查bar()
時會再自行檢查一次 -
do_check_subprog()
會呼叫到do_common()
,而在bpf_check()
當中的do_check_main()
也會呼叫到do_common()
:/** * main prog 的 int subprog 會是 0,而其他 subprog 會是 1, 2, 3... * 所有 (包含 main prog) 的 subprog 數量為 env->subprog_cnt */ static int do_check_common(struct bpf_verifier_env *env, int subprog) { bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); struct bpf_verifier_state *state; struct bpf_reg_state *regs; int ret, i; env->prev_linfo = NULL; env->pass_cnt++; /* 紀錄被 check_common 的次數,不過在其他 function 當中似乎用不到 */ state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); if (!state) return -ENOMEM; state->curframe = 0; state->speculative = false; state->branches = 1; /* 每個 function frame 會有 10 個 register (struct bpf_reg_state regs[MAX_BPF_REG]) */ state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); if (!state->frame[0]) { kfree(state); return -ENOMEM; } env->cur_state = state; /** * init_func_state(env, state, callsite, frameno, subprogno): * state->callsite = callsite; * state->frameno = frameno; * state->subprogno = subprogno; * init_reg_state(env, state); */ init_func_state(env, state->frame[0], /* #define BPF_MAIN_FUNC (-1) */ BPF_MAIN_FUNC /* callsite */, 0 /* frameno */, subprog); /* 每個 frame 都會有一組 register 可以使用,what is frame @__@ ? */ regs = state->frame[state->curframe]->regs; if (subprog /* >= 1 為 subprog */ || env->prog->type == BPF_PROG_TYPE_EXT) { ret = btf_prepare_func_args(env, subprog, regs); if (ret) goto out; for (i = BPF_REG_1; i <= BPF_REG_5; i++) { if (regs[i].type == PTR_TO_CTX) mark_reg_known_zero(env, regs, i); else if (regs[i].type == SCALAR_VALUE) mark_reg_unknown(env, regs, i); else if (regs[i].type == PTR_TO_MEM_OR_NULL) { const u32 mem_size = regs[i].mem_size; mark_reg_known_zero(env, regs, i); regs[i].mem_size = mem_size; regs[i].id = ++env->id_gen; } } } else { /* == 0 為 main prog */ /* 1st arg to a function */ /* ctx = context, PTR_TO_CTX: reg points to bpf_context */ regs[BPF_REG_1].type = PTR_TO_CTX; /* func(r1, r2, ...) */ mark_reg_known_zero(env, regs, BPF_REG_1); ret = btf_check_subprog_arg_match(env, subprog, regs); if (ret == -EFAULT) goto out; } ret = do_check(env); out: /* check for NULL is necessary, since cur_state can be freed inside * do_check() under memory pressure. */ if (env->cur_state) { free_verifier_state(env->cur_state, true); env->cur_state = NULL; } while (!pop_stack(env, NULL, NULL, false)); /* 清空 stack (?) */ if (!ret && pop_log) bpf_vlog_reset(&env->log, 0); free_states(env); return ret; }
-
初始化 register state
init_reg_state()
:static void init_reg_state(struct bpf_verifier_env *env, struct bpf_func_state *state) { /** * 注意這邊的 state type 為 bpf_func_state (state->frame[0]), * 而 do_check_common 的 state 為 bpf_verifier_state */ struct bpf_reg_state *regs = state->regs; int i; for (i = 0; i < MAX_BPF_REG /* 10 */; i++) { mark_reg_not_init(env, regs, i); regs[i].live = REG_LIVE_NONE; regs[i].parent = NULL; regs[i].subreg_def = DEF_NOT_SUBREG; } /* frame pointer */ regs[BPF_REG_FP].type = PTR_TO_STACK; /* 將 BPF_REG_FP (frame pointer, reg_10) mark 成 known,並將內容都設成 0 */ mark_reg_known_zero(env, regs, BPF_REG_FP); regs[BPF_REG_FP].frameno = state->frameno; }
-
mark_reg_not_init()
滿多層的,不過大概作了以下事情來初始化 register 的狀態:memset(reg, 0, offsetof(struct bpf_reg_state, var_off)); /* 將位置 ~ var_off 都設為 0 */ reg->type = SCALAR_VALUE; reg->var_off = tnum_unknown; reg->frameno = 0; reg->precise = env->subprog_cnt > 1 || !env->bpf_capable; // __mark_reg_unbounded(reg); reg->smin_value = S64_MIN; reg->smax_value = S64_MAX; reg->umin_value = 0; reg->umax_value = U64_MAX; reg->s32_min_value = S32_MIN; reg->s32_max_value = S32_MAX; reg->u32_min_value = 0; reg->u32_max_value = U32_MAX; reg->type = NOT_INIT;
看起來 code 的註釋想表達初始化前還有一個狀態為 unknown,這邊的行為就是初始化 unknown (?)
-
-
bpf_check_subprog_arg_match()
檢查 BTF (bpf typr format) 是否符合 regs 的 expection:/** * 在 do_check_common() 當中,如果在做 main prog 的 arg match,如果回傳是 -EFAULT 並不是 error (?) * 只有 -EFAULT 會被當作 error 發生 */ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs) { struct bpf_prog *prog = env->prog; struct btf *btf = prog->aux->btf; bool is_global; u32 btf_id; int err; /* 代表 subprog 並非 function ? */ if (!prog->aux->func_info) return -EINVAL; btf_id = prog->aux->func_info[subprog].type_id; if (!btf_id) return -EFAULT; /** * function unreliable 代表 compiler 在 optimize 時把 static func 的參數給移除了, * 或是對 global function 的錯誤參數傳遞 (?) * 此 function 是以 BTF 觀點來檢查是否 match,因此可以先 mark 成 unreliable */ if (prog->aux->func_info_aux[subprog].unreliable) return -EINVAL; is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global); /* Compiler optimizations can remove arguments from static functions * or mismatched type can be passed into a global function. * In such cases mark the function as unreliable from BTF point of view. */ if (err) prog->aux->func_info_aux[subprog].unreliable = true; return err; }
btf_check_func_arg_match()
尚未 trace (太大坨 + 沒被執行到)
-
重頭戲
do_check()
,第一部分說明了開頭的基本檢查以及 alu operation (刪除了 verbose,細節請參閱 src):static int do_check(struct bpf_verifier_env *env) { bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); struct bpf_verifier_state *state = env->cur_state; struct bpf_insn *insns = env->prog->insnsi; struct bpf_reg_state *regs; int insn_cnt = env->prog->len; bool do_print_state = false; int prev_insn_idx = -1; for (;;) { struct bpf_insn *insn; u8 class; int err; env->prev_insn_idx = prev_insn_idx; /* 超過範圍 */ if (env->insn_idx >= insn_cnt) { return -EFAULT; } insn = &insns[env->insn_idx]; class = BPF_CLASS(insn->code); /* insn 太複雜,已經被處理過太多次了 */ if (++env->insn_processed > BPF_COMPLEXITY_LIMIT_INSNS /* 1000000 */) { return -E2BIG; } err = is_state_visited(env, env->insn_idx); if (err < 0) return err; if (err == 1) { /* found equivalent state, can prune the search */ goto process_bpf_exit; } if (signal_pending(current)) /* 看是否有 signal 正在傳送 */ return -EAGAIN; if (need_resched()) /* 也許需要 reschedule */ cond_resched(); if (env->log.level & BPF_LOG_LEVEL2 || (env->log.level & BPF_LOG_LEVEL && do_print_state)) { do_print_state = false; } if (bpf_prog_is_dev_bound(env->prog->aux) /* aux->offload_requested */) { err = bpf_prog_offload_verify_insn(env, env->insn_idx, env->prev_insn_idx); if (err) return err; } regs = cur_regs(env); /* cur->frame[cur->curframe] */ /** * 如果 insn 沒有要用猜的 (speculative),則執行: * env->insn_aux_data[env->insn_idx].seen = env->pass_cnt * 為了讓後續 verify unreachable path 時,sanitize_dead_code() 還能夠 rewrite/sanitize */ sanitize_mark_insn_seen(env); prev_insn_idx = env->insn_idx; if (class == BPF_ALU || class == BPF_ALU64) { err = check_alu_op(env, insn); if (err) return err; } ...
-
class check list:
ALU
|ALU64
-check_alu_op()
NEG
END
MOV
AND
,SUB
, ...
LDX
-check_reg_arg()
for src and dst reg,check_mem_access()
,reg_type_mismatch()
STX
-check_reg_arg()
for src and dst reg,check_mem_access()
,reg_type_mismatch()
ST
-check_reg_arg()
for src reg,is_ctx_reg()
for dst reg,check_mem_access()
JMP
|JMP32
CALL
JA
EXIT
JNE
,JGE
...
LD
-check_ld_abs()
,check_ld_imm()
-
is_state_visited()
用來檢查 state 是否走訪過(comment 細節在 src):static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl, **pprev; struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err, states_cnt = 0; bool add_new_state = env->test_state_freq ? true : false; cur->last_insn_idx = env->prev_insn_idx; /** * 當初在執行 init_explored_state(env, w) 有更動到 prune_point, * (env->insn_aux_data[idx].prune_point = true), * * 而這裡要 prunn_point 為 0 才能繼續往下走 */ if (!env->insn_aux_data[insn_idx].prune_point) return 0; /* 後面還沒看 @_@ */ }
-
當 class 為
ALU
時,會執行check_alu_op()
檢查 ALU 內部的 operation 是否合法:/* check validity of 32-bit and 64-bit arithmetic operations */ /** * ALU 的 operation 有: * BPF_END, BPF_NEG, BPF_MOV, BPF_SUB, ... SUB_XOR */ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) { struct bpf_reg_state *regs = cur_regs(env); u8 opcode = BPF_OP(insn->code); int err; if (opcode == BPF_END || opcode == BPF_NEG) { if (opcode == BPF_NEG) { /** * #define BPF_SRC(code) ((code) & 0x08) * #define BPF_K 0x00 * #define BPF_X 0x08 * 不確定 K 與 X 的意思 @__@,不過 NEG 只能是 BPF_K * off, imm 要為 0,src_reg 要用 reg_0 */ if (BPF_SRC(insn->code) != 0 || insn->src_reg != BPF_REG_0 || insn->off != 0 || insn->imm != 0) { verbose(env, "BPF_NEG uses reserved fields\n"); return -EINVAL; } } else { /** * src_reg == reg_0, off == 0, imm 要是 16/32/64,class 要是 ALU64, * 進來此 function 時 class 可以是 alu or alu64 */ if (insn->src_reg != BPF_REG_0 || insn->off != 0 || (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) || BPF_CLASS(insn->code) == BPF_ALU64) { verbose(env, "BPF_END uses reserved fields\n"); return -EINVAL; } } /* check src operand */ /* 檢查 dst_reg 在 type SRC_OP 時是否合法 */ err = check_reg_arg(env, insn->dst_reg, SRC_OP); if (err) return err; /* 不能 pointer operation */ if (is_pointer_value(env, insn->dst_reg)) { return -EACCES; } /* check dest operand */ /* 檢查 dst_reg 在 type DST_OP 時是否合法 */ err = check_reg_arg(env, insn->dst_reg, DST_OP); if (err) return err; } else if (opcode == BPF_MOV) { /* a constant (BPF_K) or the index register (BPF_X) */ if (BPF_SRC(insn->code) == BPF_X) { /* 要 imm == 0 && off == 0 */ if (insn->imm != 0 || insn->off != 0) { return -EINVAL; } /* check src operand */ err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) return err; } else { /* 要 src_reg == r0 && off == 0 */ if (insn->src_reg != BPF_REG_0 || insn->off != 0) { return -EINVAL; } } /* check dest operand, mark as required later */ err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); if (err) return err; /* 如果是 index register */ if (BPF_SRC(insn->code) == BPF_X) { struct bpf_reg_state *src_reg = regs + insn->src_reg; struct bpf_reg_state *dst_reg = regs + insn->dst_reg; if (BPF_CLASS(insn->code) == BPF_ALU64) { /* case: R1 = R2 * copy register state to dest reg */ if (src_reg->type == SCALAR_VALUE && !src_reg->id) /* Assign src and dst registers the same ID * that will be used by find_equal_scalars() * to propagate min/max range. */ src_reg->id = ++env->id_gen; /* same id */ *dst_reg = *src_reg; /* assign (r1 = r2) */ dst_reg->live |= REG_LIVE_WRITTEN; /* 被寫 */ dst_reg->subreg_def = DEF_NOT_SUBREG; /* 64bit 沒 subreg (?) */ } else { /* R1 = (u32) R2 */ /* 不能 assign pointer */ if (is_pointer_value(env, insn->src_reg)) { return -EACCES; } else if (src_reg->type == SCALAR_VALUE) { *dst_reg = *src_reg; /* Make sure ID is cleared otherwise * dst_reg min/max could be incorrectly * propagated into src_reg by find_equal_scalars() */ dst_reg->id = 0; dst_reg->live |= REG_LIVE_WRITTEN; /* 32 bit 的 subreg_def 為 env->insn_idx + 1 */ dst_reg->subreg_def = env->insn_idx + 1; } else { /* 不是 pointer 也不是 scalar value */ mark_reg_unknown(env, regs, insn->dst_reg); } /** * BPF architecture zero extends alu32 ops into 64-bit registesr (a typo) */ zext_32_to_64(dst_reg); } } else { /* 如果是 imm */ /* case: R = imm * remember the value we stored into this reg */ /* clear any state __mark_reg_known doesn't set */ mark_reg_unknown(env, regs, insn->dst_reg); /* assign an imm --> scalar value */ regs[insn->dst_reg].type = SCALAR_VALUE; /* Mark the unknown part of a register (variable offset or scalar value) as * known to have the value @imm. */ if (BPF_CLASS(insn->code) == BPF_ALU64) { __mark_reg_known(regs + insn->dst_reg, insn->imm); } else { __mark_reg_known(regs + insn->dst_reg, (u32)insn->imm); } } } else if (opcode > BPF_END) { /* invalid */ return -EINVAL; } else { /* all other ALU ops: and, sub, xor, add, ... */ if (BPF_SRC(insn->code) == BPF_X) { /* index register */ /* 要 imm == 0 && off == 0 */ if (insn->imm != 0 || insn->off != 0) { return -EINVAL; } /* check src1 operand */ err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) return err; } else { /* 要 src_reg == reg_0 && off == 0 */ /* reg_0 代表沒在用 */ if (insn->src_reg != BPF_REG_0 || insn->off != 0) { return -EINVAL; } } /* user space 中 bpf macro 結尾 _IMM 都是 BPF_K,而結尾 _REG 的都是 BPF_X */ /* check src2 operand */ err = check_reg_arg(env, insn->dst_reg, SRC_OP); if (err) return err; if ((opcode == BPF_MOD || opcode == BPF_DIV) && BPF_SRC(insn->code) == BPF_K && insn->imm == 0) { return -EINVAL; } if ((opcode == BPF_LSH || opcode == BPF_RSH || opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) { int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; /* invalid shift */ if (insn->imm < 0 || insn->imm >= size) { return -EINVAL; } } /* check dest operand */ err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); if (err) return err; /* Handles ALU ops other than BPF_END, BPF_NEG and BPF_MOV: computes new min/max * and var_off. */ return adjust_reg_min_max_vals(env, insn); /* 下方的某 section 獨立分析 */ } return 0; }
- 大多數都會檢查某個 operation 是否有用到不該用的欄位 (reversed field)
check_reg_arg()
經常被呼叫到,會根據 reg 的 type 去做一些基本的檢查:static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, enum reg_arg_type t) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_insn *insn = env->prog->insnsi + env->insn_idx; struct bpf_reg_state *reg, *regs = state->regs; bool rw64; /* 使用 register 超出範圍 */ if (regno >= MAX_BPF_REG) { return -EINVAL; } reg = ®s[regno]; /* is_reg64(): returns TRUE if the source or destination register operates on 64-bit */ rw64 = is_reg64(env, insn, regno, reg, t); if (t == SRC_OP) { /* check whether register used as source operand can be read */ /* 還沒初始化 ? */ if (reg->type == NOT_INIT) { verbose(env, "R%d !read_ok\n", regno); return -EACCES; } /* We don't need to worry about FP liveness because it's read-only */ /* fp (reg10) 唯讀 */ if (regno == BPF_REG_FP) return 0; if (rw64) /* zext == zero extended */ /** * The dst will be zero extended, so won't be sub-register anymore. * 代表有些 register 需要 sub-reg ? * * env->insn_aux_data[reg->subreg_def - 1].zext_dst = true; * reg->subreg_def = DEF_NOT_SUBREG; */ mark_insn_zext(env, reg); /** * 將 reg mark 成 read,並且有區分 64 and 32 * mark_reg_read() 會 traverse reg 的 parent (因此才有 sub reg ?) * 而且 parent == NULL 就不會被 mark (reg->live |= flag) */ return mark_reg_read(env, reg, reg->parent, rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32); } else { /* check whether register used as dest operand can be written to */ /* fp 唯讀 */ if (regno == BPF_REG_FP) { verbose(env, "frame pointer is read only\n"); return -EACCES; } /* written 沒有分 32/64 */ reg->live |= REG_LIVE_WRITTEN; /* 64 就不需要 subreg,32 則需要,並且值為 env->insn_idx + 1 (下一個 insn ?) */ /* 還不知道 subreg_def 是用來幹嘛的 */ reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1; if (t == DST_OP) /* 此時 t 可能為 DST_OP_NO_MARK or DST_OP */ mark_reg_unknown(env, regs, regno); } return 0; }
-
結論:
DST_OP
- check whether register used as dest operand can be written toSRC_OP
- check whether register used as source operand can be read
-
reg_arg_type
一共有三種:enum reg_arg_type { SRC_OP, /* register is used as source operand */ DST_OP, /* register is used as destination operand */ DST_OP_NO_MARK /* same as above, check only, don't mark */ };
-
-
第二部分說明
BPF_LDX
的行為:... else if (class == BPF_LDX) { enum bpf_reg_type *prev_src_type, src_reg_type; /* check for reserved fields is already done */ /* check src operand */ err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) return err; err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); if (err) return err; src_reg_type = regs[insn->src_reg].type; /* check that memory (src_reg + off) is readable, * the state of dst_reg will be updated by this func */ err = check_mem_access(env, env->insn_idx, insn->src_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, insn->dst_reg, false); if (err) return err; prev_src_type = &env->insn_aux_data[env->insn_idx].ptr_type; if (*prev_src_type == NOT_INIT) { /* saw a valid insn * dst_reg = *(u32 *)(src_reg + off) * save type to validate intersecting paths */ *prev_src_type = src_reg_type; } else if (reg_type_mismatch(src_reg_type, *prev_src_type)) { /* ABuser program is trying to use the same insn * dst_reg = *(u32*) (src_reg + off) * with different pointer types: * src_reg == ctx in one branch and * src_reg == stack|map in some other branch. * Reject it. */ verbose(env, "same insn cannot be used with different pointers\n"); return -EINVAL; } } ...
-
第三部分為
BPF_STX
:... else if (class == BPF_STX) { enum bpf_reg_type *prev_dst_type, dst_reg_type; if (BPF_MODE(insn->code) == BPF_ATOMIC) { /* atomic operation */ err = check_atomic(env, env->insn_idx, insn); if (err) return err; env->insn_idx++; continue; } /* reversed field */ if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) { return -EINVAL; } /* check src1 operand */ /* readable */ err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) return err; /* check src2 operand */ /* readable */ err = check_reg_arg(env, insn->dst_reg, SRC_OP); if (err) return err; dst_reg_type = regs[insn->dst_reg].type; /* check that memory (dst_reg + off) is writeable */ /* BPF_READ = 1 * BPF_WRITE = 2 */ err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, insn->src_reg, false); if (err) return err; /* 當前的 ptr type */ prev_dst_type = &env->insn_aux_data[env->insn_idx].ptr_type; if (*prev_dst_type == NOT_INIT) { /* regs[insn->dst_reg].type */ *prev_dst_type = dst_reg_type; } else if (reg_type_mismatch(dst_reg_type /* src */, *prev_dst_type /* dst */)) { /* return src != prev (reg type 不同,代表 different type) * && (!reg_type_mismatch_ok(src) || !reg_type_mismatch_ok(prev)) * * reg_type_mismatch_ok(): * Return true if it's OK to have the same insn return a different type * PTR_TO_CTX, PTR_TO_SOCKET, ... 都會回傳 false */ /* 相同的 insn 不能用不同的 pointer (?) */ verbose(env, "same insn cannot be used with different pointers\n"); return -EINVAL; } } ...
-
其中有一個
check_mem_access()
被頻繁使用到,目的是用來檢查(regno + off)
是否可存取,並且根據不同的bpf_access_type
有不同的行為 (src):- 檢查當
t = (read | write)
時, memory at(regno + off)
是否可以存取 t = write
-->regno
為其值要被放入 memory 當中的 registert = read
-->regno
為要被寫入 memory 內的值的 registert = write && regno == -1
--> unknown value 要被存入 memoryt = read && regno == -1
-- > 不管我們讀什麼
static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off, int bpf_size, enum bpf_access_type t, int value_regno, bool strict_alignment_once) { struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = regs + regno; struct bpf_func_state *state; int size, err = 0; size = bpf_size_to_bytes(bpf_size); /* 看是 BPF_W, BPF_DW 等等就會對到 4, 8 ... */ if (size < 0) return size; /* alignment checks will add in reg->off themselves */ /* 加上 offset 之後需要做對齊的 check,必須要 align size */ err = check_ptr_alignment(env, reg, off, size, strict_alignment_once); if (err) return err; /* for access checks, reg->off is just part of off */ off += reg->off; if (reg->type == PTR_TO_MAP_KEY) { /* map key */ if (t == BPF_WRITE) { /* 不能 change key */ return -EACCES; } /* check read/write into a memory region with possible variable offset */ err = check_mem_region_access(env, regno, off, size, reg->map_ptr->key_size, false); if (err) return err; if (value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_MAP_VALUE) { if (t == BPF_WRITE && value_regno >= 0 && /* return allow_ptr_leaks ? false : reg->type != SCALAR_VALUE; */ is_pointer_value(env, value_regno)) { return -EACCES; } /* type == BPF_WRITE && !(bpf_map_flags_to_cap(map) & BPF_MAP_CAN_WRITE) * type == BPF_READ && !(bpf_map_flags_to_cap(map) & BPF_MAP_CAN_READ) * will return -EACCES */ err = check_map_access_type(env, regno, off, size, t); if (err) return err; /* 沒有 spinlock 的話,check_map_access() 就是 check_mem_region_access() 的 wrapper */ err = check_map_access(env, regno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) { struct bpf_map *map = reg->map_ptr; /* if map is read-only, track its contents as scalars */ if (tnum_is_const(reg->var_off) && bpf_map_is_rdonly(map) && map->ops->map_direct_value_addr) { int map_off = off + reg->var_off.value; u64 val = 0; err = bpf_map_direct_read(map, map_off, size, &val); if (err) return err; regs[value_regno].type = SCALAR_VALUE; __mark_reg_known(®s[value_regno], val); } else { mark_reg_unknown(env, regs, value_regno); } } } else if (reg->type == PTR_TO_MEM) { if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { /* 不給 leak */ return -EACCES; } err = check_mem_region_access(env, regno, off, size, reg->mem_size, false); if (!err && t == BPF_READ && value_regno >= 0) /* 寫成 unknown */ mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; struct btf *btf = NULL; u32 btf_id = 0; if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { /* 不給 leak */ return -EACCES; } err = check_ctx_reg(env, reg, regno); if (err < 0) return err; err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf, &btf_id); /* CTX 的部分還沒看 */ if (!err && t == BPF_READ && value_regno >= 0) { /* ctx access returns either a scalar, or a * PTR_TO_PACKET[_META,_END]. In the latter * case, we know the offset is zero. */ if (reg_type == SCALAR_VALUE) { mark_reg_unknown(env, regs, value_regno); } else { mark_reg_known_zero(env, regs, value_regno); if (reg_type_may_be_null(reg_type)) regs[value_regno].id = ++env->id_gen; /* A load of ctx field could have different * actual load size with the one encoded in the * insn. When the dst is PTR, it is for sure not * a sub-register. */ regs[value_regno].subreg_def = DEF_NOT_SUBREG; if (reg_type == PTR_TO_BTF_ID || reg_type == PTR_TO_BTF_ID_OR_NULL) { regs[value_regno].btf = btf; regs[value_regno].btf_id = btf_id; } } regs[value_regno].type = reg_type; } } else if (reg->type == PTR_TO_STACK) { /* Basic bounds checks. */ /* Check that the stack access at 'regno + off' falls within the maximum stack bounds */ err = check_stack_access_within_bounds(env, regno, off, size, ACCESS_DIRECT, t); if (err) return err; /* return env->cur_state->frame[reg->frameno] */ state = func(env, reg); /* 如果原本 stack size 比較小,就執行 * env->subprog_info[func->subprogno].stack_depth = -off */ err = update_stack_depth(env, state, off); if (err) return err; /* 這兩個 check_stack 好大坨 @__@,之後在看 */ if (t == BPF_READ) err = check_stack_read(env, regno, off, size, value_regno); else err = check_stack_write(env, regno, off, size, value_regno, insn_idx); } else if (reg_is_pkt_pointer(reg)) { /* pass packet */ } else if (reg->type == PTR_TO_FLOW_KEYS) { /* pass flow keys */ } else if (type_is_sk_pointer(reg->type)) { /* pass socket */ } else if (reg->type == PTR_TO_TP_BUFFER) { /* pass tp buffer */ } else if (reg->type == PTR_TO_BTF_ID) { /* pass btf id */ } else if (reg->type == CONST_PTR_TO_MAP) { err = check_ptr_to_map_access(env, regs, regno, off, size, t, value_regno); } else if (reg->type == PTR_TO_RDONLY_BUF) { if (t == BPF_WRITE) { /* read only */ return -EACCES; } err = check_buffer_access(env, reg, regno, off, size, false, "rdonly", &env->prog->aux->max_rdonly_access); if (!err && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_RDWR_BUF) { err = check_buffer_access(env, reg, regno, off, size, false, "rdwr", &env->prog->aux->max_rdwr_access); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else { /* invalid mem access */ return -EACCES; } /* 到這邊發現,如果 operation 是 BPF_READ (or read 相關的 ?),register 就會被 mark 成 unknown, * 作用為將幾乎整個 struct 設為 0 */ /* BPF_WRITE 不需要 */ if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ && regs[value_regno].type == SCALAR_VALUE) { /* b/h/w load zero-extends, mark upper bits as known 0 */ /* truncate register to smaller size (in bytes) */ /* https://elixir.bootlin.com/linux/v5.13.11/source/kernel/bpf/verifier.c#L3778 */ coerce_reg_to_size(®s[value_regno], size); } return err; }
-
各種 check memory/reg 的權限,其中
check_mem_region_access()
的分析如下:static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, int off, int size, u32 mem_size, bool zero_size_allowed) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg = &state->regs[regno]; int err; ... /* smin 為 neg,並且 * smin 為最小值 (不能在低) || off + reg->smin_value 在 cast 成 32 後 != 原本的值 || * reg->smin_value + off 為負數 */ if (reg->smin_value < 0 && (reg->smin_value == S64_MIN || (off + reg->smin_value != (s64)(s32)(off + reg->smin_value)) || reg->smin_value + off < 0)) { return -EACCES; } err = __check_mem_access(env, regno, reg->smin_value + off, size, mem_size, zero_size_allowed); if (err) { return err; } /* If we haven't set a max value then we need to bail since we can't be * sure we won't do bad things. * If reg->umax_value + off could overflow, treat that as unbounded too. */ /* unbounded memory access */ if (reg->umax_value >= BPF_MAX_VAR_OFF /* (1 << 29), 0x20000000 */) { return -EACCES; } err = __check_mem_access(env, regno, reg->umax_value + off, size, mem_size, zero_size_allowed); if (err) { return err; } return 0; }
-
__check_mem_access()
檢查 memory region read/write:/* check read/write into memory region (e.g., map value, ringbuf sample, etc) */ static int __check_mem_access(struct bpf_verifier_env *env, int regno, int off, int size, u32 mem_size, bool zero_size_allowed) { bool size_ok = size > 0 || (size == 0 && zero_size_allowed); struct bpf_reg_state *reg; if (off >= 0 && size_ok && (u64)off + size <= mem_size) return 0; reg = &cur_regs(env)[regno]; switch (reg->type) { /* 到此已經代表 invalid,只是根據不同的 type 有不同的 verbose */ } return -EACCES; }
-
stack 的 bound check function
check_stack_access_within_bounds()
:/* 'off' includes `regno->offset`, but not its dynamic part (if any). */ /* ACCESS_DIRECT = 1, the access is performed by an instruction * ACCESS_HELPER = 2, the access is performed by a helper */ static int check_stack_access_within_bounds( struct bpf_verifier_env *env, int regno, int off, int access_size, enum stack_access_src src, enum bpf_access_type type) { struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = regs + regno; struct bpf_func_state *state = func(env, reg); int min_off, max_off; int err; char *err_extra; if (src == ACCESS_HELPER) /* We don't know if helpers are reading or writing (or both). */ err_extra = " indirect access to"; else if (type == BPF_READ) err_extra = " read from"; else err_extra = " write to"; /* var_off 存的是 struct tnum * mask 為 1 代表未知,!mask 為 true 代表都知道了 --> const */ if (tnum_is_const(reg->var_off) /* !reg->var_off.mask */) { min_off = reg->var_off.value + off; if (access_size > 0) /* access_size 為 user 傳入的 W, DW 等等 */ max_off = min_off + access_size - 1; else max_off = min_off; } else { /* invalid unbounded variable-offset */ if (reg->smax_value >= BPF_MAX_VAR_OFF /* >= 0x20000000 */ || reg->smin_value <= -BPF_MAX_VAR_OFF /* <= -0x20000000 */) { return -EACCES; } min_off = reg->smin_value + off; if (access_size > 0) max_off = reg->smax_value + off + access_size - 1; else max_off = min_off; } /* Check that the stack access at the given offset is within bounds * maximum valid offset is -1 * minimum valid offset is -MAX_BPF_STACK (-512) for write, -state->allocated_stack for * read * * 也就是 stack access 的 range 為 -1 <= off <= -MAX_BPF_STACK */ err = check_stack_slot_within_bounds(min_off, state, type); if (!err) err = check_stack_slot_within_bounds(max_off, state, type); if (err) { /* verbose */ } return err; }
- 檢查當
-
第四部份為
BPF_ST
:... else if (class == BPF_ST) { if (BPF_MODE(insn->code) != BPF_MEM || insn->src_reg != BPF_REG_0) { verbose(env, "BPF_ST uses reserved fields\n"); return -EINVAL; } /* check src operand */ err = check_reg_arg(env, insn->dst_reg, SRC_OP); if (err) return err; if (is_ctx_reg(env, insn->dst_reg)) { verbose(env, "BPF_ST stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str[reg_state(env, insn->dst_reg)->type]); return -EACCES; } /* check that memory (dst_reg + off) is writeable */ err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, false); if (err) return err; } ...
-
第五部分為
JMP
系列:... else if (class == BPF_JMP || class == BPF_JMP32) { u8 opcode = BPF_OP(insn->code); env->jmps_processed++; /* function call */ if (opcode == BPF_CALL) { /* reserved fields */ if (BPF_SRC(insn->code) != BPF_K || insn->off != 0 || (insn->src_reg != BPF_REG_0 && insn->src_reg != BPF_PSEUDO_CALL && insn->src_reg != BPF_PSEUDO_KFUNC_CALL) || insn->dst_reg != BPF_REG_0 || class == BPF_JMP32) { return -EINVAL; } /* function call 不能 hold lock */ if (env->cur_state->active_spin_lock && (insn->src_reg == BPF_PSEUDO_CALL || insn->imm != BPF_FUNC_spin_unlock)) { return -EINVAL; } if (insn->src_reg == BPF_PSEUDO_CALL) err = check_func_call(env, insn, &env->insn_idx); else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) err = check_kfunc_call(env, insn); else err = check_helper_call(env, insn, &env->insn_idx); if (err) return err; } else if (opcode == BPF_JA) { /* reserved field */ if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0 || class == BPF_JMP32) { return -EINVAL; } /* 直接跳過 insn->off 個 insn */ env->insn_idx += insn->off + 1; continue; } else if (opcode == BPF_EXIT) { /* reserved field */ if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0 || class == BPF_JMP32) { return -EINVAL; } /* missing spinlock,應該是要被 release 的 ? */ if (env->cur_state->active_spin_lock) { return -EINVAL; } if (state->curframe) { /* exit from nested function */ err = prepare_func_exit(env, &env->insn_idx); if (err) return err; do_print_state = true; continue; } err = check_reference_leak(env); if (err) return err; err = check_return_code(env); if (err) return err; process_bpf_exit: update_branch_counts(env, env->cur_state); err = pop_stack(env, &prev_insn_idx, &env->insn_idx, pop_log); if (err < 0) { if (err != -ENOENT) return err; break; } else { do_print_state = true; continue; } } else { /* 其他的 condition jump */ err = check_cond_jmp_op(env, insn, &env->insn_idx); if (err) return err; } } ...
-
一共有三種呼叫方式:
BPF_PSEUDO_CALL
-check_func_call()
BPF_PSEUDO_KFUNC_CALL
-check_kfunc_call()
- other -
check_helper_call()
-
check_helper_call()
,在__BPF_FUNC_MAPPER
(src) 當中有定義 fn 對應到的 function:static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { const struct bpf_func_proto *fn = NULL; struct bpf_reg_state *regs; struct bpf_call_arg_meta meta; int insn_idx = *insn_idx_p; bool changes_data; int i, err, func_id; /* find function prototype */ func_id = insn->imm; if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID /* 165 */) { return -EINVAL; } /* e.g. sk_filter_func_proto */ if (env->ops->get_func_proto) /* 舉 lookup_elem 為例子, * 一開始執行 sk_filter_func_proto,一層層找對應 fund_id 以及 type 的 func proto, * 最後在 bpf_base_func_proto 找到 bpf_map_lookup_elem_proto */ fn = env->ops->get_func_proto(func_id, env->prog); if (!fn) { /* unknown */ return -EINVAL; } /* eBPF programs must be GPL compatible to use GPL-ed functions */ /* license 必須要是 GPL 的 */ if (!env->prog->gpl_compatible && fn->gpl_only) { return -EINVAL; } /* 不允許在 probe 階段呼叫 */ if (fn->allowed && !fn->allowed(env->prog)) { return -EINVAL; } /* With LD_ABS/IND some JITs save/restore skb from r1. */ /* 有些 func 會修改到 pkt 的內容 */ changes_data = bpf_helper_changes_pkt_data(fn->func); /* 如果會修改 pkt 的內容,但是 r1 卻不要求指向 ctx,為 misconfig */ if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) { return -EINVAL; } memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; /* check_raw_mode_ok() - arg type 為 ARG_PTR_TO_UNINIT_MEM 只能有一個 * check_arg_pair_ok() - 另一個滿有趣的限制 - arg1 不能是 const type,arg5 不能是 ptr_to_mem type * ,以及 argn 跟 argn+1 必須要是 (ptr, size) or (size, ptr), * 否則就不能用 ptr * check_btf_id_ok() - ARG_PTR_TO_BTF_ID 跟 fn->arg_btf_id[i] 必須同時有 / 同時沒有 * check_refcount_ok() - arg type 為 ARG_PTR_TO_SOCK_COMMON 只能有一個 (unref) */ err = check_func_proto(fn, func_id); if (err) { /* misconfig */ return err; } meta.func_id = func_id; /* check args */ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS /* 5 */; i++) { /* 有點大坨,還沒看 */ err = check_func_arg(env, i, &meta, fn); if (err) return err; } /* 更新 env->insn_aux_data[insn_idx]->map_ptr_state */ err = record_func_map(env, &meta, func_id, insn_idx); if (err) return err; /* 更新 env->insn_aux_data[insn_idx]->map_key_state */ err = record_func_key(env, &meta, func_id, insn_idx); if (err) return err; /* Mark slots with STACK_MISC in case of raw mode, stack offset * is inferred from register state. */ for (i = 0; i < meta.access_size; i++) { /* 前面整個 meta 都被設成 0 了不是 (?) */ err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, BPF_WRITE, -1, false); if (err) return err; } if (func_id == BPF_FUNC_tail_call) { /* ... */ } else if (is_release_function(func_id)) { /* ... */ } regs = cur_regs(env); if (func_id == BPF_FUNC_get_local_storage && /* get storage 的 reg2 要是 null */ !register_is_null(®s[BPF_REG_2])) { return -EINVAL; } if (func_id == BPF_FUNC_for_each_map_elem) { err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, set_map_elem_callback_state); if (err < 0) return -EINVAL; } if (func_id == BPF_FUNC_snprintf) { err = check_bpf_snprintf_call(env, regs); if (err < 0) return err; } /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { /* __mark_reg_unknown(env, reg); reg->type = NOT_INIT; */ mark_reg_not_init(env, regs, caller_saved[i]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); } /* helper call returns 64-bit value. */ regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; /* why 32 bit needs a subreg */ /* update return register (already marked as written above) */ if (fn->ret_type == RET_INTEGER) { /* sets type to SCALAR_VALUE */ /* unknown 意即 scalar value (?) */ mark_reg_unknown(env, regs, BPF_REG_0); } else if (fn->ret_type == RET_VOID) { /* return void 的情況代表 reg not init (?) */ regs[BPF_REG_0].type = NOT_INIT; } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL || fn->ret_type == RET_PTR_TO_MAP_VALUE) { /* 還沒有 offset,因此先設為 zero */ mark_reg_known_zero(env, regs, BPF_REG_0); /* meta.map_ptr 在 check_func_arg 設置的,指向 map address */ /* https://elixir.bootlin.com/linux/v5.13.11/source/kernel/bpf/verifier.c#L4927 */ if (meta.map_ptr == NULL) { return -EINVAL; } regs[BPF_REG_0].map_ptr = meta.map_ptr; if (fn->ret_type == RET_PTR_TO_MAP_VALUE) { regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; if (map_value_has_spin_lock(meta.map_ptr)) regs[BPF_REG_0].id = ++env->id_gen; } else { regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; } } else if (...) { /* 省略一些 else if 的 case,行為大同小異 */ } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL || /* 頗亂,先 pass */ } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL || fn->ret_type == RET_PTR_TO_BTF_ID) { /* 頗亂,先 pass */ } else { /* unknown */ return -EINVAL; } /* 感覺 return type 結尾為 _OR_NULL 的,reg0 都會先設為 known_zero */ /* 只要 return type 後面有 _OR_NULL 都 return true */ if (reg_type_may_be_null(regs[BPF_REG_0].type)) /* 為 reg assign 一個 id,當 return or_null 或是 spinlock 都會,還有其他少數特例 */ regs[BPF_REG_0].id = ++env->id_gen; if (is_ptr_cast_function(func_id)) { /* func_id 為 BPF_FUNC prefix */ /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; } else if (is_acquire_function(func_id, meta.map_ptr)) { /* 看不太懂,先 pass */ } do_refine_retval_range(regs, fn->ret_type, func_id, &meta); /* 這是因為 func 有時候只能用某些 map,而 map 也有相對應的 func, * 因此要從 map & func 的觀點分別作檢查 */ err = check_map_func_compatibility(env, meta.map_ptr, func_id); if (err) return err; /* 有 function 的 type 與 get stack / get task stack 相關 */ if ((func_id == BPF_FUNC_get_stack || func_id == BPF_FUNC_get_task_stack) && !env->prog->has_callchain_buf) { if (err) { return err; } env->prog->has_callchain_buf = true; } if (func_id == BPF_FUNC_get_stackid || func_id == BPF_FUNC_get_stack) env->prog->call_get_stack = true; /* return value of bpf_helper_changes_pkt_data(fn->func) * 可能會改變 pkt 內容的 func */ if (changes_data) /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] * are now invalid, so turn them into unknown SCALAR_VALUE. * * 大概就是只要 reg 存著指向 pkt 的 pointer,就呼叫 mark_reg_unknown() 將其設為 unknown * 不過 function 中有 spilled register,不確定是什麼 */ clear_all_pkt_pointers(env); return 0; }
-
不同 helper function 使用到的 func prototype 不太一樣,可以參考 src,下方為使用到的 struct
bpf_func_proto
:/* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL * instructions after verifying */ struct bpf_func_proto { u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); bool gpl_only; bool pkt_access; enum bpf_return_type ret_type; union { struct { enum bpf_arg_type arg1_type; enum bpf_arg_type arg2_type; enum bpf_arg_type arg3_type; enum bpf_arg_type arg4_type; enum bpf_arg_type arg5_type; }; enum bpf_arg_type arg_type[5]; }; union { struct { u32 *arg1_btf_id; u32 *arg2_btf_id; u32 *arg3_btf_id; u32 *arg4_btf_id; u32 *arg5_btf_id; }; u32 *arg_btf_id[5]; }; int *ret_btf_id; /* return value btf_id */ bool (*allowed)(const struct bpf_prog *prog); }; /* example */ const struct bpf_func_proto bpf_map_lookup_elem_proto = { .func = bpf_map_lookup_elem, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, /* r1 要放指向 map 的 ptr,userland 可以用 BPF_LD_MAP_FD 取得˙*/ .arg2_type = ARG_PTR_TO_MAP_KEY, /* key */ };
-
condition jmp 像是 JNE, 等等
check_cond_jmp_op
(src):static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { struct bpf_verifier_state *this_branch = env->cur_state; struct bpf_verifier_state *other_branch; struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL; u8 opcode = BPF_OP(insn->code); bool is_jmp32; int pred = -1; int err; /* Only conditional jumps are expected to reach here. */ /* 不過為什麼會到這邊 @__@ ? */ if (opcode == BPF_JA || opcode > BPF_JSLE) { return -EINVAL; } if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0) { /* BPF_X 用 register */ return -EINVAL; } /* check src1 operand */ err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) return err; /* 不能比 ptr */ if (is_pointer_value(env, insn->src_reg)) { return -EACCES; } src_reg = ®s[insn->src_reg]; } else { if (insn->src_reg != BPF_REG_0) { /* BPF_K */ return -EINVAL; } } /* check src2 operand */ err = check_reg_arg(env, insn->dst_reg, SRC_OP); if (err) return err; dst_reg = ®s[insn->dst_reg]; is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; /* if false, jmp64 */ if (BPF_SRC(insn->code) == BPF_K) { /* 用 imm */ /* prediction */ pred = is_branch_taken(dst_reg, insn->imm, opcode, is_jmp32); } else if (src_reg->type == SCALAR_VALUE && is_jmp32 && tnum_is_const(tnum_subreg(src_reg->var_off))) { /* 用 src_reg 的 value * 並且 jmp32, * 由於 jmp32 的關係也使用 subreg (tnum_cast(a, 4)) */ pred = is_branch_taken(dst_reg, tnum_subreg(src_reg->var_off).value, opcode, is_jmp32); } else if (src_reg->type == SCALAR_VALUE && !is_jmp32 && tnum_is_const(src_reg->var_off)) { /* 跟上面只差在 jmp64 */ pred = is_branch_taken(dst_reg, src_reg->var_off.value, opcode, is_jmp32); } else if (/* 與 pkt (packet) 相關 */) { ... } /* 1 - taken, 0 - not taken, -1 - unknown */ if (pred >= 0) { /* If we get here with a dst_reg pointer type it is because * above is_branch_taken() special cased the 0 comparison. */ /* mark_chain == Markov chain (?) */ if (!__is_pointer_value(false, dst_reg)) err = mark_chain_precision(env, insn->dst_reg); if (BPF_SRC(insn->code) == BPF_X && !err && !__is_pointer_value(false, src_reg)) err = mark_chain_precision(env, insn->src_reg); if (err) return err; } if (pred == 1) { /* Only follow the goto, ignore fall-through. If needed, push * the fall-through branch for simulation under speculative * execution. */ if (!env->bypass_spec_v1 && !sanitize_speculative_path(env, insn, *insn_idx + 1, *insn_idx)) return -EFAULT; *insn_idx += insn->off; return 0; } else if (pred == 0) { /* Only follow the fall-through branch, since that's where the * program will go. If needed, push the goto branch for * simulation under speculative execution. */ if (!env->bypass_spec_v1 && !sanitize_speculative_path(env, insn, *insn_idx + insn->off + 1, *insn_idx)) return -EFAULT; return 0; } /* 得到新的 struct bpf_verifier_state */ other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, false); if (!other_branch) return -EFAULT; other_branch_regs = other_branch->frame[other_branch->curframe]->regs; /* 檢查是否正在與 const value 比較,讓我們可以調整 dst_reg 的 min/max, * 只有在 src/dst 都是 scalar (或指向同個 obj 的 ptr in future) 時才合法, * 否則不同的 base ptr 代表著 offset 不可比較 */ /* 設置 register value range 的 function - reg_set_min_max() */ if (BPF_SRC(insn->code) == BPF_X) { /* register operation */ struct bpf_reg_state *src_reg = ®s[insn->src_reg]; /* 都是 scalar */ if (dst_reg->type == SCALAR_VALUE && src_reg->type == SCALAR_VALUE) { if (tnum_is_const(src_reg->var_off) || (is_jmp32 && tnum_is_const(tnum_subreg(src_reg->var_off)))) reg_set_min_max(&other_branch_regs[insn->dst_reg], dst_reg, src_reg->var_off.value, tnum_subreg(src_reg->var_off).value, opcode, is_jmp32); else if (tnum_is_const(dst_reg->var_off) || (is_jmp32 && tnum_is_const(tnum_subreg(dst_reg->var_off)))) reg_set_min_max_inv(&other_branch_regs[insn->src_reg], src_reg, dst_reg->var_off.value, tnum_subreg(dst_reg->var_off).value, opcode, is_jmp32); else if (!is_jmp32 && (opcode == BPF_JEQ || opcode == BPF_JNE)) /* Comparing for equality, we can combine knowledge */ reg_combine_min_max(&other_branch_regs[insn->src_reg], &other_branch_regs[insn->dst_reg], src_reg, dst_reg, opcode); if (src_reg->id && !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) { find_equal_scalars(this_branch, src_reg); find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]); } } } else if (dst_reg->type == SCALAR_VALUE) { reg_set_min_max(&other_branch_regs[insn->dst_reg], dst_reg, insn->imm, (u32)insn->imm, opcode, is_jmp32); } if (dst_reg->type == SCALAR_VALUE && dst_reg->id && !WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) { find_equal_scalars(this_branch, dst_reg); find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]); } /* detect if R == 0 where R is returned from bpf_map_lookup_elem(). * NOTE: these optimizations below are related with pointer comparison * which will never be JMP32. */ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K && insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && reg_type_may_be_null(dst_reg->type)) { /* Mark all identical registers in each branch as either * safe or unknown depending R == 0 or R != 0 conditional. */ /* 如果 ret_type 是 _OR_NULL,則會被 mark 成 unknown_zero or not_null */ /* 突然想到 reg->id 是否為 ticket spinlock 取得號碼牌的機制 */ mark_ptr_or_null_regs(this_branch, insn->dst_reg, opcode == BPF_JNE); mark_ptr_or_null_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ); } else if (...) { return -EACCES; } return 0; }
-
check branch taken 相關的 function
is_branch_taken()
:/* compute branch direction of the expression "if (reg opcode val) goto target;" * and return: * 1 - branch will be taken and "goto target" will be executed * 0 - branch will not be taken and fall-through to next insn * -1 - unknown. Example: "if (reg < 5)" is unknown when register value * range [0,10] */ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, bool is_jmp32) { if (__is_pointer_value(false, reg) /* reg->type != SCALAR_VALUE */) { /* return * PTR_TO_SOCKET || * PTR_TO_TCP_SOCK || * PTR_TO_MAP_VALUE || * PTR_TO_MAP_KEY || * PTR_TO_SOCK_COMMON */ if (!reg_type_not_null(reg->type)) return -1; /* If pointer is valid tests against zero will fail so we can * use this to direct branch taken. */ if (val != 0) return -1; switch (opcode) { case BPF_JEQ: return 0; case BPF_JNE: return 1; default: /* make sense,大於小於相關的都不確定 */ return -1; } } if (is_jmp32) return is_branch32_taken(reg, val, opcode); return is_branch64_taken(reg, val, opcode); }
-
sanitize_speculative_path()
:static struct bpf_verifier_state * sanitize_speculative_path(struct bpf_verifier_env *env, const struct bpf_insn *insn, u32 next_idx, u32 curr_idx) { struct bpf_verifier_state *branch; struct bpf_reg_state *regs; branch = push_stack(env, next_idx, curr_idx, true); if (branch && insn) { regs = branch->frame[branch->curframe]->regs; if (BPF_SRC(insn->code) == BPF_K) { mark_reg_unknown(env, regs, insn->dst_reg); } else if (BPF_SRC(insn->code) == BPF_X) { mark_reg_unknown(env, regs, insn->dst_reg); mark_reg_unknown(env, regs, insn->src_reg); } } return branch; }
-
push_stack()
新增一個bpf_verifier_stack_elem
element:static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx, bool speculative) { struct bpf_verifier_state *cur = env->cur_state; struct bpf_verifier_stack_elem *elem; int err; elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); if (!elem) goto err; /* copy value to elem */ elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; elem->next = env->head; elem->log_pos = env->log.len_used; env->head = elem; env->stack_size++; /* copy 當前的整個 struct bpf_verifier_state */ err = copy_verifier_state(&elem->st, cur); if (err) goto err; /* elem->st 為 element state */ elem->st.speculative |= speculative; if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) { /* jmp size 過於複雜 */ goto err; } if (elem->st.parent) { ++elem->st.parent->branches; } return &elem->st; err: free_verifier_state(env->cur_state, true); env->cur_state = NULL; /* pop all elements and return */ while (!pop_stack(env, NULL, NULL, false)); return NULL; }
-
-
-
-
最後一個部分為
BPF_LD
:... else if (class == BPF_LD) { /* BPF_IMM 0x00 * BPF_ABS 0x20 * BPF_IND 0x40 * BPF_MEM 0x60 * BPF_LEN 0x80 * BPF_MSH 0xa0 */ u8 mode = BPF_MODE(insn->code); if (mode == BPF_ABS || mode == BPF_IND) { err = check_ld_abs(env, insn); if (err) return err; } else if (mode == BPF_IMM) { err = check_ld_imm(env, insn); if (err) return err; env->insn_idx++; /* BPF_LD_IMM64_RAW,所以下個 insn 不看 (存 imm 後 32 bit) */ sanitize_mark_insn_seen(env); /* mark 成看過的 */ } else { /* ABS IND IMM 都不是 */ return -EINVAL; } } else /* 未知的 insn class (invalid) */ return -EINVAL; } env->insn_idx++; } return 0; }
-
其中兩個 check_ld function
check_ld_abs()
以及check_ld_imm()
-
check_ld_abs()
確保兩件事:-
當 ctx == skb (socket buffer) 才會出現
LD_ABS|LD_IND
insn -
會使用到 R1-R5 regs
-
R6-R9 不會 touch,並且將 return value 放在 R0
static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) { struct bpf_reg_state *regs = cur_regs(env); static const int ctx_reg = BPF_REG_6; u8 mode = BPF_MODE(insn->code); int i, err; /* 此 prog type 不能使用 ABS|IND */ if (!may_access_skb(resolve_prog_type(env->prog))) { return -EINVAL; } if (!env->ops->gen_ld_abs) { /* misconfigured */ return -EINVAL; } /* uses reserved fields */ if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || BPF_SIZE(insn->code) == BPF_DW || (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { return -EINVAL; } /* check whether implicit source operand (register R6) is readable */ /* r6 存放 skb data,確保可以讀 (SRC_OP) */ err = check_reg_arg(env, ctx_reg, SRC_OP); if (err) return err; err = check_reference_leak(env); /* ABS|IND 不能與 socket ref 同時用 (?) */ if (err) { return err; } /* 不能在 bpf_spin_lock active 時用 */ if (env->cur_state->active_spin_lock) { return -EINVAL; } if (regs[ctx_reg].type != PTR_TO_CTX) { /* r6 不指向 ctx */ return -EINVAL; } if (mode == BPF_IND) { /* src_reg 是否可讀 */ err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) return err; } /* reg->off 是 ctx ptr 的 offset * * 確保只能讀取 unmodified form: * - reg->off == 0 * - tnum_is_const(reg->var_off) * - !reg->var_off.value */ err = check_ctx_reg(env, ®s[ctx_reg], ctx_reg); if (err < 0) return err; /* reset caller saved regs to unreadable */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); } /* mark destination R0 register as readable, since it contains * the value fetched from the packet. * Already marked as written above. */ /* unknown 就是 readable 嗎 ? */ mark_reg_unknown(env, regs, BPF_REG_0); /* ld_abs load up to 32-bit skb data. */ regs[BPF_REG_0].subreg_def = env->insn_idx + 1; return 0; }
-
-
check_ld_imm()
:/* verify BPF_LD_IMM64 instruction */ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) { struct bpf_insn_aux_data *aux = cur_aux(env); /* &env->insn_aux_data[env->insn_idx] */ /* cur_func(env)->regs * cur_func: env->cur_state->frame[env->cur_state->curframe] */ struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *dst_reg; struct bpf_map *map; int err; if (BPF_SIZE(insn->code) != BPF_DW) { /* 要是 imm64 */ return -EINVAL; } /* reserved field */ if (insn->off != 0) { return -EINVAL; } /* 確定可寫 */ err = check_reg_arg(env, insn->dst_reg, DST_OP); if (err) return err; dst_reg = ®s[insn->dst_reg]; if (insn->src_reg == 0) { /* 因為 64 bits 是由兩個 insn 組成 */ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; dst_reg->type = SCALAR_VALUE; __mark_reg_known(®s[insn->dst_reg], imm); return 0; } if (insn->src_reg == BPF_PSEUDO_BTF_ID /* 3 */) { mark_reg_known_zero(env, regs, insn->dst_reg); /* 沒有很理解 BTF_ID 的功能也沒用過 */ return 0; } if (insn->src_reg == BPF_PSEUDO_FUNC) { struct bpf_prog_aux *aux = env->prog->aux; u32 subprogno = insn[1].imm; if (!aux->func_info) { /* miss func_info */ return -EINVAL; } /* callback function 要是靜態的 */ /* aux->func_info_aux[subprogno] 指向 callback function ? */ if (aux->func_info_aux[subprogno].linkage != BTF_FUNC_STATIC) { return -EINVAL; } dst_reg->type = PTR_TO_FUNC; dst_reg->subprogno = subprogno; return 0; } map = env->used_maps[aux->map_index]; mark_reg_known_zero(env, regs, insn->dst_reg); /* 將大多資料設成 0 */ dst_reg->map_ptr = map; /* 沒看懂 */ if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) { dst_reg->type = PTR_TO_MAP_VALUE; dst_reg->off = aux->map_off; if (map_value_has_spin_lock(map)) dst_reg->id = ++env->id_gen; } else if (insn->src_reg == BPF_PSEUDO_MAP_FD) { dst_reg->type = CONST_PTR_TO_MAP; /* BPF_PSEUDO_MAP_FD 的目的是要讀 map address */ } else { return -EINVAL; } return 0; }
env->insn_aux_data[env->insn_idx]
為 insn 的 metadata
-
-
adjust min max 系列一共有三個 function:
adjust_reg_min_max_vals()
- register,不過只有在check_alu_op()
被執行adjust_ptr_min_max_vals()
- pointer,在adjust_reg_min_max_vals()
被執行adjust_scalar_min_max_vals()
- scalar,在adjust_reg_min_max_vals()
被執行
adjust_reg_min_max_vals()
:
static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
struct bpf_insn *insn)
{
struct bpf_verifier_state *vstate = env->cur_state;
struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg;
struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
u8 opcode = BPF_OP(insn->code);
int err;
dst_reg = ®s[insn->dst_reg];
src_reg = NULL;
if (dst_reg->type != SCALAR_VALUE)
ptr_reg = dst_reg;
else
/* Make sure ID is cleared otherwise dst_reg min/max could be
* incorrectly propagated into other registers by find_equal_scalars()
*/
dst_reg->id = 0;
if (BPF_SRC(insn->code) == BPF_X) {
src_reg = ®s[insn->src_reg];
if (src_reg->type != SCALAR_VALUE) {
if (dst_reg->type != SCALAR_VALUE) {
/* Combining two pointers by any ALU op yields
* an arbitrary scalar. Disallow all math except
* pointer subtraction
*/
if (opcode == BPF_SUB && env->allow_ptr_leaks) {
mark_reg_unknown(env, regs, insn->dst_reg);
return 0;
}
verbose(env, "R%d pointer %s pointer prohibited\n",
insn->dst_reg,
bpf_alu_string[opcode >> 4]);
return -EACCES;
} else {
/* scalar += pointer
* This is legal, but we have to reverse our
* src/dest handling in computing the range
*/
err = mark_chain_precision(env, insn->dst_reg);
if (err)
return err;
return adjust_ptr_min_max_vals(env, insn,
src_reg, dst_reg);
}
} else if (ptr_reg) {
/* pointer += scalar */
err = mark_chain_precision(env, insn->src_reg);
if (err)
return err;
return adjust_ptr_min_max_vals(env, insn,
dst_reg, src_reg);
}
} else {
/* Pretend the src is a reg with a known value, since we only
* need to be able to read from this state.
*/
off_reg.type = SCALAR_VALUE;
__mark_reg_known(&off_reg, insn->imm);
src_reg = &off_reg;
if (ptr_reg) /* pointer += K */
return adjust_ptr_min_max_vals(env, insn,
ptr_reg, src_reg);
}
/* Got here implies adding two SCALAR_VALUEs */
if (WARN_ON_ONCE(ptr_reg)) {
print_verifier_state(env, state);
verbose(env, "verifier internal error: unexpected ptr_reg\n");
return -EINVAL;
}
if (WARN_ON(!src_reg)) {
print_verifier_state(env, state);
verbose(env, "verifier internal error: no src_reg\n");
return -EINVAL;
}
return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
}
insn 的 struct bpf_insn
:
struct bpf_insn {
__u8 code; /* opcode */
__u8 dst_reg:4; /* dest register */
__u8 src_reg:4; /* source register */
__s16 off; /* signed offset */
__s32 imm; /* signed immediate constant */
};
insn 的輔助 struct bpf_insn_aux_data
:
struct bpf_insn_aux_data {
union {
enum bpf_reg_type ptr_type; /* pointer type for load/store insns */
unsigned long map_ptr_state; /* pointer/poison value for maps */
s32 call_imm; /* saved imm field of call insn */
u32 alu_limit; /* limit for add/sub register with pointer */
struct {
u32 map_index; /* index into used_maps[] */
u32 map_off; /* offset from value base address */
};
struct {
enum bpf_reg_type reg_type; /* type of pseudo_btf_id */
union {
struct {
struct btf *btf;
u32 btf_id; /* btf_id for struct typed var */
};
u32 mem_size; /* mem_size for non-struct typed var */
};
} btf_var;
};
u64 map_key_state; /* constant (32 bit) key tracking for maps */
int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
u32 seen; /* this insn was processed by the verifier at env->pass_cnt */
bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
bool zext_dst; /* this insn zero extends dst reg */
u8 alu_state; /* used in combination with alu_limit */
/* below fields are initialized once */
unsigned int orig_idx; /* original instruction index */
bool prune_point;
};
state list 的 struct bpf_verifier_state_list
:
/* linked list of verifier states used to prune search */
struct bpf_verifier_state_list {
struct bpf_verifier_state state;BPF_REG_
struct bpf_verifier_state_list *next;
int miss_cnt, hit_cnt;
};
而 bpf_verifier_state
則是 (src 有提供一些 comment 幫助理解):
/* Maximum number of register states that can exist at once */
#define BPF_ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE)
#define MAX_CALL_FRAMES 8
struct bpf_verifier_state {
struct bpf_func_state *frame[MAX_CALL_FRAMES];
struct bpf_verifier_state *parent;
u32 branches;
u32 insn_idx;
u32 curframe;
u32 active_spin_lock;
bool speculative;
u32 first_insn_idx;
u32 last_insn_idx;
struct bpf_idx_pair *jmp_history;
u32 jmp_history_cnt;
};
bpf_check()
下半段:
...
skip_full_check:
kvfree(env->explored_states);
if (ret == 0)
ret = check_max_stack_depth(env);
/* instruction rewrites happen after this point */
if (is_priv) {
if (ret == 0)
opt_hard_wire_dead_code_branches(env);
if (ret == 0)
ret = opt_remove_dead_code(env);
if (ret == 0)
ret = opt_remove_nops(env);
} else {
if (ret == 0)
sanitize_dead_code(env);
}
if (ret == 0)
/* program is valid, convert *(u32*)(ctx + off) accesses */
ret = convert_ctx_accesses(env);
if (ret == 0)
ret = do_misc_fixups(env);
/* do 32-bit optimization after insn patching has done so those patched
* insns could be handled correctly.
*/
if (ret == 0 && !bpf_prog_is_dev_bound(env->prog->aux)) {
ret = opt_subreg_zext_lo32_rnd_hi32(env, attr);
env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret
: false;
}
if (ret == 0)
ret = fixup_call_args(env);
env->verification_time = ktime_get_ns() - start_time;
print_verification_stats(env);
if (log->level && bpf_verifier_log_full(log))
ret = -ENOSPC;
if (log->level && !log->ubuf) {
ret = -EFAULT;
goto err_release_maps;
}
if (ret)
goto err_release_maps;
if (env->used_map_cnt) {
/* if program passed verifier, update used_maps in bpf_prog_info */
env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
sizeof(env->used_maps[0]),
GFP_KERNEL);
if (!env->prog->aux->used_maps) {
ret = -ENOMEM;
goto err_release_maps;
}
memcpy(env->prog->aux->used_maps, env->used_maps,
sizeof(env->used_maps[0]) * env->used_map_cnt);
env->prog->aux->used_map_cnt = env->used_map_cnt;
}
if (env->used_btf_cnt) {
/* if program passed verifier, update used_btfs in bpf_prog_aux */
env->prog->aux->used_btfs = kmalloc_array(env->used_btf_cnt,
sizeof(env->used_btfs[0]),
GFP_KERNEL);
if (!env->prog->aux->used_btfs) {
ret = -ENOMEM;
goto err_release_maps;
}
memcpy(env->prog->aux->used_btfs, env->used_btfs,
sizeof(env->used_btfs[0]) * env->used_btf_cnt);
env->prog->aux->used_btf_cnt = env->used_btf_cnt;
}
if (env->used_map_cnt || env->used_btf_cnt) {
/* program is valid. Convert pseudo bpf_ld_imm64 into generic
* bpf_ld_imm64 instructions
*/
convert_pseudo_ld_imm64(env);
}
adjust_btf_func(env);
err_release_maps:
if (!env->prog->aux->used_maps)
/* if we didn't copy map pointers into bpf_prog_info, release
* them now. Otherwise free_used_maps() will release them.
*/
release_maps(env);
if (!env->prog->aux->used_btfs)
release_btfs(env);
/* extension progs temporarily inherit the attach_type of their targets
for verification purposes, so set it back to zero before returning
*/
if (env->prog->type == BPF_PROG_TYPE_EXT)
env->prog->expected_attach_type = 0;
*prog = env->prog;
err_unlock:
if (!is_priv)
mutex_unlock(&bpf_verifier_lock);
vfree(env->insn_aux_data);
err_free_env:
kfree(env);
return ret;
}