Skip to content

Latest commit

 

History

History
3929 lines (3378 loc) · 126 KB

eBPF.md

File metadata and controls

3929 lines (3378 loc) · 126 KB

eBPF

kernel.md有稍微 trace eBPF 的運作機制,而這邊會做更詳細的追蹤,使用的 kernel 版本為 5.13.11。

進入點 syscall

首先關於 bpf 的操作是透過 nr 321 的 syscall 來執行,下列為傳入的參數:

%rax System call %rdi %rsi %rdx
321 sys_bpf int cmd union bpf_attr *attr unsigned int size

之後會到 kernel 的 bpf syscall handler 來執行 (src):

SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
	union bpf_attr attr;
	int err;

	if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
		return -EPERM;

    /* 傳入的 size 可能跟 sizeof(attr) 不相同,因此要檢查多的部分是否為 null */
	err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
	if (err)
		return err;
    /* size 為 sizeof(attr) or size 之中較小的值 */
	size = min_t(u32, size, sizeof(attr));

	/* copy attributes from user space, may be less than sizeof(bpf_attr) */
	memset(&attr, 0, sizeof(attr));
	if (copy_from_user(&attr, uattr, size) != 0)
		return -EFAULT;

   	/**
   	 * https://elixir.bootlin.com/linux/v5.13.11/source/security/security.c#L2566
   	 * 定義一連串 bpf cmd 檢測
   	 */
	err = security_bpf(cmd, &attr, size);
	if (err < 0)
		return err;

    /* 根據 user 傳入的 cmd 做對應的行為,像是 create map、update map 等等 */
	switch (cmd) {
	case BPF_MAP_CREATE:
		err = map_create(&attr);
		break;
	...
	default:
		err = -EINVAL;
		break;
	}

	return err;
}
  • 這邊的 call flow 是從 __x64_sys_bpf --> __do_sys_bpf,所以該 macro 展開後應該會是 __do_sys_bpf,不過 __x64_sys_bpf 似乎也沒做什麼事

  • copy_from_user() 是從 user 提供的 ptr copy 資料回 kernel mode:

    static __always_inline unsigned long __must_check
    copy_from_user(void *to, const void __user *from, unsigned long n)
    {
    	if (likely(check_copy_size(to, n, false)))
    		n = _copy_from_user(to, from, n);
    	return n;
    }

    會先執行 check_copy_size() 檢查資料的合法性,合法的定義如下:

    • not bogus address
    • fully contained by stack (or stack frame, when available)
    • fully within SLAB object (or object whitelist area, when available)
    • not in kernel text
  • 雖說 security_bpf() 看起來是檢測 bpf 的安全性,過程中會從 security_hook_heads+1800 嘗試拿出 function pointer 去執行 (?),但如果沒有特別設置,則什麼都不會做:

    int security_bpf(int cmd, union bpf_attr *attr, unsigned int size)
    {
    	return call_int_hook(bpf, 0, cmd, attr, size);
    }

BPF_MAP_CREATE --> map_create

map_create() 為 bpf 在建立 kernel 跟 userland 共享的記憶體空間,之後透過 map_lookup_elem() 來從 userland 取得 kernel space 最新資料,或是透過 map_update_elem() 將 userland 的資料更新上去。

首先建立時會參照 user 傳入的 struct union bpf_attr (size: 0x78):

union bpf_attr {
	struct { /* anonymous struct used by BPF_MAP_CREATE command */
        /* 沒特別設定的話,userland 傳入的應該只會有前 5 個 */
		__u32	map_type;	/* one of enum bpf_map_type */
		__u32	key_size;	/* size of key in bytes */
		__u32	value_size;	/* size of value in bytes */
		__u32	max_entries;	/* max number of entries in a map */
		__u32	map_flags;	/* BPF_MAP_CREATE related
					 * flags defined above.
					 */
        /* ------------------------------------------ */
		__u32	inner_map_fd;	/* fd pointing to the inner map */
		__u32	numa_node;	/* numa node (effective only if
					 * BPF_F_NUMA_NODE is set).
					 */
		char	map_name[BPF_OBJ_NAME_LEN];
		__u32	map_ifindex;	/* ifindex of netdev to create on */
		__u32	btf_fd;		/* fd pointing to a BTF type data */
		__u32	btf_key_type_id;	/* BTF type_id of the key */
		__u32	btf_value_type_id;	/* BTF type_id of the value */
		__u32	btf_vmlinux_value_type_id;/* BTF type_id of a kernel-
						   * struct stored as the
						   * map value
						   */
	};
	...

map_create() 的程式碼如下:

static int map_create(union bpf_attr *attr)
{
	int numa_node = bpf_map_attr_numa_node(attr);
	struct bpf_map *map;
	int f_flags;
	int err;

	err = CHECK_ATTR(BPF_MAP_CREATE);
	if (err)
		return -EINVAL;

    /* 不確定什麼時候會進入這兩個 condition,不過我看 attr 只有前面 4~5 個 member 被初始化而已 */
	if (attr->btf_vmlinux_value_type_id) {
		if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
		    attr->btf_key_type_id || attr->btf_value_type_id)
			return -EINVAL;
	} else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
		return -EINVAL;
	}

    /* 檢測 flag,bpf 的 fd 也有分 RD, WR, RDWR */
	f_flags = bpf_get_file_flag(attr->map_flags);
	if (f_flags < 0)
		return f_flags;

    /**
     * numa - Non-Uniform Memory Access
     * 應該是跟硬體架構有關的判斷式,不過 usermode 似乎可以透過 BPF_F_NUMA_NODE 此 flag 來選擇 numa node
     */
	if (numa_node != NUMA_NO_NODE &&
	    ((unsigned int)numa_node >= nr_node_ids ||
	     !node_online(numa_node)))
		return -EINVAL;

	/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
	map = find_and_alloc_map(attr); /* 根據傳入的 attr 新增一個 map */
	if (IS_ERR(map))
		return PTR_ERR(map);

    /* 複製 attr 的 map_name 到 map->name */
	err = bpf_obj_name_cpy(map->name, attr->map_name,
			       sizeof(attr->map_name));
	if (err < 0)
		goto free_map;

    /* 設置與 lock 相關的 member */
	atomic64_set(&map->refcnt, 1);
	atomic64_set(&map->usercnt, 1);
	mutex_init(&map->freeze_mutex);

	map->spin_lock_off = -EINVAL;
	if (attr->btf_key_type_id || attr->btf_value_type_id ||
	    /* Even the map's value is a kernel's struct,
	     * the bpf_prog.o must have BTF to begin with
	     * to figure out the corresponding kernel's
	     * counter part.  Thus, attr->btf_fd has
	     * to be valid also.
	     */
	    attr->btf_vmlinux_value_type_id) {
		struct btf *btf;

		btf = btf_get_by_fd(attr->btf_fd);
		if (IS_ERR(btf)) {
			err = PTR_ERR(btf);
			goto free_map;
		}
		if (btf_is_kernel(btf)) {
			btf_put(btf);
			err = -EACCES;
			goto free_map;
		}
		map->btf = btf;

		if (attr->btf_value_type_id) {
			err = map_check_btf(map, btf, attr->btf_key_type_id,
					    attr->btf_value_type_id);
			if (err)
				goto free_map;
		}

		map->btf_key_type_id = attr->btf_key_type_id;
		map->btf_value_type_id = attr->btf_value_type_id;
		map->btf_vmlinux_value_type_id =
			attr->btf_vmlinux_value_type_id;
	}

    /* 這種 security_* 的 function 都是在 security/security.c 定義,會看有沒有 pre-define hook 可以呼叫
    	如果沒有的話就什麼事情也不做
    */
	err = security_bpf_map_alloc(map);
	if (err)
		goto free_map;

    /* published the map to the userspace */
    /* 背後是用 idr 做的,而 idr 的機制我不是很了解,只查到是用 radix-tree 去 implement int->ptr
     * 用途像是 device name 等等
     */
	err = bpf_map_alloc_id(map);
	if (err)
		goto free_map_sec;

	bpf_map_save_memcg(map);

    /* 會 assign 新的 fd 給 map,正常的話會是一個新的 fd */
	err = bpf_map_new_fd(map, f_flags);
	if (err < 0) {
		/* failed to allocate fd.
		 * bpf_map_put_with_uref() is needed because the above
		 * bpf_map_alloc_id() has published the map
		 * to the userspace and the userspace may
		 * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
		 */
		bpf_map_put_with_uref(map);
		return err;
	}
	/* 正常離開 */
	return err;

free_map_sec:
	security_bpf_map_free(map);
free_map:
	btf_put(map->btf);
	map->ops->map_free(map);
	return err;
}
  • find_and_alloc_map() 會根據 attr->map_type 來建立 map (src):

    static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
    {
    	const struct bpf_map_ops *ops;
    	u32 type = attr->map_type;
    	struct bpf_map *map;
    	int err;
    
        /* bpf_map_types 一共有 29 個,如 fixed_percpu_data, htab_map_ops, array_map_ops,... 等等 */
    	if (type >= ARRAY_SIZE(bpf_map_types))
    		return ERR_PTR(-EINVAL);
    
    	type = array_index_nospec(type, ARRAY_SIZE(bpf_map_types)); /* 取得 type index (e.g. array = 2) */
    	ops = bpf_map_types[type]; /* 取得對應 type 的 op function table */
    	if (!ops)
    		return ERR_PTR(-EINVAL);
        /**
         * 同類的 ops function 都定義在同個檔案,如 array 就是在 https://elixir.bootlin.com/linux/v5.13.11/source/kernel/bpf/arraymap.c
         */
    
    	if (ops->map_alloc_check) {
            /* 檢查 attr 中關於 map 的資料是否為 array specified */
    		err = ops->map_alloc_check(attr);
    		if (err)
    			return ERR_PTR(err);
    	}
    	if (attr->map_ifindex)
    		ops = &bpf_map_offload_ops;
    	map = ops->map_alloc(attr); /* array 的話會 call array_map_alloc() */
    	if (IS_ERR(map))
    		return map;
        /* 到這,map 已經根據 attr 初始化完成,並且也檢測了合法性,最後只需要在 assign op function 以及 type */
    	map->ops = ops;
    	map->map_type = type;
    	return map;
    }
    • array_map_alloc() 做了以下事情:
      • 透過 round_up() bitwise 的操作,將 attr->max_entries 擴展到 >= attr->max_entries 的二冪次值
      • array_size += (u64) max_entries * elem_size 為 struct 需要的大小,array_size 為 sizeof(struct bpf_array),而 max_entries 為 extend 後的 entry 數量,elem_size 為 user 傳入的 attr->value_size
      • bpf_map_area_allocarray_size 為參數,間接呼叫 area = kmalloc_node(size, ...) 來建立
        • kmalloc_node()kmalloc() 的差別在於 bpf 可能會需要選擇特定的 NUMA node,不過如果沒特別指定的話 (node 傳入 -1) 其實沒什麼差別
      • bpf_map_init_from_attr() 會根據傳入的 attr 來初始化 map,只是做一些簡單的 assign 而已

BPF_PROG_LOAD --> bpf_prog_load

bpf_prog_load 會接收使用者傳入的 attr,並且 emulate 執行 insn 看是否合法 (是否有 infinite loop 等等),可謂說是 bpf 最關鍵的一個 function。

透過 switch (cmd) 會進入 bpf_prog_load() (src),而上半部分的程式碼在做 bpf_prog 的初始化,以及建構整個執行環境:

static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
{
	enum bpf_prog_type type = attr->prog_type;
	struct bpf_prog *prog, *dst_prog = NULL;
	struct btf *attach_btf = NULL;
	int err;
	char license[128];
	bool is_gpl;

	if (CHECK_ATTR(BPF_PROG_LOAD))
		return -EINVAL;

    /* 只有這些 flag 是被允許的 */
	if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
				 BPF_F_ANY_ALIGNMENT |
				 BPF_F_TEST_STATE_FREQ |
				 BPF_F_SLEEPABLE |
				 BPF_F_TEST_RND_HI32))
		return -EINVAL;

	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
	    (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
	    !bpf_capable())
		return -EPERM;

	/* copy eBPF program license from user space */
    /* 把 license copy 到 kernel */
	if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
			      sizeof(license) - 1) < 0)
		return -EFAULT;
	license[sizeof(license) - 1] = 0;

	/* eBPF programs must be GPL compatible to use GPL-ed functions */
    /* license_is_gpl_compatible() 有列出一連串相容的 license */
	is_gpl = license_is_gpl_compatible(license);

	if (attr->insn_cnt == 0 ||
	    attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
		return -E2BIG;
    /**
     * 只能 type BPF_PROG_TYPE_SOCKET_FILTER 或是 BPF_PROG_TYPE_CGROUP_SKB)
     * 但是 enum bpf_prog_type 裡面有很多 type @_@ ?
     */
	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
	    type != BPF_PROG_TYPE_CGROUP_SKB &&
	    !bpf_capable())
		return -EPERM;

	if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN))
		return -EPERM;
	if (is_perfmon_prog_type(type) && !perfmon_capable())
		return -EPERM;

	/* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
	 * or btf, we need to check which one it is
	 */
    /* 用來 attach 舊的 prog_fd ? */
	if (attr->attach_prog_fd) {
		dst_prog = bpf_prog_get(attr->attach_prog_fd);
		if (IS_ERR(dst_prog)) {
			dst_prog = NULL;
			attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd);
			if (IS_ERR(attach_btf))
				return -EINVAL;
			if (!btf_is_kernel(attach_btf)) {
				/* attaching through specifying bpf_prog's BTF
				 * objects directly might be supported eventually
				 */
				btf_put(attach_btf);
				return -ENOTSUPP;
			}
		}
	} else if (attr->attach_btf_id) {
		/* fall back to vmlinux BTF, if BTF type ID is specified */
		attach_btf = bpf_get_btf_vmlinux();
		if (IS_ERR(attach_btf))
			return PTR_ERR(attach_btf);
		if (!attach_btf)
			return -EINVAL;
		btf_get(attach_btf);
	}

     /**
     * bpf_prog_load_check_attach():
     * Sets expected_attach_type in @attr if prog type requires it but has
	 * some attach types that have to be backward compatible
	 * 不過只有 BPF_PROG_TYPE_CGROUP_SOCK 可以設 expected_attach_type 成 BPF_CGROUP_INET_SOCK_CREATE
	 */
	bpf_prog_load_fixup_attach_type(attr);
	if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
				       attach_btf, attr->attach_btf_id,
				       dst_prog)) {
        /**
         * 只有在 bpf_prog_load_check_attach() return EINVAL (invalid value) 時會進入
         * 上述 function 在一一檢查 prog_type 與 expected_attach_type 的關係
         */
		if (dst_prog)
			bpf_prog_put(dst_prog);
		if (attach_btf)
			btf_put(attach_btf);
		return -EINVAL;
	}

	/* plain bpf_prog allocation */
    /**
     * 取得 struct bpf_prog + insn length 大小的 space 作為第一個參數
     * 之後執行 bpf_prog_alloc 來
     */
	prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
	if (!prog) {
		if (dst_prog)
			bpf_prog_put(dst_prog);
		if (attach_btf)
			btf_put(attach_btf);
		return -ENOMEM;
	}

	prog->expected_attach_type = attr->expected_attach_type;
	prog->aux->attach_btf = attach_btf;
	prog->aux->attach_btf_id = attr->attach_btf_id;
	prog->aux->dst_prog = dst_prog;
	prog->aux->offload_requested = !!attr->prog_ifindex;
	prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;

	err = security_bpf_prog_alloc(prog->aux);
	if (err)
		goto free_prog;

	prog->aux->user = get_current_user();
	prog->len = attr->insn_cnt;

	err = -EFAULT;
	if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns),
			   bpf_prog_insn_size(prog)) != 0)
		goto free_prog_sec;

	prog->orig_prog = NULL;
	prog->jited = 0;

    /* 已經有一個 reference */
	atomic64_set(&prog->aux->refcnt, 1);
	prog->gpl_compatible = is_gpl ? 1 : 0;

	if (bpf_prog_is_dev_bound(prog->aux)) {
		err = bpf_prog_offload_init(prog, attr);
		if (err)
			goto free_prog_sec;
	}

	/* find program type: socket_filter vs tracing_filter */
    /**
     * 找到對應 prog type 的 type index / ops,並 assign 給 prog->aux->ops 以及 prog->type
     * 就 BPF_PROG_TYPE_SOCKET_FILTER 來說,
     * ops 會是 sk_filter_prog_ops
     * type 會是 BPF_PROG_TYPE_SOCKET_FILTER
     */
	err = find_prog_type(type, prog);
	if (err < 0)
		goto free_prog_sec;

	prog->aux->load_time = ktime_get_boottime_ns();
    /* copy prog name from user mode */
	err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
			       sizeof(attr->prog_name));
	if (err < 0)
		goto free_prog_sec;
	...
  • bpf_prog_alloc 相關 function 有使用到 GFP prefix 的 flag,而 GFP 本身為 Get Free Pages = __get_free_pages

    • 指定 allocate memory 時的行為,e.g. GFP_ATOMIC 為在 allocate page 時不會有 context-switch

    • 背後會透過 __vmalloc 來建立存放 bpf_prog

    • 程式碼 (src):

      struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
      {
          /* 上述說明的 GFP flag */
      	gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
      	struct bpf_prog *prog;
      	int cpu;
      
          /* 建立一個沒有設 stats 的 prog struct */
      	prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags);
      	if (!prog)
      		return NULL;
      
          /* 這邊才設 stats */
          /* 可是回傳的不是一般的 address,而是 e.g 0x607ff0c030a0 */
      	prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
      	if (!prog->stats) {
      		free_percpu(prog->active);
      		kfree(prog->aux);
      		vfree(prog);
      		return NULL;
      	}
      
          /* bpf 來說不怎麼重要 ? */
      	for_each_possible_cpu(cpu) {
      		struct bpf_prog_stats *pstats;
      
      		pstats = per_cpu_ptr(prog->stats, cpu);
      		u64_stats_init(&pstats->syncp);
      	}
      	return prog;
      }

      過程中呼叫的 bpf_prog_alloc_no_stats() 為主要 allocate 的部分:

      struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
      {
      	gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
      	struct bpf_prog_aux *aux;
      	struct bpf_prog *fp;
      
          /* 一樣找到 >= 的 pow of 2 value (以 page 為 base) */
      	size = round_up(size, PAGE_SIZE); /* 沒有很大的 prog 都會直接分一個 page 給他 (0x1000) */
      	fp = __vmalloc(size, gfp_flags);
      	if (fp == NULL)
      		return NULL;
      
      	aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT | gfp_extra_flags);
      	if (aux == NULL) {
      		vfree(fp);
      		return NULL;
      	}
      	fp->active = alloc_percpu_gfp(int, GFP_KERNEL_ACCOUNT | gfp_extra_flags);
      	if (!fp->active) {
      		vfree(fp);
      		kfree(aux);
      		return NULL;
      	}
      
      	fp->pages = size / PAGE_SIZE; /* 站了幾個 page */
      	fp->aux = aux;
      	fp->aux->prog = fp;
      	fp->jit_requested = ebpf_jit_enabled();
      
      	INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);
          /* init lock */
      	mutex_init(&fp->aux->used_maps_mutex);
      	mutex_init(&fp->aux->dst_mutex);
      
      	return fp;
      }
      • 一共分配兩塊 memory: bpf_prog 以及 aux

      • ebpf_jit_enabled() 會影響到是否可以用 JIT 優化,還是只能都用 interpreter

        static inline bool ebpf_jit_enabled(void)
        {
        	return bpf_jit_enable && bpf_jit_is_ebpf();
        }
        
        static inline bool bpf_jit_is_ebpf(void)
        {
        /* 要從編譯 kernel 的時候設定 */
        # ifdef CONFIG_HAVE_EBPF_JIT
        	return true;
        # else
        	return false;
        # endif
        }
        
        /* bpf_jit_enable 在 kernel/bpf/core.c */
        #ifdef CONFIG_BPF_JIT
        /* All BPF JIT sysctl knobs here. */
        int bpf_jit_enable   __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
    • 最後還會有一個 struct bpf_prog_stats,還蠻小的 (src):

      struct bpf_prog_stats {
      	u64 cnt;
      	u64 nsecs;
      	u64 misses;
      	struct u64_stats_sync syncp;
      } __aligned(2 * sizeof(u64));

到這邊為只,可以看一下 bpf_prog 目前長的樣子:

{
  pages = 1,
  jited = 0,
  jit_requested = 1,
  gpl_compatible = 1,
  cb_access = 0,
  dst_needed = 0,
  blinded = 0,
  is_func = 0,
  kprobe_override = 0,
  has_callchain_buf = 0,
  enforce_expected_attach_type = 0,
  call_get_stack = 0,
  type = BPF_PROG_TYPE_SOCKET_FILTER,
  expected_attach_type = BPF_CGROUP_INET_INGRESS,
  len = 37,
  jited_len = 0,
  tag = "\000\000\000\000\000\000\000",
  stats = 0x607ff0c030a0,
  active = 0x607ff0c03094,
  bpf_func = 0x0 <fixed_percpu_data>,
  aux = 0xffff888006783400,
  orig_prog = 0x0 <fixed_percpu_data>,
  insns = 0xffffc9000006d048,
  insnsi = 0xffffc9000006d048
}

bpf_prog_load() 的下半部分執行 verifier,檢測 insn 本身是否合法:

	...
	/* run eBPF verifier */
	err = bpf_check(&prog, attr, uattr);
	if (err < 0)
		goto free_used_maps;

	prog = bpf_prog_select_runtime(prog, &err);
	if (err < 0)
		goto free_used_maps;

	err = bpf_prog_alloc_id(prog);
	if (err)
		goto free_used_maps;

	/* Upon success of bpf_prog_alloc_id(), the BPF prog is
	 * effectively publicly exposed. However, retrieving via
	 * bpf_prog_get_fd_by_id() will take another reference,
	 * therefore it cannot be gone underneath us.
	 *
	 * Only for the time /after/ successful bpf_prog_new_fd()
	 * and before returning to userspace, we might just hold
	 * one reference and any parallel close on that fd could
	 * rip everything out. Hence, below notifications must
	 * happen before bpf_prog_new_fd().
	 *
	 * Also, any failure handling from this point onwards must
	 * be using bpf_prog_put() given the program is exposed.
	 */
	bpf_prog_kallsyms_add(prog);
	perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
	bpf_audit_prog(prog, BPF_AUDIT_LOAD);

	err = bpf_prog_new_fd(prog);
	if (err < 0)
		bpf_prog_put(prog);
	return err;

free_used_maps:
	/* In case we have subprogs, we need to wait for a grace
	 * period before we can tear down JIT memory since symbols
	 * are already exposed under kallsyms.
	 */
	__bpf_prog_put_noref(prog, prog->aux->func_cnt);
	return err;
free_prog_sec:
	free_uid(prog->aux->user);
	security_bpf_prog_free(prog->aux);
free_prog:
	if (prog->aux->attach_btf)
		btf_put(prog->aux->attach_btf);
	bpf_prog_free(prog);
	return err;
}

其中 bpf_check() 是關鍵。

bpf_check() - the verifier

verifier 使用到的 struct 可以在 include/linux/bpf_verifier.h,主要有兩個 bpf_verifier_env (size: 0x1d30) 以及 bpf_verifier_log (size: 0x418):

/* single container for all structs
 * one verifier_env per bpf_check() call
 * 一次只會有一個 bpf_verifier_env 在 bpf_check() 的執行過程
 */
struct bpf_verifier_env {
	u32 insn_idx;
	u32 prev_insn_idx;
	struct bpf_prog *prog;		/* eBPF program being verified */
	const struct bpf_verifier_ops *ops;
	struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */
	int stack_size;			/* number of states to be processed */
	bool strict_alignment;		/* perform strict pointer alignment checks */
	bool test_state_freq;
    ...
};
#define BPF_VERIFIER_TMP_LOG_SIZE	1024
struct bpf_verifier_log {
	u32 level;
	char kbuf[BPF_VERIFIER_TMP_LOG_SIZE];
	char __user *ubuf;
	u32 len_used;
	u32 len_total;
};

bpf_check() 上半段程式碼如下 (src):

int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
	      union bpf_attr __user *uattr)
{
	u64 start_time = ktime_get_ns();
	struct bpf_verifier_env *env;
	struct bpf_verifier_log *log;
	int i, len, ret = -EINVAL;
	bool is_priv;

	/* no program is valid */
	if (ARRAY_SIZE(bpf_verifier_ops) == 0)
		return -EINVAL;

	/* 'struct bpf_verifier_env' can be global, but since it's not small,
	 * allocate/free it every time bpf_check() is called
	 */
    /* allocate 用來存放 env 的 memory */
	env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
	if (!env)
		return -ENOMEM;
    
    /* env 中也包含 log */
	log = &env->log;

	len = (*prog)->len;
    /* 每個 insn 都有一個 struct bpf_insn_aux_data */
	env->insn_aux_data =
		vzalloc(array_size(sizeof(struct bpf_insn_aux_data), len));
	ret = -ENOMEM;
	if (!env->insn_aux_data)
		goto err_free_env;
	for (i = 0; i < len; i++)
		env->insn_aux_data[i].orig_idx = i;
	env->prog = *prog;
    /* 每個 prog->type 都有不同的 verifier ops,如 fixed_percpu_data, sk_filter_verifier_ops, kprobe_verifier_ops, ... */
    /* BPF_PROG_TYPE_SOCKET_FILTER 對到 sk_filter_verifier_ops */
	env->ops = bpf_verifier_ops[env->prog->type];
	is_priv = bpf_capable(); /* capable(CAP_BPF) || capable(CAP_SYS_ADMIN) */

    /* 什麼都沒做 @_@ */
	bpf_get_btf_vmlinux();

	/* grab the mutex to protect few globals used by verifier */
    /* lock ! 保護 verifier 的 globals */
	if (!is_priv)
		mutex_lock(&bpf_verifier_lock);

	if (attr->log_level || attr->log_buf || attr->log_size) {
		/* user requested verbose verifier output
		 * and supplied buffer to store the verification trace
		 */
		log->level = attr->log_level;
		log->ubuf = (char __user *) (unsigned long) attr->log_buf;
		log->len_total = attr->log_size;

		ret = -EINVAL;
		/* log attributes have to be sane */
        /**
         * (2**31 - 1) >> 2 >= log size >= 128
         * 5 > log->level > 0
         * @BPF_LOG_MASK = 1 (l1) | 2 (l2) | 4 (stats) == 0b111
         */
		if (log->len_total < 128 || log->len_total > UINT_MAX >> 2 ||
		    !log->level || !log->ubuf || log->level & ~BPF_LOG_MASK)
			goto err_unlock;
	}

    /* 什麼都沒做, too */
	if (IS_ERR(btf_vmlinux)) {
		/* Either gcc or pahole or kernel are broken. */
		verbose(env, "in-kernel BTF is malformed\n");
		ret = PTR_ERR(btf_vmlinux);
		goto skip_full_check;
	}

    /* 沒特別設就是 false */
	env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
		env->strict_alignment = true;
	if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
		env->strict_alignment = false;

    /* capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN) */
	env->allow_ptr_leaks = bpf_allow_ptr_leaks();
    /* capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN) */
	env->allow_uninit_stack = bpf_allow_uninit_stack();
    /* capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN) */
	env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access();
    /* capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN) */
  	env->bypass_spec_v1 = bpf_bypass_spec_v1();
    /* capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN) */
	env->bypass_spec_v4 = bpf_bypass_spec_v4();
    /* 以上都是需要 perfmon_capable() */
    
    /* capable(CAP_BPF) || capable(CAP_SYS_ADMIN),看有無執行 bpf 的權限 */
	env->bpf_capable = bpf_capable();

	if (is_priv)
		env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;

    /**
     * bpf_verifier_state_list 紀錄每個 verifier 的階段,不過這邊單純 allocate ptr size
     * 因為 env->explored_states 是 pointer of pointer
     */
	env->explored_states = kvcalloc(state_htab_size(env),
				       sizeof(struct bpf_verifier_state_list *),
				       GFP_USER);
	ret = -ENOMEM;
	if (!env->explored_states)
		goto skip_full_check;

	ret = add_subprog_and_kfunc(env);
	if (ret < 0)
		goto skip_full_check;

	ret = check_subprogs(env);
	if (ret < 0)
		goto skip_full_check;

	ret = check_btf_info(env, attr, uattr);
	if (ret < 0)
		goto skip_full_check;

	ret = check_attach_btf_id(env);
	if (ret)
		goto skip_full_check;

	ret = resolve_pseudo_ldimm64(env);
	if (ret < 0)
		goto skip_full_check;

   	/* return aux->offload_requested */
	if (bpf_prog_is_dev_bound(env->prog->aux)) {
		ret = bpf_prog_offload_verifier_prep(env->prog);
		if (ret)
			goto skip_full_check;
	}

	ret = check_cfg(env);
	if (ret < 0)
		goto skip_full_check;

	ret = do_check_subprogs(env);
	ret = ret ?: do_check_main(env);

	if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux))
		ret = bpf_prog_offload_finalize(env);
	...
}
  • 一系列的檢查:
    • add_subprog_and_kfunc() --> add_subprog() --> find_subprog()
      • add_subprog() - 新增 sub prog (淺顯易懂 ?)
      • find_subprog() - 用 bsearch 找對應 off 的 subprog,不過透過 add_subprog_and_kfunc() 呼叫的 off 為 0
    • check_subprogs()
    • 只有特定 type 的 prog 會需要做更多的檢查:
      • check_btf_info()
      • check_attach_btf_id()
    • resolve_pseudo_ldimm64()
    • check_cfg()
    • do_check_subprogs()
    • do_check_main()

add_subprog_and_kfunc() 新增 sub prog 以及 kfunc:

static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
{
	struct bpf_subprog_info *subprog = env->subprog_info;
	struct bpf_insn *insn = env->prog->insnsi;
	int i, ret, insn_cnt = env->prog->len;

	/* Add entry function. */
	ret = add_subprog(env, 0);
	if (ret)
		return ret;

	for (i = 0; i < insn_cnt; i++, insn++) {
        /**
         * call --> insn->code == (BPF_JMP | BPF_CALL) && insn->src_reg == BPF_PSEUDO_CALL
         * kfunc call --> insn->code == (BPF_JMP | BPF_CALL) && insn->src_reg == BPF_PSEUDO_KFUNC_CALL
         * func --> insn->code == (BPF_LD | BPF_IMM | BPF_DW) && insn->src_reg == BPF_PSEUDO_FUNC
         */
        /* 不是 func + 不是 call + 不是 kfunc --> cont */
		if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn) &&
		    !bpf_pseudo_kfunc_call(insn))
			continue;

		if (!env->bpf_capable) {
			verbose(env, "loading/calling other bpf or kernel functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
			return -EPERM;
		}

        /* 若為 call / kfunc call / func,就新增一個 sub prog */
		if (bpf_pseudo_func(insn)) {
			ret = add_subprog(env, i + insn->imm + 1);
			if (ret >= 0)
				/* remember subprog */
				insn[1].imm = ret;
		} else if (bpf_pseudo_call(insn)) {
			ret = add_subprog(env, i + insn->imm + 1);
		} else {
			ret = add_kfunc_call(env, insn->imm);
		}

		if (ret < 0)
			return ret;
	}

	/* Add a fake 'exit' subprog which could simplify subprog iteration
	 * logic. 'subprog_cnt' should not be increased.
	 */
    /* 假的 exit subprog @_@ ? */
	subprog[env->subprog_cnt].start = insn_cnt;

	if (env->log.level & BPF_LOG_LEVEL2)
		for (i = 0; i < env->subprog_cnt; i++)
			verbose(env, "func#%d @%d\n", i, subprog[i].start);

	return 0;
}

但實際上新增 subprog 是由 add_subprog() 完成:

static int add_subprog(struct bpf_verifier_env *env, int off)
{
	int insn_cnt = env->prog->len;
	int ret;

	if (off >= insn_cnt || off < 0) {
		verbose(env, "call to invalid destination\n");
		return -EINVAL;
	}
	ret = find_subprog(env, off); /* 找 off = 0 的 subprog 是否存在 */
	if (ret >= 0)
		return ret;
	if (env->subprog_cnt >= BPF_MAX_SUBPROGS) {
		verbose(env, "too many subprograms\n");
		return -E2BIG;
	}
	/* determine subprog starts. The end is one before the next starts */
    /* env->subprog_cnt++ */
	env->subprog_info[env->subprog_cnt++].start = off;
    /* sorted by prog->start,因此新增的 subprog 會是起頭 */
	sort(env->subprog_info, env->subprog_cnt,
	     sizeof(env->subprog_info[0]), cmp_subprogs, NULL);
	return env->subprog_cnt - 1;
}

接著執行 check_subprogs(),檢查所有的 subprog 是否 JMP 都在同個 subprog:

static int check_subprogs(struct bpf_verifier_env *env)
{
	int i, subprog_start, subprog_end, off, cur_subprog = 0;
	struct bpf_subprog_info *subprog = env->subprog_info;
	struct bpf_insn *insn = env->prog->insnsi;
	int insn_cnt = env->prog->len;

	/* now check that all jumps are within the same subprog */
    /* 檢查所有 JMP 相關的 insn 都在同個 subprog (?) */
    
    /* 當前的 subprog 的開頭 ~ 下一個 subprog 的開頭為當前 subprog 執行的週期 */
    /* 在 add_subprog_and_kfunc() 最後會有 fake subprog,就是要產生 subprog_end */
	subprog_start = subprog[cur_subprog].start;
	subprog_end = subprog[cur_subprog + 1].start;
	for (i = 0; i < insn_cnt; i++) {
		u8 code = insn[i].code;

		if (code == (BPF_JMP | BPF_CALL) &&
		    insn[i].imm == BPF_FUNC_tail_call &&
		    insn[i].src_reg != BPF_PSEUDO_CALL)
			subprog[cur_subprog].has_tail_call = true;
		if (BPF_CLASS(code) == BPF_LD &&
		    (BPF_MODE(code) == BPF_ABS || BPF_MODE(code) == BPF_IND))
			subprog[cur_subprog].has_ld_abs = true;
		if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32)
			goto next;
		if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
			goto next;
        
		off = i + insn[i].off + 1;
		if (off < subprog_start || off >= subprog_end) {
			verbose(env, "jump out of range from insn %d to %d\n", i, off);
			return -EINVAL;
		}
next:
		if (i == subprog_end - 1) {
			/* to avoid fall-through from one subprog into another
			 * the last insn of the subprog should be either exit
			 * or unconditional jump back
			 */
			if (code != (BPF_JMP | BPF_EXIT) &&
			    code != (BPF_JMP | BPF_JA)) {
				verbose(env, "last insn is not an exit or jmp\n");
				return -EINVAL;
			}
            /* 換下一個 subprog */
			subprog_start = subprog_end;
			cur_subprog++;
			if (cur_subprog < env->subprog_cnt)
				subprog_end = subprog[cur_subprog + 1].start;
		}
	}
	return 0;
}

check_btf_info() 檢查 BTF (BPF Type Format) :

static int check_btf_info(struct bpf_verifier_env *env,
			  const union bpf_attr *attr,
			  union bpf_attr __user *uattr)
{
	struct btf *btf;
	int err;

	if (!attr->func_info_cnt && !attr->line_info_cnt) {
        /**
         * traverse 所有的 env->subprog_info[],檢查是否有 has_ld_abs / has_tail_call 的情況
         * @ LD_ABS is not allowed in subprogs without BTF
         * @ tail_call is not allowed in subprogs without BTF
         */
		if (check_abnormal_return(env))
			return -EINVAL;
		return 0;
	}

    /* TODO: 以下還沒 trace 到 */
	...
	return 0;
}

resolve_pseudo_ldimm64() 找到 ld_imm64 insn 中的 imm,並將 map_fd 轉換成 struct bpf_map *:

/* find and rewrite pseudo imm in ld_imm64 instructions:
 *
 * 1. if it accesses map FD, replace it with actual map pointer.
 * 2. if it accesses btf_id of a VAR, replace it with pointer to the var.
 *
 * NOTE: btf_vmlinux is required for converting pseudo btf_id.
 */
static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
{
	struct bpf_insn *insn = env->prog->insnsi; /* 取得 insn list */
	int insn_cnt = env->prog->len;
	int i, j, err;

	err = bpf_prog_calc_tag(env->prog); /* 為 prog 計算 tag,過程中會使用到 SHA1 */
	if (err)
        return err;
    /* 遍歷每個 insn */
    for (i = 0; i < insn_cnt; i++, insn++) {
        /**
         * 使用 LDX 但是不是用 MEM,或是 imm 不為 0,就代表使用到其他保留的欄位
         * 猜想是因為 LDX 在 userland 只有 BPF_LDX_MEM 能用,而使用時 imm == 0 以及 mode == BPF_MEM
         */
		if (BPF_CLASS(insn->code) == BPF_LDX &&
		    (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
			verbose(env, "BPF_LDX uses reserved fields\n");
			return -EINVAL;
		}

        /**
         * usermode 使用 BPF_LD_IMM64_RAW macro 時為 {.code  = BPF_LD | BPF_DW | BPF_IMM,...}
         * 並且只有 BPF_LD_MAP_FD 以及 BPF_LD_IMM64 macro 作為 wrapper 使用 BPF_LD_IMM64_RAW
         * 兩者差在 LD_MAP_FD 的 src 會是 BPF_PSEUDO_MAP_FD (1),而 LD_IMM64 src 為 0
         */
		if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
			struct bpf_insn_aux_data *aux;
			struct bpf_map *map;
			struct fd f;
			u64 addr;

            /**
             * ld_imm 為最後一個 insn,或者是下一個 insn 有被使用
             * 因為 bpf_insn 只能使用 imm32,因此如果要用 imm64 的話,
             * 則需要兩個 insn,並且分別保存前 32 bits 的 imm 以及後 32 bits 的 imm
             * 而第二個 insn 除了 imm 外其他欄位階為 0
             */
			if (i == insn_cnt - 1 || insn[1].code != 0 ||
			    insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
			    insn[1].off != 0) {
				verbose(env, "invalid bpf_ld_imm64 insn\n");
				return -EINVAL;
			}

            /* 當 src_reg 不同時有不同的檢查機制,有的甚至直接當作合法的 (?) */
			if (insn[0].src_reg == 0)
				/* valid generic load 64-bit imm */
				goto next_insn;

			if (insn[0].src_reg == BPF_PSEUDO_BTF_ID) {
				aux = &env->insn_aux_data[i];
				err = check_pseudo_btf_id(env, insn, aux);
				if (err)
					return err;
				goto next_insn;
			}

			if (insn[0].src_reg == BPF_PSEUDO_FUNC) {
				aux = &env->insn_aux_data[i];
				aux->ptr_type = PTR_TO_FUNC;
				goto next_insn;
			}

			/* In final convert_pseudo_ld_imm64() step, this is
			 * converted into regular 64-bit imm load insn.
			 */
			if ((insn[0].src_reg != BPF_PSEUDO_MAP_FD &&
			     insn[0].src_reg != BPF_PSEUDO_MAP_VALUE) ||
			    (insn[0].src_reg == BPF_PSEUDO_MAP_FD &&
			     insn[1].imm != 0)) {
				verbose(env,
					"unrecognized bpf_ld_imm64 insn\n");
				return -EINVAL;
			}
			/* 使用 BPF_LD_MAP_FD 時 imm 會是 fd number,透過此 func 來取得 kernel fd struct */
			f = fdget(insn[0].imm);
            /* kernel fd struct 是用欄位 file->private 來保存檔案資料,e.g. f.file->private_data */
			map = __bpf_map_get(f);
			if (IS_ERR(map)) {
				verbose(env, "fd %d is not pointing to valid bpf_map\n",
					insn[0].imm);
				return PTR_ERR(map);
			}
			
            /**
             * Validate that trace type programs use preallocated hash maps
             * trace type 有 BPF_PROG_TYPE_KPROBE, TRACEPOINT, PERF_EVENT, RAW_TRACEPOINT
             *
             * 除此之外還檢查 spinlock 的使用,如 socket filter, tracing prog 以及 sleepable prog
             * 都不能使用 bpf_spin_lock,因為在 tracing prog tracepoint 在 locked region、或是在
             * locked region 睡著,就會造成其他的 thread 拿不到 lock
             *
             * 設計考量 (?) 只允許 sleepable prog 使用 array, hash, ringbuf maps
             */
			err = check_map_prog_compatibility(env, map, env->prog);
			if (err) {
				fdput(f);
				return err;
			}

			aux = &env->insn_aux_data[i];
            /**
             * 當 src 是 BPF_PSEUDO_MAP_FD,就會把 imm 從 fd number 改成 map addr
             * 因此 BPF_LD_MAP_FD 能夠取得指令 fd 的 map struct
             */
			if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
				addr = (unsigned long)map;
			} else {
				u32 off = insn[1].imm;

				if (off >= BPF_MAX_VAR_OFF) {
					verbose(env, "direct value offset of %u is not allowed\n", off);
					fdput(f);
					return -EINVAL;
				}

				/* map 的檢測 ... */

				aux->map_off = off;
				addr += off;
			}

			insn[0].imm = (u32)addr;
			insn[1].imm = addr >> 32;

			/* check whether we recorded this map already */
			for (j = 0; j < env->used_map_cnt; j++) {
				if (env->used_maps[j] == map) { /* 找到 pre-allocate 的 map */
					aux->map_index = j; /* aux 記錄下來*/
					fdput(f); /* 將資料印出 @__@ ? */
					goto next_insn;
				}
			}

			if (env->used_map_cnt >= MAX_USED_MAPS) {
				fdput(f);
				return -E2BIG;
			}

            /* atomic 增加 map->refcnt */
			bpf_map_inc(map);

			aux->map_index = env->used_map_cnt;
			env->used_maps[env->used_map_cnt++] = map;

			if (bpf_map_is_cgroup_storage(map) &&
			    bpf_cgroup_storage_assign(env->prog->aux, map)) {
				verbose(env, "only one cgroup storage of each type is allowed\n");
				fdput(f);
				return -EBUSY;
			}

			fdput(f);
next_insn:
			insn++;
			i++;
			continue;
		}

		/* Basic sanity check before we invest more work here. */
		if (!bpf_opcode_in_insntable(insn->code)) {
			verbose(env, "unknown opcode %02x\n", insn->code);
			return -EINVAL;
		}
	}

	/* now all pseudo BPF_LD_IMM64 instructions load valid
	 * 'struct bpf_map *' into a register instead of user map_fd.
	 * These pointers will be used later by verifier to validate map access.
	 */
    /* 將 map_fd 轉換成 struct bpf_map * */
	return 0;
}
  • 關於 BPF_XXX 的 macro 於 include/uapi/linux/bpf_common.h 被定義 (uapi 為 user api ?),主要幾個重點 macro 如下:

    /**
     * user mode 傳入的是 bpf_insn.code,而通常都會以 | 的方式將 class / size / mode 組合起來,如:
     * .code  = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM
     */
    /* 0000 0111 */
    #define BPF_CLASS(code) ((code) & 0x07)
    #define		BPF_LD		0x00
    #define		BPF_LDX		0x01
    ...
    /* ld/ldx fields */
    /* 0001 1000 */
    #define BPF_SIZE(code)  ((code) & 0x18)
    #define		BPF_W		0x00 /* 32-bit */
    ...
    /* 1110 0000 */
    #define BPF_MODE(code)  ((code) & 0xe0)
    #define		BPF_IMM		0x00
    #define		BPF_ABS		0x20
    ...
    /* alu/jmp fields */
    /* 1111 0000 */
    #define BPF_OP(code)    ((code) & 0xf0)
    #define		BPF_ADD		0x00
    #define		BPF_SUB		0x10
    ...
        
    /* 並且在 common.h 中也有定義最多的 insn 數量為 4096 */
    #ifndef BPF_MAXINSNS
    #define BPF_MAXINSNS 4096
    
    • LDLDX 的差別在於是否可以使用 offset,因此 X 的意思應該為 extended

下個 check_cfg() 程式碼意外的少,主要是用 DFS 看 BPF 內是否有 loop (back edge):

static int check_cfg(struct bpf_verifier_env *env)
{
	int insn_cnt = env->prog->len;
	int *insn_stack, *insn_state;
	int ret = 0;
	int i;

    /* 標記每個 insn 是否被 discover */
	insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
	if (!insn_state)
		return -ENOMEM;

    /* DFS stack,將目前走到的 insn push 上去 */
	insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
	if (!insn_stack) {
		kvfree(insn_state);
		return -ENOMEM;
	}

    /* 第一個 insn 被走訪過 */
	insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
	insn_stack[0] = 0; /* 0 is the first instruction */
	env->cfg.cur_stack = 1; /* stack top */

	while (env->cfg.cur_stack > 0) {
		int t = insn_stack[env->cfg.cur_stack - 1];

        /* 走訪 insn */
		ret = visit_insn(t, insn_cnt, env);
		switch (ret) {
		case DONE_EXPLORING: /* EXIT */
			insn_state[t] = EXPLORED;
			env->cfg.cur_stack--;
			break;
		case KEEP_EXPLORING:
			break;
		default:
			if (ret > 0) { /* error happen */
				verbose(env, "visit_insn internal bug\n");
				ret = -EFAULT;
			}
			goto err_free;
		}
	}

    /* top 最低為 0 */
	if (env->cfg.cur_stack < 0) {
		verbose(env, "pop stack internal bug\n");
		ret = -EFAULT;
		goto err_free;
	}

    /* 有些 insn 不會被走到 */
	for (i = 0; i < insn_cnt; i++) {
		if (insn_state[i] != EXPLORED) {
			verbose(env, "unreachable insn %d\n", i);
			ret = -EINVAL;
			goto err_free;
		}
	}
	ret = 0; /* cfg looks good */

err_free:
	kvfree(insn_state);
	kvfree(insn_stack);
	env->cfg.insn_state = env->cfg.insn_stack = NULL;
	return ret;
}
  • 看過 kmalloc()vmalloc(),但是就沒看過 kvcalloc()。為 kvmalloc_array(n, size, flags | __GFP_ZERO) ,底層呼叫 kvmalloc_node(),function 的說明為:

    attempt to allocate physically contiguous memory, but upon failure, fall back to non-contiguous (vmalloc) allocation
    

    就是一種 kmalloc() 以及 vmalloc() 混用的 fu

  • 關鍵的部分在 visit_insn() 的回傳值:

    /* Visits the instruction at index t and returns one of the following:
     *  < 0 - an error occurred
     *  DONE_EXPLORING - the instruction was fully explored
     *  KEEP_EXPLORING - there is still work to be done before it is fully explored
     */
    /**
     * init_explored_state(env, w)
     * env->insn_aux_data[idx].prune_point = true
     */
    /* visit insn[t] */
    static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
    {
    	struct bpf_insn *insns = env->prog->insnsi;
    	int ret;
    
        /* pseudo function call */
    	if (bpf_pseudo_func(insns + t))
    		return visit_func_call_insn(t, insn_cnt, insns, env, true);
    
    	/* All non-branch instructions have a single fall-through edge. */
    	/* jmp 的 edge 為單一 (fall-through) */
    	if (BPF_CLASS(insns[t].code) != BPF_JMP &&
    	    BPF_CLASS(insns[t].code) != BPF_JMP32)
    		return push_insn(t, t + 1, FALLTHROUGH, env, false);
    
    	switch (BPF_OP(insns[t].code)) {
    	case BPF_EXIT: /* EXIT */
    		return DONE_EXPLORING;
    
    	case BPF_CALL: /* call pseudo func */
    		return visit_func_call_insn(t, insn_cnt, insns, env,
    					    insns[t].src_reg == BPF_PSEUDO_CALL);
    
    	case BPF_JA: /* jmp always */
    		if (BPF_SRC(insns[t].code) != BPF_K)
    			return -EINVAL;
    
    		/* unconditional jump with single edge */
            /* 直接跳去 offset 的地方執行 */
    		ret = push_insn(t, t + insns[t].off + 1, FALLTHROUGH, env,
    				true);
    		if (ret) /* DONE_EXPLORING */
    			return ret;
                
    		init_explored_state(env, t + insns[t].off + 1);
    		if (t + 1 < insn_cnt)
    			init_explored_state(env, t + 1);
    
    		return ret;
    
    	default:
    		/* conditional jump with two edges */
    		init_explored_state(env, t);
    		ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
    		if (ret)
    			return ret;
    
    		return push_insn(t, t + insns[t].off + 1, BRANCH, env, true);
    	}
    }
    • 其中 visit_func_call_insn():

      /* 不太確定 visit_callee 的意思 */
      static int visit_func_call_insn(int t, int insn_cnt,
      				struct bpf_insn *insns,
      				struct bpf_verifier_env *env,
      				bool visit_callee)
      {
      	int ret;
      	/* function 的 edge 為單一 (fall-through) */
      	ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
      	if (ret)
      		return ret;
      
      	if (t + 1 < insn_cnt)
      		init_explored_state(env, t + 1);
      	if (visit_callee) {
      		init_explored_state(env, t);
      		ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
      				env, false);
      	}
      	return ret;
      }
    • push_insn() (這部份把 verbose 的程式碼拿掉了):

      /* t, w, e - match pseudo-code above:
       * t - index of current instruction
       * w - next instruction
       * e - edge
       */
      /**
       * 當 function or jmp 執行 push_insn 時,loop_ok 為 false,代表不允許走到 discovered insn
       * discovered ---> discovered (X) (loop)
       * discovered ---> explored (O) (cross-edge)
       */
      static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
      		     bool loop_ok)
      {
      	int *insn_stack = env->cfg.insn_stack;
      	int *insn_state = env->cfg.insn_state;
      
          /* 已經走過了 */
      	if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
      		return DONE_EXPLORING;
      	if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH))
      		return DONE_EXPLORING;
      
          /* insn 不合法 */
      	if (w < 0 || w >= env->prog->len) {
      		return -EINVAL;
      	}
      
      	if (e == BRANCH)
      		/* mark branch target for state pruning */
      		init_explored_state(env, w);
      
      	if (insn_state[w] == 0) { /* 還沒 discovered */
      		/* tree-edge */
      		insn_state[t] = DISCOVERED | e;
      		insn_state[w] = DISCOVERED;
      		if (env->cfg.cur_stack >= env->prog->len)
      			return -E2BIG;
      		insn_stack[env->cfg.cur_stack++] = w; /* push to stack */
      		return KEEP_EXPLORING;
      	} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
              /* function / jmp */
              /* DONE_EXPLORING 的部分在 check_cfg 才會 assign */
      		if (loop_ok && env->bpf_capable)
      			return DONE_EXPLORING;
      		return -EINVAL;
      	} else if (insn_state[w] == EXPLORED) {
      		/* forward- or cross-edge */
              /* 走到其他已經 explored 完畢的 edge */
      		insn_state[t] = DISCOVERED | e;
      	} else {
      		return -EFAULT;
      	}
      	return DONE_EXPLORING;
      }
    • insn state:

      enum {
          /* insn 狀態 */
      	DISCOVERED = 0x10, /* 發現了,正在走 */
      	EXPLORED = 0x20, /* 此後的 insn 已經走完 */
          /* insn 對應的 bb 情況,branch 會有分岔 */
      	FALLTHROUGH = 1,
      	BRANCH = 2,
      };
      
      enum {
      	DONE_EXPLORING = 0,
      	KEEP_EXPLORING = 1,
      };
    • pseudo code:

      procedure DFS-iterative(G,v):
          label v as discovered
          let S be a stack
          S.push(v)
          while S is not empty
                t <- S.pop()
                if t is what we are looking for:
                    return t
                for all edges e in G.adjacentEdges(t) do
                    if edge e is already labelled
                        continue with the next edge
                    w <- G.adjacentVertex(t,e)
                    if vertex w is not discovered and not explored
                        label e as tree-edge
                        label w as discovered
                        S.push(w)
                        continue at 5
                    else if vertex w is discovered
                        label e as back-edge
                    else
                        // vertex w is explored
                        label e as forward- or cross-edge
                label t as explored
                S.pop()

到此還是不太了解 subprog 是以怎樣的方式存在,也許能透過 bpf check function do_check_subprogs() 來更加了解:

static int do_check_subprogs(struct bpf_verifier_env *env)
{
	struct bpf_prog_aux *aux = env->prog->aux;
	int i, ret;

	if (!aux->func_info)
		return 0;

	for (i = 1; i < env->subprog_cnt; i++) {
		if (aux->func_info_aux[i].linkage != BTF_FUNC_GLOBAL)
			continue;
		env->insn_idx = env->subprog_info[i].start;
		WARN_ON_ONCE(env->insn_idx == 0);
		ret = do_check_common(env, i);
		if (ret) {
			return ret;
		} else if (env->log.level & BPF_LOG_LEVEL) {
			verbose(env,
				"Func#%d is safe for any args that match its prototype\n",
				i);
		}
	}
	return 0;
}
  • linux 在 function 上方有提供一段 useful 的註釋:

    /* Verify all global functions in a BPF program one by one based on their BTF.
     * All global functions must pass verification. Otherwise the whole program is rejected.
     * Consider:
     * int bar(int);
     * int foo(int f)
     * {
     *    return bar(f);
     * }
     * int bar(int b)
     * {
     *    ...
     * }
     * foo() will be verified first for R1=any_scalar_value. During verification it
     * will be assumed that bar() already verified successfully and call to bar()
     * from foo() will be checked for type match only. Later bar() will be verified
     * independently to check that it's safe for R1=any_scalar_value.
     */
    

    代表在檢查 foo() 時,會假設使用到的 function 皆為 verification,而在檢查 bar() 時會再自行檢查一次

  • do_check_subprog() 會呼叫到 do_common(),而在 bpf_check() 當中的 do_check_main() 也會呼叫到 do_common():

    /**
     * main prog 的 int subprog 會是 0,而其他 subprog 會是 1, 2, 3...
     * 所有 (包含 main prog) 的 subprog 數量為 env->subprog_cnt
     */
    static int do_check_common(struct bpf_verifier_env *env, int subprog)
    {
    	bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
    	struct bpf_verifier_state *state;
    	struct bpf_reg_state *regs;
    	int ret, i;
    
    	env->prev_linfo = NULL;
    	env->pass_cnt++; /* 紀錄被 check_common 的次數,不過在其他 function 當中似乎用不到 */
    
    	state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
    	if (!state)
    		return -ENOMEM;
    	state->curframe = 0;
    	state->speculative = false;
    	state->branches = 1;
        /* 每個 function frame 會有 10 個 register (struct bpf_reg_state regs[MAX_BPF_REG]) */
    	state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
    	if (!state->frame[0]) {
    		kfree(state);
    		return -ENOMEM;
    	}
    	env->cur_state = state;
        /**
         * init_func_state(env, state, callsite, frameno, subprogno):
         * state->callsite = callsite;
         * state->frameno = frameno;
         * state->subprogno = subprogno;
         * init_reg_state(env, state);
         */
    	init_func_state(env, state->frame[0],
    			/* #define BPF_MAIN_FUNC (-1) */
    			BPF_MAIN_FUNC /* callsite */,
    			0 /* frameno */,
    			subprog);
    	/* 每個 frame 都會有一組 register 可以使用,what is frame @__@ ? */
    	regs = state->frame[state->curframe]->regs;
    	if (subprog /* >= 1 為 subprog */ || env->prog->type == BPF_PROG_TYPE_EXT) {
    		ret = btf_prepare_func_args(env, subprog, regs);
    		if (ret)
    			goto out;
    		for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
    			if (regs[i].type == PTR_TO_CTX)
    				mark_reg_known_zero(env, regs, i);
    			else if (regs[i].type == SCALAR_VALUE)
    				mark_reg_unknown(env, regs, i);
    			else if (regs[i].type == PTR_TO_MEM_OR_NULL) {
    				const u32 mem_size = regs[i].mem_size;
    
    				mark_reg_known_zero(env, regs, i);
    				regs[i].mem_size = mem_size;
    				regs[i].id = ++env->id_gen;
    			}
    		}
    	} else { /* == 0 為 main prog */
    		/* 1st arg to a function */
            /* ctx = context, PTR_TO_CTX: reg points to bpf_context */
    		regs[BPF_REG_1].type = PTR_TO_CTX; /* func(r1, r2, ...) */
    		mark_reg_known_zero(env, regs, BPF_REG_1);
    		ret = btf_check_subprog_arg_match(env, subprog, regs);
    		if (ret == -EFAULT)
    			goto out;
    	}
    
    	ret = do_check(env);
    out:
    	/* check for NULL is necessary, since cur_state can be freed inside
    	 * do_check() under memory pressure.
    	 */
    	if (env->cur_state) {
    		free_verifier_state(env->cur_state, true);
    		env->cur_state = NULL;
    	}
    	while (!pop_stack(env, NULL, NULL, false)); /* 清空 stack (?) */
    	if (!ret && pop_log)
    		bpf_vlog_reset(&env->log, 0);
    	free_states(env);
    	return ret;
    }
    • 初始化 register state init_reg_state():

      static void init_reg_state(struct bpf_verifier_env *env,
      			   struct bpf_func_state *state)
      {
          /**
           * 注意這邊的 state type 為 bpf_func_state (state->frame[0]),
           * 而 do_check_common 的 state 為 bpf_verifier_state
           */
      	struct bpf_reg_state *regs = state->regs;
      	int i;
      
      	for (i = 0; i < MAX_BPF_REG /* 10 */; i++) {
      		mark_reg_not_init(env, regs, i);
      		regs[i].live = REG_LIVE_NONE;
      		regs[i].parent = NULL;
      		regs[i].subreg_def = DEF_NOT_SUBREG;
      	}
      
      	/* frame pointer */
      	regs[BPF_REG_FP].type = PTR_TO_STACK;
          /* 將 BPF_REG_FP (frame pointer, reg_10) mark 成 known,並將內容都設成 0 */
      	mark_reg_known_zero(env, regs, BPF_REG_FP);
      	regs[BPF_REG_FP].frameno = state->frameno;
      }
      • mark_reg_not_init() 滿多層的,不過大概作了以下事情來初始化 register 的狀態:

        memset(reg, 0, offsetof(struct bpf_reg_state, var_off)); /* 將位置 ~ var_off 都設為 0 */
        reg->type = SCALAR_VALUE;
        reg->var_off = tnum_unknown;
        reg->frameno = 0;
        reg->precise = env->subprog_cnt > 1 || !env->bpf_capable;
        
        // __mark_reg_unbounded(reg);
        reg->smin_value = S64_MIN;
        reg->smax_value = S64_MAX;
        reg->umin_value = 0;
        reg->umax_value = U64_MAX;
        
        reg->s32_min_value = S32_MIN;
        reg->s32_max_value = S32_MAX;
        reg->u32_min_value = 0;
        reg->u32_max_value = U32_MAX;
        
        reg->type = NOT_INIT;

        看起來 code 的註釋想表達初始化前還有一個狀態為 unknown,這邊的行為就是初始化 unknown (?)

    • bpf_check_subprog_arg_match() 檢查 BTF (bpf typr format) 是否符合 regs 的 expection:

      /**
       * 在 do_check_common() 當中,如果在做 main prog 的 arg match,如果回傳是 -EFAULT 並不是 error (?)
       * 只有 -EFAULT 會被當作 error 發生
       */
      int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
      				struct bpf_reg_state *regs)
      {
      	struct bpf_prog *prog = env->prog;
      	struct btf *btf = prog->aux->btf;
      	bool is_global;
      	u32 btf_id;
      	int err;
      
          /* 代表 subprog 並非 function ? */
      	if (!prog->aux->func_info)
      		return -EINVAL;
      
      	btf_id = prog->aux->func_info[subprog].type_id;
      	if (!btf_id)
      		return -EFAULT;
      	
          /**
           * function unreliable 代表 compiler 在 optimize 時把 static func 的參數給移除了,
           * 或是對 global function 的錯誤參數傳遞 (?)
           * 此 function 是以 BTF 觀點來檢查是否 match,因此可以先 mark 成 unreliable
           */
      	if (prog->aux->func_info_aux[subprog].unreliable)
      		return -EINVAL;
      
      	is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
      	err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global);
      
      	/* Compiler optimizations can remove arguments from static functions
      	 * or mismatched type can be passed into a global function.
      	 * In such cases mark the function as unreliable from BTF point of view.
      	 */
      	if (err)
      		prog->aux->func_info_aux[subprog].unreliable = true;
      	return err;
      }
      • btf_check_func_arg_match() 尚未 trace (太大坨 + 沒被執行到)
    • 重頭戲 do_check(),第一部分說明了開頭的基本檢查以及 alu operation (刪除了 verbose,細節請參閱 src):

      static int do_check(struct bpf_verifier_env *env)
      {
      	bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
      	struct bpf_verifier_state *state = env->cur_state;
      	struct bpf_insn *insns = env->prog->insnsi;
      	struct bpf_reg_state *regs;
      	int insn_cnt = env->prog->len;
      	bool do_print_state = false;
      	int prev_insn_idx = -1;
      
      	for (;;) {
      		struct bpf_insn *insn;
      		u8 class;
      		int err;
      
      		env->prev_insn_idx = prev_insn_idx;
              /* 超過範圍 */
      		if (env->insn_idx >= insn_cnt) {
      			return -EFAULT;
      		}
      
      		insn = &insns[env->insn_idx];
      		class = BPF_CLASS(insn->code);
      
              /* insn 太複雜,已經被處理過太多次了 */
      		if (++env->insn_processed > BPF_COMPLEXITY_LIMIT_INSNS /* 1000000 */) {
      			return -E2BIG;
      		}
      
      		err = is_state_visited(env, env->insn_idx);
      		if (err < 0)
      			return err;
      		if (err == 1) {
      			/* found equivalent state, can prune the search */
      			goto process_bpf_exit;
      		}
      
      		if (signal_pending(current)) /* 看是否有 signal 正在傳送 */
      			return -EAGAIN;
      
      		if (need_resched()) /* 也許需要 reschedule */
      			cond_resched();
      
      		if (env->log.level & BPF_LOG_LEVEL2 ||
      		    (env->log.level & BPF_LOG_LEVEL && do_print_state)) {
      			do_print_state = false;
      		}
      
      		if (bpf_prog_is_dev_bound(env->prog->aux) /* aux->offload_requested */) {
      			err = bpf_prog_offload_verify_insn(env, env->insn_idx,
      							   env->prev_insn_idx);
      			if (err)
      				return err;
      		}
      
      		regs = cur_regs(env); /* cur->frame[cur->curframe] */
              /**
               * 如果 insn 沒有要用猜的 (speculative),則執行:
               * env->insn_aux_data[env->insn_idx].seen = env->pass_cnt
               * 為了讓後續 verify unreachable path 時,sanitize_dead_code() 還能夠 rewrite/sanitize
               */
      		sanitize_mark_insn_seen(env);
      		prev_insn_idx = env->insn_idx;
      
      		if (class == BPF_ALU || class == BPF_ALU64) {
      			err = check_alu_op(env, insn);
      			if (err)
      				return err;
      
      		}
              ...
      • class check list:

        • ALU | ALU64 - check_alu_op()
          • NEG
          • END
          • MOV
          • AND, SUB, ...
        • LDX - check_reg_arg() for src and dst reg, check_mem_access(), reg_type_mismatch()
        • STX - check_reg_arg() for src and dst reg, check_mem_access(), reg_type_mismatch()
        • ST - check_reg_arg() for src reg, is_ctx_reg() for dst reg, check_mem_access()
        • JMP | JMP32
          • CALL
          • JA
          • EXIT
          • JNE, JGE ...
        • LD - check_ld_abs(), check_ld_imm()
      • is_state_visited() 用來檢查 state 是否走訪過(comment 細節在 src):

        static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
        {
        	struct bpf_verifier_state_list *new_sl;
        	struct bpf_verifier_state_list *sl, **pprev;
        	struct bpf_verifier_state *cur = env->cur_state, *new;
        	int i, j, err, states_cnt = 0;
        	bool add_new_state = env->test_state_freq ? true : false;
        
        	cur->last_insn_idx = env->prev_insn_idx;
            /**
             * 當初在執行 init_explored_state(env, w) 有更動到 prune_point,
        	 * (env->insn_aux_data[idx].prune_point = true),
        	 *
        	 * 而這裡要 prunn_point 為 0 才能繼續往下走
        	 */
        	if (!env->insn_aux_data[insn_idx].prune_point)
        		return 0;
        	/* 後面還沒看 @_@ */
        }
      • 當 class 為 ALU 時,會執行 check_alu_op() 檢查 ALU 內部的 operation 是否合法:

        /* check validity of 32-bit and 64-bit arithmetic operations */
        /**
         * ALU 的 operation 有:
         * BPF_END, BPF_NEG, BPF_MOV, BPF_SUB, ... SUB_XOR
         */
        static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
        {
        	struct bpf_reg_state *regs = cur_regs(env);
        	u8 opcode = BPF_OP(insn->code);
        	int err;
        
        	if (opcode == BPF_END || opcode == BPF_NEG) {
        		if (opcode == BPF_NEG) {
                    /**
                     * #define BPF_SRC(code)   ((code) & 0x08)
        			 * #define BPF_K 0x00
        			 * #define BPF_X 0x08
        			 * 不確定 K 與 X 的意思 @__@,不過 NEG 只能是 BPF_K
        			 * off, imm 要為 0,src_reg 要用 reg_0
                     */
        			if (BPF_SRC(insn->code) != 0 ||
        			    insn->src_reg != BPF_REG_0 ||
        			    insn->off != 0 || insn->imm != 0) {
        				verbose(env, "BPF_NEG uses reserved fields\n");
        				return -EINVAL;
        			}
        		} else {
                    /**
                     * src_reg == reg_0, off == 0, imm 要是 16/32/64,class 要是 ALU64,
                     * 進來此 function 時 class 可以是 alu or alu64
                     */
        			if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
        			    (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
        			    BPF_CLASS(insn->code) == BPF_ALU64) {
        				verbose(env, "BPF_END uses reserved fields\n");
        				return -EINVAL;
        			}
        		}
        
        		/* check src operand */
                /* 檢查 dst_reg 在 type SRC_OP 時是否合法 */
        		err = check_reg_arg(env, insn->dst_reg, SRC_OP);
        		if (err)
        			return err;
        
                /* 不能 pointer operation */
        		if (is_pointer_value(env, insn->dst_reg)) {
        			return -EACCES;
        		}
        
        		/* check dest operand */
                /* 檢查 dst_reg 在 type DST_OP 時是否合法 */
        		err = check_reg_arg(env, insn->dst_reg, DST_OP);
        		if (err)
        			return err;
        
        	} else if (opcode == BPF_MOV) {
        		/* a constant (BPF_K) or the index register (BPF_X) */
        		if (BPF_SRC(insn->code) == BPF_X) {
                    /* 要 imm == 0 && off == 0 */
        			if (insn->imm != 0 || insn->off != 0) {
        				return -EINVAL;
        			}
        
        			/* check src operand */
        			err = check_reg_arg(env, insn->src_reg, SRC_OP);
        			if (err)
        				return err;
        		} else {
                    /* 要 src_reg == r0 && off == 0 */
        			if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
        				return -EINVAL;
        			}
        		}
        
        		/* check dest operand, mark as required later */
        		err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
        		if (err)
        			return err;
        
                /* 如果是 index register */
        		if (BPF_SRC(insn->code) == BPF_X) {
        			struct bpf_reg_state *src_reg = regs + insn->src_reg;
        			struct bpf_reg_state *dst_reg = regs + insn->dst_reg;
        
        			if (BPF_CLASS(insn->code) == BPF_ALU64) {
        				/* case: R1 = R2
        				 * copy register state to dest reg
        				 */
        				if (src_reg->type == SCALAR_VALUE && !src_reg->id)
        					/* Assign src and dst registers the same ID
        					 * that will be used by find_equal_scalars()
        					 * to propagate min/max range.
        					 */
        					src_reg->id = ++env->id_gen; /* same id */
        				*dst_reg = *src_reg; /* assign (r1 = r2) */
        				dst_reg->live |= REG_LIVE_WRITTEN; /* 被寫 */
        				dst_reg->subreg_def = DEF_NOT_SUBREG; /* 64bit 沒 subreg (?) */
        			} else {
        				/* R1 = (u32) R2 */
                        /* 不能 assign pointer */
        				if (is_pointer_value(env, insn->src_reg)) {
        					return -EACCES;
        				} else if (src_reg->type == SCALAR_VALUE) {
        					*dst_reg = *src_reg;
        					/* Make sure ID is cleared otherwise
        					 * dst_reg min/max could be incorrectly
        					 * propagated into src_reg by find_equal_scalars()
        					 */
        					dst_reg->id = 0;
        					dst_reg->live |= REG_LIVE_WRITTEN;
                            /* 32 bit 的 subreg_def 為 env->insn_idx + 1 */
        					dst_reg->subreg_def = env->insn_idx + 1;
        				} else {
                            /* 不是 pointer 也不是 scalar value */
        					mark_reg_unknown(env, regs,
        							 insn->dst_reg);
        				}
                        /**
                         * BPF architecture zero extends alu32 ops into 64-bit registesr (a typo)
                         */
        				zext_32_to_64(dst_reg);
        			}
        		} else {
        	        /* 如果是 imm */
        			/* case: R = imm
        			 * remember the value we stored into this reg
        			 */
        			/* clear any state __mark_reg_known doesn't set */
        			mark_reg_unknown(env, regs, insn->dst_reg);
                    /* assign an imm --> scalar value */
        			regs[insn->dst_reg].type = SCALAR_VALUE;
                    /* Mark the unknown part of a register (variable offset or scalar value) as
                     * known to have the value @imm.
                     */
        			if (BPF_CLASS(insn->code) == BPF_ALU64) {
        				__mark_reg_known(regs + insn->dst_reg,
        						 insn->imm);
        			} else {
        				__mark_reg_known(regs + insn->dst_reg,
        						 (u32)insn->imm);
        			}
        		}
        
        	} else if (opcode > BPF_END) { /* invalid */
        		return -EINVAL;
        	} else {	/* all other ALU ops: and, sub, xor, add, ... */
        		if (BPF_SRC(insn->code) == BPF_X) { /* index register */
                    /* 要 imm == 0 && off == 0 */
        			if (insn->imm != 0 || insn->off != 0) {
        				return -EINVAL;
        			}
        			/* check src1 operand */
        			err = check_reg_arg(env, insn->src_reg, SRC_OP);
        			if (err)
        				return err;
        		} else {
                    /* 要 src_reg == reg_0 && off == 0 */
                    /* reg_0 代表沒在用 */
        			if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
        				return -EINVAL;
        			}
        		}
                /* user space 中 bpf macro 結尾 _IMM 都是 BPF_K,而結尾 _REG 的都是 BPF_X */
        
        		/* check src2 operand */
        		err = check_reg_arg(env, insn->dst_reg, SRC_OP);
        		if (err)
        			return err;
        
        		if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
        		    BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
        			return -EINVAL;
        		}
        
        		if ((opcode == BPF_LSH || opcode == BPF_RSH ||
        		     opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
        			int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
        
                    /* invalid shift */
        			if (insn->imm < 0 || insn->imm >= size) {
        				return -EINVAL;
        			}
        		}
        
        		/* check dest operand */
        		err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
        		if (err)
        			return err;
        
                /* Handles ALU ops other than BPF_END, BPF_NEG and BPF_MOV: computes new min/max
                 * and var_off.
                 */
        		return adjust_reg_min_max_vals(env, insn); /* 下方的某 section 獨立分析 */
        	}
        
        	return 0;
        }
        • 大多數都會檢查某個 operation 是否有用到不該用的欄位 (reversed field)

        check_reg_arg() 經常被呼叫到,會根據 reg 的 type 去做一些基本的檢查:

        static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
        			 enum reg_arg_type t)
        {
        	struct bpf_verifier_state *vstate = env->cur_state;
        	struct bpf_func_state *state = vstate->frame[vstate->curframe];
        	struct bpf_insn *insn = env->prog->insnsi + env->insn_idx;
        	struct bpf_reg_state *reg, *regs = state->regs;
        	bool rw64;
        
            /* 使用 register 超出範圍 */
        	if (regno >= MAX_BPF_REG) {
        		return -EINVAL;
        	}
        
        	reg = &regs[regno];
            /* is_reg64(): returns TRUE if the source or destination register operates on 64-bit */
        	rw64 = is_reg64(env, insn, regno, reg, t);
        	if (t == SRC_OP) {
        		/* check whether register used as source operand can be read */
                
                /* 還沒初始化 ? */
        		if (reg->type == NOT_INIT) {
        			verbose(env, "R%d !read_ok\n", regno);
        			return -EACCES;
        		}
        		/* We don't need to worry about FP liveness because it's read-only */
                /* fp (reg10) 唯讀 */
        		if (regno == BPF_REG_FP)
        			return 0;
        
        		if (rw64)
                    /* zext == zero extended */
                    /**
                     * The dst will be zero extended, so won't be sub-register anymore.
                     * 代表有些 register 需要 sub-reg ?
                     * 
                     * env->insn_aux_data[reg->subreg_def - 1].zext_dst = true;
                     * reg->subreg_def = DEF_NOT_SUBREG;
                     */
        			mark_insn_zext(env, reg);
        		
                /**
                 * 將 reg mark 成 read,並且有區分 64 and 32
                 * mark_reg_read() 會 traverse reg 的 parent (因此才有 sub reg ?)
                 * 而且 parent == NULL 就不會被 mark (reg->live |= flag)
                 */
        		return mark_reg_read(env, reg, reg->parent,
        				     rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32);
        	} else {
        		/* check whether register used as dest operand can be written to */
                
                /* fp 唯讀 */
        		if (regno == BPF_REG_FP) {
        			verbose(env, "frame pointer is read only\n");
        			return -EACCES;
        		}
                
                /* written 沒有分 32/64 */
        		reg->live |= REG_LIVE_WRITTEN;
                /* 64 就不需要 subreg,32 則需要,並且值為 env->insn_idx + 1 (下一個 insn ?) */
                /* 還不知道 subreg_def 是用來幹嘛的 */
        		reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1;
        		if (t == DST_OP) /* 此時 t 可能為 DST_OP_NO_MARK or DST_OP */
        			mark_reg_unknown(env, regs, regno);
        	}
        	return 0;
        }
        • 結論:

          1. DST_OP - check whether register used as dest operand can be written to
          2. SRC_OP - check whether register used as source operand can be read
        • reg_arg_type 一共有三種:

          enum reg_arg_type {
          	SRC_OP,		/* register is used as source operand */
          	DST_OP,		/* register is used as destination operand */
          	DST_OP_NO_MARK	/* same as above, check only, don't mark */
          };
    • 第二部分說明 BPF_LDX 的行為:

      	...
      	else if (class == BPF_LDX) {
      			enum bpf_reg_type *prev_src_type, src_reg_type;
      
      			/* check for reserved fields is already done */
      
      			/* check src operand */
      			err = check_reg_arg(env, insn->src_reg, SRC_OP);
      			if (err)
      				return err;
      
      			err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
      			if (err)
      				return err;
      
      			src_reg_type = regs[insn->src_reg].type;
      
      			/* check that memory (src_reg + off) is readable,
      			 * the state of dst_reg will be updated by this func
      			 */
      			err = check_mem_access(env, env->insn_idx, insn->src_reg,
      					       insn->off, BPF_SIZE(insn->code),
      					       BPF_READ, insn->dst_reg, false);
      			if (err)
      				return err;
      
      			prev_src_type = &env->insn_aux_data[env->insn_idx].ptr_type;
      
      			if (*prev_src_type == NOT_INIT) {
      				/* saw a valid insn
      				 * dst_reg = *(u32 *)(src_reg + off)
      				 * save type to validate intersecting paths
      				 */
      				*prev_src_type = src_reg_type;
      
      			} else if (reg_type_mismatch(src_reg_type, *prev_src_type)) {
      				/* ABuser program is trying to use the same insn
      				 * dst_reg = *(u32*) (src_reg + off)
      				 * with different pointer types:
      				 * src_reg == ctx in one branch and
      				 * src_reg == stack|map in some other branch.
      				 * Reject it.
      				 */
      				verbose(env, "same insn cannot be used with different pointers\n");
      				return -EINVAL;
      			}
      
      		} 
      	...
    • 第三部分為 BPF_STX:

      		...
      		else if (class == BPF_STX) {
      			enum bpf_reg_type *prev_dst_type, dst_reg_type;
      
      			if (BPF_MODE(insn->code) == BPF_ATOMIC) { /* atomic operation */
      				err = check_atomic(env, env->insn_idx, insn);
      				if (err)
      					return err;
      				env->insn_idx++;
      				continue;
      			}
      
                  /* reversed field */
      			if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) {
      				return -EINVAL;
      			}
      
      			/* check src1 operand */
                  /* readable */
      			err = check_reg_arg(env, insn->src_reg, SRC_OP);
      			if (err)
      				return err;
      			
                  /* check src2 operand */
                  /* readable */
      			err = check_reg_arg(env, insn->dst_reg, SRC_OP);
      			if (err)
      				return err;
      
      			dst_reg_type = regs[insn->dst_reg].type;
      
      			/* check that memory (dst_reg + off) is writeable */
                  /* BPF_READ = 1
                   * BPF_WRITE = 2 
                   */
      			err = check_mem_access(env, env->insn_idx, insn->dst_reg,
      					       insn->off, BPF_SIZE(insn->code),
      					       BPF_WRITE, insn->src_reg, false);
      			if (err)
      				return err;
      
                  /* 當前的 ptr type */
      			prev_dst_type = &env->insn_aux_data[env->insn_idx].ptr_type;
      
      			if (*prev_dst_type == NOT_INIT) {
                      /* regs[insn->dst_reg].type */
      				*prev_dst_type = dst_reg_type;
      			} else if (reg_type_mismatch(dst_reg_type /* src */, *prev_dst_type /* dst */)) {
                      /* return src != prev (reg type 不同,代表 different type)
                       * && (!reg_type_mismatch_ok(src) || !reg_type_mismatch_ok(prev))
      			     *
      			     * reg_type_mismatch_ok():
      			     * Return true if it's OK to have the same insn return a different type
      			     * PTR_TO_CTX, PTR_TO_SOCKET, ... 都會回傳 false
      			     */
                      
                      /* 相同的 insn 不能用不同的 pointer (?) */
      				verbose(env, "same insn cannot be used with different pointers\n");
      				return -EINVAL;
      			}
      		} 
      		...
    • 其中有一個 check_mem_access() 被頻繁使用到,目的是用來檢查 (regno + off) 是否可存取,並且根據不同的 bpf_access_type 有不同的行為 (src):

      • 檢查當 t = (read | write) 時, memory at (regno + off) 是否可以存取
      • t = write --> regno 為其值要被放入 memory 當中的 register
      • t = read --> regno 為要被寫入 memory 內的值的 register
      • t = write && regno == -1 --> unknown value 要被存入 memory
      • t = read && regno == -1 -- > 不管我們讀什麼
      static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno,
      			    int off, int bpf_size, enum bpf_access_type t,
      			    int value_regno, bool strict_alignment_once)
      {
      	struct bpf_reg_state *regs = cur_regs(env);
      	struct bpf_reg_state *reg = regs + regno;
      	struct bpf_func_state *state;
      	int size, err = 0;
      
      	size = bpf_size_to_bytes(bpf_size); /* 看是 BPF_W, BPF_DW 等等就會對到 4, 8 ... */
      	if (size < 0)
      		return size;
      
      	/* alignment checks will add in reg->off themselves */
          /* 加上 offset 之後需要做對齊的 check,必須要 align size */
      	err = check_ptr_alignment(env, reg, off, size, strict_alignment_once);
      	if (err)
      		return err;
      
      	/* for access checks, reg->off is just part of off */
      	off += reg->off;
      
      	if (reg->type == PTR_TO_MAP_KEY) { /* map key */
      		if (t == BPF_WRITE) { /* 不能 change key */
      			return -EACCES;
      		}
      		
              /* check read/write into a memory region with possible variable offset */
      		err = check_mem_region_access(env, regno, off, size,
      					      reg->map_ptr->key_size, false);
      		if (err)
      			return err;
      		if (value_regno >= 0)
      			mark_reg_unknown(env, regs, value_regno);
      	} else if (reg->type == PTR_TO_MAP_VALUE) {
      		if (t == BPF_WRITE && value_regno >= 0 &&
                  /* 	return allow_ptr_leaks ? false : reg->type != SCALAR_VALUE; */
      		    is_pointer_value(env, value_regno)) {
      			return -EACCES;
      		}
              /* type == BPF_WRITE && !(bpf_map_flags_to_cap(map) & BPF_MAP_CAN_WRITE)
               * type == BPF_READ && !(bpf_map_flags_to_cap(map) & BPF_MAP_CAN_READ)
               * will return -EACCES
               */
      		err = check_map_access_type(env, regno, off, size, t);
      		if (err)
      			return err;
              /* 沒有 spinlock 的話,check_map_access() 就是 check_mem_region_access() 的 wrapper */
      		err = check_map_access(env, regno, off, size, false);
      		if (!err && t == BPF_READ && value_regno >= 0) {
      			struct bpf_map *map = reg->map_ptr;
      
      			/* if map is read-only, track its contents as scalars */
      			if (tnum_is_const(reg->var_off) &&
      			    bpf_map_is_rdonly(map) &&
      			    map->ops->map_direct_value_addr) {
      				int map_off = off + reg->var_off.value;
      				u64 val = 0;
      
      				err = bpf_map_direct_read(map, map_off, size,
      							  &val);
      				if (err)
      					return err;
      
      				regs[value_regno].type = SCALAR_VALUE;
      				__mark_reg_known(&regs[value_regno], val);
      			} else {
      				mark_reg_unknown(env, regs, value_regno);
      			}
      		}
      	} else if (reg->type == PTR_TO_MEM) {
      		if (t == BPF_WRITE && value_regno >= 0 &&
      		    is_pointer_value(env, value_regno)) { /* 不給 leak */
      			return -EACCES;
      		}
      		err = check_mem_region_access(env, regno, off, size,
      					      reg->mem_size, false);
      		if (!err && t == BPF_READ && value_regno >= 0) /* 寫成 unknown */
      			mark_reg_unknown(env, regs, value_regno);
      	} else if (reg->type == PTR_TO_CTX) {
      		enum bpf_reg_type reg_type = SCALAR_VALUE;
      		struct btf *btf = NULL;
      		u32 btf_id = 0;
      
      		if (t == BPF_WRITE && value_regno >= 0 &&
      		    is_pointer_value(env, value_regno)) { /* 不給 leak */
      			return -EACCES;
      		}
      
      		err = check_ctx_reg(env, reg, regno);
      		if (err < 0)
      			return err;
      
      		err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf, &btf_id);
              /* CTX 的部分還沒看 */
      		if (!err && t == BPF_READ && value_regno >= 0) {
      			/* ctx access returns either a scalar, or a
      			 * PTR_TO_PACKET[_META,_END]. In the latter
      			 * case, we know the offset is zero.
      			 */
      			if (reg_type == SCALAR_VALUE) {
      				mark_reg_unknown(env, regs, value_regno);
      			} else {
      				mark_reg_known_zero(env, regs,
      						    value_regno);
      				if (reg_type_may_be_null(reg_type))
      					regs[value_regno].id = ++env->id_gen;
      				/* A load of ctx field could have different
      				 * actual load size with the one encoded in the
      				 * insn. When the dst is PTR, it is for sure not
      				 * a sub-register.
      				 */
      				regs[value_regno].subreg_def = DEF_NOT_SUBREG;
      				if (reg_type == PTR_TO_BTF_ID ||
      				    reg_type == PTR_TO_BTF_ID_OR_NULL) {
      					regs[value_regno].btf = btf;
      					regs[value_regno].btf_id = btf_id;
      				}
      			}
      			regs[value_regno].type = reg_type;
      		}
      
      	} else if (reg->type == PTR_TO_STACK) {
      		/* Basic bounds checks. */
              /* Check that the stack access at 'regno + off' falls within the maximum stack bounds */
      		err = check_stack_access_within_bounds(env, regno, off, size, ACCESS_DIRECT, t);
      		if (err)
      			return err;
      
              /* return env->cur_state->frame[reg->frameno] */
              state = func(env, reg);
              /* 如果原本 stack size 比較小,就執行
               * env->subprog_info[func->subprogno].stack_depth = -off
               */
      		err = update_stack_depth(env, state, off);
      		if (err)
      			return err;
      
              /* 這兩個 check_stack 好大坨 @__@,之後在看 */
      		if (t == BPF_READ)
      			err = check_stack_read(env, regno, off, size,
      					       value_regno);
      		else
      			err = check_stack_write(env, regno, off, size,
      						value_regno, insn_idx);
      	} else if (reg_is_pkt_pointer(reg)) {
      		/* pass packet */
      	} else if (reg->type == PTR_TO_FLOW_KEYS) {
      		/* pass flow keys */
      	} else if (type_is_sk_pointer(reg->type)) {
      		/* pass socket */
      	} else if (reg->type == PTR_TO_TP_BUFFER) {
      		/* pass tp buffer */
      	} else if (reg->type == PTR_TO_BTF_ID) {
      		/* pass btf id */
      	} else if (reg->type == CONST_PTR_TO_MAP) {
      		err = check_ptr_to_map_access(env, regs, regno, off, size, t,
      					      value_regno);
      	} else if (reg->type == PTR_TO_RDONLY_BUF) {
      		if (t == BPF_WRITE) { /* read only */
      			return -EACCES;
      		}
      		err = check_buffer_access(env, reg, regno, off, size, false,
      					  "rdonly",
      					  &env->prog->aux->max_rdonly_access);
      		if (!err && value_regno >= 0)
      			mark_reg_unknown(env, regs, value_regno);
      	} else if (reg->type == PTR_TO_RDWR_BUF) {
      		err = check_buffer_access(env, reg, regno, off, size, false,
      					  "rdwr",
      					  &env->prog->aux->max_rdwr_access);
      		if (!err && t == BPF_READ && value_regno >= 0)
      			mark_reg_unknown(env, regs, value_regno);
      	} else {
      		/* invalid mem access */
      		return -EACCES;
      	}
          /* 到這邊發現,如果 operation 是 BPF_READ (or read 相關的 ?),register 就會被 mark 成 unknown,
           * 作用為將幾乎整個 struct 設為 0
           */
      
          /* BPF_WRITE 不需要 */
      	if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
      	    regs[value_regno].type == SCALAR_VALUE) {
      		/* b/h/w load zero-extends, mark upper bits as known 0 */
              /* truncate register to smaller size (in bytes) */
              /* https://elixir.bootlin.com/linux/v5.13.11/source/kernel/bpf/verifier.c#L3778 */
      		coerce_reg_to_size(&regs[value_regno], size);
      	}
      	return err;
      }
      • 各種 check memory/reg 的權限,其中 check_mem_region_access() 的分析如下:

        static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno,
        				   int off, int size, u32 mem_size,
        				   bool zero_size_allowed)
        {
        	struct bpf_verifier_state *vstate = env->cur_state;
        	struct bpf_func_state *state = vstate->frame[vstate->curframe];
        	struct bpf_reg_state *reg = &state->regs[regno];
        	int err;
        
        	...
        
            /* smin 為 neg,並且
             * smin 為最小值 (不能在低) || off + reg->smin_value 在 cast 成 32 後 != 原本的值 ||
             * reg->smin_value + off 為負數
             */
        	if (reg->smin_value < 0 &&
        	    (reg->smin_value == S64_MIN ||
        	     (off + reg->smin_value != (s64)(s32)(off + reg->smin_value)) ||
        	      reg->smin_value + off < 0)) {
        		return -EACCES;
        	}
        	err = __check_mem_access(env, regno, reg->smin_value + off, size,
        				 mem_size, zero_size_allowed);
        	if (err) {
        		return err;
        	}
        
        	/* If we haven't set a max value then we need to bail since we can't be
        	 * sure we won't do bad things.
        	 * If reg->umax_value + off could overflow, treat that as unbounded too.
        	 */
            /* unbounded memory access */
        	if (reg->umax_value >= BPF_MAX_VAR_OFF /* (1 << 29), 0x20000000 */) {
        		return -EACCES;
        	}
        	err = __check_mem_access(env, regno, reg->umax_value + off, size,
        				 mem_size, zero_size_allowed);
        	if (err) {
        		return err;
        	}
        
        	return 0;
        }
      • __check_mem_access() 檢查 memory region read/write:

        /* check read/write into memory region (e.g., map value, ringbuf sample, etc) */
        static int __check_mem_access(struct bpf_verifier_env *env, int regno,
        			      int off, int size, u32 mem_size,
        			      bool zero_size_allowed)
        {
        	bool size_ok = size > 0 || (size == 0 && zero_size_allowed);
        	struct bpf_reg_state *reg;
        
        	if (off >= 0 && size_ok && (u64)off + size <= mem_size)
        		return 0;
        
        	reg = &cur_regs(env)[regno];
        	switch (reg->type) {
                /* 到此已經代表 invalid,只是根據不同的 type 有不同的 verbose */
        	}
        
        	return -EACCES;
        }
      • stack 的 bound check function check_stack_access_within_bounds():

        /* 'off' includes `regno->offset`, but not its dynamic part (if any). */
        /* ACCESS_DIRECT = 1,  the access is performed by an instruction
         * ACCESS_HELPER = 2,  the access is performed by a helper
         */
        static int check_stack_access_within_bounds(
        		struct bpf_verifier_env *env,
        		int regno, int off, int access_size,
        		enum stack_access_src src, enum bpf_access_type type)
        {
        	struct bpf_reg_state *regs = cur_regs(env);
        	struct bpf_reg_state *reg = regs + regno;
        	struct bpf_func_state *state = func(env, reg);
        	int min_off, max_off;
        	int err;
        	char *err_extra;
        
        	if (src == ACCESS_HELPER)
        		/* We don't know if helpers are reading or writing (or both). */
        		err_extra = " indirect access to";
        	else if (type == BPF_READ)
        		err_extra = " read from";
        	else
        		err_extra = " write to";
        
            /* var_off 存的是 struct tnum
             * mask 為 1 代表未知,!mask 為 true 代表都知道了 --> const
             */
        	if (tnum_is_const(reg->var_off) /* !reg->var_off.mask */) {
        		min_off = reg->var_off.value + off;
        		if (access_size > 0)
                    /* access_size 為 user 傳入的 W, DW 等等 */
        			max_off = min_off + access_size - 1;
        		else
        			max_off = min_off;
        	} else {
                /* invalid unbounded variable-offset */
        		if (reg->smax_value >= BPF_MAX_VAR_OFF /* >= 0x20000000 */ ||
        		    reg->smin_value <= -BPF_MAX_VAR_OFF /* <= -0x20000000 */) {
        			return -EACCES;
        		}
        		min_off = reg->smin_value + off;
        		if (access_size > 0)
        			max_off = reg->smax_value + off + access_size - 1;
        		else
        			max_off = min_off;
        	}
        	/* Check that the stack access at the given offset is within bounds
        	 * maximum valid offset is -1
        	 * minimum valid offset is -MAX_BPF_STACK (-512) for write, -state->allocated_stack for
        	 * read
        	 *
        	 * 也就是 stack access 的 range 為 -1 <= off <= -MAX_BPF_STACK
        	 */
        	err = check_stack_slot_within_bounds(min_off, state, type);
        	if (!err)
        		err = check_stack_slot_within_bounds(max_off, state, type);
        
        	if (err) {
        		/* verbose */
        	}
        	return err;
        }
    • 第四部份為 BPF_ST:

      		...
      		else if (class == BPF_ST) {
      			if (BPF_MODE(insn->code) != BPF_MEM ||
      			    insn->src_reg != BPF_REG_0) {
      				verbose(env, "BPF_ST uses reserved fields\n");
      				return -EINVAL;
      			}
      			/* check src operand */
      			err = check_reg_arg(env, insn->dst_reg, SRC_OP);
      			if (err)
      				return err;
      
      			if (is_ctx_reg(env, insn->dst_reg)) {
      				verbose(env, "BPF_ST stores into R%d %s is not allowed\n",
      					insn->dst_reg,
      					reg_type_str[reg_state(env, insn->dst_reg)->type]);
      				return -EACCES;
      			}
      
      			/* check that memory (dst_reg + off) is writeable */
      			err = check_mem_access(env, env->insn_idx, insn->dst_reg,
      					       insn->off, BPF_SIZE(insn->code),
      					       BPF_WRITE, -1, false);
      			if (err)
      				return err;
      
      		}
      		...
    • 第五部分為 JMP 系列:

      		...
      		else if (class == BPF_JMP || class == BPF_JMP32) {
      			u8 opcode = BPF_OP(insn->code);
      
      			env->jmps_processed++;
                  /* function call */
      			if (opcode == BPF_CALL) {
                      /* reserved fields */
      				if (BPF_SRC(insn->code) != BPF_K ||
      				    insn->off != 0 ||
      				    (insn->src_reg != BPF_REG_0 &&
      				     insn->src_reg != BPF_PSEUDO_CALL &&
      				     insn->src_reg != BPF_PSEUDO_KFUNC_CALL) ||
      				    insn->dst_reg != BPF_REG_0 ||
      				    class == BPF_JMP32) {
      					return -EINVAL;
      				}
      
                      /* function call 不能 hold lock */
      				if (env->cur_state->active_spin_lock &&
      				    (insn->src_reg == BPF_PSEUDO_CALL ||
      				     insn->imm != BPF_FUNC_spin_unlock)) {
      					return -EINVAL;
      				}
      				if (insn->src_reg == BPF_PSEUDO_CALL)
      					err = check_func_call(env, insn, &env->insn_idx);
      				else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
      					err = check_kfunc_call(env, insn);
      				else
      					err = check_helper_call(env, insn, &env->insn_idx);
                      
      				if (err)
      					return err;
      			} else if (opcode == BPF_JA) {
                      /* reserved field */
      				if (BPF_SRC(insn->code) != BPF_K ||
      				    insn->imm != 0 ||
      				    insn->src_reg != BPF_REG_0 ||
      				    insn->dst_reg != BPF_REG_0 ||
      				    class == BPF_JMP32) {
      					return -EINVAL;
      				}
      				
                      /* 直接跳過 insn->off 個 insn */
      				env->insn_idx += insn->off + 1;
      				continue;
      
      			} else if (opcode == BPF_EXIT) {
                      /* reserved field */
      				if (BPF_SRC(insn->code) != BPF_K ||
      				    insn->imm != 0 ||
      				    insn->src_reg != BPF_REG_0 ||
      				    insn->dst_reg != BPF_REG_0 ||
      				    class == BPF_JMP32) {
      					return -EINVAL;
      				}
      
                      /* missing spinlock,應該是要被 release 的 ? */
      				if (env->cur_state->active_spin_lock) {
      					return -EINVAL;
      				}
      
      				if (state->curframe) {
      					/* exit from nested function */
      					err = prepare_func_exit(env, &env->insn_idx);
      					if (err)
      						return err;
      					do_print_state = true;
      					continue;
      				}
      
      				err = check_reference_leak(env);
      				if (err)
      					return err;
      
      				err = check_return_code(env);
      				if (err)
      					return err;
      process_bpf_exit:
      				update_branch_counts(env, env->cur_state);
      				err = pop_stack(env, &prev_insn_idx,
      						&env->insn_idx, pop_log);
      				if (err < 0) {
      					if (err != -ENOENT)
      						return err;
      					break;
      				} else {
      					do_print_state = true;
      					continue;
      				}
      			} else { /* 其他的 condition jump */
      				err = check_cond_jmp_op(env, insn, &env->insn_idx);
      				if (err)
      					return err;
      			}
      		}
      	...
      • 一共有三種呼叫方式:

        • BPF_PSEUDO_CALL - check_func_call()
        • BPF_PSEUDO_KFUNC_CALL - check_kfunc_call()
        • other - check_helper_call()
      • check_helper_call(),在 __BPF_FUNC_MAPPER (src) 當中有定義 fn 對應到的 function:

        static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
        			     int *insn_idx_p)
        {
        	const struct bpf_func_proto *fn = NULL;
        	struct bpf_reg_state *regs;
        	struct bpf_call_arg_meta meta;
        	int insn_idx = *insn_idx_p;
        	bool changes_data;
        	int i, err, func_id;
        
        	/* find function prototype */
        	func_id = insn->imm;
        	if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID /* 165 */) {
        		return -EINVAL;
        	}
        
            /* e.g. sk_filter_func_proto */
        	if (env->ops->get_func_proto)
                /* 舉 lookup_elem 為例子,
                 * 一開始執行 sk_filter_func_proto,一層層找對應 fund_id 以及 type 的 func proto,
                 * 最後在 bpf_base_func_proto 找到 bpf_map_lookup_elem_proto
                 */
        		fn = env->ops->get_func_proto(func_id, env->prog);
        	if (!fn) { /* unknown */
        		return -EINVAL;
        	}
        
        	/* eBPF programs must be GPL compatible to use GPL-ed functions */
            /* license 必須要是 GPL 的 */
        	if (!env->prog->gpl_compatible && fn->gpl_only) {
        		return -EINVAL;
        	}
        
            /* 不允許在 probe 階段呼叫 */
        	if (fn->allowed && !fn->allowed(env->prog)) {
        		return -EINVAL;
        	}
        
        	/* With LD_ABS/IND some JITs save/restore skb from r1. */
            /* 有些 func 會修改到 pkt 的內容 */
        	changes_data = bpf_helper_changes_pkt_data(fn->func);
            /* 如果會修改 pkt 的內容,但是 r1 卻不要求指向 ctx,為 misconfig */
        	if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
        		return -EINVAL;
        	}
        
        	memset(&meta, 0, sizeof(meta));
        	meta.pkt_access = fn->pkt_access;
        
            /* check_raw_mode_ok() - arg type 為 ARG_PTR_TO_UNINIT_MEM 只能有一個
             * check_arg_pair_ok() - 另一個滿有趣的限制 - arg1 不能是 const type,arg5 不能是 ptr_to_mem type
             *						,以及 argn 跟 argn+1 必須要是 (ptr, size) or (size, ptr),
             *    					否則就不能用 ptr
             * check_btf_id_ok() - ARG_PTR_TO_BTF_ID 跟 fn->arg_btf_id[i] 必須同時有 / 同時沒有
             * check_refcount_ok() - arg type 為 ARG_PTR_TO_SOCK_COMMON 只能有一個 (unref)
             */
        	err = check_func_proto(fn, func_id);
        	if (err) { /* misconfig */
        		return err;
        	}
        
        	meta.func_id = func_id;
        	/* check args */
        	for (i = 0; i < MAX_BPF_FUNC_REG_ARGS /* 5 */; i++) {
                /* 有點大坨,還沒看 */
        		err = check_func_arg(env, i, &meta, fn);
        		if (err)
        			return err;
        	}
        
            /* 更新 env->insn_aux_data[insn_idx]->map_ptr_state */
        	err = record_func_map(env, &meta, func_id, insn_idx);
        	if (err)
        		return err;
            
            /* 更新 env->insn_aux_data[insn_idx]->map_key_state */
        	err = record_func_key(env, &meta, func_id, insn_idx);
        	if (err)
        		return err;
        
        	/* Mark slots with STACK_MISC in case of raw mode, stack offset
        	 * is inferred from register state.
        	 */
        	for (i = 0; i < meta.access_size; i++) { /* 前面整個 meta 都被設成 0 了不是 (?) */
        		err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B,
        				       BPF_WRITE, -1, false);
        		if (err)
        			return err;
        	}
        
        	if (func_id == BPF_FUNC_tail_call) {
        		/* ... */
        	} else if (is_release_function(func_id)) {
        		/* ... */
        	}
        
        	regs = cur_regs(env);
        
        	if (func_id == BPF_FUNC_get_local_storage &&
                /* get storage 的 reg2 要是 null */
        	    !register_is_null(&regs[BPF_REG_2])) {
        		return -EINVAL;
        	}
        
        	if (func_id == BPF_FUNC_for_each_map_elem) {
        		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
        					set_map_elem_callback_state);
        		if (err < 0)
        			return -EINVAL;
        	}
        
        	if (func_id == BPF_FUNC_snprintf) {
        		err = check_bpf_snprintf_call(env, regs);
        		if (err < 0)
        			return err;
        	}
        
        	/* reset caller saved regs */
        	for (i = 0; i < CALLER_SAVED_REGS; i++) {
                /* __mark_reg_unknown(env, reg); reg->type = NOT_INIT; */
        		mark_reg_not_init(env, regs, caller_saved[i]);
        		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
        	}
        
        	/* helper call returns 64-bit value. */
        	regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; /* why 32 bit needs a subreg */
        
        	/* update return register (already marked as written above) */
        	if (fn->ret_type == RET_INTEGER) {
        		/* sets type to SCALAR_VALUE */
                /* unknown 意即 scalar value (?) */
        		mark_reg_unknown(env, regs, BPF_REG_0);
        	} else if (fn->ret_type == RET_VOID) {
                /* return void 的情況代表 reg not init (?) */
        		regs[BPF_REG_0].type = NOT_INIT;
        	} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL ||
        		   fn->ret_type == RET_PTR_TO_MAP_VALUE) {
                /* 還沒有 offset,因此先設為 zero */
        		mark_reg_known_zero(env, regs, BPF_REG_0);
        		/* meta.map_ptr 在 check_func_arg 設置的,指向 map address */
                /* https://elixir.bootlin.com/linux/v5.13.11/source/kernel/bpf/verifier.c#L4927 */
        		if (meta.map_ptr == NULL) {
        			return -EINVAL;
        		}
        		regs[BPF_REG_0].map_ptr = meta.map_ptr;
        		if (fn->ret_type == RET_PTR_TO_MAP_VALUE) {
        			regs[BPF_REG_0].type = PTR_TO_MAP_VALUE;
        			if (map_value_has_spin_lock(meta.map_ptr))
        				regs[BPF_REG_0].id = ++env->id_gen;
        		} else {
        			regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
        		}
            } else if (...) {
        	    /* 省略一些 else if 的 case,行為大同小異 */
        	} else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL ||
        		/* 頗亂,先 pass */
        	} else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL ||
        		   fn->ret_type == RET_PTR_TO_BTF_ID) {
        		/* 頗亂,先 pass */	
        	} else { /* unknown */
        		return -EINVAL;
        	}
        	/* 感覺 return type 結尾為 _OR_NULL 的,reg0 都會先設為 known_zero */
        
            /* 只要 return type 後面有 _OR_NULL 都 return true */
        	if (reg_type_may_be_null(regs[BPF_REG_0].type))
                /* 為 reg assign 一個 id,當 return or_null 或是 spinlock 都會,還有其他少數特例 */
        		regs[BPF_REG_0].id = ++env->id_gen;
        
        	if (is_ptr_cast_function(func_id)) { /* func_id 為 BPF_FUNC prefix */
        		/* For release_reference() */
        		regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
        	} else if (is_acquire_function(func_id, meta.map_ptr)) {
        		/* 看不太懂,先 pass */
        	}
        
        	do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
        
        	/* 這是因為 func 有時候只能用某些 map,而 map 也有相對應的 func,
        	 * 因此要從 map & func 的觀點分別作檢查
        	 */
        	err = check_map_func_compatibility(env, meta.map_ptr, func_id);
        	if (err)
        		return err;
        
        	/* 有 function 的 type 與 get stack / get task stack 相關 */
        	if ((func_id == BPF_FUNC_get_stack ||
        	     func_id == BPF_FUNC_get_task_stack) &&
        	    !env->prog->has_callchain_buf) {
        		if (err) {
        			return err;
        		}
        		env->prog->has_callchain_buf = true;
        	}
        
        	if (func_id == BPF_FUNC_get_stackid || func_id == BPF_FUNC_get_stack)
        		env->prog->call_get_stack = true;
        
        	/* return value of bpf_helper_changes_pkt_data(fn->func)
        	 * 可能會改變 pkt 內容的 func
        	 */
        	if (changes_data)
        		/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
                 * are now invalid, so turn them into unknown SCALAR_VALUE.
                 *
                 * 大概就是只要 reg 存著指向 pkt 的 pointer,就呼叫 mark_reg_unknown() 將其設為 unknown
                 * 不過 function 中有 spilled register,不確定是什麼
                 */
        		clear_all_pkt_pointers(env);
        	return 0;
        }
        • 不同 helper function 使用到的 func prototype 不太一樣,可以參考 src,下方為使用到的 struct bpf_func_proto:

          /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
           * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL
           * instructions after verifying
           */
          struct bpf_func_proto {
          	u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
          	bool gpl_only;
          	bool pkt_access;
          	enum bpf_return_type ret_type;
          	union {
          		struct {
          			enum bpf_arg_type arg1_type;
          			enum bpf_arg_type arg2_type;
          			enum bpf_arg_type arg3_type;
          			enum bpf_arg_type arg4_type;
          			enum bpf_arg_type arg5_type;
          		};
          		enum bpf_arg_type arg_type[5];
          	};
          	union {
          		struct {
          			u32 *arg1_btf_id;
          			u32 *arg2_btf_id;
          			u32 *arg3_btf_id;
          			u32 *arg4_btf_id;
          			u32 *arg5_btf_id;
          		};
          		u32 *arg_btf_id[5];
          	};
          	int *ret_btf_id; /* return value btf_id */
          	bool (*allowed)(const struct bpf_prog *prog);
          };
          
          /* example */
          const struct bpf_func_proto bpf_map_lookup_elem_proto = {
          	.func		= bpf_map_lookup_elem,
          	.gpl_only	= false,
          	.pkt_access	= true,
          	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
          	.arg1_type	= ARG_CONST_MAP_PTR, /* r1 要放指向 map 的 ptr,userland 可以用 BPF_LD_MAP_FD 取得˙*/
          	.arg2_type	= ARG_PTR_TO_MAP_KEY, /* key */
          };
        • condition jmp 像是 JNE, 等等 check_cond_jmp_op (src):

          static int check_cond_jmp_op(struct bpf_verifier_env *env,
          			     struct bpf_insn *insn, int *insn_idx)
          {
          	struct bpf_verifier_state *this_branch = env->cur_state;
          	struct bpf_verifier_state *other_branch;
          	struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
          	struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL;
          	u8 opcode = BPF_OP(insn->code);
          	bool is_jmp32;
          	int pred = -1;
          	int err;
          
          	/* Only conditional jumps are expected to reach here. */
              /* 不過為什麼會到這邊 @__@ ? */
          	if (opcode == BPF_JA || opcode > BPF_JSLE) {
          		return -EINVAL;
          	}
          
          	if (BPF_SRC(insn->code) == BPF_X) {
          		if (insn->imm != 0) { /* BPF_X 用 register */
          			return -EINVAL;
          		}
          
          		/* check src1 operand */
          		err = check_reg_arg(env, insn->src_reg, SRC_OP);
          		if (err)
          			return err;
          
                  /* 不能比 ptr */
          		if (is_pointer_value(env, insn->src_reg)) {
          			return -EACCES;
          		}
          		src_reg = &regs[insn->src_reg];
          	} else {
          		if (insn->src_reg != BPF_REG_0) { /* BPF_K */
          			return -EINVAL;
          		}
          	}
          
          	/* check src2 operand */
          	err = check_reg_arg(env, insn->dst_reg, SRC_OP);
          	if (err)
          		return err;
          
          	dst_reg = &regs[insn->dst_reg];
          	is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; /* if false, jmp64 */
          
          	if (BPF_SRC(insn->code) == BPF_K) { /* 用 imm */
                  /* prediction */
          		pred = is_branch_taken(dst_reg, insn->imm, opcode, is_jmp32);
          	} else if (src_reg->type == SCALAR_VALUE &&
          		   is_jmp32 && tnum_is_const(tnum_subreg(src_reg->var_off))) {
                  /* 用 src_reg 的 value 
                   * 並且 jmp32,
                   * 由於 jmp32 的關係也使用 subreg (tnum_cast(a, 4))
                   */
          		pred = is_branch_taken(dst_reg,
          				       tnum_subreg(src_reg->var_off).value,
          				       opcode,
          				       is_jmp32);
          	} else if (src_reg->type == SCALAR_VALUE &&
          		   !is_jmp32 && tnum_is_const(src_reg->var_off)) {
                  /* 跟上面只差在 jmp64 */
          		pred = is_branch_taken(dst_reg,
          				       src_reg->var_off.value,
          				       opcode,
          				       is_jmp32);
          	} else if (/* 與 pkt (packet) 相關 */) { ... }
          
              /* 1 - taken, 0 - not taken, -1 - unknown */
          	if (pred >= 0) {
          		/* If we get here with a dst_reg pointer type it is because
          		 * above is_branch_taken() special cased the 0 comparison.
          		 */
                  /* mark_chain == Markov chain (?) */
          		if (!__is_pointer_value(false, dst_reg))
          			err = mark_chain_precision(env, insn->dst_reg);
          		if (BPF_SRC(insn->code) == BPF_X && !err &&
          		    !__is_pointer_value(false, src_reg))
          			err = mark_chain_precision(env, insn->src_reg);
          		if (err)
          			return err;
          	}
          
          	if (pred == 1) {
          		/* Only follow the goto, ignore fall-through. If needed, push
          		 * the fall-through branch for simulation under speculative
          		 * execution.
          		 */
          		if (!env->bypass_spec_v1 &&
          		    !sanitize_speculative_path(env, insn, *insn_idx + 1,
          					       *insn_idx))
          			return -EFAULT;
          		*insn_idx += insn->off;
          		return 0;
          	} else if (pred == 0) {
          		/* Only follow the fall-through branch, since that's where the
          		 * program will go. If needed, push the goto branch for
          		 * simulation under speculative execution.
          		 */
          		if (!env->bypass_spec_v1 &&
          		    !sanitize_speculative_path(env, insn,
          					       *insn_idx + insn->off + 1,
          					       *insn_idx))
          			return -EFAULT;
          		return 0;
          	}
          
              /* 得到新的 struct bpf_verifier_state */
          	other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx,
          				  false);
          	if (!other_branch)
          		return -EFAULT;
          	other_branch_regs = other_branch->frame[other_branch->curframe]->regs;
          
              /* 檢查是否正在與 const value 比較,讓我們可以調整 dst_reg 的 min/max,
               * 只有在 src/dst 都是 scalar (或指向同個 obj 的 ptr in future) 時才合法,
               * 否則不同的 base ptr 代表著 offset 不可比較
               */
              /* 設置 register value range 的 function - reg_set_min_max() */
          	if (BPF_SRC(insn->code) == BPF_X) { /* register operation */
          		struct bpf_reg_state *src_reg = &regs[insn->src_reg];
          
                  /* 都是 scalar */
          		if (dst_reg->type == SCALAR_VALUE &&
          		    src_reg->type == SCALAR_VALUE) {
          			if (tnum_is_const(src_reg->var_off) ||
          			    (is_jmp32 &&
          			     tnum_is_const(tnum_subreg(src_reg->var_off))))
          				reg_set_min_max(&other_branch_regs[insn->dst_reg],
          						dst_reg,
          						src_reg->var_off.value,
          						tnum_subreg(src_reg->var_off).value,
          						opcode, is_jmp32);
          			else if (tnum_is_const(dst_reg->var_off) ||
          				 (is_jmp32 &&
          				  tnum_is_const(tnum_subreg(dst_reg->var_off))))
          				reg_set_min_max_inv(&other_branch_regs[insn->src_reg],
          						    src_reg,
          						    dst_reg->var_off.value,
          						    tnum_subreg(dst_reg->var_off).value,
          						    opcode, is_jmp32);
          			else if (!is_jmp32 &&
          				 (opcode == BPF_JEQ || opcode == BPF_JNE))
          				/* Comparing for equality, we can combine knowledge */
          				reg_combine_min_max(&other_branch_regs[insn->src_reg],
          						    &other_branch_regs[insn->dst_reg],
          						    src_reg, dst_reg, opcode);
          			if (src_reg->id &&
          			    !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) {
          				find_equal_scalars(this_branch, src_reg);
          				find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]);
          			}
          
          		}
          	} else if (dst_reg->type == SCALAR_VALUE) {
          		reg_set_min_max(&other_branch_regs[insn->dst_reg],
          					dst_reg, insn->imm, (u32)insn->imm,
          					opcode, is_jmp32);
          	}
          
          	if (dst_reg->type == SCALAR_VALUE && dst_reg->id &&
          	    !WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) {
          		find_equal_scalars(this_branch, dst_reg);
          		find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]);
          	}
          
          	/* detect if R == 0 where R is returned from bpf_map_lookup_elem().
          	 * NOTE: these optimizations below are related with pointer comparison
          	 *       which will never be JMP32.
          	 */
          	if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K &&
          	    insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
          	    reg_type_may_be_null(dst_reg->type)) {
          		/* Mark all identical registers in each branch as either
          		 * safe or unknown depending R == 0 or R != 0 conditional.
          		 */
                  /* 如果 ret_type 是 _OR_NULL,則會被 mark 成 unknown_zero or not_null */
                  /* 突然想到 reg->id 是否為 ticket spinlock 取得號碼牌的機制 */
          		mark_ptr_or_null_regs(this_branch, insn->dst_reg,
          				      opcode == BPF_JNE);
          		mark_ptr_or_null_regs(other_branch, insn->dst_reg,
          				      opcode == BPF_JEQ);
          	} else if (...) {
          		return -EACCES;
          	}
          	return 0;
          }
          • check branch taken 相關的 function is_branch_taken():

            /* compute branch direction of the expression "if (reg opcode val) goto target;"
             * and return:
             *  1 - branch will be taken and "goto target" will be executed
             *  0 - branch will not be taken and fall-through to next insn
             * -1 - unknown. Example: "if (reg < 5)" is unknown when register value
             *      range [0,10]
             */
            static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode,
            			   bool is_jmp32)
            {
            	if (__is_pointer_value(false, reg) /* reg->type != SCALAR_VALUE */) {
                    /* return
                     * PTR_TO_SOCKET ||
                     * PTR_TO_TCP_SOCK ||
                     * PTR_TO_MAP_VALUE ||
                     * PTR_TO_MAP_KEY ||
                     * PTR_TO_SOCK_COMMON
                     */
            		if (!reg_type_not_null(reg->type))
            			return -1;
            
            		/* If pointer is valid tests against zero will fail so we can
            		 * use this to direct branch taken.
            		 */
            		if (val != 0)
            			return -1;
            
            		switch (opcode) {
            		case BPF_JEQ:
            			return 0;
            		case BPF_JNE:
            			return 1;
            		default: /* make sense,大於小於相關的都不確定 */
            			return -1;
            		}
            	}
            
            	if (is_jmp32)
            		return is_branch32_taken(reg, val, opcode);
            	return is_branch64_taken(reg, val, opcode);
            }
          • sanitize_speculative_path():

            static struct bpf_verifier_state *
            sanitize_speculative_path(struct bpf_verifier_env *env,
            			  const struct bpf_insn *insn,
            			  u32 next_idx, u32 curr_idx)
            {
            	struct bpf_verifier_state *branch;
            	struct bpf_reg_state *regs;
            
            	branch = push_stack(env, next_idx, curr_idx, true);
            	if (branch && insn) {
            		regs = branch->frame[branch->curframe]->regs;
            		if (BPF_SRC(insn->code) == BPF_K) {
            			mark_reg_unknown(env, regs, insn->dst_reg);
            		} else if (BPF_SRC(insn->code) == BPF_X) {
            			mark_reg_unknown(env, regs, insn->dst_reg);
            			mark_reg_unknown(env, regs, insn->src_reg);
            		}
            	}
            	return branch;
            }
          • push_stack() 新增一個 bpf_verifier_stack_elem element:

            static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
            					     int insn_idx, int prev_insn_idx,
            					     bool speculative)
            {
            	struct bpf_verifier_state *cur = env->cur_state;
            	struct bpf_verifier_stack_elem *elem;
            	int err;
            
            	elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
            	if (!elem)
            		goto err;
            
                /* copy value to elem */
            	elem->insn_idx = insn_idx;
            	elem->prev_insn_idx = prev_insn_idx;
            	elem->next = env->head;
            	elem->log_pos = env->log.len_used;
            	env->head = elem;
            	env->stack_size++;
                /* copy 當前的整個 struct bpf_verifier_state */
            	err = copy_verifier_state(&elem->st, cur);
            	if (err)
            		goto err;
                /* elem->st 為 element state */
            	elem->st.speculative |= speculative;
            	if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) { /* jmp size 過於複雜 */
            		goto err;
            	}
            	if (elem->st.parent) {
            		++elem->st.parent->branches;
            	}
            	return &elem->st;
            err:
            	free_verifier_state(env->cur_state, true);
            	env->cur_state = NULL;
            	/* pop all elements and return */
            	while (!pop_stack(env, NULL, NULL, false));
            	return NULL;
            }
    • 最後一個部分為 BPF_LD:

      		... 	
      		else if (class == BPF_LD) {
                  /* BPF_IMM   0x00
                   * BPF_ABS   0x20
                   * BPF_IND   0x40
                   * BPF_MEM   0x60
                   * BPF_LEN   0x80
                   * BPF_MSH   0xa0
                   */
      			u8 mode = BPF_MODE(insn->code);
                  
      			if (mode == BPF_ABS || mode == BPF_IND) {
      				err = check_ld_abs(env, insn);
      				if (err)
      					return err;
      
      			} else if (mode == BPF_IMM) {
      				err = check_ld_imm(env, insn);
      				if (err)
      					return err;
      
      				env->insn_idx++; /* BPF_LD_IMM64_RAW,所以下個 insn 不看 (存 imm 後 32 bit) */
      				sanitize_mark_insn_seen(env); /* mark 成看過的 */
      			} else { /* ABS IND IMM 都不是 */
      				return -EINVAL;
      			}
      		} else  /* 未知的 insn class (invalid) */
      			return -EINVAL;
      		}
      
      		env->insn_idx++;
      	}
      
      	return 0;
      }
      • 其中兩個 check_ld function check_ld_abs() 以及 check_ld_imm()

      • check_ld_abs() 確保兩件事:

        • 當 ctx == skb (socket buffer) 才會出現 LD_ABS|LD_IND insn

        • 會使用到 R1-R5 regs

        • R6-R9 不會 touch,並且將 return value 放在 R0

        static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
        {
        	struct bpf_reg_state *regs = cur_regs(env);
        	static const int ctx_reg = BPF_REG_6;
        	u8 mode = BPF_MODE(insn->code);
        	int i, err;
        
            /* 此 prog type 不能使用 ABS|IND */
        	if (!may_access_skb(resolve_prog_type(env->prog))) {
        		return -EINVAL;
        	}
        
        	if (!env->ops->gen_ld_abs) { /* misconfigured */
        		return -EINVAL;
        	}
        
            /* uses reserved fields */
        	if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
        	    BPF_SIZE(insn->code) == BPF_DW ||
        	    (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
        		return -EINVAL;
        	}
        
        	/* check whether implicit source operand (register R6) is readable */
            /* r6 存放 skb data,確保可以讀 (SRC_OP) */
        	err = check_reg_arg(env, ctx_reg, SRC_OP);
        	if (err)
        		return err;
        
        	err = check_reference_leak(env); /* ABS|IND 不能與 socket ref 同時用 (?) */
        	if (err) {
        		return err;
        	}
        
            /* 不能在 bpf_spin_lock active 時用 */
        	if (env->cur_state->active_spin_lock) {
        		return -EINVAL;
        	}
        
        	if (regs[ctx_reg].type != PTR_TO_CTX) { /* r6 不指向 ctx */
        		return -EINVAL;
        	}
        
        	if (mode == BPF_IND) {
                /* src_reg 是否可讀 */
        		err = check_reg_arg(env, insn->src_reg, SRC_OP);
        		if (err)
        			return err;
        	}
        	/* reg->off 是 ctx ptr 的 offset
             *
             * 確保只能讀取 unmodified form:
             * - reg->off == 0
             * - tnum_is_const(reg->var_off)
             * - !reg->var_off.value
             */
        	err = check_ctx_reg(env, &regs[ctx_reg], ctx_reg);
        	if (err < 0)
        		return err;
        
        	/* reset caller saved regs to unreadable */
        	for (i = 0; i < CALLER_SAVED_REGS; i++) {
        		mark_reg_not_init(env, regs, caller_saved[i]);
        		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
        	}
        
        	/* mark destination R0 register as readable, since it contains
        	 * the value fetched from the packet.
        	 * Already marked as written above.
        	 */
            /* unknown 就是 readable 嗎 ? */
        	mark_reg_unknown(env, regs, BPF_REG_0);
        	/* ld_abs load up to 32-bit skb data. */
        	regs[BPF_REG_0].subreg_def = env->insn_idx + 1;
        	return 0;
        }
      • check_ld_imm():

        /* verify BPF_LD_IMM64 instruction */
        static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
        {
        	struct bpf_insn_aux_data *aux = cur_aux(env); /* &env->insn_aux_data[env->insn_idx] */
            /* cur_func(env)->regs
             * cur_func: env->cur_state->frame[env->cur_state->curframe]
             */
        	struct bpf_reg_state *regs = cur_regs(env); 
        	struct bpf_reg_state *dst_reg;
        	struct bpf_map *map;
        	int err;
        
        	if (BPF_SIZE(insn->code) != BPF_DW) { /* 要是 imm64 */
        		return -EINVAL;
        	}
            /* reserved field */
        	if (insn->off != 0) {
        		return -EINVAL;
        	}
        	
            /* 確定可寫 */
        	err = check_reg_arg(env, insn->dst_reg, DST_OP);
        	if (err)
        		return err;
        
        	dst_reg = &regs[insn->dst_reg];
        	if (insn->src_reg == 0) {
                /* 因為 64 bits 是由兩個 insn 組成 */
        		u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
        
        		dst_reg->type = SCALAR_VALUE;
        		__mark_reg_known(&regs[insn->dst_reg], imm);
        		return 0;
        	}
        
        	if (insn->src_reg == BPF_PSEUDO_BTF_ID /* 3 */) {
        		mark_reg_known_zero(env, regs, insn->dst_reg);
        		/* 沒有很理解 BTF_ID 的功能也沒用過 */
        		return 0;
        	}
        
        	if (insn->src_reg == BPF_PSEUDO_FUNC) {
        		struct bpf_prog_aux *aux = env->prog->aux;
        		u32 subprogno = insn[1].imm;
        
        		if (!aux->func_info) { /* miss func_info */
        			return -EINVAL;
        		}
                /* callback function 要是靜態的 */
                /* aux->func_info_aux[subprogno] 指向 callback function ? */
        		if (aux->func_info_aux[subprogno].linkage != BTF_FUNC_STATIC) {
        			return -EINVAL;
        		}
        
        		dst_reg->type = PTR_TO_FUNC;
        		dst_reg->subprogno = subprogno;
        		return 0;
        	}
        
        	map = env->used_maps[aux->map_index];
        	mark_reg_known_zero(env, regs, insn->dst_reg); /* 將大多資料設成 0 */
        	dst_reg->map_ptr = map;
        
            /* 沒看懂 */
        	if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) {
        		dst_reg->type = PTR_TO_MAP_VALUE;
        		dst_reg->off = aux->map_off;
        		if (map_value_has_spin_lock(map))
        			dst_reg->id = ++env->id_gen;
        	} else if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
        		dst_reg->type = CONST_PTR_TO_MAP; /* BPF_PSEUDO_MAP_FD 的目的是要讀 map address */
        	} else {
        		return -EINVAL;
        	}
        
        	return 0;
        }
        • env->insn_aux_data[env->insn_idx] 為 insn 的 metadata

Adjust min max

adjust min max 系列一共有三個 function:

  • adjust_reg_min_max_vals() - register,不過只有在 check_alu_op() 被執行
  • adjust_ptr_min_max_vals() - pointer,在 adjust_reg_min_max_vals() 被執行
  • adjust_scalar_min_max_vals() - scalar,在 adjust_reg_min_max_vals() 被執行

adjust_reg_min_max_vals():

static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
				   struct bpf_insn *insn)
{
	struct bpf_verifier_state *vstate = env->cur_state;
	struct bpf_func_state *state = vstate->frame[vstate->curframe];
	struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg;
	struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
	u8 opcode = BPF_OP(insn->code);
	int err;

	dst_reg = &regs[insn->dst_reg];
	src_reg = NULL;
	if (dst_reg->type != SCALAR_VALUE)
		ptr_reg = dst_reg;
	else
		/* Make sure ID is cleared otherwise dst_reg min/max could be
		 * incorrectly propagated into other registers by find_equal_scalars()
		 */
		dst_reg->id = 0;
	if (BPF_SRC(insn->code) == BPF_X) {
		src_reg = &regs[insn->src_reg];
		if (src_reg->type != SCALAR_VALUE) {
			if (dst_reg->type != SCALAR_VALUE) {
				/* Combining two pointers by any ALU op yields
				 * an arbitrary scalar. Disallow all math except
				 * pointer subtraction
				 */
				if (opcode == BPF_SUB && env->allow_ptr_leaks) {
					mark_reg_unknown(env, regs, insn->dst_reg);
					return 0;
				}
				verbose(env, "R%d pointer %s pointer prohibited\n",
					insn->dst_reg,
					bpf_alu_string[opcode >> 4]);
				return -EACCES;
			} else {
				/* scalar += pointer
				 * This is legal, but we have to reverse our
				 * src/dest handling in computing the range
				 */
				err = mark_chain_precision(env, insn->dst_reg);
				if (err)
					return err;
				return adjust_ptr_min_max_vals(env, insn,
							       src_reg, dst_reg);
			}
		} else if (ptr_reg) {
			/* pointer += scalar */
			err = mark_chain_precision(env, insn->src_reg);
			if (err)
				return err;
			return adjust_ptr_min_max_vals(env, insn,
						       dst_reg, src_reg);
		}
	} else {
		/* Pretend the src is a reg with a known value, since we only
		 * need to be able to read from this state.
		 */
		off_reg.type = SCALAR_VALUE;
		__mark_reg_known(&off_reg, insn->imm);
		src_reg = &off_reg;
		if (ptr_reg) /* pointer += K */
			return adjust_ptr_min_max_vals(env, insn,
						       ptr_reg, src_reg);
	}

	/* Got here implies adding two SCALAR_VALUEs */
	if (WARN_ON_ONCE(ptr_reg)) {
		print_verifier_state(env, state);
		verbose(env, "verifier internal error: unexpected ptr_reg\n");
		return -EINVAL;
	}
	if (WARN_ON(!src_reg)) {
		print_verifier_state(env, state);
		verbose(env, "verifier internal error: no src_reg\n");
		return -EINVAL;
	}
	return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
}

insn 的 struct bpf_insn:

struct bpf_insn {
	__u8	code;		/* opcode */
	__u8	dst_reg:4;	/* dest register */
	__u8	src_reg:4;	/* source register */
	__s16	off;		/* signed offset */
	__s32	imm;		/* signed immediate constant */
};

insn 的輔助 struct bpf_insn_aux_data:

struct bpf_insn_aux_data {
	union {
		enum bpf_reg_type ptr_type;	/* pointer type for load/store insns */
		unsigned long map_ptr_state;	/* pointer/poison value for maps */
		s32 call_imm;			/* saved imm field of call insn */
		u32 alu_limit;			/* limit for add/sub register with pointer */
		struct {
			u32 map_index;		/* index into used_maps[] */
			u32 map_off;		/* offset from value base address */
		};
		struct {
			enum bpf_reg_type reg_type;	/* type of pseudo_btf_id */
			union {
				struct {
					struct btf *btf;
					u32 btf_id;	/* btf_id for struct typed var */
				};
				u32 mem_size;	/* mem_size for non-struct typed var */
			};
		} btf_var;
	};
	u64 map_key_state; /* constant (32 bit) key tracking for maps */
	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
	u32 seen; /* this insn was processed by the verifier at env->pass_cnt */
	bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
	bool zext_dst; /* this insn zero extends dst reg */
	u8 alu_state; /* used in combination with alu_limit */

	/* below fields are initialized once */
	unsigned int orig_idx; /* original instruction index */
	bool prune_point;
};

state list 的 struct bpf_verifier_state_list:

/* linked list of verifier states used to prune search */
struct bpf_verifier_state_list {
	struct bpf_verifier_state state;BPF_REG_
	struct bpf_verifier_state_list *next;
	int miss_cnt, hit_cnt;
};

bpf_verifier_state 則是 (src 有提供一些 comment 幫助理解):

/* Maximum number of register states that can exist at once */
#define BPF_ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE)
#define MAX_CALL_FRAMES 8
struct bpf_verifier_state {
	struct bpf_func_state *frame[MAX_CALL_FRAMES];
	struct bpf_verifier_state *parent;
	u32 branches;
	u32 insn_idx;
	u32 curframe;
	u32 active_spin_lock;
	bool speculative;
	u32 first_insn_idx;
	u32 last_insn_idx;
	struct bpf_idx_pair *jmp_history;
	u32 jmp_history_cnt;
};

bpf_check() 下半段:

	...
	skip_full_check:
	kvfree(env->explored_states);

	if (ret == 0)
		ret = check_max_stack_depth(env);

	/* instruction rewrites happen after this point */
	if (is_priv) {
		if (ret == 0)
			opt_hard_wire_dead_code_branches(env);
		if (ret == 0)
			ret = opt_remove_dead_code(env);
		if (ret == 0)
			ret = opt_remove_nops(env);
	} else {
		if (ret == 0)
			sanitize_dead_code(env);
	}

	if (ret == 0)
		/* program is valid, convert *(u32*)(ctx + off) accesses */
		ret = convert_ctx_accesses(env);

	if (ret == 0)
		ret = do_misc_fixups(env);

	/* do 32-bit optimization after insn patching has done so those patched
	 * insns could be handled correctly.
	 */
	if (ret == 0 && !bpf_prog_is_dev_bound(env->prog->aux)) {
		ret = opt_subreg_zext_lo32_rnd_hi32(env, attr);
		env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret
								     : false;
	}

	if (ret == 0)
		ret = fixup_call_args(env);

	env->verification_time = ktime_get_ns() - start_time;
	print_verification_stats(env);

	if (log->level && bpf_verifier_log_full(log))
		ret = -ENOSPC;
	if (log->level && !log->ubuf) {
		ret = -EFAULT;
		goto err_release_maps;
	}

	if (ret)
		goto err_release_maps;

	if (env->used_map_cnt) {
		/* if program passed verifier, update used_maps in bpf_prog_info */
		env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
							  sizeof(env->used_maps[0]),
							  GFP_KERNEL);

		if (!env->prog->aux->used_maps) {
			ret = -ENOMEM;
			goto err_release_maps;
		}

		memcpy(env->prog->aux->used_maps, env->used_maps,
		       sizeof(env->used_maps[0]) * env->used_map_cnt);
		env->prog->aux->used_map_cnt = env->used_map_cnt;
	}
	if (env->used_btf_cnt) {
		/* if program passed verifier, update used_btfs in bpf_prog_aux */
		env->prog->aux->used_btfs = kmalloc_array(env->used_btf_cnt,
							  sizeof(env->used_btfs[0]),
							  GFP_KERNEL);
		if (!env->prog->aux->used_btfs) {
			ret = -ENOMEM;
			goto err_release_maps;
		}

		memcpy(env->prog->aux->used_btfs, env->used_btfs,
		       sizeof(env->used_btfs[0]) * env->used_btf_cnt);
		env->prog->aux->used_btf_cnt = env->used_btf_cnt;
	}
	if (env->used_map_cnt || env->used_btf_cnt) {
		/* program is valid. Convert pseudo bpf_ld_imm64 into generic
		 * bpf_ld_imm64 instructions
		 */
		convert_pseudo_ld_imm64(env);
	}

	adjust_btf_func(env);

err_release_maps:
	if (!env->prog->aux->used_maps)
		/* if we didn't copy map pointers into bpf_prog_info, release
		 * them now. Otherwise free_used_maps() will release them.
		 */
		release_maps(env);
	if (!env->prog->aux->used_btfs)
		release_btfs(env);

	/* extension progs temporarily inherit the attach_type of their targets
	   for verification purposes, so set it back to zero before returning
	 */
	if (env->prog->type == BPF_PROG_TYPE_EXT)
		env->prog->expected_attach_type = 0;

	*prog = env->prog;
err_unlock:
	if (!is_priv)
		mutex_unlock(&bpf_verifier_lock);
	vfree(env->insn_aux_data);
err_free_env:
	kfree(env);
	return ret;
}