Skip to content

Commit

Permalink
bpf: BPF for lightweight tunnel infrastructure
Browse files Browse the repository at this point in the history
Registers new BPF program types which correspond to the LWT hooks:
  - BPF_PROG_TYPE_LWT_IN   => dst_input()
  - BPF_PROG_TYPE_LWT_OUT  => dst_output()
  - BPF_PROG_TYPE_LWT_XMIT => lwtunnel_xmit()

The separate program types are required to differentiate between the
capabilities each LWT hook allows:

 * Programs attached to dst_input() or dst_output() are restricted and
   may only read the data of an skb. This prevent modification and
   possible invalidation of already validated packet headers on receive
   and the construction of illegal headers while the IP headers are
   still being assembled.

 * Programs attached to lwtunnel_xmit() are allowed to modify packet
   content as well as prepending an L2 header via a newly introduced
   helper bpf_skb_change_head(). This is safe as lwtunnel_xmit() is
   invoked after the IP header has been assembled completely.

All BPF programs receive an skb with L3 headers attached and may return
one of the following error codes:

 BPF_OK - Continue routing as per nexthop
 BPF_DROP - Drop skb and return EPERM
 BPF_REDIRECT - Redirect skb to device as per redirect() helper.
                (Only valid in lwtunnel_xmit() context)

The return codes are binary compatible with their TC_ACT_
relatives to ease compatibility.

Signed-off-by: Thomas Graf <[email protected]>
Acked-by: Alexei Starovoitov <[email protected]>
Acked-by: Daniel Borkmann <[email protected]>
Signed-off-by: David S. Miller <[email protected]>
  • Loading branch information
tgraf authored and davem330 committed Dec 2, 2016
1 parent efd8570 commit 3a0af8f
Show file tree
Hide file tree
Showing 9 changed files with 646 additions and 5 deletions.
2 changes: 1 addition & 1 deletion include/linux/filter.h
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,7 @@ struct xdp_buff {
};

/* compute the linear packet data range [data, data_end) which
* will be accessed by cls_bpf and act_bpf programs
* will be accessed by cls_bpf, act_bpf and lwt programs
*/
static inline void bpf_compute_data_end(struct sk_buff *skb)
{
Expand Down
32 changes: 31 additions & 1 deletion include/uapi/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,9 @@ enum bpf_prog_type {
BPF_PROG_TYPE_XDP,
BPF_PROG_TYPE_PERF_EVENT,
BPF_PROG_TYPE_CGROUP_SKB,
BPF_PROG_TYPE_LWT_IN,
BPF_PROG_TYPE_LWT_OUT,
BPF_PROG_TYPE_LWT_XMIT,
};

enum bpf_attach_type {
Expand Down Expand Up @@ -409,6 +412,16 @@ union bpf_attr {
*
* int bpf_get_numa_node_id()
* Return: Id of current NUMA node.
*
* int bpf_skb_change_head()
* Grows headroom of skb and adjusts MAC header offset accordingly.
* Will extends/reallocae as required automatically.
* May change skb data pointer and will thus invalidate any check
* performed for direct packet access.
* @skb: pointer to skb
* @len: length of header to be pushed in front
* @flags: Flags (unused for now)
* Return: 0 on success or negative error
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
Expand Down Expand Up @@ -453,7 +466,8 @@ union bpf_attr {
FN(skb_pull_data), \
FN(csum_update), \
FN(set_hash_invalid), \
FN(get_numa_node_id),
FN(get_numa_node_id), \
FN(skb_change_head),

/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
Expand Down Expand Up @@ -537,6 +551,22 @@ struct bpf_tunnel_key {
__u32 tunnel_label;
};

/* Generic BPF return codes which all BPF program types may support.
* The values are binary compatible with their TC_ACT_* counter-part to
* provide backwards compatibility with existing SCHED_CLS and SCHED_ACT
* programs.
*
* XDP is handled seprately, see XDP_*.
*/
enum bpf_ret_code {
BPF_OK = 0,
/* 1 reserved */
BPF_DROP = 2,
/* 3-6 reserved */
BPF_REDIRECT = 7,
/* >127 are reserved for prog type specific return codes */
};

/* User return codes for XDP prog type.
* A valid XDP program must return one of these defined values. All other
* return codes are reserved for future use. Unknown return codes will result
Expand Down
23 changes: 23 additions & 0 deletions include/uapi/linux/lwtunnel.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ enum lwtunnel_encap_types {
LWTUNNEL_ENCAP_ILA,
LWTUNNEL_ENCAP_IP6,
LWTUNNEL_ENCAP_SEG6,
LWTUNNEL_ENCAP_BPF,
__LWTUNNEL_ENCAP_MAX,
};

Expand Down Expand Up @@ -43,4 +44,26 @@ enum lwtunnel_ip6_t {

#define LWTUNNEL_IP6_MAX (__LWTUNNEL_IP6_MAX - 1)

enum {
LWT_BPF_PROG_UNSPEC,
LWT_BPF_PROG_FD,
LWT_BPF_PROG_NAME,
__LWT_BPF_PROG_MAX,
};

#define LWT_BPF_PROG_MAX (__LWT_BPF_PROG_MAX - 1)

enum {
LWT_BPF_UNSPEC,
LWT_BPF_IN,
LWT_BPF_OUT,
LWT_BPF_XMIT,
LWT_BPF_XMIT_HEADROOM,
__LWT_BPF_MAX,
};

#define LWT_BPF_MAX (__LWT_BPF_MAX - 1)

#define LWT_BPF_MAX_HEADROOM 256

#endif /* _UAPI_LWTUNNEL_H_ */
14 changes: 11 additions & 3 deletions kernel/bpf/verifier.c
Original file line number Diff line number Diff line change
Expand Up @@ -633,12 +633,19 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
#define MAX_PACKET_OFF 0xffff

static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
const struct bpf_call_arg_meta *meta)
const struct bpf_call_arg_meta *meta,
enum bpf_access_type t)
{
switch (env->prog->type) {
case BPF_PROG_TYPE_LWT_IN:
case BPF_PROG_TYPE_LWT_OUT:
/* dst_input() and dst_output() can't write for now */
if (t == BPF_WRITE)
return false;
case BPF_PROG_TYPE_SCHED_CLS:
case BPF_PROG_TYPE_SCHED_ACT:
case BPF_PROG_TYPE_XDP:
case BPF_PROG_TYPE_LWT_XMIT:
if (meta)
return meta->pkt_access;

Expand Down Expand Up @@ -837,7 +844,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
err = check_stack_read(state, off, size, value_regno);
}
} else if (state->regs[regno].type == PTR_TO_PACKET) {
if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) {
if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
verbose("cannot write into packet\n");
return -EACCES;
}
Expand Down Expand Up @@ -970,7 +977,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
return 0;
}

if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) {
if (type == PTR_TO_PACKET &&
!may_access_direct_pkt_data(env, meta, BPF_READ)) {
verbose("helper access to the packet is not allowed\n");
return -EACCES;
}
Expand Down
8 changes: 8 additions & 0 deletions net/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,14 @@ config LWTUNNEL
weight tunnel endpoint. Tunnel encapsulation parameters are stored
with light weight tunnel state associated with fib routes.

config LWTUNNEL_BPF
bool "Execute BPF program as route nexthop action"
depends on LWTUNNEL
default y if LWTUNNEL=y
---help---
Allows to run BPF programs as a nexthop action following a route
lookup for incoming and outgoing packets.

config DST_CACHE
bool
default n
Expand Down
1 change: 1 addition & 0 deletions net/core/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
obj-$(CONFIG_DST_CACHE) += dst_cache.o
obj-$(CONFIG_HWBM) += hwbm.o
obj-$(CONFIG_NET_DEVLINK) += devlink.o
173 changes: 173 additions & 0 deletions net/core/filter.c
Original file line number Diff line number Diff line change
Expand Up @@ -1689,6 +1689,12 @@ static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
u32 flags)
{
/* Verify that a link layer header is carried */
if (unlikely(skb->mac_header >= skb->network_header)) {
kfree_skb(skb);
return -ERANGE;
}

bpf_push_mac_rcsum(skb);
return flags & BPF_F_INGRESS ?
__bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
Expand Down Expand Up @@ -2188,12 +2194,53 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = {
.arg3_type = ARG_ANYTHING,
};

BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
u64, flags)
{
u32 max_len = __bpf_skb_max_len(skb);
u32 new_len = skb->len + head_room;
int ret;

if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) ||
new_len < skb->len))
return -EINVAL;

ret = skb_cow(skb, head_room);
if (likely(!ret)) {
/* Idea for this helper is that we currently only
* allow to expand on mac header. This means that
* skb->protocol network header, etc, stay as is.
* Compared to bpf_skb_change_tail(), we're more
* flexible due to not needing to linearize or
* reset GSO. Intention for this helper is to be
* used by an L3 skb that needs to push mac header
* for redirection into L2 device.
*/
__skb_push(skb, head_room);
memset(skb->data, 0, head_room);
skb_reset_mac_header(skb);
}

bpf_compute_data_end(skb);
return 0;
}

static const struct bpf_func_proto bpf_skb_change_head_proto = {
.func = bpf_skb_change_head,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_ANYTHING,
};

bool bpf_helper_changes_skb_data(void *func)
{
if (func == bpf_skb_vlan_push ||
func == bpf_skb_vlan_pop ||
func == bpf_skb_store_bytes ||
func == bpf_skb_change_proto ||
func == bpf_skb_change_head ||
func == bpf_skb_change_tail ||
func == bpf_skb_pull_data ||
func == bpf_l3_csum_replace ||
Expand Down Expand Up @@ -2639,6 +2686,68 @@ cg_skb_func_proto(enum bpf_func_id func_id)
}
}

static const struct bpf_func_proto *
lwt_inout_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
case BPF_FUNC_skb_load_bytes:
return &bpf_skb_load_bytes_proto;
case BPF_FUNC_skb_pull_data:
return &bpf_skb_pull_data_proto;
case BPF_FUNC_csum_diff:
return &bpf_csum_diff_proto;
case BPF_FUNC_get_cgroup_classid:
return &bpf_get_cgroup_classid_proto;
case BPF_FUNC_get_route_realm:
return &bpf_get_route_realm_proto;
case BPF_FUNC_get_hash_recalc:
return &bpf_get_hash_recalc_proto;
case BPF_FUNC_perf_event_output:
return &bpf_skb_event_output_proto;
case BPF_FUNC_get_smp_processor_id:
return &bpf_get_smp_processor_id_proto;
case BPF_FUNC_skb_under_cgroup:
return &bpf_skb_under_cgroup_proto;
default:
return sk_filter_func_proto(func_id);
}
}

static const struct bpf_func_proto *
lwt_xmit_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
case BPF_FUNC_skb_get_tunnel_key:
return &bpf_skb_get_tunnel_key_proto;
case BPF_FUNC_skb_set_tunnel_key:
return bpf_get_skb_set_tunnel_proto(func_id);
case BPF_FUNC_skb_get_tunnel_opt:
return &bpf_skb_get_tunnel_opt_proto;
case BPF_FUNC_skb_set_tunnel_opt:
return bpf_get_skb_set_tunnel_proto(func_id);
case BPF_FUNC_redirect:
return &bpf_redirect_proto;
case BPF_FUNC_clone_redirect:
return &bpf_clone_redirect_proto;
case BPF_FUNC_skb_change_tail:
return &bpf_skb_change_tail_proto;
case BPF_FUNC_skb_change_head:
return &bpf_skb_change_head_proto;
case BPF_FUNC_skb_store_bytes:
return &bpf_skb_store_bytes_proto;
case BPF_FUNC_csum_update:
return &bpf_csum_update_proto;
case BPF_FUNC_l3_csum_replace:
return &bpf_l3_csum_replace_proto;
case BPF_FUNC_l4_csum_replace:
return &bpf_l4_csum_replace_proto;
case BPF_FUNC_set_hash_invalid:
return &bpf_set_hash_invalid_proto;
default:
return lwt_inout_func_proto(func_id);
}
}

static bool __is_valid_access(int off, int size, enum bpf_access_type type)
{
if (off < 0 || off >= sizeof(struct __sk_buff))
Expand Down Expand Up @@ -2676,6 +2785,39 @@ static bool sk_filter_is_valid_access(int off, int size,
return __is_valid_access(off, size, type);
}

static bool lwt_is_valid_access(int off, int size,
enum bpf_access_type type,
enum bpf_reg_type *reg_type)
{
switch (off) {
case offsetof(struct __sk_buff, tc_classid):
return false;
}

if (type == BPF_WRITE) {
switch (off) {
case offsetof(struct __sk_buff, mark):
case offsetof(struct __sk_buff, priority):
case offsetof(struct __sk_buff, cb[0]) ...
offsetof(struct __sk_buff, cb[4]):
break;
default:
return false;
}
}

switch (off) {
case offsetof(struct __sk_buff, data):
*reg_type = PTR_TO_PACKET;
break;
case offsetof(struct __sk_buff, data_end):
*reg_type = PTR_TO_PACKET_END;
break;
}

return __is_valid_access(off, size, type);
}

static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
const struct bpf_prog *prog)
{
Expand Down Expand Up @@ -3007,6 +3149,19 @@ static const struct bpf_verifier_ops cg_skb_ops = {
.convert_ctx_access = sk_filter_convert_ctx_access,
};

static const struct bpf_verifier_ops lwt_inout_ops = {
.get_func_proto = lwt_inout_func_proto,
.is_valid_access = lwt_is_valid_access,
.convert_ctx_access = sk_filter_convert_ctx_access,
};

static const struct bpf_verifier_ops lwt_xmit_ops = {
.get_func_proto = lwt_xmit_func_proto,
.is_valid_access = lwt_is_valid_access,
.convert_ctx_access = sk_filter_convert_ctx_access,
.gen_prologue = tc_cls_act_prologue,
};

static struct bpf_prog_type_list sk_filter_type __read_mostly = {
.ops = &sk_filter_ops,
.type = BPF_PROG_TYPE_SOCKET_FILTER,
Expand All @@ -3032,13 +3187,31 @@ static struct bpf_prog_type_list cg_skb_type __read_mostly = {
.type = BPF_PROG_TYPE_CGROUP_SKB,
};

static struct bpf_prog_type_list lwt_in_type __read_mostly = {
.ops = &lwt_inout_ops,
.type = BPF_PROG_TYPE_LWT_IN,
};

static struct bpf_prog_type_list lwt_out_type __read_mostly = {
.ops = &lwt_inout_ops,
.type = BPF_PROG_TYPE_LWT_OUT,
};

static struct bpf_prog_type_list lwt_xmit_type __read_mostly = {
.ops = &lwt_xmit_ops,
.type = BPF_PROG_TYPE_LWT_XMIT,
};

static int __init register_sk_filter_ops(void)
{
bpf_register_prog_type(&sk_filter_type);
bpf_register_prog_type(&sched_cls_type);
bpf_register_prog_type(&sched_act_type);
bpf_register_prog_type(&xdp_type);
bpf_register_prog_type(&cg_skb_type);
bpf_register_prog_type(&lwt_in_type);
bpf_register_prog_type(&lwt_out_type);
bpf_register_prog_type(&lwt_xmit_type);

return 0;
}
Expand Down
Loading

0 comments on commit 3a0af8f

Please sign in to comment.