Skip to content

Commit

Permalink
refactor: optimize TCP and UDP checksum handling with eBPF (#220)
Browse files Browse the repository at this point in the history
udp and tcp ingress:
1. use `bpf_l4_csum_replace` `bpf_l3_csum_replace` to recalculate the
checksums;
2. and `bpf_skb_store_bytes` to update dest ip and port;

Signed-off-by: tzssangglass <[email protected]>
  • Loading branch information
k8s-ci-robot authored May 30, 2024
2 parents 29d1e6c + d69044a commit 4b8278c
Show file tree
Hide file tree
Showing 5 changed files with 175 additions and 70 deletions.
1 change: 1 addition & 0 deletions dataplane/ebpf/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions dataplane/ebpf/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ publish = false
[dependencies]
aya-ebpf = { git = "https://github.com/aya-rs/aya" }
aya-log-ebpf = { git = "https://github.com/aya-rs/aya" }
aya-ebpf-cty = { git = "https://github.com/aya-rs/aya" }
common = { path = "../common" }
memoffset = "0.9"
network-types = "0.0.5"
Expand Down
84 changes: 37 additions & 47 deletions dataplane/ebpf/src/ingress/tcp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,22 @@ SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)

use core::mem;

use aya_ebpf::{
bindings::TC_ACT_OK,
helpers::{bpf_csum_diff, bpf_redirect_neigh},
programs::TcContext,
};
use aya_ebpf::{bindings::TC_ACT_OK, helpers::bpf_redirect_neigh, programs::TcContext};
use aya_log_ebpf::{debug, info};

use memoffset::offset_of;
use network_types::{eth::EthHdr, ip::Ipv4Hdr, tcp::TcpHdr};

use crate::{
utils::{csum_fold_helper, ptr_at, update_tcp_conns},
utils::{ptr_at, set_ipv4_dest_port, set_ipv4_ip_dst, update_tcp_conns},
BACKENDS, GATEWAY_INDEXES, LB_CONNECTIONS,
};
use common::{
Backend, BackendKey, ClientKey, LoadBalancerMapping, TCPState, BACKENDS_ARRAY_CAPACITY,
};

const TCP_CSUM_OFF: u32 = (EthHdr::LEN + Ipv4Hdr::LEN + offset_of!(TcpHdr, check)) as u32;

pub fn handle_tcp_ingress(ctx: TcContext) -> Result<i32, i64> {
let ip_hdr: *mut Ipv4Hdr = unsafe { ptr_at(&ctx, EthHdr::LEN)? };

Expand All @@ -30,6 +30,7 @@ pub fn handle_tcp_ingress(ctx: TcContext) -> Result<i32, i64> {
let tcp_hdr: *mut TcpHdr = unsafe { ptr_at(&ctx, tcp_header_offset) }?;

let original_daddr = unsafe { (*ip_hdr).dst_addr };
let original_dport = unsafe { (*tcp_hdr).dest };

// The source identifier
let client_key = ClientKey {
Expand All @@ -56,7 +57,7 @@ pub fn handle_tcp_ingress(ctx: TcContext) -> Result<i32, i64> {

backend_key = BackendKey {
ip: u32::from_be(original_daddr),
port: (u16::from_be(unsafe { (*tcp_hdr).dest })) as u32,
port: (u16::from_be(original_dport)) as u32,
};
let backend_list = unsafe { BACKENDS.get(&backend_key) }.ok_or(TC_ACT_OK)?;
let backend_index = unsafe { GATEWAY_INDEXES.get(&backend_key) }.ok_or(TC_ACT_OK)?;
Expand Down Expand Up @@ -104,33 +105,40 @@ pub fn handle_tcp_ingress(ctx: TcContext) -> Result<i32, i64> {
u16::from_be(unsafe { (*tcp_hdr).dest })
);

// DNAT the ip address
unsafe {
(*ip_hdr).dst_addr = backend.daddr.to_be();
}
// DNAT the port
unsafe { (*tcp_hdr).dest = (backend.dport as u16).to_be() };

if (ctx.data() + EthHdr::LEN + Ipv4Hdr::LEN) > ctx.data_end() {
info!(&ctx, "Iphdr is out of bounds");
return Ok(TC_ACT_OK);
}

// Calculate l3 cksum
// TODO(astoycos) use l3_cksum_replace instead
unsafe { (*ip_hdr).check = 0 };
let full_cksum = unsafe {
bpf_csum_diff(
mem::MaybeUninit::zeroed().assume_init(),
0,
ip_hdr as *mut u32,
Ipv4Hdr::LEN as u32,
0,
)
} as u64;
unsafe { (*ip_hdr).check = csum_fold_helper(full_cksum) };
// FIXME
unsafe { (*tcp_hdr).check = 0 };
let tcp_hdr_ref = unsafe { tcp_hdr.as_ref().ok_or(TC_ACT_OK)? };

// If the packet has the RST flag set, it means the connection is being terminated, so remove it
// from our map.
if tcp_hdr_ref.rst() == 1 {
unsafe {
LB_CONNECTIONS.remove(&client_key)?;
}
}

let mut lb_mapping = LoadBalancerMapping {
backend,
backend_key,
tcp_state,
};

update_tcp_conns(tcp_hdr_ref, &client_key, &mut lb_mapping)?;

let backend_ip = backend.daddr.to_be();
let ret = set_ipv4_ip_dst(&ctx, TCP_CSUM_OFF, &original_daddr, backend_ip);
if ret != 0 {
return Ok(TC_ACT_OK);
}

let backend_port = (backend.dport as u16).to_be();
let ret = set_ipv4_dest_port(&ctx, TCP_CSUM_OFF, &original_dport, backend_port);
if ret != 0 {
return Ok(TC_ACT_OK);
}

let action = unsafe {
bpf_redirect_neigh(
Expand All @@ -141,12 +149,6 @@ pub fn handle_tcp_ingress(ctx: TcContext) -> Result<i32, i64> {
)
};

let mut lb_mapping = LoadBalancerMapping {
backend,
backend_key,
tcp_state,
};

// If the connection is new, then record it in our map for future tracking.
if new_conn {
unsafe {
Expand All @@ -158,18 +160,6 @@ pub fn handle_tcp_ingress(ctx: TcContext) -> Result<i32, i64> {
return Ok(action as i32);
}

let tcp_hdr_ref = unsafe { tcp_hdr.as_ref().ok_or(TC_ACT_OK)? };

// If the packet has the RST flag set, it means the connection is being terminated, so remove it
// from our map.
if tcp_hdr_ref.rst() == 1 {
unsafe {
LB_CONNECTIONS.remove(&client_key)?;
}
}

update_tcp_conns(tcp_hdr_ref, &client_key, &mut lb_mapping)?;

info!(&ctx, "redirect action: {}", action);
Ok(action as i32)
}
38 changes: 17 additions & 21 deletions dataplane/ebpf/src/ingress/udp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,20 @@ SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)

use core::mem;

use aya_ebpf::{
bindings::TC_ACT_PIPE,
helpers::{bpf_csum_diff, bpf_redirect_neigh},
programs::TcContext,
};
use aya_ebpf::{bindings::TC_ACT_PIPE, helpers::bpf_redirect_neigh, programs::TcContext};
use aya_log_ebpf::{debug, info};

use memoffset::offset_of;
use network_types::{eth::EthHdr, ip::Ipv4Hdr, udp::UdpHdr};

use crate::{
utils::{csum_fold_helper, ptr_at},
utils::{ptr_at, set_ipv4_dest_port, set_ipv4_ip_dst},
BACKENDS, GATEWAY_INDEXES, LB_CONNECTIONS,
};
use common::{BackendKey, ClientKey, LoadBalancerMapping, BACKENDS_ARRAY_CAPACITY};

const UDP_CSUM_OFF: u32 = (EthHdr::LEN + Ipv4Hdr::LEN + offset_of!(UdpHdr, check)) as u32;

pub fn handle_udp_ingress(ctx: TcContext) -> Result<i32, i64> {
let ip_hdr: *mut Ipv4Hdr = unsafe { ptr_at(&ctx, EthHdr::LEN)? };

Expand Down Expand Up @@ -95,21 +95,17 @@ pub fn handle_udp_ingress(ctx: TcContext) -> Result<i32, i64> {
return Ok(TC_ACT_PIPE);
}

// Calculate l3 cksum
// TODO(astoycos) use l3_cksum_replace instead
unsafe { (*ip_hdr).check = 0 };
let full_cksum = unsafe {
bpf_csum_diff(
mem::MaybeUninit::zeroed().assume_init(),
0,
ip_hdr as *mut u32,
Ipv4Hdr::LEN as u32,
0,
)
} as u64;
unsafe { (*ip_hdr).check = csum_fold_helper(full_cksum) };
// Kernel allows UDP packet with unset checksums
unsafe { (*udp_hdr).check = 0 };
let backend_ip = backend.daddr.to_be();
let ret = set_ipv4_ip_dst(&ctx, UDP_CSUM_OFF, &original_daddr, backend_ip);
if ret != 0 {
return Ok(TC_ACT_PIPE);
}

let backend_port = (backend.dport as u16).to_be();
let ret = set_ipv4_dest_port(&ctx, UDP_CSUM_OFF, &original_dport, backend_port);
if ret != 0 {
return Ok(TC_ACT_PIPE);
}

let action = unsafe {
bpf_redirect_neigh(
Expand Down
121 changes: 119 additions & 2 deletions dataplane/ebpf/src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,25 @@ Copyright 2023 The Kubernetes Authors.
SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
*/

use aya_ebpf::{bindings::TC_ACT_OK, programs::TcContext};
use aya_ebpf::{
bindings::TC_ACT_OK,
helpers::{bpf_l3_csum_replace, bpf_l4_csum_replace, bpf_skb_store_bytes},
programs::TcContext,
};
use aya_ebpf_cty::{c_long, c_void};
use aya_log_ebpf::info;
use core::mem;
use network_types::tcp::TcpHdr;
use network_types::{eth::EthHdr, ip::Ipv4Hdr, tcp::TcpHdr};

use crate::LB_CONNECTIONS;
use common::{ClientKey, LoadBalancerMapping, TCPState};

use memoffset::offset_of;

const IP_CSUM_OFF: u32 = (EthHdr::LEN + offset_of!(Ipv4Hdr, check)) as u32;
const IP_DST_OFF: u32 = (EthHdr::LEN + offset_of!(Ipv4Hdr, dst_addr)) as u32;
const IS_PSEUDO: u64 = 0x10;

// -----------------------------------------------------------------------------
// Helper Functions
// -----------------------------------------------------------------------------
Expand Down Expand Up @@ -123,3 +135,108 @@ pub fn update_tcp_conns(
}
Ok(())
}

// inspired by https://github.com/torvalds/linux/blob/master/samples/bpf/tcbpf1_kern.c
// update dst_addr in the ip_hdr
// recalculate the checksums
pub fn set_ipv4_ip_dst(ctx: &TcContext, l4_csum_offset: u32, old_ip: &u32, new_dip: u32) -> c_long {
let mut ret: c_long;
unsafe {
ret = bpf_l4_csum_replace(
ctx.skb.skb,
l4_csum_offset,
*old_ip as u64,
new_dip as u64,
IS_PSEUDO | (mem::size_of_val(&new_dip) as u64),
);
}
if ret != 0 {
info!(
ctx,
"Failed to update the TCP checksum after modifying the destination IP"
);
return ret;
}

unsafe {
ret = bpf_l3_csum_replace(
ctx.skb.skb,
IP_CSUM_OFF,
*old_ip as u64,
new_dip as u64,
mem::size_of_val(&new_dip) as u64,
);
}
if ret != 0 {
info!(
ctx,
"Failed to update the IP header checksum after modifying the destination IP"
);
return ret;
}

unsafe {
ret = bpf_skb_store_bytes(
ctx.skb.skb,
IP_DST_OFF,
&new_dip as *const u32 as *const c_void,
mem::size_of_val(&new_dip) as u32,
0,
);
}
if ret != 0 {
info!(
ctx,
"Failed to update the destination IP address in the packet header"
);
return ret;
}

ret
}

// update destination port in the tcp_hdr
// recalculate the checksums
pub fn set_ipv4_dest_port(
ctx: &TcContext,
l4_csum_offset: u32,
old_port: &u16,
new_port: u16,
) -> c_long {
let mut ret: c_long;
unsafe {
ret = bpf_l4_csum_replace(
ctx.skb.skb,
l4_csum_offset,
*old_port as u64,
new_port as u64,
mem::size_of_val(&new_port) as u64,
);
}
if ret != 0 {
info!(
ctx,
"Failed to update the TCP checksum after modifying the destination port"
);
return ret;
}

unsafe {
ret = bpf_skb_store_bytes(
ctx.skb.skb,
l4_csum_offset,
&new_port as *const u16 as *const c_void,
mem::size_of_val(&new_port) as u32,
0,
);
}
if ret != 0 {
info!(
ctx,
"Failed to update the destination port in the packet header"
);
return ret;
}

ret
}

0 comments on commit 4b8278c

Please sign in to comment.