Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: optimize TCP and UDP checksum handling with eBPF #220

Merged
merged 5 commits into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dataplane/ebpf/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions dataplane/ebpf/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ publish = false
[dependencies]
aya-ebpf = { git = "https://github.com/aya-rs/aya" }
aya-log-ebpf = { git = "https://github.com/aya-rs/aya" }
aya-ebpf-cty = { git = "https://github.com/aya-rs/aya" }
common = { path = "../common" }
memoffset = "0.9"
network-types = "0.0.5"
Expand Down
84 changes: 37 additions & 47 deletions dataplane/ebpf/src/ingress/tcp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,22 @@ SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)

use core::mem;

use aya_ebpf::{
bindings::TC_ACT_OK,
helpers::{bpf_csum_diff, bpf_redirect_neigh},
programs::TcContext,
};
use aya_ebpf::{bindings::TC_ACT_OK, helpers::bpf_redirect_neigh, programs::TcContext};
use aya_log_ebpf::{debug, info};

use memoffset::offset_of;
use network_types::{eth::EthHdr, ip::Ipv4Hdr, tcp::TcpHdr};

use crate::{
utils::{csum_fold_helper, ptr_at, update_tcp_conns},
utils::{ptr_at, set_ipv4_dest_port, set_ipv4_ip_dst, update_tcp_conns},
BACKENDS, GATEWAY_INDEXES, LB_CONNECTIONS,
};
use common::{
Backend, BackendKey, ClientKey, LoadBalancerMapping, TCPState, BACKENDS_ARRAY_CAPACITY,
};

const TCP_CSUM_OFF: u32 = (EthHdr::LEN + Ipv4Hdr::LEN + offset_of!(TcpHdr, check)) as u32;

pub fn handle_tcp_ingress(ctx: TcContext) -> Result<i32, i64> {
let ip_hdr: *mut Ipv4Hdr = unsafe { ptr_at(&ctx, EthHdr::LEN)? };

Expand All @@ -30,6 +30,7 @@ pub fn handle_tcp_ingress(ctx: TcContext) -> Result<i32, i64> {
let tcp_hdr: *mut TcpHdr = unsafe { ptr_at(&ctx, tcp_header_offset) }?;

let original_daddr = unsafe { (*ip_hdr).dst_addr };
let original_dport = unsafe { (*tcp_hdr).dest };

// The source identifier
let client_key = ClientKey {
Expand All @@ -56,7 +57,7 @@ pub fn handle_tcp_ingress(ctx: TcContext) -> Result<i32, i64> {

backend_key = BackendKey {
ip: u32::from_be(original_daddr),
port: (u16::from_be(unsafe { (*tcp_hdr).dest })) as u32,
port: (u16::from_be(original_dport)) as u32,
};
let backend_list = unsafe { BACKENDS.get(&backend_key) }.ok_or(TC_ACT_OK)?;
let backend_index = unsafe { GATEWAY_INDEXES.get(&backend_key) }.ok_or(TC_ACT_OK)?;
Expand Down Expand Up @@ -104,33 +105,40 @@ pub fn handle_tcp_ingress(ctx: TcContext) -> Result<i32, i64> {
u16::from_be(unsafe { (*tcp_hdr).dest })
);

// DNAT the ip address
unsafe {
(*ip_hdr).dst_addr = backend.daddr.to_be();
}
// DNAT the port
unsafe { (*tcp_hdr).dest = (backend.dport as u16).to_be() };

if (ctx.data() + EthHdr::LEN + Ipv4Hdr::LEN) > ctx.data_end() {
info!(&ctx, "Iphdr is out of bounds");
return Ok(TC_ACT_OK);
}

// Calculate l3 cksum
// TODO(astoycos) use l3_cksum_replace instead
unsafe { (*ip_hdr).check = 0 };
let full_cksum = unsafe {
bpf_csum_diff(
mem::MaybeUninit::zeroed().assume_init(),
0,
ip_hdr as *mut u32,
Ipv4Hdr::LEN as u32,
0,
)
} as u64;
unsafe { (*ip_hdr).check = csum_fold_helper(full_cksum) };
// FIXME
unsafe { (*tcp_hdr).check = 0 };
let tcp_hdr_ref = unsafe { tcp_hdr.as_ref().ok_or(TC_ACT_OK)? };

// If the packet has the RST flag set, it means the connection is being terminated, so remove it
// from our map.
if tcp_hdr_ref.rst() == 1 {
unsafe {
LB_CONNECTIONS.remove(&client_key)?;
}
}

let mut lb_mapping = LoadBalancerMapping {
backend,
backend_key,
tcp_state,
};

update_tcp_conns(tcp_hdr_ref, &client_key, &mut lb_mapping)?;

let backend_ip = backend.daddr.to_be();
let ret = set_ipv4_ip_dst(&ctx, TCP_CSUM_OFF, &original_daddr, backend_ip);
if ret != 0 {
return Ok(TC_ACT_OK);
}

let backend_port = (backend.dport as u16).to_be();
let ret = set_ipv4_dest_port(&ctx, TCP_CSUM_OFF, &original_dport, backend_port);
if ret != 0 {
return Ok(TC_ACT_OK);
}

let action = unsafe {
bpf_redirect_neigh(
Expand All @@ -141,12 +149,6 @@ pub fn handle_tcp_ingress(ctx: TcContext) -> Result<i32, i64> {
)
};

let mut lb_mapping = LoadBalancerMapping {
backend,
backend_key,
tcp_state,
};

// If the connection is new, then record it in our map for future tracking.
if new_conn {
unsafe {
Expand All @@ -158,18 +160,6 @@ pub fn handle_tcp_ingress(ctx: TcContext) -> Result<i32, i64> {
return Ok(action as i32);
}

let tcp_hdr_ref = unsafe { tcp_hdr.as_ref().ok_or(TC_ACT_OK)? };

// If the packet has the RST flag set, it means the connection is being terminated, so remove it
// from our map.
if tcp_hdr_ref.rst() == 1 {
unsafe {
LB_CONNECTIONS.remove(&client_key)?;
}
}

update_tcp_conns(tcp_hdr_ref, &client_key, &mut lb_mapping)?;

info!(&ctx, "redirect action: {}", action);
Ok(action as i32)
}
38 changes: 17 additions & 21 deletions dataplane/ebpf/src/ingress/udp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,20 @@ SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)

use core::mem;

use aya_ebpf::{
bindings::TC_ACT_PIPE,
helpers::{bpf_csum_diff, bpf_redirect_neigh},
programs::TcContext,
};
use aya_ebpf::{bindings::TC_ACT_PIPE, helpers::bpf_redirect_neigh, programs::TcContext};
use aya_log_ebpf::{debug, info};

use memoffset::offset_of;
use network_types::{eth::EthHdr, ip::Ipv4Hdr, udp::UdpHdr};

use crate::{
utils::{csum_fold_helper, ptr_at},
utils::{ptr_at, set_ipv4_dest_port, set_ipv4_ip_dst},
BACKENDS, GATEWAY_INDEXES, LB_CONNECTIONS,
};
use common::{BackendKey, ClientKey, LoadBalancerMapping, BACKENDS_ARRAY_CAPACITY};

const UDP_CSUM_OFF: u32 = (EthHdr::LEN + Ipv4Hdr::LEN + offset_of!(UdpHdr, check)) as u32;

pub fn handle_udp_ingress(ctx: TcContext) -> Result<i32, i64> {
let ip_hdr: *mut Ipv4Hdr = unsafe { ptr_at(&ctx, EthHdr::LEN)? };

Expand Down Expand Up @@ -95,21 +95,17 @@ pub fn handle_udp_ingress(ctx: TcContext) -> Result<i32, i64> {
return Ok(TC_ACT_PIPE);
}

// Calculate l3 cksum
// TODO(astoycos) use l3_cksum_replace instead
unsafe { (*ip_hdr).check = 0 };
let full_cksum = unsafe {
bpf_csum_diff(
mem::MaybeUninit::zeroed().assume_init(),
0,
ip_hdr as *mut u32,
Ipv4Hdr::LEN as u32,
0,
)
} as u64;
unsafe { (*ip_hdr).check = csum_fold_helper(full_cksum) };
// Kernel allows UDP packet with unset checksums
unsafe { (*udp_hdr).check = 0 };
let backend_ip = backend.daddr.to_be();
let ret = set_ipv4_ip_dst(&ctx, UDP_CSUM_OFF, &original_daddr, backend_ip);
if ret != 0 {
return Ok(TC_ACT_PIPE);
}

let backend_port = (backend.dport as u16).to_be();
let ret = set_ipv4_dest_port(&ctx, UDP_CSUM_OFF, &original_dport, backend_port);
if ret != 0 {
return Ok(TC_ACT_PIPE);
}

let action = unsafe {
bpf_redirect_neigh(
Expand Down
121 changes: 119 additions & 2 deletions dataplane/ebpf/src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,25 @@ Copyright 2023 The Kubernetes Authors.
SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
*/

use aya_ebpf::{bindings::TC_ACT_OK, programs::TcContext};
use aya_ebpf::{
bindings::TC_ACT_OK,
helpers::{bpf_l3_csum_replace, bpf_l4_csum_replace, bpf_skb_store_bytes},
programs::TcContext,
};
use aya_ebpf_cty::{c_long, c_void};
use aya_log_ebpf::info;
use core::mem;
use network_types::tcp::TcpHdr;
use network_types::{eth::EthHdr, ip::Ipv4Hdr, tcp::TcpHdr};

use crate::LB_CONNECTIONS;
use common::{ClientKey, LoadBalancerMapping, TCPState};

use memoffset::offset_of;

const IP_CSUM_OFF: u32 = (EthHdr::LEN + offset_of!(Ipv4Hdr, check)) as u32;
const IP_DST_OFF: u32 = (EthHdr::LEN + offset_of!(Ipv4Hdr, dst_addr)) as u32;
const IS_PSEUDO: u64 = 0x10;

// -----------------------------------------------------------------------------
// Helper Functions
// -----------------------------------------------------------------------------
Expand Down Expand Up @@ -123,3 +135,108 @@ pub fn update_tcp_conns(
}
Ok(())
}

// inspired by https://github.com/torvalds/linux/blob/master/samples/bpf/tcbpf1_kern.c
// update dst_addr in the ip_hdr
// recalculate the checksums
pub fn set_ipv4_ip_dst(ctx: &TcContext, l4_csum_offset: u32, old_ip: &u32, new_dip: u32) -> c_long {
let mut ret: c_long;
unsafe {
ret = bpf_l4_csum_replace(
ctx.skb.skb,
l4_csum_offset,
*old_ip as u64,
new_dip as u64,
IS_PSEUDO | (mem::size_of_val(&new_dip) as u64),
);
}
if ret != 0 {
info!(
ctx,
"Failed to update the TCP checksum after modifying the destination IP"
);
return ret;
}

unsafe {
ret = bpf_l3_csum_replace(
ctx.skb.skb,
IP_CSUM_OFF,
*old_ip as u64,
new_dip as u64,
mem::size_of_val(&new_dip) as u64,
);
}
if ret != 0 {
info!(
ctx,
"Failed to update the IP header checksum after modifying the destination IP"
);
return ret;
}

unsafe {
ret = bpf_skb_store_bytes(
ctx.skb.skb,
IP_DST_OFF,
&new_dip as *const u32 as *const c_void,
mem::size_of_val(&new_dip) as u32,
0,
);
}
if ret != 0 {
info!(
ctx,
"Failed to update the destination IP address in the packet header"
);
return ret;
}

ret
}

// update destination port in the tcp_hdr
// recalculate the checksums
pub fn set_ipv4_dest_port(
ctx: &TcContext,
l4_csum_offset: u32,
old_port: &u16,
new_port: u16,
) -> c_long {
let mut ret: c_long;
unsafe {
ret = bpf_l4_csum_replace(
ctx.skb.skb,
l4_csum_offset,
*old_port as u64,
new_port as u64,
mem::size_of_val(&new_port) as u64,
);
}
if ret != 0 {
info!(
ctx,
"Failed to update the TCP checksum after modifying the destination port"
);
return ret;
}

unsafe {
ret = bpf_skb_store_bytes(
ctx.skb.skb,
l4_csum_offset,
&new_port as *const u16 as *const c_void,
mem::size_of_val(&new_port) as u32,
0,
);
}
if ret != 0 {
info!(
ctx,
"Failed to update the destination port in the packet header"
);
return ret;
}

ret
}
Loading