Skip to content

Commit

Permalink
Redesign mapping storage and search
Browse files Browse the repository at this point in the history
Rather than keeping a large-ish fixed-size array for every single
process, use a longest prefix match that the BPF trie data structure
provides, this has several benefits.

- Searching is no longer linear the number of mappings;
- The max numer of mappings is lifted, and now the total amount of
  mapping entries are shared among the processes, so processes that have
few mappings will have few entries, while processes with lots of
mappings can have more entries than the current maximum, allowing
lightswitch to profile more applications;

Future work
===========

There are no counters in BPF or userspace in case of errors. There are a
bunch of areas in the code that can panic, such as map operations that
will fail if the maps are full.

Test Plan
=========

Added unit and integration tests, as well as ran this commit for several
hours with many different processes without issues.
  • Loading branch information
javierhonduco committed Apr 15, 2024
1 parent d44df1e commit 0a13d07
Show file tree
Hide file tree
Showing 7 changed files with 246 additions and 79 deletions.
94 changes: 42 additions & 52 deletions src/bpf/profiler.bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,13 @@
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>

struct {
__uint(type, BPF_MAP_TYPE_LPM_TRIE);
__type(key, struct exec_mappings_key);
__type(value, mapping_t);
__uint(map_flags, BPF_F_NO_PREALLOC);
__uint(max_entries, MAX_PROCESSES * 200);
} exec_mappings SEC(".maps");

struct {
__uint(type, BPF_MAP_TYPE_HASH);
Expand Down Expand Up @@ -140,7 +147,7 @@ static void bump_samples() {

// Binary search the unwind table to find the row index containing the unwind
// information for a given program counter (pc).
static u64 find_offset_for_pc(stack_unwind_table_t *table, u64 pc, u64 left,
static __always_inline u64 find_offset_for_pc(stack_unwind_table_t *table, u64 pc, u64 left,
u64 right) {
u64 found = BINARY_SEARCH_DEFAULT;

Expand Down Expand Up @@ -181,11 +188,8 @@ static u64 find_offset_for_pc(stack_unwind_table_t *table, u64 pc, u64 left,

enum find_unwind_table_return {
FIND_UNWIND_SUCCESS = 1,

FIND_UNWIND_MAPPING_SHOULD_NEVER_HAPPEN = 2,
FIND_UNWIND_MAPPING_EXHAUSTED_SEARCH = 3,
FIND_UNWIND_MAPPING_NOT_FOUND = 4,
FIND_UNWIND_CHUNK_NOT_FOUND = 5,
FIND_UNWIND_MAPPING_NOT_FOUND = 2,
FIND_UNWIND_CHUNK_NOT_FOUND = 3,

FIND_UNWIND_JITTED = 100,
FIND_UNWIND_SPECIAL = 200,
Expand All @@ -196,60 +200,42 @@ enum find_unwind_table_return {
// address.
static __always_inline enum find_unwind_table_return
find_unwind_table(chunk_info_t **chunk_info, pid_t per_process_id, u64 pc, u64 *offset) {
process_info_t *proc_info = bpf_map_lookup_elem(&process_info, &per_process_id);
// Appease the verifier.
if (proc_info == NULL) {
LOG("[error] should never happen proc_info");
return FIND_UNWIND_MAPPING_SHOULD_NEVER_HAPPEN;
}

bool found = false;
u32 executable_id = 0;
u32 type = 0;
u64 load_address = 0;

// Find the mapping.
for (u32 i = 0; i < MAX_MAPPINGS_PER_PROCESS; i++) {
if (i > proc_info->len) {
LOG("[info] mapping not found, i (%d) > proc_info->len (%d) pc: %llx", i,
proc_info->len, pc);
return FIND_UNWIND_MAPPING_EXHAUSTED_SEARCH;
}

// Appease the verifier.
if (i < 0 || i > MAX_MAPPINGS_PER_PROCESS) {
LOG("[error] should never happen, verifier");
return FIND_UNWIND_MAPPING_SHOULD_NEVER_HAPPEN;
}
struct exec_mappings_key key = {};
key.prefix_len = PREFIX_LEN;
key.pid = __builtin_bswap32((u32) per_process_id);
key.data = __builtin_bswap64(pc);

if (proc_info->mappings[i].begin <= pc &&
pc <= proc_info->mappings[i].end) {
found = true;
executable_id = proc_info->mappings[i].executable_id;
load_address = proc_info->mappings[i].load_address;
bpf_printk("==== found load address %llx", load_address);
mapping_t *mapping = bpf_map_lookup_elem(&exec_mappings, &key);

type = proc_info->mappings[i].type;
break;
}
if (mapping == NULL) {
LOG("[error] :((( no mapping for ip=%llx", pc);
return FIND_UNWIND_MAPPING_NOT_FOUND;
}
if (pc < mapping->begin || pc >= mapping->end) {
bpf_printk("[error] mapping not found %llx %llx %llx %d %d", mapping->begin, pc, mapping->end, pc >= mapping->begin, pc < mapping->end);
return FIND_UNWIND_MAPPING_NOT_FOUND;
}

if (found) {
if (offset != NULL) {
*offset = load_address;
}
executable_id = mapping->executable_id;
load_address = mapping->load_address;
bpf_printk("==== found load address %llx", load_address);
type = mapping->type;

// "type" here is set in userspace in our `proc_info` map to indicate JITed
// and special sections, It is not something we get from procfs.
if (type == 1) {
return FIND_UNWIND_JITTED;
}
if (type == 2) {
return FIND_UNWIND_SPECIAL;
}
} else {
LOG("[warn] :((( no mapping for ip=%llx", pc);
return FIND_UNWIND_MAPPING_NOT_FOUND;
if (offset != NULL) {
*offset = load_address;
}

// "type" here is set in userspace in our `proc_info` map to indicate JITed
// and special sections, It is not something we get from procfs.
if (type == 1) {
return FIND_UNWIND_JITTED;
}
if (type == 2) {
return FIND_UNWIND_SPECIAL;
}

LOG("~about to check chunks, executable_id=%d", executable_id);
Expand Down Expand Up @@ -504,6 +490,10 @@ int dwarf_unwind(struct bpf_perf_event_data *ctx) {
break;
}

if (found_cfa_type == CFA_TYPE_OFFSET_DID_NOT_FIT) {
return 1;
}

if (found_rbp_type == RBP_TYPE_UNDEFINED_RETURN_ADDRESS) {
LOG("[info] null return address, end of stack", unwind_state->ip);
reached_bottom_of_stack = true;
Expand Down Expand Up @@ -761,7 +751,7 @@ int on_event(struct bpf_perf_event_data *ctx) {
process_info_t *proc_info =
bpf_map_lookup_elem(&process_info, &per_process_id);
if (proc_info) {
bpf_printk("== unwindin with per_process_id: %d", per_process_id);
bpf_printk("== unwinding with per_process_id: %d", per_process_id);
bump_samples();

u64 ip = 0;
Expand Down
16 changes: 14 additions & 2 deletions src/bpf/profiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ _Static_assert(1 << MAX_BINARY_SEARCH_DEPTH >= MAX_UNWIND_TABLE_SIZE,
#define CFA_TYPE_EXPRESSION 3
// Special values.
#define CFA_TYPE_END_OF_FDE_MARKER 4
#define CFA_TYPE_OFFSET_DID_NOT_FIT 5

// Values for the unwind table's frame pointer type.
#define RBP_TYPE_UNCHANGED 0
Expand Down Expand Up @@ -132,11 +133,22 @@ typedef struct {
u64 end;
} mapping_t;

// Key for the longest prefix matching. This is defined
// in the kernel in struct bpf_lpm_trie_key.
struct exec_mappings_key {
u32 prefix_len;
u32 pid;
u64 data;
};

// Prefix size in bits, excluding the prefix length.
#define PREFIX_LEN (sizeof(struct exec_mappings_key) - sizeof(u32)) * 8;

// Executable mappings for a process.
typedef struct {
u32 is_jit_compiler;
u32 len;
mapping_t mappings[MAX_MAPPINGS_PER_PROCESS];
//u32 len;
// mapping_t mappings[MAX_MAPPINGS_PER_PROCESS];
} process_info_t;

// A row in the stack unwinding table for x86_64.
Expand Down
12 changes: 12 additions & 0 deletions src/bpf/profiler_bindings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,18 @@ unsafe impl Plain for Event {}
unsafe impl Plain for process_info_t {}
unsafe impl Plain for unwind_info_chunks_t {}
unsafe impl Plain for unwinder_stats_t {}
unsafe impl Plain for exec_mappings_key {}
unsafe impl Plain for mapping_t {}

impl exec_mappings_key {
pub fn new(pid: u32, address: u64, prefix: u32) -> Self {
Self {
prefix_len: 32 + prefix,
pid: pid.to_be(),
data: address.to_be(),
}
}
}

impl Add for unwinder_stats_t {
type Output = Self;
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ pub mod profile;
pub mod profiler;
pub mod unwind_info;
pub mod usym;
pub mod util;
2 changes: 1 addition & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ fn main() -> Result<(), Box<dyn Error>> {

let collector = Collector::new();

let mut p: Profiler<'_> = Profiler::new(false, args.duration, args.sample_freq);
let mut p: Profiler<'_> = Profiler::new(true, args.duration, args.sample_freq);
p.profile_pids(args.pids);
p.run(collector.clone());

Expand Down
47 changes: 23 additions & 24 deletions src/profiler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ use crate::collector::*;
use crate::object::{build_id, elf_load, is_dynamic, is_go, BuildId};
use crate::perf_events::setup_perf_event;
use crate::unwind_info::{in_memory_unwind_info, remove_redundant, remove_unnecesary_markers};
use crate::util::summarize_address_range;

// Some temporary data structures to get things going, this could use lots of
// improvements
Expand Down Expand Up @@ -471,7 +472,6 @@ impl Profiler<'_> {

// Local unwind info state
let mut mappings = Vec::with_capacity(MAX_MAPPINGS_PER_PROCESS as usize);
let mut num_mappings: u32 = 0;

// hack for kworkers and such
let mut got_some_unwind_info: bool = true;
Expand Down Expand Up @@ -501,7 +501,6 @@ impl Profiler<'_> {
executable_id: 0,
type_: 1, // jitted
});
num_mappings += 1;
continue;
}
MappingType::Vdso => {
Expand All @@ -512,7 +511,6 @@ impl Profiler<'_> {
executable_id: 0,
type_: 2, // vdso
});
num_mappings += 1;
continue;
}
MappingType::FileBacked => {
Expand Down Expand Up @@ -559,7 +557,6 @@ impl Profiler<'_> {
executable_id: *executable_id,
type_: 0, // normal i think
});
num_mappings += 1;
debug!("unwind info CACHED for executable");
continue;
}
Expand All @@ -578,7 +575,6 @@ impl Profiler<'_> {
executable_id: self.native_unwind_state.build_id_to_executable_id.len() as u32,
type_: 0, // normal i think
});
num_mappings += 1;

let build_id = mapping.build_id.clone().unwrap();
// This is not released (see note "deadlock")
Expand Down Expand Up @@ -764,25 +760,28 @@ impl Profiler<'_> {
self.native_unwind_state.dirty = true;

// Add process info
// "default"
mappings.resize(
MAX_MAPPINGS_PER_PROCESS as usize,
mapping_t {
load_address: 0,
begin: 0,
end: 0,
executable_id: 0,
type_: 0,
},
);
let boxed_slice = mappings.into_boxed_slice();
let boxed_array: Box<[mapping_t; MAX_MAPPINGS_PER_PROCESS as usize]> =
boxed_slice.try_into().expect("try into");
let proc_info = process_info_t {
is_jit_compiler: 0,
len: num_mappings,
mappings: *boxed_array, // check this doesn't go into the stack!!
};
for mapping in mappings {
for address_range in summarize_address_range(mapping.begin, mapping.end - 1) {
let key = exec_mappings_key::new(
pid.try_into().unwrap(),
address_range.addr,
address_range.range,
);
let value = mapping;

self.bpf
.maps()
.exec_mappings()
.update(
unsafe { plain::as_bytes(&key) },
unsafe { plain::as_bytes(&value) },
MapFlags::ANY,
)
.unwrap();
}
}

let proc_info = process_info_t { is_jit_compiler: 0 };
self.bpf
.maps()
.process_info()
Expand Down
Loading

0 comments on commit 0a13d07

Please sign in to comment.