Skip to content

Commit

Permalink
bpfassets: add support to collect cpu frequency with BPF
Browse files Browse the repository at this point in the history
Signed-off-by: Marcelo Amaral <[email protected]>
  • Loading branch information
marceloamaral committed Dec 14, 2022
1 parent 0cadb2e commit ba931d3
Show file tree
Hide file tree
Showing 5 changed files with 386 additions and 286 deletions.
275 changes: 161 additions & 114 deletions bpfassets/perf_event/perf_event.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,174 +14,221 @@ See the License for the specific language governing permissions and
limitations under the License.
*/

#include <uapi/linux/ptrace.h>
#include <uapi/linux/bpf_perf_event.h>
#include <linux/sched.h>
// #include <linux/bpf.h>
// #include <linux/bpf_perf_event.h>

#ifndef NUM_CPUS
#define NUM_CPUS 128
#endif

// we cannot define it dynamically as NUM_CPUS because the golang needs to know this
// size at compiler time for decoding
#define CPU_VECTOR_SIZE 128
#ifndef CPU_REF_FREQ
#define CPU_REF_FREQ 2500
#endif

typedef struct switch_args
{
u64 pad;
char prev_comm[16];
int prev_pid;
int prev_prio;
long long prev_state;
char next_comm[16];
int next_pid;
int next_prio;
} switch_args;

typedef struct process_time_t
#define KHZ 1000

typedef struct process_metrics_t
{
u64 cgroup_id;
u64 pid;
u64 process_run_time;
u64 cpu_cycles;
u64 cpu_instr;
u64 cache_misses;
u64 cache_miss;
char comm[16];
//u64 pad;
// the max eBPF stack limit is 512 bytes, which is a vector of u16 with 128 elements
// the time is calculated in miliseconds, uint16 max size is 65K, ~1mim
u16 cpu_time[CPU_VECTOR_SIZE];
} process_time_t;
} process_metrics_t;

typedef struct pid_time_t
{
int pid;
u32 pid;
u32 cpu;
} pid_time_t;

BPF_PERF_OUTPUT(events);

// processes and pid time
BPF_HASH(processes, u64, process_time_t);
BPF_HASH(processes, u64, process_metrics_t);
BPF_HASH(pid_time, pid_time_t);

// perf counters
BPF_PERF_ARRAY(cpu_cycles, NUM_CPUS);
BPF_PERF_ARRAY(cpu_instr, NUM_CPUS);
BPF_PERF_ARRAY(cache_miss, NUM_CPUS);
BPF_PERF_ARRAY(cpu_cycles_hc_reader, NUM_CPUS);
BPF_ARRAY(cpu_cycles, u64, NUM_CPUS);

// tracking counters
BPF_ARRAY(prev_cpu_cycles, u64, NUM_CPUS);
BPF_ARRAY(prev_cpu_instr, u64, NUM_CPUS);
BPF_ARRAY(prev_cache_miss, u64, NUM_CPUS);
BPF_PERF_ARRAY(cpu_ref_cycles_hc_reader, NUM_CPUS);
BPF_ARRAY(cpu_ref_cycles, u64, NUM_CPUS);

static void safe_array_add(u32 idx, u16 *array, u16 value)
BPF_PERF_ARRAY(cpu_instr_hc_reader, NUM_CPUS);
BPF_ARRAY(cpu_instr, u64, NUM_CPUS);

BPF_PERF_ARRAY(cache_miss_hc_reader, NUM_CPUS);
BPF_ARRAY(cache_miss, u64, NUM_CPUS);

// cpu freq counters
BPF_ARRAY(cpu_freq_array, u32, NUM_CPUS);

static inline u64 get_on_cpu_time(u32 cur_pid, u32 prev_pid, u64 cur_ts)
{
#pragma clang loop unroll(full)
for (int array_index = 0; array_index < CPU_VECTOR_SIZE-1; array_index++)
u64 cpu_time = 0;

// get pid time
pid_time_t prev_pid_key = {.pid = prev_pid};
u64 *prev_ts = pid_time.lookup(&prev_pid_key);
if (prev_ts != 0)
{
if (array_index == idx)
// Probably a clock issue where the recorded on-CPU event had a
// timestamp later than the recorded off-CPU event, or vice versa.
// But do not return, since the hardware counters can be collected.
if (cur_ts > *prev_ts)
{
array[array_index] += value;
break;
cpu_time = (cur_ts - *prev_ts) / 1000000; /*milisecond*/
pid_time.delete(&prev_pid_key);
}
}
pid_time_t new_pid_key = {.pid = cur_pid};
pid_time.update(&new_pid_key, &cur_ts);

return cpu_time;
}

int sched_switch(switch_args *ctx)
static inline u64 normalize(u64 *counter, u64 *enabled, u64 *running)
{
u64 pid = bpf_get_current_pid_tgid() >> 32;
#ifdef SET_GROUP_ID
u64 cgroup_id = bpf_get_current_cgroup_id();
#else
u64 cgroup_id = 0;
#endif
if (*running > 0)
return *counter * *enabled / *running;
return *counter;
}

u64 time = bpf_ktime_get_ns();
static inline u64 calc_delta(u64 *prev_val, u64 *val)
{
u64 delta = 0;
u32 cpu_id = bpf_get_smp_processor_id();
pid_time_t new_pid, old_pid;
if (prev_val)
{
if (*val > *prev_val)
delta = *val - *prev_val;
}
return delta;
}

// get pid time
old_pid.pid = ctx->prev_pid;
u64 *last_time = pid_time.lookup(&old_pid);
if (last_time != 0)
// altough the "get_on_cpu_counters" has some code duplications, it is inline code and the compiles will improve this
static inline u64 get_on_cpu_cycles(u32 *cpu_id)
{
u64 delta = 0;
struct bpf_perf_event_value c = {};
int error = cpu_cycles_hc_reader.perf_counter_value(CUR_CPU_IDENTIFIER, &c, sizeof(struct bpf_perf_event_value));
if (error == 0)
{
delta = (time - *last_time) / 1000000; /*milisecond*/
// return if the process did not use any cpu time yet
if (delta == 0)
{
return 0;
}
pid_time.delete(&old_pid);
u64 val = normalize(&c.counter, &c.enabled, &c.running);
u64 *prev_val = cpu_cycles.lookup(cpu_id);
delta = calc_delta(prev_val, &val);
cpu_cycles.update(cpu_id, &val);
}
return delta;
}

new_pid.pid = ctx->next_pid;
pid_time.lookup_or_try_init(&new_pid, &time);
static inline u64 get_on_cpu_ref_cycles(u32 *cpu_id)
{
u64 delta = 0;
struct bpf_perf_event_value c = {};
int error = cpu_ref_cycles_hc_reader.perf_counter_value(CUR_CPU_IDENTIFIER, &c, sizeof(struct bpf_perf_event_value));
if (error == 0)
{
u64 val = normalize(&c.counter, &c.enabled, &c.running);
u64 *prev_val = cpu_ref_cycles.lookup(cpu_id);
delta = calc_delta(prev_val, &val);
cpu_ref_cycles.update(cpu_id, &val);
}
return delta;
}

u64 cpu_cycles_delta = 0;
u64 cpu_instr_delta = 0;
u64 cache_miss_delta = 0;
u64 *prev;
static inline u64 get_on_cpu_instr(u32 *cpu_id)
{
u64 delta = 0;
struct bpf_perf_event_value c = {};
int error = cpu_instr_hc_reader.perf_counter_value(CUR_CPU_IDENTIFIER, &c, sizeof(struct bpf_perf_event_value));
if (error == 0)
{
u64 val = normalize(&c.counter, &c.enabled, &c.running);
u64 *prev_val = cpu_instr.lookup(cpu_id);
delta = calc_delta(prev_val, &val);
cpu_instr.update(cpu_id, &val);
}
return delta;
}

u64 val = cpu_cycles.perf_read(CUR_CPU_IDENTIFIER);
if (((s64)val > 0) || ((s64)val < -256))
static inline u64 get_on_cpu_cache_miss(u32 *cpu_id)
{
u64 delta = 0;
struct bpf_perf_event_value c = {};
int error = cache_miss_hc_reader.perf_counter_value(CUR_CPU_IDENTIFIER, &c, sizeof(struct bpf_perf_event_value));
if (error == 0)
{
prev = prev_cpu_cycles.lookup(&cpu_id);
if (prev)
{
cpu_cycles_delta = val - *prev;
}
prev_cpu_cycles.update(&cpu_id, &val);
u64 val = normalize(&c.counter, &c.enabled, &c.running);
u64 *prev_val = cache_miss.lookup(cpu_id);
delta = calc_delta(prev_val, &val);
cache_miss.update(cpu_id, &val);
}
val = cpu_instr.perf_read(CUR_CPU_IDENTIFIER);
if (((s64)val > 0) || ((s64)val < -256))
return delta;
}

// calculate the average cpu freq
static inline u64 get_on_cpu_avg_freq(u32 *cpu_id, u64 on_cpu_cycles_delta, u64 on_cpu_ref_cycles_delta)
{
u32 avg_freq = 0;
cpu_freq_array.lookup_or_try_init(cpu_id, &avg_freq);
if (avg_freq == 0)
{
prev = prev_cpu_instr.lookup(&cpu_id);
if (prev)
{
cpu_instr_delta = val - *prev;
}
prev_cpu_instr.update(&cpu_id, &val);
avg_freq = ((on_cpu_cycles_delta * CPU_REF_FREQ) / on_cpu_ref_cycles_delta) / KHZ;
}
val = cache_miss.perf_read(CUR_CPU_IDENTIFIER);
if (((s64)val > 0) || ((s64)val < -256))
else
{
prev = prev_cache_miss.lookup(&cpu_id);
if (prev)
{
cache_miss_delta = val - *prev;
}
prev_cache_miss.update(&cpu_id, &val);
avg_freq += ((on_cpu_cycles_delta * CPU_REF_FREQ) / on_cpu_ref_cycles_delta) / KHZ;
avg_freq /= 2;
}
return avg_freq;
}

// init process time
struct process_time_t *process_time;
process_time = processes.lookup(&pid);
if (process_time == 0)
// int kprobe__finish_task_switch(switch_args *ctx)
int kprobe__finish_task_switch(struct pt_regs *ctx, struct task_struct *prev)
{
u64 cur_pid = bpf_get_current_pid_tgid() >> 32;
#ifdef SET_GROUP_ID
u64 cgroup_id = bpf_get_current_cgroup_id();
#else
u64 cgroup_id = 0;
#endif

u64 cur_ts = bpf_ktime_get_ns();
u32 cpu_id = bpf_get_smp_processor_id();
u64 on_cpu_time_delta = get_on_cpu_time(cur_pid, prev->pid, cur_ts);
u64 on_cpu_cycles_delta = get_on_cpu_cycles(&cpu_id);
u64 on_cpu_ref_cycles_delta = get_on_cpu_ref_cycles(&cpu_id);
u64 on_cpu_instr_delta = get_on_cpu_instr(&cpu_id);
u64 on_cpu_cache_miss_delta = get_on_cpu_cache_miss(&cpu_id);
u64 on_cpu_avg_freq = get_on_cpu_avg_freq(&cpu_id, on_cpu_cycles_delta, on_cpu_ref_cycles_delta);

// store process metrics
struct process_metrics_t *process_metrics;
process_metrics = processes.lookup(&cur_pid);
if (process_metrics == 0)
{
process_time_t new_process = {};
new_process.pid = pid;
process_metrics_t new_process = {};
new_process.pid = cur_pid;
new_process.cgroup_id = cgroup_id;
new_process.cpu_cycles = cpu_cycles_delta;
new_process.cpu_instr = cpu_instr_delta;
new_process.cache_misses = cache_miss_delta;
new_process.process_run_time += delta;
#ifdef CPU_FREQ
//FIXME: for certain reason, hyper-v seems to always get a cpu_id that is same as NUM_CPUS and cause stack overrun
safe_array_add(cpu_id, new_process.cpu_time, delta);
#endif
new_process.process_run_time = on_cpu_time_delta;
bpf_get_current_comm(&new_process.comm, sizeof(new_process.comm));
processes.update(&pid, &new_process);

new_process.cpu_cycles = on_cpu_cycles_delta;
new_process.cpu_instr = on_cpu_instr_delta;
new_process.cache_miss = on_cpu_cache_miss_delta;

processes.update(&cur_pid, &new_process);
}
else
{
// update process time
process_time->cpu_cycles += cpu_cycles_delta;
process_time->cpu_instr += cpu_instr_delta;
process_time->cache_misses += cache_miss_delta;
process_time->process_run_time += delta;
#ifdef CPU_FREQ
safe_array_add(cpu_id, process_time->cpu_time, delta);
#endif
process_metrics->process_run_time += on_cpu_time_delta;

process_metrics->cpu_cycles += on_cpu_cycles_delta;
process_metrics->cpu_instr += on_cpu_instr_delta;
process_metrics->cache_miss += on_cpu_cache_miss_delta;
}

return 0;
Expand Down
Loading

0 comments on commit ba931d3

Please sign in to comment.