Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ebpf: Update CPU freq calculation to improve performance #427

Merged
merged 3 commits into from
Dec 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
275 changes: 161 additions & 114 deletions bpfassets/perf_event/perf_event.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,174 +14,221 @@ See the License for the specific language governing permissions and
limitations under the License.
*/

#include <uapi/linux/ptrace.h>
#include <uapi/linux/bpf_perf_event.h>
#include <linux/sched.h>
// #include <linux/bpf.h>
// #include <linux/bpf_perf_event.h>

#ifndef NUM_CPUS
#define NUM_CPUS 128
#endif

// we cannot define it dynamically as NUM_CPUS because the golang needs to know this
// size at compiler time for decoding
#define CPU_VECTOR_SIZE 128
#ifndef CPU_REF_FREQ
#define CPU_REF_FREQ 2500
#endif

typedef struct switch_args
{
u64 pad;
char prev_comm[16];
int prev_pid;
int prev_prio;
long long prev_state;
char next_comm[16];
int next_pid;
int next_prio;
} switch_args;

typedef struct process_time_t
#define KHZ 1000

typedef struct process_metrics_t
{
u64 cgroup_id;
u64 pid;
u64 process_run_time;
u64 cpu_cycles;
u64 cpu_instr;
u64 cache_misses;
u64 cache_miss;
char comm[16];
//u64 pad;
// the max eBPF stack limit is 512 bytes, which is a vector of u16 with 128 elements
// the time is calculated in miliseconds, uint16 max size is 65K, ~1mim
u16 cpu_time[CPU_VECTOR_SIZE];
} process_time_t;
} process_metrics_t;

typedef struct pid_time_t
{
int pid;
u32 pid;
u32 cpu;
} pid_time_t;

BPF_PERF_OUTPUT(events);

// processes and pid time
BPF_HASH(processes, u64, process_time_t);
BPF_HASH(processes, u64, process_metrics_t);
BPF_HASH(pid_time, pid_time_t);

// perf counters
BPF_PERF_ARRAY(cpu_cycles, NUM_CPUS);
BPF_PERF_ARRAY(cpu_instr, NUM_CPUS);
BPF_PERF_ARRAY(cache_miss, NUM_CPUS);
BPF_PERF_ARRAY(cpu_cycles_hc_reader, NUM_CPUS);
BPF_ARRAY(cpu_cycles, u64, NUM_CPUS);

// tracking counters
BPF_ARRAY(prev_cpu_cycles, u64, NUM_CPUS);
BPF_ARRAY(prev_cpu_instr, u64, NUM_CPUS);
BPF_ARRAY(prev_cache_miss, u64, NUM_CPUS);
BPF_PERF_ARRAY(cpu_ref_cycles_hc_reader, NUM_CPUS);
BPF_ARRAY(cpu_ref_cycles, u64, NUM_CPUS);

static void safe_array_add(u32 idx, u16 *array, u16 value)
BPF_PERF_ARRAY(cpu_instr_hc_reader, NUM_CPUS);
BPF_ARRAY(cpu_instr, u64, NUM_CPUS);

BPF_PERF_ARRAY(cache_miss_hc_reader, NUM_CPUS);
BPF_ARRAY(cache_miss, u64, NUM_CPUS);

// cpu freq counters
BPF_ARRAY(cpu_freq_array, u32, NUM_CPUS);

static inline u64 get_on_cpu_time(u32 cur_pid, u32 prev_pid, u64 cur_ts)
{
#pragma clang loop unroll(full)
for (int array_index = 0; array_index < CPU_VECTOR_SIZE-1; array_index++)
u64 cpu_time = 0;

// get pid time
pid_time_t prev_pid_key = {.pid = prev_pid};
u64 *prev_ts = pid_time.lookup(&prev_pid_key);
if (prev_ts != 0)
{
if (array_index == idx)
// Probably a clock issue where the recorded on-CPU event had a
// timestamp later than the recorded off-CPU event, or vice versa.
// But do not return, since the hardware counters can be collected.
if (cur_ts > *prev_ts)
{
array[array_index] += value;
break;
cpu_time = (cur_ts - *prev_ts) / 1000000; /*milisecond*/
pid_time.delete(&prev_pid_key);
}
}
pid_time_t new_pid_key = {.pid = cur_pid};
pid_time.update(&new_pid_key, &cur_ts);

return cpu_time;
}

int sched_switch(switch_args *ctx)
static inline u64 normalize(u64 *counter, u64 *enabled, u64 *running)
{
u64 pid = bpf_get_current_pid_tgid() >> 32;
#ifdef SET_GROUP_ID
u64 cgroup_id = bpf_get_current_cgroup_id();
#else
u64 cgroup_id = 0;
#endif
if (*running > 0)
return *counter * *enabled / *running;
return *counter;
}

u64 time = bpf_ktime_get_ns();
static inline u64 calc_delta(u64 *prev_val, u64 *val)
{
u64 delta = 0;
u32 cpu_id = bpf_get_smp_processor_id();
pid_time_t new_pid, old_pid;
if (prev_val)
{
if (*val > *prev_val)
delta = *val - *prev_val;
}
return delta;
}

// get pid time
old_pid.pid = ctx->prev_pid;
u64 *last_time = pid_time.lookup(&old_pid);
if (last_time != 0)
// altough the "get_on_cpu_counters" has some code duplications, it is inline code and the compiles will improve this
static inline u64 get_on_cpu_cycles(u32 *cpu_id)
{
u64 delta = 0;
struct bpf_perf_event_value c = {};
int error = cpu_cycles_hc_reader.perf_counter_value(CUR_CPU_IDENTIFIER, &c, sizeof(struct bpf_perf_event_value));
if (error == 0)
{
delta = (time - *last_time) / 1000000; /*milisecond*/
// return if the process did not use any cpu time yet
if (delta == 0)
{
return 0;
}
pid_time.delete(&old_pid);
u64 val = normalize(&c.counter, &c.enabled, &c.running);
u64 *prev_val = cpu_cycles.lookup(cpu_id);
delta = calc_delta(prev_val, &val);
cpu_cycles.update(cpu_id, &val);
}
return delta;
}

new_pid.pid = ctx->next_pid;
pid_time.lookup_or_try_init(&new_pid, &time);
static inline u64 get_on_cpu_ref_cycles(u32 *cpu_id)
{
u64 delta = 0;
struct bpf_perf_event_value c = {};
int error = cpu_ref_cycles_hc_reader.perf_counter_value(CUR_CPU_IDENTIFIER, &c, sizeof(struct bpf_perf_event_value));
if (error == 0)
{
u64 val = normalize(&c.counter, &c.enabled, &c.running);
u64 *prev_val = cpu_ref_cycles.lookup(cpu_id);
delta = calc_delta(prev_val, &val);
cpu_ref_cycles.update(cpu_id, &val);
}
return delta;
}

u64 cpu_cycles_delta = 0;
u64 cpu_instr_delta = 0;
u64 cache_miss_delta = 0;
u64 *prev;
static inline u64 get_on_cpu_instr(u32 *cpu_id)
{
u64 delta = 0;
struct bpf_perf_event_value c = {};
int error = cpu_instr_hc_reader.perf_counter_value(CUR_CPU_IDENTIFIER, &c, sizeof(struct bpf_perf_event_value));
if (error == 0)
{
u64 val = normalize(&c.counter, &c.enabled, &c.running);
u64 *prev_val = cpu_instr.lookup(cpu_id);
delta = calc_delta(prev_val, &val);
cpu_instr.update(cpu_id, &val);
}
return delta;
}

u64 val = cpu_cycles.perf_read(CUR_CPU_IDENTIFIER);
if (((s64)val > 0) || ((s64)val < -256))
static inline u64 get_on_cpu_cache_miss(u32 *cpu_id)
{
u64 delta = 0;
struct bpf_perf_event_value c = {};
int error = cache_miss_hc_reader.perf_counter_value(CUR_CPU_IDENTIFIER, &c, sizeof(struct bpf_perf_event_value));
if (error == 0)
{
prev = prev_cpu_cycles.lookup(&cpu_id);
if (prev)
{
cpu_cycles_delta = val - *prev;
}
prev_cpu_cycles.update(&cpu_id, &val);
u64 val = normalize(&c.counter, &c.enabled, &c.running);
u64 *prev_val = cache_miss.lookup(cpu_id);
delta = calc_delta(prev_val, &val);
cache_miss.update(cpu_id, &val);
}
val = cpu_instr.perf_read(CUR_CPU_IDENTIFIER);
if (((s64)val > 0) || ((s64)val < -256))
return delta;
}

// calculate the average cpu freq
static inline u64 get_on_cpu_avg_freq(u32 *cpu_id, u64 on_cpu_cycles_delta, u64 on_cpu_ref_cycles_delta)
{
u32 avg_freq = 0;
cpu_freq_array.lookup_or_try_init(cpu_id, &avg_freq);
if (avg_freq == 0)
{
prev = prev_cpu_instr.lookup(&cpu_id);
if (prev)
{
cpu_instr_delta = val - *prev;
}
prev_cpu_instr.update(&cpu_id, &val);
avg_freq = ((on_cpu_cycles_delta * CPU_REF_FREQ) / on_cpu_ref_cycles_delta) / KHZ;
}
val = cache_miss.perf_read(CUR_CPU_IDENTIFIER);
if (((s64)val > 0) || ((s64)val < -256))
else
{
prev = prev_cache_miss.lookup(&cpu_id);
if (prev)
{
cache_miss_delta = val - *prev;
}
prev_cache_miss.update(&cpu_id, &val);
avg_freq += ((on_cpu_cycles_delta * CPU_REF_FREQ) / on_cpu_ref_cycles_delta) / KHZ;
avg_freq /= 2;
}
return avg_freq;
}

// init process time
struct process_time_t *process_time;
process_time = processes.lookup(&pid);
if (process_time == 0)
// int kprobe__finish_task_switch(switch_args *ctx)
int kprobe__finish_task_switch(struct pt_regs *ctx, struct task_struct *prev)
{
u64 cur_pid = bpf_get_current_pid_tgid() >> 32;
#ifdef SET_GROUP_ID
u64 cgroup_id = bpf_get_current_cgroup_id();
#else
u64 cgroup_id = 0;
#endif

u64 cur_ts = bpf_ktime_get_ns();
u32 cpu_id = bpf_get_smp_processor_id();
u64 on_cpu_time_delta = get_on_cpu_time(cur_pid, prev->pid, cur_ts);
u64 on_cpu_cycles_delta = get_on_cpu_cycles(&cpu_id);
u64 on_cpu_ref_cycles_delta = get_on_cpu_ref_cycles(&cpu_id);
u64 on_cpu_instr_delta = get_on_cpu_instr(&cpu_id);
u64 on_cpu_cache_miss_delta = get_on_cpu_cache_miss(&cpu_id);
u64 on_cpu_avg_freq = get_on_cpu_avg_freq(&cpu_id, on_cpu_cycles_delta, on_cpu_ref_cycles_delta);

// store process metrics
struct process_metrics_t *process_metrics;
process_metrics = processes.lookup(&cur_pid);
if (process_metrics == 0)
{
process_time_t new_process = {};
new_process.pid = pid;
process_metrics_t new_process = {};
new_process.pid = cur_pid;
new_process.cgroup_id = cgroup_id;
new_process.cpu_cycles = cpu_cycles_delta;
new_process.cpu_instr = cpu_instr_delta;
new_process.cache_misses = cache_miss_delta;
new_process.process_run_time += delta;
#ifdef CPU_FREQ
//FIXME: for certain reason, hyper-v seems to always get a cpu_id that is same as NUM_CPUS and cause stack overrun
safe_array_add(cpu_id, new_process.cpu_time, delta);
#endif
new_process.process_run_time = on_cpu_time_delta;
bpf_get_current_comm(&new_process.comm, sizeof(new_process.comm));
processes.update(&pid, &new_process);

new_process.cpu_cycles = on_cpu_cycles_delta;
new_process.cpu_instr = on_cpu_instr_delta;
new_process.cache_miss = on_cpu_cache_miss_delta;

processes.update(&cur_pid, &new_process);
}
else
{
// update process time
process_time->cpu_cycles += cpu_cycles_delta;
process_time->cpu_instr += cpu_instr_delta;
process_time->cache_misses += cache_miss_delta;
process_time->process_run_time += delta;
#ifdef CPU_FREQ
safe_array_add(cpu_id, process_time->cpu_time, delta);
#endif
process_metrics->process_run_time += on_cpu_time_delta;

process_metrics->cpu_cycles += on_cpu_cycles_delta;
process_metrics->cpu_instr += on_cpu_instr_delta;
process_metrics->cache_miss += on_cpu_cache_miss_delta;
}

return 0;
Expand Down
Loading