sustainable-computing-io · rootfs · Dec 14, 2022 · Dec 14, 2022 · Dec 14, 2022 · Dec 14, 2022
diff --git a/bpfassets/perf_event/perf_event.c b/bpfassets/perf_event/perf_event.c
@@ -14,174 +14,221 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-#include <uapi/linux/ptrace.h>
-#include <uapi/linux/bpf_perf_event.h>
+#include <linux/sched.h>
+// #include <linux/bpf.h>
+// #include <linux/bpf_perf_event.h>
 
 #ifndef NUM_CPUS
 #define NUM_CPUS 128
 #endif
 
-// we cannot define it dynamically as NUM_CPUS because the golang needs to know this
-// size at compiler time for decoding
-#define CPU_VECTOR_SIZE 128
+#ifndef CPU_REF_FREQ
+#define CPU_REF_FREQ 2500
+#endif
 
-typedef struct switch_args
-{
-    u64 pad;
-    char prev_comm[16];
-    int prev_pid;
-    int prev_prio;
-    long long prev_state;
-    char next_comm[16];
-    int next_pid;
-    int next_prio;
-} switch_args;
-
-typedef struct process_time_t
+#define KHZ 1000
+
+typedef struct process_metrics_t
 {
     u64 cgroup_id;
     u64 pid;
     u64 process_run_time;
     u64 cpu_cycles;
     u64 cpu_instr;
-    u64 cache_misses;
+    u64 cache_miss;
     char comm[16];
-    //u64 pad;
-    // the max eBPF stack limit is 512 bytes, which is a vector of u16 with 128 elements
-    // the time is calculated in miliseconds, uint16 max size is 65K, ~1mim
-    u16 cpu_time[CPU_VECTOR_SIZE];
-}  process_time_t;
+} process_metrics_t;
 
 typedef struct pid_time_t
 {
-    int pid;
+    u32 pid;
+    u32 cpu;
 } pid_time_t;
 
-BPF_PERF_OUTPUT(events);
-
 // processes and pid time
-BPF_HASH(processes, u64, process_time_t);
+BPF_HASH(processes, u64, process_metrics_t);
 BPF_HASH(pid_time, pid_time_t);
 
 // perf counters
-BPF_PERF_ARRAY(cpu_cycles, NUM_CPUS);
-BPF_PERF_ARRAY(cpu_instr, NUM_CPUS);
-BPF_PERF_ARRAY(cache_miss, NUM_CPUS);
+BPF_PERF_ARRAY(cpu_cycles_hc_reader, NUM_CPUS);
+BPF_ARRAY(cpu_cycles, u64, NUM_CPUS);
 
-// tracking counters
-BPF_ARRAY(prev_cpu_cycles, u64, NUM_CPUS);
-BPF_ARRAY(prev_cpu_instr, u64, NUM_CPUS);
-BPF_ARRAY(prev_cache_miss, u64, NUM_CPUS);
+BPF_PERF_ARRAY(cpu_ref_cycles_hc_reader, NUM_CPUS);
+BPF_ARRAY(cpu_ref_cycles, u64, NUM_CPUS);
 
-static void safe_array_add(u32 idx, u16 *array, u16 value)
+BPF_PERF_ARRAY(cpu_instr_hc_reader, NUM_CPUS);
+BPF_ARRAY(cpu_instr, u64, NUM_CPUS);
+
+BPF_PERF_ARRAY(cache_miss_hc_reader, NUM_CPUS);
+BPF_ARRAY(cache_miss, u64, NUM_CPUS);
+
+// cpu freq counters
+BPF_ARRAY(cpu_freq_array, u32, NUM_CPUS);
+
+static inline u64 get_on_cpu_time(u32 cur_pid, u32 prev_pid, u64 cur_ts)
 {
-#pragma clang loop unroll(full)
-    for (int array_index = 0; array_index < CPU_VECTOR_SIZE-1; array_index++)
+    u64 cpu_time = 0;
+
+    // get pid time
+    pid_time_t prev_pid_key = {.pid = prev_pid};
+    u64 *prev_ts = pid_time.lookup(&prev_pid_key);
+    if (prev_ts != 0)
     {
-        if (array_index == idx)
+        // Probably a clock issue where the recorded on-CPU event had a
+        // timestamp later than the recorded off-CPU event, or vice versa.
+        // But do not return, since the hardware counters can be collected.
+        if (cur_ts > *prev_ts)
         {
-            array[array_index] += value;
-            break;
+            cpu_time = (cur_ts - *prev_ts) / 1000000; /*milisecond*/
+            pid_time.delete(&prev_pid_key);
         }
     }
+    pid_time_t new_pid_key = {.pid = cur_pid};
+    pid_time.update(&new_pid_key, &cur_ts);
+
+    return cpu_time;
 }
 
-int sched_switch(switch_args *ctx)
+static inline u64 normalize(u64 *counter, u64 *enabled, u64 *running)
 {
-    u64 pid = bpf_get_current_pid_tgid() >> 32;
-#ifdef SET_GROUP_ID
-    u64 cgroup_id = bpf_get_current_cgroup_id();
-#else
-    u64 cgroup_id = 0;
-#endif
+    if (*running > 0)
+        return *counter * *enabled / *running;
+    return *counter;
+}
 
-    u64 time = bpf_ktime_get_ns();
+static inline u64 calc_delta(u64 *prev_val, u64 *val)
+{
     u64 delta = 0;
-    u32 cpu_id = bpf_get_smp_processor_id();
-    pid_time_t new_pid, old_pid;
+    if (prev_val)
+    {
+        if (*val > *prev_val)
+            delta = *val - *prev_val;
+    }
+    return delta;
+}
 
-    // get pid time
-    old_pid.pid = ctx->prev_pid;
-    u64 *last_time = pid_time.lookup(&old_pid);
-    if (last_time != 0)
+// altough the "get_on_cpu_counters" has some code duplications, it is inline code and the compiles will improve this
+static inline u64 get_on_cpu_cycles(u32 *cpu_id)
+{
+    u64 delta = 0;
+    struct bpf_perf_event_value c = {};
+    int error = cpu_cycles_hc_reader.perf_counter_value(CUR_CPU_IDENTIFIER, &c, sizeof(struct bpf_perf_event_value));
+    if (error == 0)
     {
-        delta = (time - *last_time) / 1000000; /*milisecond*/
-        // return if the process did not use any cpu time yet
-        if (delta == 0)
-        {
-            return 0;
-        }
-        pid_time.delete(&old_pid);
+        u64 val = normalize(&c.counter, &c.enabled, &c.running);
+        u64 *prev_val = cpu_cycles.lookup(cpu_id);
+        delta = calc_delta(prev_val, &val);
+        cpu_cycles.update(cpu_id, &val);
     }
+    return delta;
+}
 
-    new_pid.pid = ctx->next_pid;
-    pid_time.lookup_or_try_init(&new_pid, &time);
+static inline u64 get_on_cpu_ref_cycles(u32 *cpu_id)
+{
+    u64 delta = 0;
+    struct bpf_perf_event_value c = {};
+    int error = cpu_ref_cycles_hc_reader.perf_counter_value(CUR_CPU_IDENTIFIER, &c, sizeof(struct bpf_perf_event_value));
+    if (error == 0)
+    {
+        u64 val = normalize(&c.counter, &c.enabled, &c.running);
+        u64 *prev_val = cpu_ref_cycles.lookup(cpu_id);
+        delta = calc_delta(prev_val, &val);
+        cpu_ref_cycles.update(cpu_id, &val);
+    }
+    return delta;
+}
 
-    u64 cpu_cycles_delta = 0;
-    u64 cpu_instr_delta = 0;
-    u64 cache_miss_delta = 0;
-    u64 *prev;
+static inline u64 get_on_cpu_instr(u32 *cpu_id)
+{
+    u64 delta = 0;
+    struct bpf_perf_event_value c = {};
+    int error = cpu_instr_hc_reader.perf_counter_value(CUR_CPU_IDENTIFIER, &c, sizeof(struct bpf_perf_event_value));
+    if (error == 0)
+    {
+        u64 val = normalize(&c.counter, &c.enabled, &c.running);
+        u64 *prev_val = cpu_instr.lookup(cpu_id);
+        delta = calc_delta(prev_val, &val);
+        cpu_instr.update(cpu_id, &val);
+    }
+    return delta;
+}
 
-    u64 val = cpu_cycles.perf_read(CUR_CPU_IDENTIFIER);
-    if (((s64)val > 0) || ((s64)val < -256))
+static inline u64 get_on_cpu_cache_miss(u32 *cpu_id)
+{
+    u64 delta = 0;
+    struct bpf_perf_event_value c = {};
+    int error = cache_miss_hc_reader.perf_counter_value(CUR_CPU_IDENTIFIER, &c, sizeof(struct bpf_perf_event_value));
+    if (error == 0)
     {
-        prev = prev_cpu_cycles.lookup(&cpu_id);
-        if (prev)
-        {
-            cpu_cycles_delta = val - *prev;
-        }
-        prev_cpu_cycles.update(&cpu_id, &val);
+        u64 val = normalize(&c.counter, &c.enabled, &c.running);
+        u64 *prev_val = cache_miss.lookup(cpu_id);
+        delta = calc_delta(prev_val, &val);
+        cache_miss.update(cpu_id, &val);
     }
-    val = cpu_instr.perf_read(CUR_CPU_IDENTIFIER);
-    if (((s64)val > 0) || ((s64)val < -256))
+    return delta;
+}
+
+// calculate the average cpu freq
+static inline u64 get_on_cpu_avg_freq(u32 *cpu_id, u64 on_cpu_cycles_delta, u64 on_cpu_ref_cycles_delta)
+{
+    u32 avg_freq = 0;
+    cpu_freq_array.lookup_or_try_init(cpu_id, &avg_freq);
+    if (avg_freq == 0)
     {
-        prev = prev_cpu_instr.lookup(&cpu_id);
-        if (prev)
-        {
-            cpu_instr_delta = val - *prev;
-        }
-        prev_cpu_instr.update(&cpu_id, &val);
+        avg_freq = ((on_cpu_cycles_delta * CPU_REF_FREQ) / on_cpu_ref_cycles_delta) / KHZ;
     }
-    val = cache_miss.perf_read(CUR_CPU_IDENTIFIER);
-    if (((s64)val > 0) || ((s64)val < -256))
+    else
     {
-        prev = prev_cache_miss.lookup(&cpu_id);
-        if (prev)
-        {
-            cache_miss_delta = val - *prev;
-        }
-        prev_cache_miss.update(&cpu_id, &val);
+        avg_freq += ((on_cpu_cycles_delta * CPU_REF_FREQ) / on_cpu_ref_cycles_delta) / KHZ;
+        avg_freq /= 2;
     }
+    return avg_freq;
+}
 
-    // init process time
-    struct process_time_t *process_time;
-    process_time = processes.lookup(&pid);
-    if (process_time == 0)
+// int kprobe__finish_task_switch(switch_args *ctx)
+int kprobe__finish_task_switch(struct pt_regs *ctx, struct task_struct *prev)
+{
+    u64 cur_pid = bpf_get_current_pid_tgid() >> 32;
+#ifdef SET_GROUP_ID
+    u64 cgroup_id = bpf_get_current_cgroup_id();
+#else
+    u64 cgroup_id = 0;
+#endif
+
+    u64 cur_ts = bpf_ktime_get_ns();
+    u32 cpu_id = bpf_get_smp_processor_id();
+    u64 on_cpu_time_delta = get_on_cpu_time(cur_pid, prev->pid, cur_ts);
+    u64 on_cpu_cycles_delta = get_on_cpu_cycles(&cpu_id);
+    u64 on_cpu_ref_cycles_delta = get_on_cpu_ref_cycles(&cpu_id);
+    u64 on_cpu_instr_delta = get_on_cpu_instr(&cpu_id);
+    u64 on_cpu_cache_miss_delta = get_on_cpu_cache_miss(&cpu_id);
+    u64 on_cpu_avg_freq = get_on_cpu_avg_freq(&cpu_id, on_cpu_cycles_delta, on_cpu_ref_cycles_delta);
+
+    // store process metrics
+    struct process_metrics_t *process_metrics;
+    process_metrics = processes.lookup(&cur_pid);
+    if (process_metrics == 0)
     {
-        process_time_t new_process = {};
-        new_process.pid = pid;
+        process_metrics_t new_process = {};
+        new_process.pid = cur_pid;
         new_process.cgroup_id = cgroup_id;
-        new_process.cpu_cycles = cpu_cycles_delta;
-        new_process.cpu_instr = cpu_instr_delta;
-        new_process.cache_misses = cache_miss_delta;
-        new_process.process_run_time += delta;
-#ifdef CPU_FREQ
-        //FIXME: for certain reason, hyper-v seems to always get a cpu_id that is same as NUM_CPUS and cause stack overrun
-        safe_array_add(cpu_id, new_process.cpu_time, delta);
-#endif        
+        new_process.process_run_time = on_cpu_time_delta;
         bpf_get_current_comm(&new_process.comm, sizeof(new_process.comm));
-        processes.update(&pid, &new_process);
+
+        new_process.cpu_cycles = on_cpu_cycles_delta;
+        new_process.cpu_instr = on_cpu_instr_delta;
+        new_process.cache_miss = on_cpu_cache_miss_delta;
+
+        processes.update(&cur_pid, &new_process);
     }
     else
     {
         // update process time
-        process_time->cpu_cycles += cpu_cycles_delta;
-        process_time->cpu_instr += cpu_instr_delta;
-        process_time->cache_misses += cache_miss_delta;
-        process_time->process_run_time += delta;
-#ifdef CPU_FREQ
-        safe_array_add(cpu_id, process_time->cpu_time, delta);
-#endif        
+        process_metrics->process_run_time += on_cpu_time_delta;
+
+        process_metrics->cpu_cycles += on_cpu_cycles_delta;
+        process_metrics->cpu_instr += on_cpu_instr_delta;
+        process_metrics->cache_miss += on_cpu_cache_miss_delta;
     }
 
     return 0;