From b181ec3fe8265df6ce19f9ba832fd4c1ce2fd7a5 Mon Sep 17 00:00:00 2001 From: Marcelo Amaral Date: Wed, 14 Dec 2022 23:35:38 +0900 Subject: [PATCH 1/3] acpi: update to only read cpu freq files when BPF is disabled Signed-off-by: Marcelo Amaral --- pkg/power/acpi/acpi.go | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/pkg/power/acpi/acpi.go b/pkg/power/acpi/acpi.go index 6ee73a398e..2f29bb8378 100644 --- a/pkg/power/acpi/acpi.go +++ b/pkg/power/acpi/acpi.go @@ -64,20 +64,23 @@ func NewACPIPowerMeter() *ACPI { return acpi } -func (a *ACPI) Run() { +func (a *ACPI) Run(isEBPFEnabled bool) { go func() { for { select { case <-a.stopChannel: return default: - cpuCoreFrequency := getCPUCoreFrequency() - a.mu.Lock() - for cpu, freq := range cpuCoreFrequency { - // average cpu frequency - a.cpuCoreFrequency[cpu] = (cpuCoreFrequency[cpu] + freq) / 2 + // in the case we cannot collect the cpu frequency using EBPF + if !isEBPFEnabled { + cpuCoreFrequency := getCPUCoreFrequency() + a.mu.Lock() + for cpu, freq := range cpuCoreFrequency { + // average cpu frequency + a.cpuCoreFrequency[cpu] = (cpuCoreFrequency[cpu] + freq) / 2 + } + a.mu.Unlock() } - a.mu.Unlock() if a.collectEnergy { if sensorPower, err := getPowerFromSensor(); err == nil { @@ -92,6 +95,11 @@ func (a *ACPI) Run() { } } + // stop the gorotime if there is nothing to do + if (isEBPFEnabled) && (!a.collectEnergy) { + return + } + time.Sleep(poolingInterval) } } From 0cadb2ee70255d06a410c1e3fb6b7c99ddb22d64 Mon Sep 17 00:00:00 2001 From: Marcelo Amaral Date: Wed, 14 Dec 2022 23:35:40 +0900 Subject: [PATCH 2/3] collector: remove expensive cpu freq calculation Signed-off-by: Marcelo Amaral --- pkg/collector/container_hc_collector.go | 62 +------------------- pkg/collector/container_hc_collector_test.go | 25 +------- pkg/collector/metric/container_metric.go | 24 +++----- pkg/collector/metric/stats.go | 2 - pkg/collector/metric/utils.go | 3 - pkg/collector/metric/utils_test.go | 23 ++------ pkg/collector/metric_collector.go | 2 +- pkg/collector/node_energy_collector.go | 14 +++++ pkg/collector/prometheus_collector.go | 22 ++++--- pkg/config/types.go | 1 + 10 files changed, 40 insertions(+), 138 deletions(-) diff --git a/pkg/collector/container_hc_collector.go b/pkg/collector/container_hc_collector.go index 710381cc95..2f1bb7eaa3 100644 --- a/pkg/collector/container_hc_collector.go +++ b/pkg/collector/container_hc_collector.go @@ -42,13 +42,11 @@ type ProcessBPFMetrics struct { CPUInstr uint64 CacheMisses uint64 Command [16]byte - CPUTime [C.CPU_VECTOR_SIZE]uint16 } // resetBPFTables reset BPF module's tables func (c *Collector) resetBPFTables() { c.bpfHCMeter.Table.DeleteAll() - c.bpfHCMeter.TimeTable.DeleteAll() } // updateBPFMetrics reads the BPF tables with process/pid/cgroupid metrics (CPU time, available HW counters) @@ -82,29 +80,14 @@ func (c *Collector) updateBPFMetrics() { c.ContainersMetrics[containerID].SetLatestProcess(ct.CGroupID, ct.PID, C.GoString(comm)) } - var activeCPUs []int32 - var avgFreq float64 - var totalCPUTime uint64 - if attacher.EnableCPUFreq { - avgFreq, totalCPUTime, activeCPUs = getAVGCPUFreqAndTotalCPUTime(c.NodeMetrics.CPUFrequency, &ct.CPUTime) - c.ContainersMetrics[containerID].AvgCPUFreq = avgFreq - } else { - totalCPUTime = ct.ProcessRunTime - activeCPUs = getActiveCPUs(&ct.CPUTime) - } - - for _, cpu := range activeCPUs { - c.ContainersMetrics[containerID].CurrCPUTimePerCPU[uint32(cpu)] += uint64(ct.CPUTime[cpu]) - } - - if err = c.ContainersMetrics[containerID].CPUTime.AddNewCurr(totalCPUTime); err != nil { + if err = c.ContainersMetrics[containerID].CPUTime.AddNewCurr(ct.ProcessRunTime); err != nil { klog.V(5).Infoln(err) } for _, counterKey := range collector_metric.AvailableCounters { var val uint64 switch counterKey { - case attacher.CPUCycleLable: + case attacher.CPUCycleLabel: val = ct.CPUCycles case attacher.CPUInstructionLabel: val = ct.CPUInstr @@ -136,47 +119,6 @@ func (c *Collector) updateBPFMetrics() { c.handleInactiveContainers(foundContainer) } -// getAVGCPUFreqAndTotalCPUTime calculates the weighted cpu frequency average -func getAVGCPUFreqAndTotalCPUTime(cpuFrequency map[int32]uint64, cpuTime *[C.CPU_VECTOR_SIZE]uint16) (avgFreq float64, totalCPUTime uint64, activeCPUs []int32) { - totalFreq := float64(0) - totalFreqWithoutWeight := float64(0) - for cpu, freq := range cpuFrequency { - if int(cpu) > len((*cpuTime))-1 { - break - } - totalCPUTime += uint64(cpuTime[cpu]) - totalFreqWithoutWeight += float64(freq) - } - if totalCPUTime == 0 { - if len(cpuFrequency) == 0 { - return - } - avgFreq = totalFreqWithoutWeight / float64(len(cpuFrequency)) - } else { - for cpu, freq := range cpuFrequency { - if int(cpu) > len((*cpuTime))-1 { - break - } - if cpuTime[cpu] != 0 { - totalFreq += float64(freq) * (float64(cpuTime[cpu]) / float64(totalCPUTime)) - activeCPUs = append(activeCPUs, cpu) - } - } - avgFreq = totalFreqWithoutWeight / float64(len(cpuFrequency)) - } - return -} - -// getActiveCPUs returns active cpu(vcpu) (in case that frequency is not active) -func getActiveCPUs(cpuTime *[C.CPU_VECTOR_SIZE]uint16) (activeCPUs []int32) { - for cpu := range cpuTime { - if cpuTime[cpu] != 0 { - activeCPUs = append(activeCPUs, int32(cpu)) - } - } - return -} - // handleInactiveContainers func (c *Collector) handleInactiveContainers(foundContainer map[string]bool) { numOfInactive := len(c.ContainersMetrics) - len(foundContainer) diff --git a/pkg/collector/container_hc_collector_test.go b/pkg/collector/container_hc_collector_test.go index d62bbd42cb..d07ddfa74c 100644 --- a/pkg/collector/container_hc_collector_test.go +++ b/pkg/collector/container_hc_collector_test.go @@ -2,30 +2,7 @@ package collector import ( . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" ) -var _ = Describe("Test for active containers", func() { - It("Get Active CPUs", func() { - var cpuTime [128]uint16 - cpuTime[3] = 1 - cpuTime[5] = 1 - cpuTime[12] = 1 - ac := getActiveCPUs(&cpuTime) - Expect(3).To(Equal(len(ac))) - Expect(int32(3)).To(Equal(ac[0])) - Expect(int32(5)).To(Equal(ac[1])) - Expect(int32(12)).To(Equal(ac[2])) - }) - - It("getAVGCPUFreqAndTotalCPUTime", func() { - var cpuTime [128]uint16 - cpuTime[3] = 1 - cpuFrequency := make(map[int32]uint64) - cpuFrequency[int32(3)] = 1 - avgFreq, totalCPUTime, activeCPUs := getAVGCPUFreqAndTotalCPUTime(cpuFrequency, &cpuTime) - Expect(float64(1)).To(Equal(avgFreq)) - Expect(uint64(1)).To(Equal(totalCPUTime)) - Expect(int32(3)).To(Equal(activeCPUs[0])) - }) +var _ = Describe("Test hc collector", func() { }) diff --git a/pkg/collector/metric/container_metric.go b/pkg/collector/metric/container_metric.go index 49122276e8..c27724fad9 100644 --- a/pkg/collector/metric/container_metric.go +++ b/pkg/collector/metric/container_metric.go @@ -48,7 +48,6 @@ type ContainerMetrics struct { // TODO: we should consider deprecate the command information Command string - AvgCPUFreq float64 CurrProcesses int Disks int @@ -62,8 +61,6 @@ type ContainerMetrics struct { BytesRead *UInt64StatCollection BytesWrite *UInt64StatCollection - CurrCPUTimePerCPU map[uint32]uint64 - EnergyInCore *UInt64Stat EnergyInDRAM *UInt64Stat EnergyInUncore *UInt64Stat @@ -90,14 +87,13 @@ func NewContainerMetrics(containerName, podName, podNamespace string) *Container BytesWrite: &UInt64StatCollection{ Stat: make(map[string]*UInt64Stat), }, - CurrCPUTimePerCPU: make(map[uint32]uint64), - EnergyInCore: &UInt64Stat{}, - EnergyInDRAM: &UInt64Stat{}, - EnergyInUncore: &UInt64Stat{}, - EnergyInPkg: &UInt64Stat{}, - EnergyInOther: &UInt64Stat{}, - EnergyInGPU: &UInt64Stat{}, - DynEnergy: &UInt64Stat{}, + EnergyInCore: &UInt64Stat{}, + EnergyInDRAM: &UInt64Stat{}, + EnergyInUncore: &UInt64Stat{}, + EnergyInPkg: &UInt64Stat{}, + EnergyInOther: &UInt64Stat{}, + EnergyInGPU: &UInt64Stat{}, + DynEnergy: &UInt64Stat{}, } for _, metricName := range AvailableCounters { c.CounterStats[metricName] = &UInt64Stat{} @@ -133,7 +129,6 @@ func (c *ContainerMetrics) ResetCurr() { for kubeletKey := range c.KubeletStats { c.KubeletStats[kubeletKey].ResetCurr() } - c.CurrCPUTimePerCPU = make(map[uint32]uint64) c.EnergyInCore.ResetCurr() c.EnergyInDRAM.ResetCurr() c.EnergyInUncore.ResetCurr() @@ -237,9 +232,6 @@ func (c *ContainerMetrics) ToPrometheusValue(metric string) string { if metric == "block_devices_used" { return strconv.FormatUint(uint64(c.Disks), 10) } - if metric == "avg_cpu_frequency" { - return fmt.Sprintf("%f", c.AvgCPUFreq) - } if curr, aggr, err := c.getFloatCurrAndAggrValue(metric); err == nil { if currentValue { return fmt.Sprintf("%f", curr) @@ -285,7 +277,6 @@ func (c *ContainerMetrics) String() string { "\tcgrouppid: %d pid: %d comm: %s\n"+ "\tePkg (mJ): %s (eCore: %s eDram: %s eUncore: %s) eGPU (mJ): %s eOther (mJ): %s \n"+ "\teDyn (mJ): %s \n"+ - "\tavgFreq: %.2f\n"+ "\tCPUTime: %d (%d)\n"+ "\tcounters: %v\n"+ "\tcgroupfs: %v\n"+ @@ -294,7 +285,6 @@ func (c *ContainerMetrics) String() string { c.CGroupPID, c.PIDS, c.Command, c.EnergyInPkg, c.EnergyInCore, c.EnergyInDRAM, c.EnergyInUncore, c.EnergyInGPU, c.EnergyInOther, c.DynEnergy, - c.AvgCPUFreq/1000, /*MHZ*/ c.CPUTime.Curr, c.CPUTime.Aggr, c.CounterStats, c.CgroupFSStats, diff --git a/pkg/collector/metric/stats.go b/pkg/collector/metric/stats.go index 25293094c3..db0885059f 100644 --- a/pkg/collector/metric/stats.go +++ b/pkg/collector/metric/stats.go @@ -12,8 +12,6 @@ import ( ) const ( - freqMetricLabel = config.CPUFrequency - // TO-DO: merge to cgroup stat ByteReadLabel = config.BytesReadIO ByteWriteLabel = config.BytesWriteIO diff --git a/pkg/collector/metric/utils.go b/pkg/collector/metric/utils.go index e75aa1f3e1..6f620feec9 100644 --- a/pkg/collector/metric/utils.go +++ b/pkg/collector/metric/utils.go @@ -59,9 +59,6 @@ func getPrometheusMetrics() []string { for _, feature := range ContainerFeaturesNames { labels = append(labels, CurrPrefix+feature, AggrPrefix+feature) } - if attacher.EnableCPUFreq { - labels = append(labels, freqMetricLabel) - } // TO-DO: remove this hard code metric labels = append(labels, blockDeviceLabel) return labels diff --git a/pkg/collector/metric/utils_test.go b/pkg/collector/metric/utils_test.go index 41f381eda9..3a95277959 100644 --- a/pkg/collector/metric/utils_test.go +++ b/pkg/collector/metric/utils_test.go @@ -4,7 +4,7 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - "github.com/sustainable-computing-io/kepler/pkg/bpfassets/attacher" + "github.com/sustainable-computing-io/kepler/pkg/config" ) func clearPlatformDependentAvailability() { @@ -33,9 +33,6 @@ var _ = Describe("Test Metric Unit", func() { clearPlatformDependentAvailability() exp := []string{"curr_cpu_time", "total_cpu_time", "curr_bytes_read", "total_bytes_read", "curr_bytes_writes", "total_bytes_writes", "block_devices_used"} - if attacher.EnableCPUFreq { - exp = []string{"curr_cpu_time", "total_cpu_time", "curr_bytes_read", "total_bytes_read", "curr_bytes_writes", "total_bytes_writes", "avg_cpu_frequency", "block_devices_used"} - } cur := getPrometheusMetrics() Expect(exp).To(Equal(cur)) }) @@ -61,22 +58,10 @@ var _ = Describe("Test Metric Unit", func() { }) It("Test setEnabledMetrics", func() { + config.ExposeHardwareCounterMetrics = false clearPlatformDependentAvailability() cur := setEnabledMetrics() - - if attacher.EnableCPUFreq { - expMap := make(map[string]int, 7) - exp := []string{"cpu_time", "cpu_instr", "cache_miss", "cpu_cycles", "bytes_read", "bytes_writes", "block_devices_used"} - for _, v := range exp { - expMap[v] = 1 - } - for _, vCur := range cur { - _, ok := expMap[vCur] - Expect(ok).To(BeTrue()) - } - } else { - exp := []string{"cpu_time", "bytes_read", "bytes_writes", "block_devices_used"} - Expect(exp).To(Equal(cur)) - } + exp := []string{"cpu_time", "bytes_read", "bytes_writes", "block_devices_used"} + Expect(exp).To(Equal(cur)) }) }) diff --git a/pkg/collector/metric_collector.go b/pkg/collector/metric_collector.go index edc958b367..8b18e4df46 100644 --- a/pkg/collector/metric_collector.go +++ b/pkg/collector/metric_collector.go @@ -80,7 +80,7 @@ func (c *Collector) Initialize() error { c.prePopulateContainerMetrics(pods) c.updateNodeEnergyMetrics() - c.acpiPowerMeter.Run() + c.acpiPowerMeter.Run(attacher.HardwareCountersEnabled) c.resetBPFTables() return nil diff --git a/pkg/collector/node_energy_collector.go b/pkg/collector/node_energy_collector.go index 36fe6b5cd5..33b6f58b13 100644 --- a/pkg/collector/node_energy_collector.go +++ b/pkg/collector/node_energy_collector.go @@ -17,6 +17,9 @@ limitations under the License. package collector import ( + "encoding/binary" + + "github.com/sustainable-computing-io/kepler/pkg/bpfassets/attacher" "github.com/sustainable-computing-io/kepler/pkg/config" "github.com/sustainable-computing-io/kepler/pkg/model" "github.com/sustainable-computing-io/kepler/pkg/power/accelerator" @@ -69,6 +72,17 @@ func (c *Collector) updateNodeGPUEnergy() { // updateNodeAvgCPUFrequency updates the average CPU frequency in each core func (c *Collector) updateNodeAvgCPUFrequency() { + // update the cpu frequency using hardware counters when available because reading files can be very expensive + if attacher.HardwareCountersEnabled { + cpuFreq := map[int32]uint64{} + for it := c.bpfHCMeter.CPUFreqTable.Iter(); it.Next(); { + cpu := int32(binary.LittleEndian.Uint32(it.Key())) + freq := uint64(binary.LittleEndian.Uint32(it.Leaf())) + cpuFreq[cpu] = freq + } + c.NodeMetrics.CPUFrequency = cpuFreq + return + } c.NodeMetrics.CPUFrequency = c.acpiPowerMeter.GetCPUCoreFrequency() } diff --git a/pkg/collector/prometheus_collector.go b/pkg/collector/prometheus_collector.go index f05388b129..1fdc824e04 100644 --- a/pkg/collector/prometheus_collector.go +++ b/pkg/collector/prometheus_collector.go @@ -350,8 +350,8 @@ func (p *PrometheusCollector) newContainerMetrics() { // Additional metrics (gauge) containerCPUTime := prometheus.NewDesc( prometheus.BuildFQName(namespace, "container", "cpu_cpu_time_us"), - "Current CPU time per CPU", - []string{"pod_name", "container_name", "container_namespace", "cpu"}, nil, + "Aggregated CPU time", + []string{"pod_name", "container_name", "container_namespace"}, nil, ) p.containerDesc = &ContainerDesc{ @@ -524,14 +524,12 @@ func (p *PrometheusCollector) UpdatePodMetrics(wg *sync.WaitGroup, ch chan<- pro float64(container.Curr()), podEnergyStatusLabelValues..., ) - for cpu, cpuTime := range container.CurrCPUTimePerCPU { - ch <- prometheus.MustNewConstMetric( - p.containerDesc.containerCPUTime, - prometheus.GaugeValue, - float64(cpuTime), - container.PodName, container.ContainerName, container.Namespace, strconv.Itoa(int(cpu)), - ) - } + ch <- prometheus.MustNewConstMetric( + p.containerDesc.containerCPUTime, + prometheus.CounterValue, + float64(container.CPUTime.Aggr), + container.PodName, container.ContainerName, container.Namespace, + ) ch <- prometheus.MustNewConstMetric( p.containerDesc.containerCoreJoulesTotal, prometheus.CounterValue, @@ -581,11 +579,11 @@ func (p *PrometheusCollector) UpdatePodMetrics(wg *sync.WaitGroup, ch chan<- pro container.PodName, container.ContainerName, container.Namespace, containerCommand, ) if collector_metric.CPUHardwareCounterEnabled { - if container.CounterStats[attacher.CPUCycleLable] != nil { + if container.CounterStats[attacher.CPUCycleLabel] != nil { ch <- prometheus.MustNewConstMetric( p.containerDesc.containerCPUCyclesTotal, prometheus.CounterValue, - float64(container.CounterStats[attacher.CPUCycleLable].Aggr), + float64(container.CounterStats[attacher.CPUCycleLabel].Aggr), container.PodName, container.ContainerName, container.Namespace, containerCommand, ) } diff --git a/pkg/config/types.go b/pkg/config/types.go index b74f0884f0..4af1c3ad48 100644 --- a/pkg/config/types.go +++ b/pkg/config/types.go @@ -19,6 +19,7 @@ package config const ( // counter - attacher package CPUCycle = "cpu_cycles" + CPURefCycle = "cpu_ref_cycles" CPUInstruction = "cpu_instr" CacheMiss = "cache_miss" From ba931d323bb3be5d2d02240b090c81b98091ec54 Mon Sep 17 00:00:00 2001 From: Marcelo Amaral Date: Wed, 14 Dec 2022 23:35:40 +0900 Subject: [PATCH 3/3] bpfassets: add support to collect cpu frequency with BPF Signed-off-by: Marcelo Amaral --- bpfassets/perf_event/perf_event.c | 275 ++++++++++------- pkg/bpfassets/attacher/bcc_attacher.go | 50 ++-- pkg/bpfassets/attacher/bcc_attacher_stub.go | 33 ++- pkg/bpfassets/attacher/bpf_perf.go | 2 +- pkg/bpfassets/perf_event_bindata.go | 312 +++++++++++--------- 5 files changed, 386 insertions(+), 286 deletions(-) diff --git a/bpfassets/perf_event/perf_event.c b/bpfassets/perf_event/perf_event.c index 4899f5c31d..b702b838ad 100644 --- a/bpfassets/perf_event/perf_event.c +++ b/bpfassets/perf_event/perf_event.c @@ -14,174 +14,221 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include -#include +#include +// #include +// #include #ifndef NUM_CPUS #define NUM_CPUS 128 #endif -// we cannot define it dynamically as NUM_CPUS because the golang needs to know this -// size at compiler time for decoding -#define CPU_VECTOR_SIZE 128 +#ifndef CPU_REF_FREQ +#define CPU_REF_FREQ 2500 +#endif -typedef struct switch_args -{ - u64 pad; - char prev_comm[16]; - int prev_pid; - int prev_prio; - long long prev_state; - char next_comm[16]; - int next_pid; - int next_prio; -} switch_args; - -typedef struct process_time_t +#define KHZ 1000 + +typedef struct process_metrics_t { u64 cgroup_id; u64 pid; u64 process_run_time; u64 cpu_cycles; u64 cpu_instr; - u64 cache_misses; + u64 cache_miss; char comm[16]; - //u64 pad; - // the max eBPF stack limit is 512 bytes, which is a vector of u16 with 128 elements - // the time is calculated in miliseconds, uint16 max size is 65K, ~1mim - u16 cpu_time[CPU_VECTOR_SIZE]; -} process_time_t; +} process_metrics_t; typedef struct pid_time_t { - int pid; + u32 pid; + u32 cpu; } pid_time_t; -BPF_PERF_OUTPUT(events); - // processes and pid time -BPF_HASH(processes, u64, process_time_t); +BPF_HASH(processes, u64, process_metrics_t); BPF_HASH(pid_time, pid_time_t); // perf counters -BPF_PERF_ARRAY(cpu_cycles, NUM_CPUS); -BPF_PERF_ARRAY(cpu_instr, NUM_CPUS); -BPF_PERF_ARRAY(cache_miss, NUM_CPUS); +BPF_PERF_ARRAY(cpu_cycles_hc_reader, NUM_CPUS); +BPF_ARRAY(cpu_cycles, u64, NUM_CPUS); -// tracking counters -BPF_ARRAY(prev_cpu_cycles, u64, NUM_CPUS); -BPF_ARRAY(prev_cpu_instr, u64, NUM_CPUS); -BPF_ARRAY(prev_cache_miss, u64, NUM_CPUS); +BPF_PERF_ARRAY(cpu_ref_cycles_hc_reader, NUM_CPUS); +BPF_ARRAY(cpu_ref_cycles, u64, NUM_CPUS); -static void safe_array_add(u32 idx, u16 *array, u16 value) +BPF_PERF_ARRAY(cpu_instr_hc_reader, NUM_CPUS); +BPF_ARRAY(cpu_instr, u64, NUM_CPUS); + +BPF_PERF_ARRAY(cache_miss_hc_reader, NUM_CPUS); +BPF_ARRAY(cache_miss, u64, NUM_CPUS); + +// cpu freq counters +BPF_ARRAY(cpu_freq_array, u32, NUM_CPUS); + +static inline u64 get_on_cpu_time(u32 cur_pid, u32 prev_pid, u64 cur_ts) { -#pragma clang loop unroll(full) - for (int array_index = 0; array_index < CPU_VECTOR_SIZE-1; array_index++) + u64 cpu_time = 0; + + // get pid time + pid_time_t prev_pid_key = {.pid = prev_pid}; + u64 *prev_ts = pid_time.lookup(&prev_pid_key); + if (prev_ts != 0) { - if (array_index == idx) + // Probably a clock issue where the recorded on-CPU event had a + // timestamp later than the recorded off-CPU event, or vice versa. + // But do not return, since the hardware counters can be collected. + if (cur_ts > *prev_ts) { - array[array_index] += value; - break; + cpu_time = (cur_ts - *prev_ts) / 1000000; /*milisecond*/ + pid_time.delete(&prev_pid_key); } } + pid_time_t new_pid_key = {.pid = cur_pid}; + pid_time.update(&new_pid_key, &cur_ts); + + return cpu_time; } -int sched_switch(switch_args *ctx) +static inline u64 normalize(u64 *counter, u64 *enabled, u64 *running) { - u64 pid = bpf_get_current_pid_tgid() >> 32; -#ifdef SET_GROUP_ID - u64 cgroup_id = bpf_get_current_cgroup_id(); -#else - u64 cgroup_id = 0; -#endif + if (*running > 0) + return *counter * *enabled / *running; + return *counter; +} - u64 time = bpf_ktime_get_ns(); +static inline u64 calc_delta(u64 *prev_val, u64 *val) +{ u64 delta = 0; - u32 cpu_id = bpf_get_smp_processor_id(); - pid_time_t new_pid, old_pid; + if (prev_val) + { + if (*val > *prev_val) + delta = *val - *prev_val; + } + return delta; +} - // get pid time - old_pid.pid = ctx->prev_pid; - u64 *last_time = pid_time.lookup(&old_pid); - if (last_time != 0) +// altough the "get_on_cpu_counters" has some code duplications, it is inline code and the compiles will improve this +static inline u64 get_on_cpu_cycles(u32 *cpu_id) +{ + u64 delta = 0; + struct bpf_perf_event_value c = {}; + int error = cpu_cycles_hc_reader.perf_counter_value(CUR_CPU_IDENTIFIER, &c, sizeof(struct bpf_perf_event_value)); + if (error == 0) { - delta = (time - *last_time) / 1000000; /*milisecond*/ - // return if the process did not use any cpu time yet - if (delta == 0) - { - return 0; - } - pid_time.delete(&old_pid); + u64 val = normalize(&c.counter, &c.enabled, &c.running); + u64 *prev_val = cpu_cycles.lookup(cpu_id); + delta = calc_delta(prev_val, &val); + cpu_cycles.update(cpu_id, &val); } + return delta; +} - new_pid.pid = ctx->next_pid; - pid_time.lookup_or_try_init(&new_pid, &time); +static inline u64 get_on_cpu_ref_cycles(u32 *cpu_id) +{ + u64 delta = 0; + struct bpf_perf_event_value c = {}; + int error = cpu_ref_cycles_hc_reader.perf_counter_value(CUR_CPU_IDENTIFIER, &c, sizeof(struct bpf_perf_event_value)); + if (error == 0) + { + u64 val = normalize(&c.counter, &c.enabled, &c.running); + u64 *prev_val = cpu_ref_cycles.lookup(cpu_id); + delta = calc_delta(prev_val, &val); + cpu_ref_cycles.update(cpu_id, &val); + } + return delta; +} - u64 cpu_cycles_delta = 0; - u64 cpu_instr_delta = 0; - u64 cache_miss_delta = 0; - u64 *prev; +static inline u64 get_on_cpu_instr(u32 *cpu_id) +{ + u64 delta = 0; + struct bpf_perf_event_value c = {}; + int error = cpu_instr_hc_reader.perf_counter_value(CUR_CPU_IDENTIFIER, &c, sizeof(struct bpf_perf_event_value)); + if (error == 0) + { + u64 val = normalize(&c.counter, &c.enabled, &c.running); + u64 *prev_val = cpu_instr.lookup(cpu_id); + delta = calc_delta(prev_val, &val); + cpu_instr.update(cpu_id, &val); + } + return delta; +} - u64 val = cpu_cycles.perf_read(CUR_CPU_IDENTIFIER); - if (((s64)val > 0) || ((s64)val < -256)) +static inline u64 get_on_cpu_cache_miss(u32 *cpu_id) +{ + u64 delta = 0; + struct bpf_perf_event_value c = {}; + int error = cache_miss_hc_reader.perf_counter_value(CUR_CPU_IDENTIFIER, &c, sizeof(struct bpf_perf_event_value)); + if (error == 0) { - prev = prev_cpu_cycles.lookup(&cpu_id); - if (prev) - { - cpu_cycles_delta = val - *prev; - } - prev_cpu_cycles.update(&cpu_id, &val); + u64 val = normalize(&c.counter, &c.enabled, &c.running); + u64 *prev_val = cache_miss.lookup(cpu_id); + delta = calc_delta(prev_val, &val); + cache_miss.update(cpu_id, &val); } - val = cpu_instr.perf_read(CUR_CPU_IDENTIFIER); - if (((s64)val > 0) || ((s64)val < -256)) + return delta; +} + +// calculate the average cpu freq +static inline u64 get_on_cpu_avg_freq(u32 *cpu_id, u64 on_cpu_cycles_delta, u64 on_cpu_ref_cycles_delta) +{ + u32 avg_freq = 0; + cpu_freq_array.lookup_or_try_init(cpu_id, &avg_freq); + if (avg_freq == 0) { - prev = prev_cpu_instr.lookup(&cpu_id); - if (prev) - { - cpu_instr_delta = val - *prev; - } - prev_cpu_instr.update(&cpu_id, &val); + avg_freq = ((on_cpu_cycles_delta * CPU_REF_FREQ) / on_cpu_ref_cycles_delta) / KHZ; } - val = cache_miss.perf_read(CUR_CPU_IDENTIFIER); - if (((s64)val > 0) || ((s64)val < -256)) + else { - prev = prev_cache_miss.lookup(&cpu_id); - if (prev) - { - cache_miss_delta = val - *prev; - } - prev_cache_miss.update(&cpu_id, &val); + avg_freq += ((on_cpu_cycles_delta * CPU_REF_FREQ) / on_cpu_ref_cycles_delta) / KHZ; + avg_freq /= 2; } + return avg_freq; +} - // init process time - struct process_time_t *process_time; - process_time = processes.lookup(&pid); - if (process_time == 0) +// int kprobe__finish_task_switch(switch_args *ctx) +int kprobe__finish_task_switch(struct pt_regs *ctx, struct task_struct *prev) +{ + u64 cur_pid = bpf_get_current_pid_tgid() >> 32; +#ifdef SET_GROUP_ID + u64 cgroup_id = bpf_get_current_cgroup_id(); +#else + u64 cgroup_id = 0; +#endif + + u64 cur_ts = bpf_ktime_get_ns(); + u32 cpu_id = bpf_get_smp_processor_id(); + u64 on_cpu_time_delta = get_on_cpu_time(cur_pid, prev->pid, cur_ts); + u64 on_cpu_cycles_delta = get_on_cpu_cycles(&cpu_id); + u64 on_cpu_ref_cycles_delta = get_on_cpu_ref_cycles(&cpu_id); + u64 on_cpu_instr_delta = get_on_cpu_instr(&cpu_id); + u64 on_cpu_cache_miss_delta = get_on_cpu_cache_miss(&cpu_id); + u64 on_cpu_avg_freq = get_on_cpu_avg_freq(&cpu_id, on_cpu_cycles_delta, on_cpu_ref_cycles_delta); + + // store process metrics + struct process_metrics_t *process_metrics; + process_metrics = processes.lookup(&cur_pid); + if (process_metrics == 0) { - process_time_t new_process = {}; - new_process.pid = pid; + process_metrics_t new_process = {}; + new_process.pid = cur_pid; new_process.cgroup_id = cgroup_id; - new_process.cpu_cycles = cpu_cycles_delta; - new_process.cpu_instr = cpu_instr_delta; - new_process.cache_misses = cache_miss_delta; - new_process.process_run_time += delta; -#ifdef CPU_FREQ - //FIXME: for certain reason, hyper-v seems to always get a cpu_id that is same as NUM_CPUS and cause stack overrun - safe_array_add(cpu_id, new_process.cpu_time, delta); -#endif + new_process.process_run_time = on_cpu_time_delta; bpf_get_current_comm(&new_process.comm, sizeof(new_process.comm)); - processes.update(&pid, &new_process); + + new_process.cpu_cycles = on_cpu_cycles_delta; + new_process.cpu_instr = on_cpu_instr_delta; + new_process.cache_miss = on_cpu_cache_miss_delta; + + processes.update(&cur_pid, &new_process); } else { // update process time - process_time->cpu_cycles += cpu_cycles_delta; - process_time->cpu_instr += cpu_instr_delta; - process_time->cache_misses += cache_miss_delta; - process_time->process_run_time += delta; -#ifdef CPU_FREQ - safe_array_add(cpu_id, process_time->cpu_time, delta); -#endif + process_metrics->process_run_time += on_cpu_time_delta; + + process_metrics->cpu_cycles += on_cpu_cycles_delta; + process_metrics->cpu_instr += on_cpu_instr_delta; + process_metrics->cache_miss += on_cpu_cache_miss_delta; } return 0; diff --git a/pkg/bpfassets/attacher/bcc_attacher.go b/pkg/bpfassets/attacher/bcc_attacher.go index ffbc561f82..5f4b4ac17d 100644 --- a/pkg/bpfassets/attacher/bcc_attacher.go +++ b/pkg/bpfassets/attacher/bcc_attacher.go @@ -41,24 +41,27 @@ type perfCounter struct { } type BpfModuleTables struct { - Module *bpf.Module - Table *bpf.Table - TimeTable *bpf.Table + Module *bpf.Module + Table *bpf.Table + CPUFreqTable *bpf.Table } const ( - CPUCycleLable = config.CPUCycle + CPUCycleLabel = config.CPUCycle + CPURefCycleLabel = config.CPURefCycle CPUInstructionLabel = config.CPUInstruction CacheMissLabel = config.CacheMiss + bpfPerfArrayPrefix = "_hc_reader" ) var ( Counters = map[string]perfCounter{ - CPUCycleLable: {unix.PERF_TYPE_HARDWARE, unix.PERF_COUNT_HW_CPU_CYCLES, true}, + CPUCycleLabel: {unix.PERF_TYPE_HARDWARE, unix.PERF_COUNT_HW_CPU_CYCLES, true}, + CPURefCycleLabel: {unix.PERF_TYPE_HARDWARE, unix.PERF_COUNT_HW_REF_CPU_CYCLES, true}, CPUInstructionLabel: {unix.PERF_TYPE_HARDWARE, unix.PERF_COUNT_HW_INSTRUCTIONS, true}, CacheMissLabel: {unix.PERF_TYPE_HARDWARE, unix.PERF_COUNT_HW_CACHE_MISSES, true}, } - EnableCPUFreq = true + HardwareCountersEnabled = false ) func loadModule(objProg []byte, options []string) (m *bpf.Module, err error) { @@ -70,24 +73,28 @@ func loadModule(objProg []byte, options []string) (m *bpf.Module, err error) { }() m = bpf.NewModule(string(objProg), options) // TODO make all entrypoints yaml-declarable - sched_switch, err := m.LoadTracepoint("sched_switch") + ftswitch, err := m.LoadKprobe("kprobe__finish_task_switch") if err != nil { - return nil, fmt.Errorf("failed to load sched_switch: %s", err) + return nil, fmt.Errorf("failed to load kprobe__finish_task_switch: %s", err) } - err = m.AttachTracepoint("sched:sched_switch", sched_switch) + err = m.AttachKprobe("finish_task_switch", ftswitch, -1) if err != nil { - return nil, fmt.Errorf("failed to attach sched_switch: %s", err) + err = m.AttachKprobe("finish_task_switch.isra.0", ftswitch, -1) + if err != nil { + return nil, fmt.Errorf("failed to attach finish_task_switch: %s", err) + } } for arrayName, counter := range Counters { - t := bpf.NewTable(m.TableId(arrayName), m) + bpfPerfArrayName := arrayName + bpfPerfArrayPrefix + t := bpf.NewTable(m.TableId(bpfPerfArrayName), m) if t == nil { - return nil, fmt.Errorf("failed to find perf array: %s", arrayName) + return nil, fmt.Errorf("failed to find perf array: %s", bpfPerfArrayName) } perfErr := openPerfEvent(t, counter.evType, counter.evConfig) if perfErr != nil { // some hypervisors don't expose perf counters - klog.Infof("failed to attach perf event %s: %v\n", arrayName, perfErr) + klog.Infof("failed to attach perf event %s: %v\n", bpfPerfArrayName, perfErr) counter.enabled = false } } @@ -103,30 +110,23 @@ func AttachBPFAssets() (*BpfModuleTables, error) { } options := []string{ "-DNUM_CPUS=" + strconv.Itoa(runtime.NumCPU()), - "-DCPU_FREQ", } if config.EnabledEBPFCgroupID { options = append(options, "-DSET_GROUP_ID") } + // TODO: verify if ebpf can run in the VM without hardware counter support, if not, we can disable the HC part and only collect the cpu time m, err := loadModule(objProg, options) if err != nil { - klog.Warningf("failed to attach perf module with options %v: %v, Hardware counter related metrics does not exist\n", options, err) - options = []string{"-DNUM_CPUS=" + strconv.Itoa(runtime.NumCPU())} - EnableCPUFreq = false - m, err = loadModule(objProg, options) - if err != nil { - klog.Infof("failed to attach perf module with options %v: %v, not able to load eBPF modules\n", options, err) - // at this time, there is not much we can do with the eBPF module - return nil, err - } + klog.Infof("failed to attach perf module with options %v: %v, not able to load eBPF modules\n", options, err) + return nil, err } table := bpf.NewTable(m.TableId("processes"), m) - timeTable := bpf.NewTable(m.TableId("pid_time"), m) + cpuFreqTable := bpf.NewTable(m.TableId("cpu_freq_array"), m) bpfModules.Module = m bpfModules.Table = table - bpfModules.TimeTable = timeTable + bpfModules.CPUFreqTable = cpuFreqTable klog.Infof("Successfully load eBPF module with option: %s", options) diff --git a/pkg/bpfassets/attacher/bcc_attacher_stub.go b/pkg/bpfassets/attacher/bcc_attacher_stub.go index b787b57ea2..5f80da3ab5 100644 --- a/pkg/bpfassets/attacher/bcc_attacher_stub.go +++ b/pkg/bpfassets/attacher/bcc_attacher_stub.go @@ -21,32 +21,49 @@ package attacher import ( "fmt" + + "github.com/sustainable-computing-io/kepler/pkg/config" ) const ( - CPUCycleLable = "cpu_cycles" - CPUInstructionLabel = "cpu_instr" - CacheMissLabel = "cache_miss" + CPUCycleLabel = config.CPUCycle + CPURefCycleLabel = config.CPURefCycle + CPUInstructionLabel = config.CPUInstruction + CacheMissLabel = config.CacheMiss ) type perfCounter struct{} type ModuleStub struct{} +// Table references a BPF table. type Table struct { } + +// TableIterator contains the current position for iteration over a *bcc.Table and provides methods for iteration. type TableIterator struct { leaf []byte + key []byte } +// Iter returns an iterator to list all table entries available as raw bytes. func (table *Table) Iter() *TableIterator { return &TableIterator{} } +// Next looks up the next element and return true if one is available. func (it *TableIterator) Next() bool { return false } +// Key returns the current key value of the iterator, if the most recent call to Next returned true. +// The slice is valid only until the next call to Next. +func (it *TableIterator) Key() []byte { + return it.key +} + +// Leaf returns the current leaf value of the iterator, if the most recent call to Next returned true. +// The slice is valid only until the next call to Next. func (it *TableIterator) Leaf() []byte { return it.leaf } @@ -55,14 +72,14 @@ func (table *Table) DeleteAll() { } type BpfModuleTables struct { - Module ModuleStub - Table *Table - TimeTable *Table + Module ModuleStub + Table *Table + CPUFreqTable *Table } var ( - Counters = map[string]perfCounter{} - EnableCPUFreq = false + Counters = map[string]perfCounter{} + HardwareCountersEnabled = false ) func AttachBPFAssets() (*BpfModuleTables, error) { diff --git a/pkg/bpfassets/attacher/bpf_perf.go b/pkg/bpfassets/attacher/bpf_perf.go index 86f022038c..9c1ff9b8d1 100644 --- a/pkg/bpfassets/attacher/bpf_perf.go +++ b/pkg/bpfassets/attacher/bpf_perf.go @@ -64,7 +64,7 @@ func openPerfEvent(table *bpf.Table, typ, config int) error { leafSize := table.Config()["leaf_size"].(uint64) if keySize != 4 || leafSize != 4 { - return fmt.Errorf("passed table has wrong size") + return fmt.Errorf("passed table has wrong size: key_size=%d, leaf_size=%d", keySize, leafSize) } res := []int{} diff --git a/pkg/bpfassets/perf_event_bindata.go b/pkg/bpfassets/perf_event_bindata.go index c1535ebf42..f9503d0bf7 100644 --- a/pkg/bpfassets/perf_event_bindata.go +++ b/pkg/bpfassets/perf_event_bindata.go @@ -1,8 +1,8 @@ -// Code generated by go-bindata. (@generated) DO NOT EDIT. - -// Package bpfassets generated by go-bindata. +// Code generated by go-bindata. // sources: // bpfassets/perf_event/perf_event.c +// DO NOT EDIT! + package bpfassets import ( @@ -26,32 +26,21 @@ type bindataFileInfo struct { modTime time.Time } -// Name return file name func (fi bindataFileInfo) Name() string { return fi.name } - -// Size return file size func (fi bindataFileInfo) Size() int64 { return fi.size } - -// Mode return file mode func (fi bindataFileInfo) Mode() os.FileMode { return fi.mode } - -// ModTime return file modify time func (fi bindataFileInfo) ModTime() time.Time { return fi.modTime } - -// IsDir return file whether a directory func (fi bindataFileInfo) IsDir() bool { - return fi.mode&os.ModeDir != 0 + return false } - -// Sys return file is sys mode func (fi bindataFileInfo) Sys() interface{} { return nil } @@ -72,174 +61,221 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include -#include +#include +// #include +// #include #ifndef NUM_CPUS #define NUM_CPUS 128 #endif -// we cannot define it dynamically as NUM_CPUS because the golang needs to know this -// size at compiler time for decoding -#define CPU_VECTOR_SIZE 128 +#ifndef CPU_REF_FREQ +#define CPU_REF_FREQ 2500 +#endif -typedef struct switch_args -{ - u64 pad; - char prev_comm[16]; - int prev_pid; - int prev_prio; - long long prev_state; - char next_comm[16]; - int next_pid; - int next_prio; -} switch_args; - -typedef struct process_time_t +#define CPUHZ 1000 + +typedef struct process_metrics_t { u64 cgroup_id; u64 pid; u64 process_run_time; u64 cpu_cycles; u64 cpu_instr; - u64 cache_misses; + u64 cache_miss; char comm[16]; - //u64 pad; - // the max eBPF stack limit is 512 bytes, which is a vector of u16 with 128 elements - // the time is calculated in miliseconds, uint16 max size is 65K, ~1mim - u16 cpu_time[CPU_VECTOR_SIZE]; -} process_time_t; +} process_metrics_t; typedef struct pid_time_t { - int pid; + u32 pid; + u32 cpu; } pid_time_t; -BPF_PERF_OUTPUT(events); - // processes and pid time -BPF_HASH(processes, u64, process_time_t); +BPF_HASH(processes, u64, process_metrics_t); BPF_HASH(pid_time, pid_time_t); // perf counters -BPF_PERF_ARRAY(cpu_cycles, NUM_CPUS); -BPF_PERF_ARRAY(cpu_instr, NUM_CPUS); -BPF_PERF_ARRAY(cache_miss, NUM_CPUS); +BPF_PERF_ARRAY(cpu_cycles_hc_reader, NUM_CPUS); +BPF_ARRAY(cpu_cycles, u64, NUM_CPUS); + +BPF_PERF_ARRAY(cpu_ref_cycles_hc_reader, NUM_CPUS); +BPF_ARRAY(cpu_ref_cycles, u64, NUM_CPUS); + +BPF_PERF_ARRAY(cpu_instr_hc_reader, NUM_CPUS); +BPF_ARRAY(cpu_instr, u64, NUM_CPUS); -// tracking counters -BPF_ARRAY(prev_cpu_cycles, u64, NUM_CPUS); -BPF_ARRAY(prev_cpu_instr, u64, NUM_CPUS); -BPF_ARRAY(prev_cache_miss, u64, NUM_CPUS); +BPF_PERF_ARRAY(cache_miss_hc_reader, NUM_CPUS); +BPF_ARRAY(cache_miss, u64, NUM_CPUS); -static void safe_array_add(u32 idx, u16 *array, u16 value) +// cpu freq counters +BPF_ARRAY(cpu_freq_array, u32, NUM_CPUS); + +static inline u64 get_on_cpu_time(u32 cur_pid, u32 prev_pid, u64 cur_ts) { -#pragma clang loop unroll(full) - for (int array_index = 0; array_index < CPU_VECTOR_SIZE-1; array_index++) + u64 cpu_time = 0; + + // get pid time + pid_time_t prev_pid_key = {.pid = prev_pid}; + u64 *prev_ts = pid_time.lookup(&prev_pid_key); + if (prev_ts != 0) { - if (array_index == idx) + // Probably a clock issue where the recorded on-CPU event had a + // timestamp later than the recorded off-CPU event, or vice versa. + // But do not return, since the hardware counters can be collected. + if (cur_ts > *prev_ts) { - array[array_index] += value; - break; + cpu_time = (cur_ts - *prev_ts) / 1000000; /*milisecond*/ + pid_time.delete(&prev_pid_key); } } + pid_time_t new_pid_key = {.pid = cur_pid}; + pid_time.update(&new_pid_key, &cur_ts); + + return cpu_time; } -int sched_switch(switch_args *ctx) +static inline u64 normalize(u64 *counter, u64 *enabled, u64 *running) { - u64 pid = bpf_get_current_pid_tgid() >> 32; -#ifdef SET_GROUP_ID - u64 cgroup_id = bpf_get_current_cgroup_id(); -#else - u64 cgroup_id = 0; -#endif + if (*running > 0) + return *counter * *enabled / *running; + return *counter; +} - u64 time = bpf_ktime_get_ns(); +static inline u64 calc_delta(u64 *prev_val, u64 *val) +{ u64 delta = 0; - u32 cpu_id = bpf_get_smp_processor_id(); - pid_time_t new_pid, old_pid; + if (prev_val) + { + if (*val > *prev_val) + delta = *val - *prev_val; + } + return delta; +} - // get pid time - old_pid.pid = ctx->prev_pid; - u64 *last_time = pid_time.lookup(&old_pid); - if (last_time != 0) +// altough the "get_on_cpu_counters" has some code duplications, it is inline code and the compiles will improve this +static inline u64 get_on_cpu_cycles(u32 *cpu_id) +{ + u64 delta = 0; + struct bpf_perf_event_value c = {}; + int error = cpu_cycles_hc_reader.perf_counter_value(CUR_CPU_IDENTIFIER, &c, sizeof(struct bpf_perf_event_value)); + if (error == 0) { - delta = (time - *last_time) / 1000000; /*milisecond*/ - // return if the process did not use any cpu time yet - if (delta == 0) - { - return 0; - } - pid_time.delete(&old_pid); + u64 val = normalize(&c.counter, &c.enabled, &c.running); + u64 *prev_val = cpu_cycles.lookup(cpu_id); + delta = calc_delta(prev_val, &val); + cpu_cycles.update(cpu_id, &val); } + return delta; +} - new_pid.pid = ctx->next_pid; - pid_time.lookup_or_try_init(&new_pid, &time); +static inline u64 get_on_cpu_ref_cycles(u32 *cpu_id) +{ + u64 delta = 0; + struct bpf_perf_event_value c = {}; + int error = cpu_ref_cycles_hc_reader.perf_counter_value(CUR_CPU_IDENTIFIER, &c, sizeof(struct bpf_perf_event_value)); + if (error == 0) + { + u64 val = normalize(&c.counter, &c.enabled, &c.running); + u64 *prev_val = cpu_ref_cycles.lookup(cpu_id); + delta = calc_delta(prev_val, &val); + cpu_ref_cycles.update(cpu_id, &val); + } + return delta; +} - u64 cpu_cycles_delta = 0; - u64 cpu_instr_delta = 0; - u64 cache_miss_delta = 0; - u64 *prev; +static inline u64 get_on_cpu_instr(u32 *cpu_id) +{ + u64 delta = 0; + struct bpf_perf_event_value c = {}; + int error = cpu_instr_hc_reader.perf_counter_value(CUR_CPU_IDENTIFIER, &c, sizeof(struct bpf_perf_event_value)); + if (error == 0) + { + u64 val = normalize(&c.counter, &c.enabled, &c.running); + u64 *prev_val = cpu_instr.lookup(cpu_id); + delta = calc_delta(prev_val, &val); + cpu_instr.update(cpu_id, &val); + } + return delta; +} - u64 val = cpu_cycles.perf_read(CUR_CPU_IDENTIFIER); - if (((s64)val > 0) || ((s64)val < -256)) +static inline u64 get_on_cpu_cache_miss(u32 *cpu_id) +{ + u64 delta = 0; + struct bpf_perf_event_value c = {}; + int error = cache_miss_hc_reader.perf_counter_value(CUR_CPU_IDENTIFIER, &c, sizeof(struct bpf_perf_event_value)); + if (error == 0) { - prev = prev_cpu_cycles.lookup(&cpu_id); - if (prev) - { - cpu_cycles_delta = val - *prev; - } - prev_cpu_cycles.update(&cpu_id, &val); + u64 val = normalize(&c.counter, &c.enabled, &c.running); + u64 *prev_val = cache_miss.lookup(cpu_id); + delta = calc_delta(prev_val, &val); + cache_miss.update(cpu_id, &val); } - val = cpu_instr.perf_read(CUR_CPU_IDENTIFIER); - if (((s64)val > 0) || ((s64)val < -256)) + return delta; +} + +// calculate the average cpu freq +static inline u64 get_on_cpu_avg_freq(u32 *cpu_id, u64 on_cpu_cycles_delta, u64 on_cpu_ref_cycles_delta) +{ + u32 avg_freq = 0; + cpu_freq_array.lookup_or_try_init(cpu_id, &avg_freq); + if (avg_freq == 0) { - prev = prev_cpu_instr.lookup(&cpu_id); - if (prev) - { - cpu_instr_delta = val - *prev; - } - prev_cpu_instr.update(&cpu_id, &val); + avg_freq = ((on_cpu_cycles_delta * CPU_REF_FREQ) / on_cpu_ref_cycles_delta) / CPUHZ; } - val = cache_miss.perf_read(CUR_CPU_IDENTIFIER); - if (((s64)val > 0) || ((s64)val < -256)) + else { - prev = prev_cache_miss.lookup(&cpu_id); - if (prev) - { - cache_miss_delta = val - *prev; - } - prev_cache_miss.update(&cpu_id, &val); + avg_freq += ((on_cpu_cycles_delta * CPU_REF_FREQ) / on_cpu_ref_cycles_delta) / CPUHZ; + avg_freq /= 2; } + return avg_freq; +} - // init process time - struct process_time_t *process_time; - process_time = processes.lookup(&pid); - if (process_time == 0) +// int kprobe__finish_task_switch(switch_args *ctx) +int kprobe__finish_task_switch(struct pt_regs *ctx, struct task_struct *prev) +{ + u64 cur_pid = bpf_get_current_pid_tgid() >> 32; +#ifdef SET_GROUP_ID + u64 cgroup_id = bpf_get_current_cgroup_id(); +#else + u64 cgroup_id = 0; +#endif + + u64 cur_ts = bpf_ktime_get_ns(); + u32 cpu_id = bpf_get_smp_processor_id(); + u64 on_cpu_time_delta = get_on_cpu_time(cur_pid, prev->pid, cur_ts); + u64 on_cpu_cycles_delta = get_on_cpu_cycles(&cpu_id); + u64 on_cpu_ref_cycles_delta = get_on_cpu_ref_cycles(&cpu_id); + u64 on_cpu_instr_delta = get_on_cpu_instr(&cpu_id); + u64 on_cpu_cache_miss_delta = get_on_cpu_cache_miss(&cpu_id); + u64 on_cpu_avg_freq = get_on_cpu_avg_freq(&cpu_id, on_cpu_cycles_delta, on_cpu_ref_cycles_delta); + + // store process metrics + struct process_metrics_t *process_metrics; + process_metrics = processes.lookup(&cur_pid); + if (process_metrics == 0) { - process_time_t new_process = {}; - new_process.pid = pid; + process_metrics_t new_process = {}; + new_process.pid = cur_pid; new_process.cgroup_id = cgroup_id; - new_process.cpu_cycles = cpu_cycles_delta; - new_process.cpu_instr = cpu_instr_delta; - new_process.cache_misses = cache_miss_delta; - new_process.process_run_time += delta; -#ifdef CPU_FREQ - //FIXME: for certain reason, hyper-v seems to always get a cpu_id that is same as NUM_CPUS and cause stack overrun - safe_array_add(cpu_id, new_process.cpu_time, delta); -#endif + new_process.process_run_time = on_cpu_time_delta; bpf_get_current_comm(&new_process.comm, sizeof(new_process.comm)); - processes.update(&pid, &new_process); + + new_process.cpu_cycles = on_cpu_cycles_delta; + new_process.cpu_instr = on_cpu_instr_delta; + new_process.cache_miss = on_cpu_cache_miss_delta; + + processes.update(&cur_pid, &new_process); } else { // update process time - process_time->cpu_cycles += cpu_cycles_delta; - process_time->cpu_instr += cpu_instr_delta; - process_time->cache_misses += cache_miss_delta; - process_time->process_run_time += delta; -#ifdef CPU_FREQ - safe_array_add(cpu_id, process_time->cpu_time, delta); -#endif + process_metrics->process_run_time += on_cpu_time_delta; + + process_metrics->cpu_cycles += on_cpu_cycles_delta; + process_metrics->cpu_instr += on_cpu_instr_delta; + process_metrics->cache_miss += on_cpu_cache_miss_delta; } return 0; @@ -265,8 +301,8 @@ func bpfassetsPerf_eventPerf_eventC() (*asset, error) { // It returns an error if the asset could not be found or // could not be loaded. func Asset(name string) ([]byte, error) { - canonicalName := strings.Replace(name, "\\", "/", -1) - if f, ok := _bindata[canonicalName]; ok { + cannonicalName := strings.Replace(name, "\\", "/", -1) + if f, ok := _bindata[cannonicalName]; ok { a, err := f() if err != nil { return nil, fmt.Errorf("Asset %s can't read by error: %v", name, err) @@ -291,8 +327,8 @@ func MustAsset(name string) []byte { // It returns an error if the asset could not be found or // could not be loaded. func AssetInfo(name string) (os.FileInfo, error) { - canonicalName := strings.Replace(name, "\\", "/", -1) - if f, ok := _bindata[canonicalName]; ok { + cannonicalName := strings.Replace(name, "\\", "/", -1) + if f, ok := _bindata[cannonicalName]; ok { a, err := f() if err != nil { return nil, fmt.Errorf("AssetInfo %s can't read by error: %v", name, err) @@ -327,13 +363,13 @@ var _bindata = map[string]func() (*asset, error){ // b.png // then AssetDir("data") would return []string{"foo.txt", "img"} // AssetDir("data/img") would return []string{"a.png", "b.png"} -// AssetDir("foo.txt") and AssetDir("nonexistent") would return an error +// AssetDir("foo.txt") and AssetDir("notexist") would return an error // AssetDir("") will return []string{"data"}. func AssetDir(name string) ([]string, error) { node := _bintree if len(name) != 0 { - canonicalName := strings.Replace(name, "\\", "/", -1) - pathList := strings.Split(canonicalName, "/") + cannonicalName := strings.Replace(name, "\\", "/", -1) + pathList := strings.Split(cannonicalName, "/") for _, p := range pathList { node = node.Children[p] if node == nil { @@ -407,6 +443,6 @@ func RestoreAssets(dir, name string) error { } func _filePath(dir, name string) string { - canonicalName := strings.Replace(name, "\\", "/", -1) - return filepath.Join(append([]string{dir}, strings.Split(canonicalName, "/")...)...) + cannonicalName := strings.Replace(name, "\\", "/", -1) + return filepath.Join(append([]string{dir}, strings.Split(cannonicalName, "/")...)...) }