From b181ec3fe8265df6ce19f9ba832fd4c1ce2fd7a5 Mon Sep 17 00:00:00 2001
From: Marcelo Amaral <marcelo.amaral1@ibm.com>
Date: Wed, 14 Dec 2022 23:35:38 +0900
Subject: [PATCH 1/3] acpi: update to only read cpu freq files when BPF is
 disabled

Signed-off-by: Marcelo Amaral <marcelo.amaral1@ibm.com>
---
 pkg/power/acpi/acpi.go | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/pkg/power/acpi/acpi.go b/pkg/power/acpi/acpi.go
index 6ee73a398e..2f29bb8378 100644
--- a/pkg/power/acpi/acpi.go
+++ b/pkg/power/acpi/acpi.go
@@ -64,20 +64,23 @@ func NewACPIPowerMeter() *ACPI {
 	return acpi
 }
 
-func (a *ACPI) Run() {
+func (a *ACPI) Run(isEBPFEnabled bool) {
 	go func() {
 		for {
 			select {
 			case <-a.stopChannel:
 				return
 			default:
-				cpuCoreFrequency := getCPUCoreFrequency()
-				a.mu.Lock()
-				for cpu, freq := range cpuCoreFrequency {
-					// average cpu frequency
-					a.cpuCoreFrequency[cpu] = (cpuCoreFrequency[cpu] + freq) / 2
+				// in the case we cannot collect the cpu frequency using EBPF
+				if !isEBPFEnabled {
+					cpuCoreFrequency := getCPUCoreFrequency()
+					a.mu.Lock()
+					for cpu, freq := range cpuCoreFrequency {
+						// average cpu frequency
+						a.cpuCoreFrequency[cpu] = (cpuCoreFrequency[cpu] + freq) / 2
+					}
+					a.mu.Unlock()
 				}
-				a.mu.Unlock()
 
 				if a.collectEnergy {
 					if sensorPower, err := getPowerFromSensor(); err == nil {
@@ -92,6 +95,11 @@ func (a *ACPI) Run() {
 					}
 				}
 
+				// stop the gorotime if there is nothing to do
+				if (isEBPFEnabled) && (!a.collectEnergy) {
+					return
+				}
+
 				time.Sleep(poolingInterval)
 			}
 		}

From 0cadb2ee70255d06a410c1e3fb6b7c99ddb22d64 Mon Sep 17 00:00:00 2001
From: Marcelo Amaral <marcelo.amaral1@ibm.com>
Date: Wed, 14 Dec 2022 23:35:40 +0900
Subject: [PATCH 2/3] collector: remove expensive cpu freq calculation

Signed-off-by: Marcelo Amaral <marcelo.amaral1@ibm.com>
---
 pkg/collector/container_hc_collector.go      | 62 +-------------------
 pkg/collector/container_hc_collector_test.go | 25 +-------
 pkg/collector/metric/container_metric.go     | 24 +++-----
 pkg/collector/metric/stats.go                |  2 -
 pkg/collector/metric/utils.go                |  3 -
 pkg/collector/metric/utils_test.go           | 23 ++------
 pkg/collector/metric_collector.go            |  2 +-
 pkg/collector/node_energy_collector.go       | 14 +++++
 pkg/collector/prometheus_collector.go        | 22 ++++---
 pkg/config/types.go                          |  1 +
 10 files changed, 40 insertions(+), 138 deletions(-)

diff --git a/pkg/collector/container_hc_collector.go b/pkg/collector/container_hc_collector.go
index 710381cc95..2f1bb7eaa3 100644
--- a/pkg/collector/container_hc_collector.go
+++ b/pkg/collector/container_hc_collector.go
@@ -42,13 +42,11 @@ type ProcessBPFMetrics struct {
 	CPUInstr       uint64
 	CacheMisses    uint64
 	Command        [16]byte
-	CPUTime        [C.CPU_VECTOR_SIZE]uint16
 }
 
 // resetBPFTables reset BPF module's tables
 func (c *Collector) resetBPFTables() {
 	c.bpfHCMeter.Table.DeleteAll()
-	c.bpfHCMeter.TimeTable.DeleteAll()
 }
 
 // updateBPFMetrics reads the BPF tables with process/pid/cgroupid metrics (CPU time, available HW counters)
@@ -82,29 +80,14 @@ func (c *Collector) updateBPFMetrics() {
 			c.ContainersMetrics[containerID].SetLatestProcess(ct.CGroupID, ct.PID, C.GoString(comm))
 		}
 
-		var activeCPUs []int32
-		var avgFreq float64
-		var totalCPUTime uint64
-		if attacher.EnableCPUFreq {
-			avgFreq, totalCPUTime, activeCPUs = getAVGCPUFreqAndTotalCPUTime(c.NodeMetrics.CPUFrequency, &ct.CPUTime)
-			c.ContainersMetrics[containerID].AvgCPUFreq = avgFreq
-		} else {
-			totalCPUTime = ct.ProcessRunTime
-			activeCPUs = getActiveCPUs(&ct.CPUTime)
-		}
-
-		for _, cpu := range activeCPUs {
-			c.ContainersMetrics[containerID].CurrCPUTimePerCPU[uint32(cpu)] += uint64(ct.CPUTime[cpu])
-		}
-
-		if err = c.ContainersMetrics[containerID].CPUTime.AddNewCurr(totalCPUTime); err != nil {
+		if err = c.ContainersMetrics[containerID].CPUTime.AddNewCurr(ct.ProcessRunTime); err != nil {
 			klog.V(5).Infoln(err)
 		}
 
 		for _, counterKey := range collector_metric.AvailableCounters {
 			var val uint64
 			switch counterKey {
-			case attacher.CPUCycleLable:
+			case attacher.CPUCycleLabel:
 				val = ct.CPUCycles
 			case attacher.CPUInstructionLabel:
 				val = ct.CPUInstr
@@ -136,47 +119,6 @@ func (c *Collector) updateBPFMetrics() {
 	c.handleInactiveContainers(foundContainer)
 }
 
-// getAVGCPUFreqAndTotalCPUTime calculates the weighted cpu frequency average
-func getAVGCPUFreqAndTotalCPUTime(cpuFrequency map[int32]uint64, cpuTime *[C.CPU_VECTOR_SIZE]uint16) (avgFreq float64, totalCPUTime uint64, activeCPUs []int32) {
-	totalFreq := float64(0)
-	totalFreqWithoutWeight := float64(0)
-	for cpu, freq := range cpuFrequency {
-		if int(cpu) > len((*cpuTime))-1 {
-			break
-		}
-		totalCPUTime += uint64(cpuTime[cpu])
-		totalFreqWithoutWeight += float64(freq)
-	}
-	if totalCPUTime == 0 {
-		if len(cpuFrequency) == 0 {
-			return
-		}
-		avgFreq = totalFreqWithoutWeight / float64(len(cpuFrequency))
-	} else {
-		for cpu, freq := range cpuFrequency {
-			if int(cpu) > len((*cpuTime))-1 {
-				break
-			}
-			if cpuTime[cpu] != 0 {
-				totalFreq += float64(freq) * (float64(cpuTime[cpu]) / float64(totalCPUTime))
-				activeCPUs = append(activeCPUs, cpu)
-			}
-		}
-		avgFreq = totalFreqWithoutWeight / float64(len(cpuFrequency))
-	}
-	return
-}
-
-// getActiveCPUs returns active cpu(vcpu) (in case that frequency is not active)
-func getActiveCPUs(cpuTime *[C.CPU_VECTOR_SIZE]uint16) (activeCPUs []int32) {
-	for cpu := range cpuTime {
-		if cpuTime[cpu] != 0 {
-			activeCPUs = append(activeCPUs, int32(cpu))
-		}
-	}
-	return
-}
-
 // handleInactiveContainers
 func (c *Collector) handleInactiveContainers(foundContainer map[string]bool) {
 	numOfInactive := len(c.ContainersMetrics) - len(foundContainer)
diff --git a/pkg/collector/container_hc_collector_test.go b/pkg/collector/container_hc_collector_test.go
index d62bbd42cb..d07ddfa74c 100644
--- a/pkg/collector/container_hc_collector_test.go
+++ b/pkg/collector/container_hc_collector_test.go
@@ -2,30 +2,7 @@ package collector
 
 import (
 	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
 )
 
-var _ = Describe("Test for active containers", func() {
-	It("Get Active CPUs", func() {
-		var cpuTime [128]uint16
-		cpuTime[3] = 1
-		cpuTime[5] = 1
-		cpuTime[12] = 1
-		ac := getActiveCPUs(&cpuTime)
-		Expect(3).To(Equal(len(ac)))
-		Expect(int32(3)).To(Equal(ac[0]))
-		Expect(int32(5)).To(Equal(ac[1]))
-		Expect(int32(12)).To(Equal(ac[2]))
-	})
-
-	It("getAVGCPUFreqAndTotalCPUTime", func() {
-		var cpuTime [128]uint16
-		cpuTime[3] = 1
-		cpuFrequency := make(map[int32]uint64)
-		cpuFrequency[int32(3)] = 1
-		avgFreq, totalCPUTime, activeCPUs := getAVGCPUFreqAndTotalCPUTime(cpuFrequency, &cpuTime)
-		Expect(float64(1)).To(Equal(avgFreq))
-		Expect(uint64(1)).To(Equal(totalCPUTime))
-		Expect(int32(3)).To(Equal(activeCPUs[0]))
-	})
+var _ = Describe("Test hc collector", func() {
 })
diff --git a/pkg/collector/metric/container_metric.go b/pkg/collector/metric/container_metric.go
index 49122276e8..c27724fad9 100644
--- a/pkg/collector/metric/container_metric.go
+++ b/pkg/collector/metric/container_metric.go
@@ -48,7 +48,6 @@ type ContainerMetrics struct {
 	// TODO: we should consider deprecate the command information
 	Command string
 
-	AvgCPUFreq    float64
 	CurrProcesses int
 	Disks         int
 
@@ -62,8 +61,6 @@ type ContainerMetrics struct {
 	BytesRead  *UInt64StatCollection
 	BytesWrite *UInt64StatCollection
 
-	CurrCPUTimePerCPU map[uint32]uint64
-
 	EnergyInCore   *UInt64Stat
 	EnergyInDRAM   *UInt64Stat
 	EnergyInUncore *UInt64Stat
@@ -90,14 +87,13 @@ func NewContainerMetrics(containerName, podName, podNamespace string) *Container
 		BytesWrite: &UInt64StatCollection{
 			Stat: make(map[string]*UInt64Stat),
 		},
-		CurrCPUTimePerCPU: make(map[uint32]uint64),
-		EnergyInCore:      &UInt64Stat{},
-		EnergyInDRAM:      &UInt64Stat{},
-		EnergyInUncore:    &UInt64Stat{},
-		EnergyInPkg:       &UInt64Stat{},
-		EnergyInOther:     &UInt64Stat{},
-		EnergyInGPU:       &UInt64Stat{},
-		DynEnergy:         &UInt64Stat{},
+		EnergyInCore:   &UInt64Stat{},
+		EnergyInDRAM:   &UInt64Stat{},
+		EnergyInUncore: &UInt64Stat{},
+		EnergyInPkg:    &UInt64Stat{},
+		EnergyInOther:  &UInt64Stat{},
+		EnergyInGPU:    &UInt64Stat{},
+		DynEnergy:      &UInt64Stat{},
 	}
 	for _, metricName := range AvailableCounters {
 		c.CounterStats[metricName] = &UInt64Stat{}
@@ -133,7 +129,6 @@ func (c *ContainerMetrics) ResetCurr() {
 	for kubeletKey := range c.KubeletStats {
 		c.KubeletStats[kubeletKey].ResetCurr()
 	}
-	c.CurrCPUTimePerCPU = make(map[uint32]uint64)
 	c.EnergyInCore.ResetCurr()
 	c.EnergyInDRAM.ResetCurr()
 	c.EnergyInUncore.ResetCurr()
@@ -237,9 +232,6 @@ func (c *ContainerMetrics) ToPrometheusValue(metric string) string {
 	if metric == "block_devices_used" {
 		return strconv.FormatUint(uint64(c.Disks), 10)
 	}
-	if metric == "avg_cpu_frequency" {
-		return fmt.Sprintf("%f", c.AvgCPUFreq)
-	}
 	if curr, aggr, err := c.getFloatCurrAndAggrValue(metric); err == nil {
 		if currentValue {
 			return fmt.Sprintf("%f", curr)
@@ -285,7 +277,6 @@ func (c *ContainerMetrics) String() string {
 		"\tcgrouppid: %d pid: %d comm: %s\n"+
 		"\tePkg (mJ): %s (eCore: %s eDram: %s eUncore: %s) eGPU (mJ): %s eOther (mJ): %s \n"+
 		"\teDyn (mJ): %s \n"+
-		"\tavgFreq: %.2f\n"+
 		"\tCPUTime:  %d (%d)\n"+
 		"\tcounters: %v\n"+
 		"\tcgroupfs: %v\n"+
@@ -294,7 +285,6 @@ func (c *ContainerMetrics) String() string {
 		c.CGroupPID, c.PIDS, c.Command,
 		c.EnergyInPkg, c.EnergyInCore, c.EnergyInDRAM, c.EnergyInUncore, c.EnergyInGPU, c.EnergyInOther,
 		c.DynEnergy,
-		c.AvgCPUFreq/1000, /*MHZ*/
 		c.CPUTime.Curr, c.CPUTime.Aggr,
 		c.CounterStats,
 		c.CgroupFSStats,
diff --git a/pkg/collector/metric/stats.go b/pkg/collector/metric/stats.go
index 25293094c3..db0885059f 100644
--- a/pkg/collector/metric/stats.go
+++ b/pkg/collector/metric/stats.go
@@ -12,8 +12,6 @@ import (
 )
 
 const (
-	freqMetricLabel = config.CPUFrequency
-
 	// TO-DO: merge to cgroup stat
 	ByteReadLabel    = config.BytesReadIO
 	ByteWriteLabel   = config.BytesWriteIO
diff --git a/pkg/collector/metric/utils.go b/pkg/collector/metric/utils.go
index e75aa1f3e1..6f620feec9 100644
--- a/pkg/collector/metric/utils.go
+++ b/pkg/collector/metric/utils.go
@@ -59,9 +59,6 @@ func getPrometheusMetrics() []string {
 	for _, feature := range ContainerFeaturesNames {
 		labels = append(labels, CurrPrefix+feature, AggrPrefix+feature)
 	}
-	if attacher.EnableCPUFreq {
-		labels = append(labels, freqMetricLabel)
-	}
 	// TO-DO: remove this hard code metric
 	labels = append(labels, blockDeviceLabel)
 	return labels
diff --git a/pkg/collector/metric/utils_test.go b/pkg/collector/metric/utils_test.go
index 41f381eda9..3a95277959 100644
--- a/pkg/collector/metric/utils_test.go
+++ b/pkg/collector/metric/utils_test.go
@@ -4,7 +4,7 @@ import (
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 
-	"github.com/sustainable-computing-io/kepler/pkg/bpfassets/attacher"
+	"github.com/sustainable-computing-io/kepler/pkg/config"
 )
 
 func clearPlatformDependentAvailability() {
@@ -33,9 +33,6 @@ var _ = Describe("Test Metric Unit", func() {
 		clearPlatformDependentAvailability()
 
 		exp := []string{"curr_cpu_time", "total_cpu_time", "curr_bytes_read", "total_bytes_read", "curr_bytes_writes", "total_bytes_writes", "block_devices_used"}
-		if attacher.EnableCPUFreq {
-			exp = []string{"curr_cpu_time", "total_cpu_time", "curr_bytes_read", "total_bytes_read", "curr_bytes_writes", "total_bytes_writes", "avg_cpu_frequency", "block_devices_used"}
-		}
 		cur := getPrometheusMetrics()
 		Expect(exp).To(Equal(cur))
 	})
@@ -61,22 +58,10 @@ var _ = Describe("Test Metric Unit", func() {
 	})
 
 	It("Test setEnabledMetrics", func() {
+		config.ExposeHardwareCounterMetrics = false
 		clearPlatformDependentAvailability()
 		cur := setEnabledMetrics()
-
-		if attacher.EnableCPUFreq {
-			expMap := make(map[string]int, 7)
-			exp := []string{"cpu_time", "cpu_instr", "cache_miss", "cpu_cycles", "bytes_read", "bytes_writes", "block_devices_used"}
-			for _, v := range exp {
-				expMap[v] = 1
-			}
-			for _, vCur := range cur {
-				_, ok := expMap[vCur]
-				Expect(ok).To(BeTrue())
-			}
-		} else {
-			exp := []string{"cpu_time", "bytes_read", "bytes_writes", "block_devices_used"}
-			Expect(exp).To(Equal(cur))
-		}
+		exp := []string{"cpu_time", "bytes_read", "bytes_writes", "block_devices_used"}
+		Expect(exp).To(Equal(cur))
 	})
 })
diff --git a/pkg/collector/metric_collector.go b/pkg/collector/metric_collector.go
index edc958b367..8b18e4df46 100644
--- a/pkg/collector/metric_collector.go
+++ b/pkg/collector/metric_collector.go
@@ -80,7 +80,7 @@ func (c *Collector) Initialize() error {
 
 	c.prePopulateContainerMetrics(pods)
 	c.updateNodeEnergyMetrics()
-	c.acpiPowerMeter.Run()
+	c.acpiPowerMeter.Run(attacher.HardwareCountersEnabled)
 	c.resetBPFTables()
 
 	return nil
diff --git a/pkg/collector/node_energy_collector.go b/pkg/collector/node_energy_collector.go
index 36fe6b5cd5..33b6f58b13 100644
--- a/pkg/collector/node_energy_collector.go
+++ b/pkg/collector/node_energy_collector.go
@@ -17,6 +17,9 @@ limitations under the License.
 package collector
 
 import (
+	"encoding/binary"
+
+	"github.com/sustainable-computing-io/kepler/pkg/bpfassets/attacher"
 	"github.com/sustainable-computing-io/kepler/pkg/config"
 	"github.com/sustainable-computing-io/kepler/pkg/model"
 	"github.com/sustainable-computing-io/kepler/pkg/power/accelerator"
@@ -69,6 +72,17 @@ func (c *Collector) updateNodeGPUEnergy() {
 
 // updateNodeAvgCPUFrequency updates the average CPU frequency in each core
 func (c *Collector) updateNodeAvgCPUFrequency() {
+	// update the cpu frequency using hardware counters when available because reading files can be very expensive
+	if attacher.HardwareCountersEnabled {
+		cpuFreq := map[int32]uint64{}
+		for it := c.bpfHCMeter.CPUFreqTable.Iter(); it.Next(); {
+			cpu := int32(binary.LittleEndian.Uint32(it.Key()))
+			freq := uint64(binary.LittleEndian.Uint32(it.Leaf()))
+			cpuFreq[cpu] = freq
+		}
+		c.NodeMetrics.CPUFrequency = cpuFreq
+		return
+	}
 	c.NodeMetrics.CPUFrequency = c.acpiPowerMeter.GetCPUCoreFrequency()
 }
 
diff --git a/pkg/collector/prometheus_collector.go b/pkg/collector/prometheus_collector.go
index f05388b129..1fdc824e04 100644
--- a/pkg/collector/prometheus_collector.go
+++ b/pkg/collector/prometheus_collector.go
@@ -350,8 +350,8 @@ func (p *PrometheusCollector) newContainerMetrics() {
 	// Additional metrics (gauge)
 	containerCPUTime := prometheus.NewDesc(
 		prometheus.BuildFQName(namespace, "container", "cpu_cpu_time_us"),
-		"Current CPU time per CPU",
-		[]string{"pod_name", "container_name", "container_namespace", "cpu"}, nil,
+		"Aggregated CPU time",
+		[]string{"pod_name", "container_name", "container_namespace"}, nil,
 	)
 
 	p.containerDesc = &ContainerDesc{
@@ -524,14 +524,12 @@ func (p *PrometheusCollector) UpdatePodMetrics(wg *sync.WaitGroup, ch chan<- pro
 				float64(container.Curr()),
 				podEnergyStatusLabelValues...,
 			)
-			for cpu, cpuTime := range container.CurrCPUTimePerCPU {
-				ch <- prometheus.MustNewConstMetric(
-					p.containerDesc.containerCPUTime,
-					prometheus.GaugeValue,
-					float64(cpuTime),
-					container.PodName, container.ContainerName, container.Namespace, strconv.Itoa(int(cpu)),
-				)
-			}
+			ch <- prometheus.MustNewConstMetric(
+				p.containerDesc.containerCPUTime,
+				prometheus.CounterValue,
+				float64(container.CPUTime.Aggr),
+				container.PodName, container.ContainerName, container.Namespace,
+			)
 			ch <- prometheus.MustNewConstMetric(
 				p.containerDesc.containerCoreJoulesTotal,
 				prometheus.CounterValue,
@@ -581,11 +579,11 @@ func (p *PrometheusCollector) UpdatePodMetrics(wg *sync.WaitGroup, ch chan<- pro
 				container.PodName, container.ContainerName, container.Namespace, containerCommand,
 			)
 			if collector_metric.CPUHardwareCounterEnabled {
-				if container.CounterStats[attacher.CPUCycleLable] != nil {
+				if container.CounterStats[attacher.CPUCycleLabel] != nil {
 					ch <- prometheus.MustNewConstMetric(
 						p.containerDesc.containerCPUCyclesTotal,
 						prometheus.CounterValue,
-						float64(container.CounterStats[attacher.CPUCycleLable].Aggr),
+						float64(container.CounterStats[attacher.CPUCycleLabel].Aggr),
 						container.PodName, container.ContainerName, container.Namespace, containerCommand,
 					)
 				}
diff --git a/pkg/config/types.go b/pkg/config/types.go
index b74f0884f0..4af1c3ad48 100644
--- a/pkg/config/types.go
+++ b/pkg/config/types.go
@@ -19,6 +19,7 @@ package config
 const (
 	// counter - attacher package
 	CPUCycle       = "cpu_cycles"
+	CPURefCycle    = "cpu_ref_cycles"
 	CPUInstruction = "cpu_instr"
 	CacheMiss      = "cache_miss"
 

From ba931d323bb3be5d2d02240b090c81b98091ec54 Mon Sep 17 00:00:00 2001
From: Marcelo Amaral <marcelo.amaral1@ibm.com>
Date: Wed, 14 Dec 2022 23:35:40 +0900
Subject: [PATCH 3/3] bpfassets: add support to collect cpu frequency with BPF

Signed-off-by: Marcelo Amaral <marcelo.amaral1@ibm.com>
---
 bpfassets/perf_event/perf_event.c           | 275 ++++++++++-------
 pkg/bpfassets/attacher/bcc_attacher.go      |  50 ++--
 pkg/bpfassets/attacher/bcc_attacher_stub.go |  33 ++-
 pkg/bpfassets/attacher/bpf_perf.go          |   2 +-
 pkg/bpfassets/perf_event_bindata.go         | 312 +++++++++++---------
 5 files changed, 386 insertions(+), 286 deletions(-)

diff --git a/bpfassets/perf_event/perf_event.c b/bpfassets/perf_event/perf_event.c
index 4899f5c31d..b702b838ad 100644
--- a/bpfassets/perf_event/perf_event.c
+++ b/bpfassets/perf_event/perf_event.c
@@ -14,174 +14,221 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-#include <uapi/linux/ptrace.h>
-#include <uapi/linux/bpf_perf_event.h>
+#include <linux/sched.h>
+// #include <linux/bpf.h>
+// #include <linux/bpf_perf_event.h>
 
 #ifndef NUM_CPUS
 #define NUM_CPUS 128
 #endif
 
-// we cannot define it dynamically as NUM_CPUS because the golang needs to know this
-// size at compiler time for decoding
-#define CPU_VECTOR_SIZE 128
+#ifndef CPU_REF_FREQ
+#define CPU_REF_FREQ 2500
+#endif
 
-typedef struct switch_args
-{
-    u64 pad;
-    char prev_comm[16];
-    int prev_pid;
-    int prev_prio;
-    long long prev_state;
-    char next_comm[16];
-    int next_pid;
-    int next_prio;
-} switch_args;
-
-typedef struct process_time_t
+#define KHZ 1000
+
+typedef struct process_metrics_t
 {
     u64 cgroup_id;
     u64 pid;
     u64 process_run_time;
     u64 cpu_cycles;
     u64 cpu_instr;
-    u64 cache_misses;
+    u64 cache_miss;
     char comm[16];
-    //u64 pad;
-    // the max eBPF stack limit is 512 bytes, which is a vector of u16 with 128 elements
-    // the time is calculated in miliseconds, uint16 max size is 65K, ~1mim
-    u16 cpu_time[CPU_VECTOR_SIZE];
-}  process_time_t;
+} process_metrics_t;
 
 typedef struct pid_time_t
 {
-    int pid;
+    u32 pid;
+    u32 cpu;
 } pid_time_t;
 
-BPF_PERF_OUTPUT(events);
-
 // processes and pid time
-BPF_HASH(processes, u64, process_time_t);
+BPF_HASH(processes, u64, process_metrics_t);
 BPF_HASH(pid_time, pid_time_t);
 
 // perf counters
-BPF_PERF_ARRAY(cpu_cycles, NUM_CPUS);
-BPF_PERF_ARRAY(cpu_instr, NUM_CPUS);
-BPF_PERF_ARRAY(cache_miss, NUM_CPUS);
+BPF_PERF_ARRAY(cpu_cycles_hc_reader, NUM_CPUS);
+BPF_ARRAY(cpu_cycles, u64, NUM_CPUS);
 
-// tracking counters
-BPF_ARRAY(prev_cpu_cycles, u64, NUM_CPUS);
-BPF_ARRAY(prev_cpu_instr, u64, NUM_CPUS);
-BPF_ARRAY(prev_cache_miss, u64, NUM_CPUS);
+BPF_PERF_ARRAY(cpu_ref_cycles_hc_reader, NUM_CPUS);
+BPF_ARRAY(cpu_ref_cycles, u64, NUM_CPUS);
 
-static void safe_array_add(u32 idx, u16 *array, u16 value)
+BPF_PERF_ARRAY(cpu_instr_hc_reader, NUM_CPUS);
+BPF_ARRAY(cpu_instr, u64, NUM_CPUS);
+
+BPF_PERF_ARRAY(cache_miss_hc_reader, NUM_CPUS);
+BPF_ARRAY(cache_miss, u64, NUM_CPUS);
+
+// cpu freq counters
+BPF_ARRAY(cpu_freq_array, u32, NUM_CPUS);
+
+static inline u64 get_on_cpu_time(u32 cur_pid, u32 prev_pid, u64 cur_ts)
 {
-#pragma clang loop unroll(full)
-    for (int array_index = 0; array_index < CPU_VECTOR_SIZE-1; array_index++)
+    u64 cpu_time = 0;
+
+    // get pid time
+    pid_time_t prev_pid_key = {.pid = prev_pid};
+    u64 *prev_ts = pid_time.lookup(&prev_pid_key);
+    if (prev_ts != 0)
     {
-        if (array_index == idx)
+        // Probably a clock issue where the recorded on-CPU event had a
+        // timestamp later than the recorded off-CPU event, or vice versa.
+        // But do not return, since the hardware counters can be collected.
+        if (cur_ts > *prev_ts)
         {
-            array[array_index] += value;
-            break;
+            cpu_time = (cur_ts - *prev_ts) / 1000000; /*milisecond*/
+            pid_time.delete(&prev_pid_key);
         }
     }
+    pid_time_t new_pid_key = {.pid = cur_pid};
+    pid_time.update(&new_pid_key, &cur_ts);
+
+    return cpu_time;
 }
 
-int sched_switch(switch_args *ctx)
+static inline u64 normalize(u64 *counter, u64 *enabled, u64 *running)
 {
-    u64 pid = bpf_get_current_pid_tgid() >> 32;
-#ifdef SET_GROUP_ID
-    u64 cgroup_id = bpf_get_current_cgroup_id();
-#else
-    u64 cgroup_id = 0;
-#endif
+    if (*running > 0)
+        return *counter * *enabled / *running;
+    return *counter;
+}
 
-    u64 time = bpf_ktime_get_ns();
+static inline u64 calc_delta(u64 *prev_val, u64 *val)
+{
     u64 delta = 0;
-    u32 cpu_id = bpf_get_smp_processor_id();
-    pid_time_t new_pid, old_pid;
+    if (prev_val)
+    {
+        if (*val > *prev_val)
+            delta = *val - *prev_val;
+    }
+    return delta;
+}
 
-    // get pid time
-    old_pid.pid = ctx->prev_pid;
-    u64 *last_time = pid_time.lookup(&old_pid);
-    if (last_time != 0)
+// altough the "get_on_cpu_counters" has some code duplications, it is inline code and the compiles will improve this
+static inline u64 get_on_cpu_cycles(u32 *cpu_id)
+{
+    u64 delta = 0;
+    struct bpf_perf_event_value c = {};
+    int error = cpu_cycles_hc_reader.perf_counter_value(CUR_CPU_IDENTIFIER, &c, sizeof(struct bpf_perf_event_value));
+    if (error == 0)
     {
-        delta = (time - *last_time) / 1000000; /*milisecond*/
-        // return if the process did not use any cpu time yet
-        if (delta == 0)
-        {
-            return 0;
-        }
-        pid_time.delete(&old_pid);
+        u64 val = normalize(&c.counter, &c.enabled, &c.running);
+        u64 *prev_val = cpu_cycles.lookup(cpu_id);
+        delta = calc_delta(prev_val, &val);
+        cpu_cycles.update(cpu_id, &val);
     }
+    return delta;
+}
 
-    new_pid.pid = ctx->next_pid;
-    pid_time.lookup_or_try_init(&new_pid, &time);
+static inline u64 get_on_cpu_ref_cycles(u32 *cpu_id)
+{
+    u64 delta = 0;
+    struct bpf_perf_event_value c = {};
+    int error = cpu_ref_cycles_hc_reader.perf_counter_value(CUR_CPU_IDENTIFIER, &c, sizeof(struct bpf_perf_event_value));
+    if (error == 0)
+    {
+        u64 val = normalize(&c.counter, &c.enabled, &c.running);
+        u64 *prev_val = cpu_ref_cycles.lookup(cpu_id);
+        delta = calc_delta(prev_val, &val);
+        cpu_ref_cycles.update(cpu_id, &val);
+    }
+    return delta;
+}
 
-    u64 cpu_cycles_delta = 0;
-    u64 cpu_instr_delta = 0;
-    u64 cache_miss_delta = 0;
-    u64 *prev;
+static inline u64 get_on_cpu_instr(u32 *cpu_id)
+{
+    u64 delta = 0;
+    struct bpf_perf_event_value c = {};
+    int error = cpu_instr_hc_reader.perf_counter_value(CUR_CPU_IDENTIFIER, &c, sizeof(struct bpf_perf_event_value));
+    if (error == 0)
+    {
+        u64 val = normalize(&c.counter, &c.enabled, &c.running);
+        u64 *prev_val = cpu_instr.lookup(cpu_id);
+        delta = calc_delta(prev_val, &val);
+        cpu_instr.update(cpu_id, &val);
+    }
+    return delta;
+}
 
-    u64 val = cpu_cycles.perf_read(CUR_CPU_IDENTIFIER);
-    if (((s64)val > 0) || ((s64)val < -256))
+static inline u64 get_on_cpu_cache_miss(u32 *cpu_id)
+{
+    u64 delta = 0;
+    struct bpf_perf_event_value c = {};
+    int error = cache_miss_hc_reader.perf_counter_value(CUR_CPU_IDENTIFIER, &c, sizeof(struct bpf_perf_event_value));
+    if (error == 0)
     {
-        prev = prev_cpu_cycles.lookup(&cpu_id);
-        if (prev)
-        {
-            cpu_cycles_delta = val - *prev;
-        }
-        prev_cpu_cycles.update(&cpu_id, &val);
+        u64 val = normalize(&c.counter, &c.enabled, &c.running);
+        u64 *prev_val = cache_miss.lookup(cpu_id);
+        delta = calc_delta(prev_val, &val);
+        cache_miss.update(cpu_id, &val);
     }
-    val = cpu_instr.perf_read(CUR_CPU_IDENTIFIER);
-    if (((s64)val > 0) || ((s64)val < -256))
+    return delta;
+}
+
+// calculate the average cpu freq
+static inline u64 get_on_cpu_avg_freq(u32 *cpu_id, u64 on_cpu_cycles_delta, u64 on_cpu_ref_cycles_delta)
+{
+    u32 avg_freq = 0;
+    cpu_freq_array.lookup_or_try_init(cpu_id, &avg_freq);
+    if (avg_freq == 0)
     {
-        prev = prev_cpu_instr.lookup(&cpu_id);
-        if (prev)
-        {
-            cpu_instr_delta = val - *prev;
-        }
-        prev_cpu_instr.update(&cpu_id, &val);
+        avg_freq = ((on_cpu_cycles_delta * CPU_REF_FREQ) / on_cpu_ref_cycles_delta) / KHZ;
     }
-    val = cache_miss.perf_read(CUR_CPU_IDENTIFIER);
-    if (((s64)val > 0) || ((s64)val < -256))
+    else
     {
-        prev = prev_cache_miss.lookup(&cpu_id);
-        if (prev)
-        {
-            cache_miss_delta = val - *prev;
-        }
-        prev_cache_miss.update(&cpu_id, &val);
+        avg_freq += ((on_cpu_cycles_delta * CPU_REF_FREQ) / on_cpu_ref_cycles_delta) / KHZ;
+        avg_freq /= 2;
     }
+    return avg_freq;
+}
 
-    // init process time
-    struct process_time_t *process_time;
-    process_time = processes.lookup(&pid);
-    if (process_time == 0)
+// int kprobe__finish_task_switch(switch_args *ctx)
+int kprobe__finish_task_switch(struct pt_regs *ctx, struct task_struct *prev)
+{
+    u64 cur_pid = bpf_get_current_pid_tgid() >> 32;
+#ifdef SET_GROUP_ID
+    u64 cgroup_id = bpf_get_current_cgroup_id();
+#else
+    u64 cgroup_id = 0;
+#endif
+
+    u64 cur_ts = bpf_ktime_get_ns();
+    u32 cpu_id = bpf_get_smp_processor_id();
+    u64 on_cpu_time_delta = get_on_cpu_time(cur_pid, prev->pid, cur_ts);
+    u64 on_cpu_cycles_delta = get_on_cpu_cycles(&cpu_id);
+    u64 on_cpu_ref_cycles_delta = get_on_cpu_ref_cycles(&cpu_id);
+    u64 on_cpu_instr_delta = get_on_cpu_instr(&cpu_id);
+    u64 on_cpu_cache_miss_delta = get_on_cpu_cache_miss(&cpu_id);
+    u64 on_cpu_avg_freq = get_on_cpu_avg_freq(&cpu_id, on_cpu_cycles_delta, on_cpu_ref_cycles_delta);
+
+    // store process metrics
+    struct process_metrics_t *process_metrics;
+    process_metrics = processes.lookup(&cur_pid);
+    if (process_metrics == 0)
     {
-        process_time_t new_process = {};
-        new_process.pid = pid;
+        process_metrics_t new_process = {};
+        new_process.pid = cur_pid;
         new_process.cgroup_id = cgroup_id;
-        new_process.cpu_cycles = cpu_cycles_delta;
-        new_process.cpu_instr = cpu_instr_delta;
-        new_process.cache_misses = cache_miss_delta;
-        new_process.process_run_time += delta;
-#ifdef CPU_FREQ
-        //FIXME: for certain reason, hyper-v seems to always get a cpu_id that is same as NUM_CPUS and cause stack overrun
-        safe_array_add(cpu_id, new_process.cpu_time, delta);
-#endif        
+        new_process.process_run_time = on_cpu_time_delta;
         bpf_get_current_comm(&new_process.comm, sizeof(new_process.comm));
-        processes.update(&pid, &new_process);
+
+        new_process.cpu_cycles = on_cpu_cycles_delta;
+        new_process.cpu_instr = on_cpu_instr_delta;
+        new_process.cache_miss = on_cpu_cache_miss_delta;
+
+        processes.update(&cur_pid, &new_process);
     }
     else
     {
         // update process time
-        process_time->cpu_cycles += cpu_cycles_delta;
-        process_time->cpu_instr += cpu_instr_delta;
-        process_time->cache_misses += cache_miss_delta;
-        process_time->process_run_time += delta;
-#ifdef CPU_FREQ
-        safe_array_add(cpu_id, process_time->cpu_time, delta);
-#endif        
+        process_metrics->process_run_time += on_cpu_time_delta;
+
+        process_metrics->cpu_cycles += on_cpu_cycles_delta;
+        process_metrics->cpu_instr += on_cpu_instr_delta;
+        process_metrics->cache_miss += on_cpu_cache_miss_delta;
     }
 
     return 0;
diff --git a/pkg/bpfassets/attacher/bcc_attacher.go b/pkg/bpfassets/attacher/bcc_attacher.go
index ffbc561f82..5f4b4ac17d 100644
--- a/pkg/bpfassets/attacher/bcc_attacher.go
+++ b/pkg/bpfassets/attacher/bcc_attacher.go
@@ -41,24 +41,27 @@ type perfCounter struct {
 }
 
 type BpfModuleTables struct {
-	Module    *bpf.Module
-	Table     *bpf.Table
-	TimeTable *bpf.Table
+	Module       *bpf.Module
+	Table        *bpf.Table
+	CPUFreqTable *bpf.Table
 }
 
 const (
-	CPUCycleLable       = config.CPUCycle
+	CPUCycleLabel       = config.CPUCycle
+	CPURefCycleLabel    = config.CPURefCycle
 	CPUInstructionLabel = config.CPUInstruction
 	CacheMissLabel      = config.CacheMiss
+	bpfPerfArrayPrefix  = "_hc_reader"
 )
 
 var (
 	Counters = map[string]perfCounter{
-		CPUCycleLable:       {unix.PERF_TYPE_HARDWARE, unix.PERF_COUNT_HW_CPU_CYCLES, true},
+		CPUCycleLabel:       {unix.PERF_TYPE_HARDWARE, unix.PERF_COUNT_HW_CPU_CYCLES, true},
+		CPURefCycleLabel:    {unix.PERF_TYPE_HARDWARE, unix.PERF_COUNT_HW_REF_CPU_CYCLES, true},
 		CPUInstructionLabel: {unix.PERF_TYPE_HARDWARE, unix.PERF_COUNT_HW_INSTRUCTIONS, true},
 		CacheMissLabel:      {unix.PERF_TYPE_HARDWARE, unix.PERF_COUNT_HW_CACHE_MISSES, true},
 	}
-	EnableCPUFreq = true
+	HardwareCountersEnabled = false
 )
 
 func loadModule(objProg []byte, options []string) (m *bpf.Module, err error) {
@@ -70,24 +73,28 @@ func loadModule(objProg []byte, options []string) (m *bpf.Module, err error) {
 	}()
 	m = bpf.NewModule(string(objProg), options)
 	// TODO make all entrypoints yaml-declarable
-	sched_switch, err := m.LoadTracepoint("sched_switch")
+	ftswitch, err := m.LoadKprobe("kprobe__finish_task_switch")
 	if err != nil {
-		return nil, fmt.Errorf("failed to load sched_switch: %s", err)
+		return nil, fmt.Errorf("failed to load kprobe__finish_task_switch: %s", err)
 	}
-	err = m.AttachTracepoint("sched:sched_switch", sched_switch)
+	err = m.AttachKprobe("finish_task_switch", ftswitch, -1)
 	if err != nil {
-		return nil, fmt.Errorf("failed to attach sched_switch: %s", err)
+		err = m.AttachKprobe("finish_task_switch.isra.0", ftswitch, -1)
+		if err != nil {
+			return nil, fmt.Errorf("failed to attach finish_task_switch: %s", err)
+		}
 	}
 
 	for arrayName, counter := range Counters {
-		t := bpf.NewTable(m.TableId(arrayName), m)
+		bpfPerfArrayName := arrayName + bpfPerfArrayPrefix
+		t := bpf.NewTable(m.TableId(bpfPerfArrayName), m)
 		if t == nil {
-			return nil, fmt.Errorf("failed to find perf array: %s", arrayName)
+			return nil, fmt.Errorf("failed to find perf array: %s", bpfPerfArrayName)
 		}
 		perfErr := openPerfEvent(t, counter.evType, counter.evConfig)
 		if perfErr != nil {
 			// some hypervisors don't expose perf counters
-			klog.Infof("failed to attach perf event %s: %v\n", arrayName, perfErr)
+			klog.Infof("failed to attach perf event %s: %v\n", bpfPerfArrayName, perfErr)
 			counter.enabled = false
 		}
 	}
@@ -103,30 +110,23 @@ func AttachBPFAssets() (*BpfModuleTables, error) {
 	}
 	options := []string{
 		"-DNUM_CPUS=" + strconv.Itoa(runtime.NumCPU()),
-		"-DCPU_FREQ",
 	}
 	if config.EnabledEBPFCgroupID {
 		options = append(options, "-DSET_GROUP_ID")
 	}
+	// TODO: verify if ebpf can run in the VM without hardware counter support, if not, we can disable the HC part and only collect the cpu time
 	m, err := loadModule(objProg, options)
 	if err != nil {
-		klog.Warningf("failed to attach perf module with options %v: %v, Hardware counter related metrics does not exist\n", options, err)
-		options = []string{"-DNUM_CPUS=" + strconv.Itoa(runtime.NumCPU())}
-		EnableCPUFreq = false
-		m, err = loadModule(objProg, options)
-		if err != nil {
-			klog.Infof("failed to attach perf module with options %v: %v, not able to load eBPF modules\n", options, err)
-			// at this time, there is not much we can do with the eBPF module
-			return nil, err
-		}
+		klog.Infof("failed to attach perf module with options %v: %v, not able to load eBPF modules\n", options, err)
+		return nil, err
 	}
 
 	table := bpf.NewTable(m.TableId("processes"), m)
-	timeTable := bpf.NewTable(m.TableId("pid_time"), m)
+	cpuFreqTable := bpf.NewTable(m.TableId("cpu_freq_array"), m)
 
 	bpfModules.Module = m
 	bpfModules.Table = table
-	bpfModules.TimeTable = timeTable
+	bpfModules.CPUFreqTable = cpuFreqTable
 
 	klog.Infof("Successfully load eBPF module with option: %s", options)
 
diff --git a/pkg/bpfassets/attacher/bcc_attacher_stub.go b/pkg/bpfassets/attacher/bcc_attacher_stub.go
index b787b57ea2..5f80da3ab5 100644
--- a/pkg/bpfassets/attacher/bcc_attacher_stub.go
+++ b/pkg/bpfassets/attacher/bcc_attacher_stub.go
@@ -21,32 +21,49 @@ package attacher
 
 import (
 	"fmt"
+
+	"github.com/sustainable-computing-io/kepler/pkg/config"
 )
 
 const (
-	CPUCycleLable       = "cpu_cycles"
-	CPUInstructionLabel = "cpu_instr"
-	CacheMissLabel      = "cache_miss"
+	CPUCycleLabel       = config.CPUCycle
+	CPURefCycleLabel    = config.CPURefCycle
+	CPUInstructionLabel = config.CPUInstruction
+	CacheMissLabel      = config.CacheMiss
 )
 
 type perfCounter struct{}
 
 type ModuleStub struct{}
 
+// Table references a BPF table.
 type Table struct {
 }
+
+// TableIterator contains the current position for iteration over a *bcc.Table and provides methods for iteration.
 type TableIterator struct {
 	leaf []byte
+	key  []byte
 }
 
+// Iter returns an iterator to list all table entries available as raw bytes.
 func (table *Table) Iter() *TableIterator {
 	return &TableIterator{}
 }
 
+// Next looks up the next element and return true if one is available.
 func (it *TableIterator) Next() bool {
 	return false
 }
 
+// Key returns the current key value of the iterator, if the most recent call to Next returned true.
+// The slice is valid only until the next call to Next.
+func (it *TableIterator) Key() []byte {
+	return it.key
+}
+
+// Leaf returns the current leaf value of the iterator, if the most recent call to Next returned true.
+// The slice is valid only until the next call to Next.
 func (it *TableIterator) Leaf() []byte {
 	return it.leaf
 }
@@ -55,14 +72,14 @@ func (table *Table) DeleteAll() {
 }
 
 type BpfModuleTables struct {
-	Module    ModuleStub
-	Table     *Table
-	TimeTable *Table
+	Module       ModuleStub
+	Table        *Table
+	CPUFreqTable *Table
 }
 
 var (
-	Counters      = map[string]perfCounter{}
-	EnableCPUFreq = false
+	Counters                = map[string]perfCounter{}
+	HardwareCountersEnabled = false
 )
 
 func AttachBPFAssets() (*BpfModuleTables, error) {
diff --git a/pkg/bpfassets/attacher/bpf_perf.go b/pkg/bpfassets/attacher/bpf_perf.go
index 86f022038c..9c1ff9b8d1 100644
--- a/pkg/bpfassets/attacher/bpf_perf.go
+++ b/pkg/bpfassets/attacher/bpf_perf.go
@@ -64,7 +64,7 @@ func openPerfEvent(table *bpf.Table, typ, config int) error {
 	leafSize := table.Config()["leaf_size"].(uint64)
 
 	if keySize != 4 || leafSize != 4 {
-		return fmt.Errorf("passed table has wrong size")
+		return fmt.Errorf("passed table has wrong size: key_size=%d, leaf_size=%d", keySize, leafSize)
 	}
 
 	res := []int{}
diff --git a/pkg/bpfassets/perf_event_bindata.go b/pkg/bpfassets/perf_event_bindata.go
index c1535ebf42..f9503d0bf7 100644
--- a/pkg/bpfassets/perf_event_bindata.go
+++ b/pkg/bpfassets/perf_event_bindata.go
@@ -1,8 +1,8 @@
-// Code generated by go-bindata. (@generated) DO NOT EDIT.
-
-// Package bpfassets generated by go-bindata.
+// Code generated by go-bindata.
 // sources:
 // bpfassets/perf_event/perf_event.c
+// DO NOT EDIT!
+
 package bpfassets
 
 import (
@@ -26,32 +26,21 @@ type bindataFileInfo struct {
 	modTime time.Time
 }
 
-// Name return file name
 func (fi bindataFileInfo) Name() string {
 	return fi.name
 }
-
-// Size return file size
 func (fi bindataFileInfo) Size() int64 {
 	return fi.size
 }
-
-// Mode return file mode
 func (fi bindataFileInfo) Mode() os.FileMode {
 	return fi.mode
 }
-
-// ModTime return file modify time
 func (fi bindataFileInfo) ModTime() time.Time {
 	return fi.modTime
 }
-
-// IsDir return file whether a directory
 func (fi bindataFileInfo) IsDir() bool {
-	return fi.mode&os.ModeDir != 0
+	return false
 }
-
-// Sys return file is sys mode
 func (fi bindataFileInfo) Sys() interface{} {
 	return nil
 }
@@ -72,174 +61,221 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-#include <uapi/linux/ptrace.h>
-#include <uapi/linux/bpf_perf_event.h>
+#include <linux/sched.h>
+// #include <linux/bpf.h>
+// #include <linux/bpf_perf_event.h>
 
 #ifndef NUM_CPUS
 #define NUM_CPUS 128
 #endif
 
-// we cannot define it dynamically as NUM_CPUS because the golang needs to know this
-// size at compiler time for decoding
-#define CPU_VECTOR_SIZE 128
+#ifndef CPU_REF_FREQ
+#define CPU_REF_FREQ 2500
+#endif
 
-typedef struct switch_args
-{
-    u64 pad;
-    char prev_comm[16];
-    int prev_pid;
-    int prev_prio;
-    long long prev_state;
-    char next_comm[16];
-    int next_pid;
-    int next_prio;
-} switch_args;
-
-typedef struct process_time_t
+#define CPUHZ 1000
+
+typedef struct process_metrics_t
 {
     u64 cgroup_id;
     u64 pid;
     u64 process_run_time;
     u64 cpu_cycles;
     u64 cpu_instr;
-    u64 cache_misses;
+    u64 cache_miss;
     char comm[16];
-    //u64 pad;
-    // the max eBPF stack limit is 512 bytes, which is a vector of u16 with 128 elements
-    // the time is calculated in miliseconds, uint16 max size is 65K, ~1mim
-    u16 cpu_time[CPU_VECTOR_SIZE];
-}  process_time_t;
+} process_metrics_t;
 
 typedef struct pid_time_t
 {
-    int pid;
+    u32 pid;
+    u32 cpu;
 } pid_time_t;
 
-BPF_PERF_OUTPUT(events);
-
 // processes and pid time
-BPF_HASH(processes, u64, process_time_t);
+BPF_HASH(processes, u64, process_metrics_t);
 BPF_HASH(pid_time, pid_time_t);
 
 // perf counters
-BPF_PERF_ARRAY(cpu_cycles, NUM_CPUS);
-BPF_PERF_ARRAY(cpu_instr, NUM_CPUS);
-BPF_PERF_ARRAY(cache_miss, NUM_CPUS);
+BPF_PERF_ARRAY(cpu_cycles_hc_reader, NUM_CPUS);
+BPF_ARRAY(cpu_cycles, u64, NUM_CPUS);
+
+BPF_PERF_ARRAY(cpu_ref_cycles_hc_reader, NUM_CPUS);
+BPF_ARRAY(cpu_ref_cycles, u64, NUM_CPUS);
+
+BPF_PERF_ARRAY(cpu_instr_hc_reader, NUM_CPUS);
+BPF_ARRAY(cpu_instr, u64, NUM_CPUS);
 
-// tracking counters
-BPF_ARRAY(prev_cpu_cycles, u64, NUM_CPUS);
-BPF_ARRAY(prev_cpu_instr, u64, NUM_CPUS);
-BPF_ARRAY(prev_cache_miss, u64, NUM_CPUS);
+BPF_PERF_ARRAY(cache_miss_hc_reader, NUM_CPUS);
+BPF_ARRAY(cache_miss, u64, NUM_CPUS);
 
-static void safe_array_add(u32 idx, u16 *array, u16 value)
+// cpu freq counters
+BPF_ARRAY(cpu_freq_array, u32, NUM_CPUS);
+
+static inline u64 get_on_cpu_time(u32 cur_pid, u32 prev_pid, u64 cur_ts)
 {
-#pragma clang loop unroll(full)
-    for (int array_index = 0; array_index < CPU_VECTOR_SIZE-1; array_index++)
+    u64 cpu_time = 0;
+
+    // get pid time
+    pid_time_t prev_pid_key = {.pid = prev_pid};
+    u64 *prev_ts = pid_time.lookup(&prev_pid_key);
+    if (prev_ts != 0)
     {
-        if (array_index == idx)
+        // Probably a clock issue where the recorded on-CPU event had a
+        // timestamp later than the recorded off-CPU event, or vice versa.
+        // But do not return, since the hardware counters can be collected.
+        if (cur_ts > *prev_ts)
         {
-            array[array_index] += value;
-            break;
+            cpu_time = (cur_ts - *prev_ts) / 1000000; /*milisecond*/
+            pid_time.delete(&prev_pid_key);
         }
     }
+    pid_time_t new_pid_key = {.pid = cur_pid};
+    pid_time.update(&new_pid_key, &cur_ts);
+
+    return cpu_time;
 }
 
-int sched_switch(switch_args *ctx)
+static inline u64 normalize(u64 *counter, u64 *enabled, u64 *running)
 {
-    u64 pid = bpf_get_current_pid_tgid() >> 32;
-#ifdef SET_GROUP_ID
-    u64 cgroup_id = bpf_get_current_cgroup_id();
-#else
-    u64 cgroup_id = 0;
-#endif
+    if (*running > 0)
+        return *counter * *enabled / *running;
+    return *counter;
+}
 
-    u64 time = bpf_ktime_get_ns();
+static inline u64 calc_delta(u64 *prev_val, u64 *val)
+{
     u64 delta = 0;
-    u32 cpu_id = bpf_get_smp_processor_id();
-    pid_time_t new_pid, old_pid;
+    if (prev_val)
+    {
+        if (*val > *prev_val)
+            delta = *val - *prev_val;
+    }
+    return delta;
+}
 
-    // get pid time
-    old_pid.pid = ctx->prev_pid;
-    u64 *last_time = pid_time.lookup(&old_pid);
-    if (last_time != 0)
+// altough the "get_on_cpu_counters" has some code duplications, it is inline code and the compiles will improve this
+static inline u64 get_on_cpu_cycles(u32 *cpu_id)
+{
+    u64 delta = 0;
+    struct bpf_perf_event_value c = {};
+    int error = cpu_cycles_hc_reader.perf_counter_value(CUR_CPU_IDENTIFIER, &c, sizeof(struct bpf_perf_event_value));
+    if (error == 0)
     {
-        delta = (time - *last_time) / 1000000; /*milisecond*/
-        // return if the process did not use any cpu time yet
-        if (delta == 0)
-        {
-            return 0;
-        }
-        pid_time.delete(&old_pid);
+        u64 val = normalize(&c.counter, &c.enabled, &c.running);
+        u64 *prev_val = cpu_cycles.lookup(cpu_id);
+        delta = calc_delta(prev_val, &val);
+        cpu_cycles.update(cpu_id, &val);
     }
+    return delta;
+}
 
-    new_pid.pid = ctx->next_pid;
-    pid_time.lookup_or_try_init(&new_pid, &time);
+static inline u64 get_on_cpu_ref_cycles(u32 *cpu_id)
+{
+    u64 delta = 0;
+    struct bpf_perf_event_value c = {};
+    int error = cpu_ref_cycles_hc_reader.perf_counter_value(CUR_CPU_IDENTIFIER, &c, sizeof(struct bpf_perf_event_value));
+    if (error == 0)
+    {
+        u64 val = normalize(&c.counter, &c.enabled, &c.running);
+        u64 *prev_val = cpu_ref_cycles.lookup(cpu_id);
+        delta = calc_delta(prev_val, &val);
+        cpu_ref_cycles.update(cpu_id, &val);
+    }
+    return delta;
+}
 
-    u64 cpu_cycles_delta = 0;
-    u64 cpu_instr_delta = 0;
-    u64 cache_miss_delta = 0;
-    u64 *prev;
+static inline u64 get_on_cpu_instr(u32 *cpu_id)
+{
+    u64 delta = 0;
+    struct bpf_perf_event_value c = {};
+    int error = cpu_instr_hc_reader.perf_counter_value(CUR_CPU_IDENTIFIER, &c, sizeof(struct bpf_perf_event_value));
+    if (error == 0)
+    {
+        u64 val = normalize(&c.counter, &c.enabled, &c.running);
+        u64 *prev_val = cpu_instr.lookup(cpu_id);
+        delta = calc_delta(prev_val, &val);
+        cpu_instr.update(cpu_id, &val);
+    }
+    return delta;
+}
 
-    u64 val = cpu_cycles.perf_read(CUR_CPU_IDENTIFIER);
-    if (((s64)val > 0) || ((s64)val < -256))
+static inline u64 get_on_cpu_cache_miss(u32 *cpu_id)
+{
+    u64 delta = 0;
+    struct bpf_perf_event_value c = {};
+    int error = cache_miss_hc_reader.perf_counter_value(CUR_CPU_IDENTIFIER, &c, sizeof(struct bpf_perf_event_value));
+    if (error == 0)
     {
-        prev = prev_cpu_cycles.lookup(&cpu_id);
-        if (prev)
-        {
-            cpu_cycles_delta = val - *prev;
-        }
-        prev_cpu_cycles.update(&cpu_id, &val);
+        u64 val = normalize(&c.counter, &c.enabled, &c.running);
+        u64 *prev_val = cache_miss.lookup(cpu_id);
+        delta = calc_delta(prev_val, &val);
+        cache_miss.update(cpu_id, &val);
     }
-    val = cpu_instr.perf_read(CUR_CPU_IDENTIFIER);
-    if (((s64)val > 0) || ((s64)val < -256))
+    return delta;
+}
+
+// calculate the average cpu freq
+static inline u64 get_on_cpu_avg_freq(u32 *cpu_id, u64 on_cpu_cycles_delta, u64 on_cpu_ref_cycles_delta)
+{
+    u32 avg_freq = 0;
+    cpu_freq_array.lookup_or_try_init(cpu_id, &avg_freq);
+    if (avg_freq == 0)
     {
-        prev = prev_cpu_instr.lookup(&cpu_id);
-        if (prev)
-        {
-            cpu_instr_delta = val - *prev;
-        }
-        prev_cpu_instr.update(&cpu_id, &val);
+        avg_freq = ((on_cpu_cycles_delta * CPU_REF_FREQ) / on_cpu_ref_cycles_delta) / CPUHZ;
     }
-    val = cache_miss.perf_read(CUR_CPU_IDENTIFIER);
-    if (((s64)val > 0) || ((s64)val < -256))
+    else
     {
-        prev = prev_cache_miss.lookup(&cpu_id);
-        if (prev)
-        {
-            cache_miss_delta = val - *prev;
-        }
-        prev_cache_miss.update(&cpu_id, &val);
+        avg_freq += ((on_cpu_cycles_delta * CPU_REF_FREQ) / on_cpu_ref_cycles_delta) / CPUHZ;
+        avg_freq /= 2;
     }
+    return avg_freq;
+}
 
-    // init process time
-    struct process_time_t *process_time;
-    process_time = processes.lookup(&pid);
-    if (process_time == 0)
+// int kprobe__finish_task_switch(switch_args *ctx)
+int kprobe__finish_task_switch(struct pt_regs *ctx, struct task_struct *prev)
+{
+    u64 cur_pid = bpf_get_current_pid_tgid() >> 32;
+#ifdef SET_GROUP_ID
+    u64 cgroup_id = bpf_get_current_cgroup_id();
+#else
+    u64 cgroup_id = 0;
+#endif
+
+    u64 cur_ts = bpf_ktime_get_ns();
+    u32 cpu_id = bpf_get_smp_processor_id();
+    u64 on_cpu_time_delta = get_on_cpu_time(cur_pid, prev->pid, cur_ts);
+    u64 on_cpu_cycles_delta = get_on_cpu_cycles(&cpu_id);
+    u64 on_cpu_ref_cycles_delta = get_on_cpu_ref_cycles(&cpu_id);
+    u64 on_cpu_instr_delta = get_on_cpu_instr(&cpu_id);
+    u64 on_cpu_cache_miss_delta = get_on_cpu_cache_miss(&cpu_id);
+    u64 on_cpu_avg_freq = get_on_cpu_avg_freq(&cpu_id, on_cpu_cycles_delta, on_cpu_ref_cycles_delta);
+
+    // store process metrics
+    struct process_metrics_t *process_metrics;
+    process_metrics = processes.lookup(&cur_pid);
+    if (process_metrics == 0)
     {
-        process_time_t new_process = {};
-        new_process.pid = pid;
+        process_metrics_t new_process = {};
+        new_process.pid = cur_pid;
         new_process.cgroup_id = cgroup_id;
-        new_process.cpu_cycles = cpu_cycles_delta;
-        new_process.cpu_instr = cpu_instr_delta;
-        new_process.cache_misses = cache_miss_delta;
-        new_process.process_run_time += delta;
-#ifdef CPU_FREQ
-        //FIXME: for certain reason, hyper-v seems to always get a cpu_id that is same as NUM_CPUS and cause stack overrun
-        safe_array_add(cpu_id, new_process.cpu_time, delta);
-#endif        
+        new_process.process_run_time = on_cpu_time_delta;
         bpf_get_current_comm(&new_process.comm, sizeof(new_process.comm));
-        processes.update(&pid, &new_process);
+
+        new_process.cpu_cycles = on_cpu_cycles_delta;
+        new_process.cpu_instr = on_cpu_instr_delta;
+        new_process.cache_miss = on_cpu_cache_miss_delta;
+
+        processes.update(&cur_pid, &new_process);
     }
     else
     {
         // update process time
-        process_time->cpu_cycles += cpu_cycles_delta;
-        process_time->cpu_instr += cpu_instr_delta;
-        process_time->cache_misses += cache_miss_delta;
-        process_time->process_run_time += delta;
-#ifdef CPU_FREQ
-        safe_array_add(cpu_id, process_time->cpu_time, delta);
-#endif        
+        process_metrics->process_run_time += on_cpu_time_delta;
+
+        process_metrics->cpu_cycles += on_cpu_cycles_delta;
+        process_metrics->cpu_instr += on_cpu_instr_delta;
+        process_metrics->cache_miss += on_cpu_cache_miss_delta;
     }
 
     return 0;
@@ -265,8 +301,8 @@ func bpfassetsPerf_eventPerf_eventC() (*asset, error) {
 // It returns an error if the asset could not be found or
 // could not be loaded.
 func Asset(name string) ([]byte, error) {
-	canonicalName := strings.Replace(name, "\\", "/", -1)
-	if f, ok := _bindata[canonicalName]; ok {
+	cannonicalName := strings.Replace(name, "\\", "/", -1)
+	if f, ok := _bindata[cannonicalName]; ok {
 		a, err := f()
 		if err != nil {
 			return nil, fmt.Errorf("Asset %s can't read by error: %v", name, err)
@@ -291,8 +327,8 @@ func MustAsset(name string) []byte {
 // It returns an error if the asset could not be found or
 // could not be loaded.
 func AssetInfo(name string) (os.FileInfo, error) {
-	canonicalName := strings.Replace(name, "\\", "/", -1)
-	if f, ok := _bindata[canonicalName]; ok {
+	cannonicalName := strings.Replace(name, "\\", "/", -1)
+	if f, ok := _bindata[cannonicalName]; ok {
 		a, err := f()
 		if err != nil {
 			return nil, fmt.Errorf("AssetInfo %s can't read by error: %v", name, err)
@@ -327,13 +363,13 @@ var _bindata = map[string]func() (*asset, error){
 //         b.png
 // then AssetDir("data") would return []string{"foo.txt", "img"}
 // AssetDir("data/img") would return []string{"a.png", "b.png"}
-// AssetDir("foo.txt") and AssetDir("nonexistent") would return an error
+// AssetDir("foo.txt") and AssetDir("notexist") would return an error
 // AssetDir("") will return []string{"data"}.
 func AssetDir(name string) ([]string, error) {
 	node := _bintree
 	if len(name) != 0 {
-		canonicalName := strings.Replace(name, "\\", "/", -1)
-		pathList := strings.Split(canonicalName, "/")
+		cannonicalName := strings.Replace(name, "\\", "/", -1)
+		pathList := strings.Split(cannonicalName, "/")
 		for _, p := range pathList {
 			node = node.Children[p]
 			if node == nil {
@@ -407,6 +443,6 @@ func RestoreAssets(dir, name string) error {
 }
 
 func _filePath(dir, name string) string {
-	canonicalName := strings.Replace(name, "\\", "/", -1)
-	return filepath.Join(append([]string{dir}, strings.Split(canonicalName, "/")...)...)
+	cannonicalName := strings.Replace(name, "\\", "/", -1)
+	return filepath.Join(append([]string{dir}, strings.Split(cannonicalName, "/")...)...)
 }