From 18f46eb6f2afc5c92dbb530f908ed5ad020782ea Mon Sep 17 00:00:00 2001 From: Alston Tang Date: Tue, 23 Jan 2024 11:41:35 -0800 Subject: [PATCH] define flops events/metrics for AMD Zen4 (#217) Summary: Pull Request resolved: https://github.com/facebookincubator/dynolog/pull/217 define two new events `zen4::fp_ret_x87_fp_ops.all` and `zen4::fp_ret_sse_avx_ops.all` to count scalar/vector fp ops. also define a new metric `fp_ops_all` that will use two events above on zen4 hosts and fallback to intel events on other hosts. Reviewed By: bigzachattack Differential Revision: D52861377 fbshipit-source-id: 4219c8b6483c6d97c639da80167abf1c46a04641 --- hbt/src/perf_event/AmdEvents.cpp | 26 +++++++ hbt/src/perf_event/BuiltinMetrics.cpp | 106 ++++++++++++++++++++++++++ 2 files changed, 132 insertions(+) diff --git a/hbt/src/perf_event/AmdEvents.cpp b/hbt/src/perf_event/AmdEvents.cpp index 77e17dd3..d7b7313c 100644 --- a/hbt/src/perf_event/AmdEvents.cpp +++ b/hbt/src/perf_event/AmdEvents.cpp @@ -65,6 +65,32 @@ void addEvents(PmuDeviceManager& pmu_manager) { "L3 Cache misses", "L3 Cache misses"), std::vector({"l3-cache-misses"})); + + // FLOPs events for AMD Zen3/Zen4 + pmu_manager.addEvent( + std::make_shared( + PmuType::cpu, + "zen3/4::fp_ret_x87_fp_ops.all", + EventDef::Encoding{.code = amd_msr::kRetiredX87Flops.val}, + "Retired x87 floating-point ops of all types.", + "The number of all x87 floating-point Ops that have retired."), + std::vector({"zen3/4-ret-x87-fp-ops-all"})); + pmu_manager.addEvent( + std::make_shared( + PmuType::cpu, + "zen3::fp_ret_sse_avx_ops.all", + EventDef::Encoding{.code = amd_msr::kZen3RetiredSseAvxFlops.val}, + "Retired SSE and AVX floating-point ops of all types.", + "The number of all SSE/AVX floating-point Ops that have retired."), + std::vector({"zen3-ret-sse-avx-fp-ops-all"})); + pmu_manager.addEvent( + std::make_shared( + PmuType::cpu, + "zen4::fp_ret_sse_avx_ops.all", + EventDef::Encoding{.code = amd_msr::kZen4RetiredSseAvxFlops.val}, + "Retired SSE and AVX floating-point ops of all types.", + "The number of all SSE/AVX floating-point Ops that have retired."), + std::vector({"zen4-ret-sse-avx-fp-ops-all"})); } } // namespace milan diff --git a/hbt/src/perf_event/BuiltinMetrics.cpp b/hbt/src/perf_event/BuiltinMetrics.cpp index edd7f248..9010d495 100644 --- a/hbt/src/perf_event/BuiltinMetrics.cpp +++ b/hbt/src/perf_event/BuiltinMetrics.cpp @@ -783,6 +783,112 @@ std::shared_ptr makeAvailableMetrics() { System::Permissions{}, std::vector{})); + metrics->add(std::make_shared( + "fp_ops_all", + "Total floating points operations", + "Counts number of floating points operations of single precision type, double precision type, and bfloat types " + "executed by the processor. " + "For AMD, each event counts the # retired floating point operations. " + "For Intel, each event counts the # retired instructions " + "Multiply # of instructions by # of operations packed inside an instruction to calculate # operations.", + std::map{ + {CpuArch::MILAN, + EventRefs{ + EventRef{ + "flops_scalar", + PmuType::cpu, + "zen3/4::fp_ret_x87_fp_ops.all", + EventExtraAttr{}, + {}}, + EventRef{ + "flops_vector", + PmuType::cpu, + "zen3::fp_ret_sse_avx_ops.all", + EventExtraAttr{}, + {}}}}, + {CpuArch::BERGAMO, + EventRefs{ + EventRef{ + "flops_scalar", + PmuType::cpu, + "zen3/4::fp_ret_x87_fp_ops.all", + EventExtraAttr{}, + {}}, + EventRef{ + "flops_vector", + PmuType::cpu, + "zen4::fp_ret_sse_avx_ops.all", + EventExtraAttr{}, + {}}}}, + {CpuArch::GENOA, + EventRefs{ + EventRef{ + "flops_scalar", + PmuType::cpu, + "zen3/4::fp_ret_x87_fp_ops.all", + EventExtraAttr{}, + {}}, + EventRef{ + "flops_vector", + PmuType::cpu, + "zen4::fp_ret_sse_avx_ops.all", + EventExtraAttr{}, + {}}}}, + // Intel by default + {std::nullopt, + EventRefs{ + EventRef{ + "instr_dp_scalar", + PmuType::cpu, + "FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", + EventExtraAttr{}, + {}}, + EventRef{ + "instr_dp_128b_packed", + PmuType::cpu, + "FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", + EventExtraAttr{}, + {}}, + EventRef{ + "instr_dp_256b_packed", + PmuType::cpu, + "FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", + EventExtraAttr{}, + {}}, + EventRef{ + "instr_dp_512b_packed", + PmuType::cpu, + "FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE", + EventExtraAttr{}, + {}}, + EventRef{ + "instr_sp_scalar", + PmuType::cpu, + "FP_ARITH_INST_RETIRED.SCALAR_SINGLE", + EventExtraAttr{}, + {}}, + EventRef{ + "instr_sp_128b_packed", + PmuType::cpu, + "FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", + EventExtraAttr{}, + {}}, + EventRef{ + "instr_sp_256b_packed", + PmuType::cpu, + "FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", + EventExtraAttr{}, + {}}, + EventRef{ + "instr_sp_512b_packed", + PmuType::cpu, + "FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", + EventExtraAttr{}, + {}}}}}, + 100'000'000, + System::Permissions{}, + std::vector{})); + metrics->add(std::make_shared( "cpu_clock", "High-resolution sys and user CPU clock",