From a451ce9caafe0a2c8188c5d1dac217f9feacc135 Mon Sep 17 00:00:00 2001
From: Chen Fu <1316708+chenfucn@users.noreply.github.com>
Date: Mon, 6 Feb 2023 11:28:14 -0800
Subject: [PATCH] fp16 detect

---
 onnxruntime/core/common/cpuid_info.cc  | 38 ++++++++++++++++++++++++++
 onnxruntime/core/common/cpuid_info.h   |  6 +++-
 onnxruntime/core/mlas/lib/halfgemm.cpp |  9 +-----
 onnxruntime/core/mlas/lib/mlasi.h      |  3 ++
 onnxruntime/core/mlas/lib/platform.cpp |  3 ++
 5 files changed, 50 insertions(+), 9 deletions(-)
diff --git a/onnxruntime/core/common/cpuid_info.cc b/onnxruntime/core/common/cpuid_info.cc
index b950e4e734fa5..03460c9def5bd 100644
--- a/onnxruntime/core/common/cpuid_info.cc
+++ b/onnxruntime/core/common/cpuid_info.cc
@@ -143,6 +143,7 @@ void CPUIDInfo::ArmLinuxInit() {
   if (pytorch_cpuinfo_init_) {
     is_hybrid_ = cpuinfo_get_uarchs_count() > 1;
     has_arm_neon_dot_ = cpuinfo_has_arm_neon_dot();
+    has_fp16_ = cpuinfo_has_arm_neon_fp16_arith();
     const uint32_t core_cnt = cpuinfo_get_cores_count();
     core_uarchs_.resize(core_cnt, cpuinfo_uarch_unknown);
     is_armv8_narrow_ld_.resize(core_cnt, false);
@@ -165,6 +166,7 @@ void CPUIDInfo::ArmLinuxInit() {
     }
   } else {
     has_arm_neon_dot_ = ((getauxval(AT_HWCAP) & HWCAP_ASIMDDP) != 0);
+    has_fp16_ |= has_arm_neon_dot_;
   }
 }
 
@@ -220,9 +222,45 @@ void CPUIDInfo::ArmWindowsInit() {
       lastUarch = uarch;
     }
   }
+
+  switch (lastUarch) {
+    case cpuinfo_uarch_cortex_a55:
+    case cpuinfo_uarch_cortex_a55r0:
+    case cpuinfo_uarch_cortex_a76:
+    case cpuinfo_uarch_neoverse_n1:
+    case cpuinfo_uarch_cortex_a77:
+    case cpuinfo_uarch_exynos_m4:
+    case cpuinfo_uarch_exynos_m5:
+      has_fp16_ = true;
+      break;
+    default:
+      break;
+  }
+  if (!has_fp16_) {
+    /*
+     * Detecting fp16 support. Different cores should have the same instruction set.
+     * So we just check the first ID_AA64PFR0_EL1
+     *  Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0100), Op2(0b000),
+     */
+    uint64_t ID_AA64PFR0_EL1;
+    unsigned long valsize = sizeof(uint64_t);
+    auto retCode = ::RegGetValueA(
+        HKEY_LOCAL_MACHINE,
+        "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
+        "CP 4020", RRF_RT_REG_QWORD, nullptr,
+        &ID_AA64PFR0_EL1, &valsize);
+    if (retCode == ERROR_SUCCESS) {
+        // AdvSIMD, bits [23:20]
+      auto advSimd = ID_AA64PFR0_EL1 >> 20;
+      if ((advSimd & 0xfULL) == 1) {
+        has_fp16_ = true;
+      }
+    }
+  }
 #endif /* Application Family or OneCore Family */
 
   has_arm_neon_dot_ = (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) != 0);
+  has_fp16_ |= has_arm_neon_dot_;
 }
 
 #endif /* (arm or arm64) and windows */
diff --git a/onnxruntime/core/common/cpuid_info.h b/onnxruntime/core/common/cpuid_info.h
index 858f8595b8220..c413e0ca7ed5f 100644
--- a/onnxruntime/core/common/cpuid_info.h
+++ b/onnxruntime/core/common/cpuid_info.h
@@ -21,7 +21,7 @@ class CPUIDInfo {
   bool HasAVX512f() const { return has_avx512f_; }
   bool HasAVX512_BF16() const {return has_avx512_bf16_;}
   bool HasAVX512Skylake() const { return has_avx512_skylake_; }
-  bool HasF16C() const { return has_f16c_; }
+  bool HasF16C() const { return has_f16c_; } /*fp16 conversion inst*/
   bool HasSSE3() const { return has_sse3_; }
   bool HasSSE4_1() const { return has_sse4_1_; }
   bool IsHybrid() const { return is_hybrid_; }
@@ -85,6 +85,9 @@ class CPUIDInfo {
     return is_armv8_narrow_ld_[coreIdx];
   }
 
+  bool HasFp16VectorAcceleration() const {
+    return has_fp16_;
+  }
 
  private:
   CPUIDInfo() {
@@ -118,6 +121,7 @@ class CPUIDInfo {
   std::vector<bool> is_armv8_narrow_ld_;
 
   bool has_arm_neon_dot_{false};
+  bool has_fp16_{false};
 
 #ifdef CPUIDINFO_ARCH_X86
 
diff --git a/onnxruntime/core/mlas/lib/halfgemm.cpp b/onnxruntime/core/mlas/lib/halfgemm.cpp
index 837b8f1bbd2b8..778db2003d6c6 100644
--- a/onnxruntime/core/mlas/lib/halfgemm.cpp
+++ b/onnxruntime/core/mlas/lib/halfgemm.cpp
@@ -25,17 +25,10 @@ Module Name:
 bool MLASCALL
 MlasFp16AccelerationSupported()
 {
-#ifdef MLAS_NEON64_INTRINSICS
-    // TODO!! Only support for ARMv8.2
-    // TODO!! how to detect ARMv8.0 ???
-    return true;
-#else
-    return false;
-#endif
+    return MLAS_CPUIDINFO::GetCPUIDInfo().HasFp16VectorAcceleration();
 }
 
 
-
 void
 MLASCALL
 MlasHalfGemmBatch(
diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
index 495c2f06ea757..21949535cf63b 100644
--- a/onnxruntime/core/mlas/lib/mlasi.h
+++ b/onnxruntime/core/mlas/lib/mlasi.h
@@ -123,6 +123,8 @@ class MLASCPUIDInfo
     // ARM
     bool HasArmNeonDot() const { return has_arm_neon_dot_; }
 
+    bool HasFp16VectorAcceleration() const { return has_fp16_; }
+
     uint32_t GetCurrentCoreIdx() const { return 0xFFFFFFFF; }
 
     int32_t GetCurrentUarch() const { return -1; }
@@ -137,6 +139,7 @@ class MLASCPUIDInfo
     MLASCPUIDInfo();
 
     bool has_arm_neon_dot_{false};
+    bool has_fp16_{false};
 };
 using MLAS_CPUIDINFO = MLASCPUIDInfo;
 
diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
index 7d8624a32a218..32ada2ee4cfa9 100644
--- a/onnxruntime/core/mlas/lib/platform.cpp
+++ b/onnxruntime/core/mlas/lib/platform.cpp
@@ -36,6 +36,9 @@ Module Name:
 MLASCPUIDInfo::MLASCPUIDInfo()
 {
     has_arm_neon_dot_ = (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) != 0);
+
+    // raw hack! Need CPUIDInfo implementation for more precise detection
+    has_fp16_ = has_arm_neon_dot_;
 }
 #endif