From a451ce9caafe0a2c8188c5d1dac217f9feacc135 Mon Sep 17 00:00:00 2001 From: Chen Fu <1316708+chenfucn@users.noreply.github.com> Date: Mon, 6 Feb 2023 11:28:14 -0800 Subject: [PATCH] fp16 detect --- onnxruntime/core/common/cpuid_info.cc | 38 ++++++++++++++++++++++++++ onnxruntime/core/common/cpuid_info.h | 6 +++- onnxruntime/core/mlas/lib/halfgemm.cpp | 9 +----- onnxruntime/core/mlas/lib/mlasi.h | 3 ++ onnxruntime/core/mlas/lib/platform.cpp | 3 ++ 5 files changed, 50 insertions(+), 9 deletions(-) diff --git a/onnxruntime/core/common/cpuid_info.cc b/onnxruntime/core/common/cpuid_info.cc index b950e4e734fa5..03460c9def5bd 100644 --- a/onnxruntime/core/common/cpuid_info.cc +++ b/onnxruntime/core/common/cpuid_info.cc @@ -143,6 +143,7 @@ void CPUIDInfo::ArmLinuxInit() { if (pytorch_cpuinfo_init_) { is_hybrid_ = cpuinfo_get_uarchs_count() > 1; has_arm_neon_dot_ = cpuinfo_has_arm_neon_dot(); + has_fp16_ = cpuinfo_has_arm_neon_fp16_arith(); const uint32_t core_cnt = cpuinfo_get_cores_count(); core_uarchs_.resize(core_cnt, cpuinfo_uarch_unknown); is_armv8_narrow_ld_.resize(core_cnt, false); @@ -165,6 +166,7 @@ void CPUIDInfo::ArmLinuxInit() { } } else { has_arm_neon_dot_ = ((getauxval(AT_HWCAP) & HWCAP_ASIMDDP) != 0); + has_fp16_ |= has_arm_neon_dot_; } } @@ -220,9 +222,45 @@ void CPUIDInfo::ArmWindowsInit() { lastUarch = uarch; } } + + switch (lastUarch) { + case cpuinfo_uarch_cortex_a55: + case cpuinfo_uarch_cortex_a55r0: + case cpuinfo_uarch_cortex_a76: + case cpuinfo_uarch_neoverse_n1: + case cpuinfo_uarch_cortex_a77: + case cpuinfo_uarch_exynos_m4: + case cpuinfo_uarch_exynos_m5: + has_fp16_ = true; + break; + default: + break; + } + if (!has_fp16_) { + /* + * Detecting fp16 support. Different cores should have the same instruction set. + * So we just check the first ID_AA64PFR0_EL1 + * Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0100), Op2(0b000), + */ + uint64_t ID_AA64PFR0_EL1; + unsigned long valsize = sizeof(uint64_t); + auto retCode = ::RegGetValueA( + HKEY_LOCAL_MACHINE, + "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0", + "CP 4020", RRF_RT_REG_QWORD, nullptr, + &ID_AA64PFR0_EL1, &valsize); + if (retCode == ERROR_SUCCESS) { + // AdvSIMD, bits [23:20] + auto advSimd = ID_AA64PFR0_EL1 >> 20; + if ((advSimd & 0xfULL) == 1) { + has_fp16_ = true; + } + } + } #endif /* Application Family or OneCore Family */ has_arm_neon_dot_ = (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) != 0); + has_fp16_ |= has_arm_neon_dot_; } #endif /* (arm or arm64) and windows */ diff --git a/onnxruntime/core/common/cpuid_info.h b/onnxruntime/core/common/cpuid_info.h index 858f8595b8220..c413e0ca7ed5f 100644 --- a/onnxruntime/core/common/cpuid_info.h +++ b/onnxruntime/core/common/cpuid_info.h @@ -21,7 +21,7 @@ class CPUIDInfo { bool HasAVX512f() const { return has_avx512f_; } bool HasAVX512_BF16() const {return has_avx512_bf16_;} bool HasAVX512Skylake() const { return has_avx512_skylake_; } - bool HasF16C() const { return has_f16c_; } + bool HasF16C() const { return has_f16c_; } /*fp16 conversion inst*/ bool HasSSE3() const { return has_sse3_; } bool HasSSE4_1() const { return has_sse4_1_; } bool IsHybrid() const { return is_hybrid_; } @@ -85,6 +85,9 @@ class CPUIDInfo { return is_armv8_narrow_ld_[coreIdx]; } + bool HasFp16VectorAcceleration() const { + return has_fp16_; + } private: CPUIDInfo() { @@ -118,6 +121,7 @@ class CPUIDInfo { std::vector is_armv8_narrow_ld_; bool has_arm_neon_dot_{false}; + bool has_fp16_{false}; #ifdef CPUIDINFO_ARCH_X86 diff --git a/onnxruntime/core/mlas/lib/halfgemm.cpp b/onnxruntime/core/mlas/lib/halfgemm.cpp index 837b8f1bbd2b8..778db2003d6c6 100644 --- a/onnxruntime/core/mlas/lib/halfgemm.cpp +++ b/onnxruntime/core/mlas/lib/halfgemm.cpp @@ -25,17 +25,10 @@ Module Name: bool MLASCALL MlasFp16AccelerationSupported() { -#ifdef MLAS_NEON64_INTRINSICS - // TODO!! Only support for ARMv8.2 - // TODO!! how to detect ARMv8.0 ??? - return true; -#else - return false; -#endif + return MLAS_CPUIDINFO::GetCPUIDInfo().HasFp16VectorAcceleration(); } - void MLASCALL MlasHalfGemmBatch( diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h index 495c2f06ea757..21949535cf63b 100644 --- a/onnxruntime/core/mlas/lib/mlasi.h +++ b/onnxruntime/core/mlas/lib/mlasi.h @@ -123,6 +123,8 @@ class MLASCPUIDInfo // ARM bool HasArmNeonDot() const { return has_arm_neon_dot_; } + bool HasFp16VectorAcceleration() const { return has_fp16_; } + uint32_t GetCurrentCoreIdx() const { return 0xFFFFFFFF; } int32_t GetCurrentUarch() const { return -1; } @@ -137,6 +139,7 @@ class MLASCPUIDInfo MLASCPUIDInfo(); bool has_arm_neon_dot_{false}; + bool has_fp16_{false}; }; using MLAS_CPUIDINFO = MLASCPUIDInfo; diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp index 7d8624a32a218..32ada2ee4cfa9 100644 --- a/onnxruntime/core/mlas/lib/platform.cpp +++ b/onnxruntime/core/mlas/lib/platform.cpp @@ -36,6 +36,9 @@ Module Name: MLASCPUIDInfo::MLASCPUIDInfo() { has_arm_neon_dot_ = (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) != 0); + + // raw hack! Need CPUIDInfo implementation for more precise detection + has_fp16_ = has_arm_neon_dot_; } #endif