microsoft · chenfucn · Feb 25, 2022 · Feb 16, 2022 · Feb 24, 2022
diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
@@ -2032,27 +2032,5 @@ enum MlasCoreType { mlas_core_unknown = 0, mlas_core_little = 2, mlas_core_big =
  *  @return 2 current core is little core with narrow memory load (e.g. ARMv8 a53)
  *          3 current core is big core with wider load (e.g. ARMv8 a72)
  */
-MLAS_FORCEINLINE
-int32_t
-MlasGetCoreUArch()
-{
-    thread_local int32_t core_type = mlas_core_unknown;
-    if (core_type == mlas_core_unknown) {
-        // initialization needed
-#if defined(MLAS_TARGET_ARM64) && defined(__linux__)
-        auto uarch = MLAS_CPUIDINFO::GetCPUIDInfo().GetCurrentUarch();
-        if (uarch == cpuinfo_uarch_cortex_a53 || uarch == cpuinfo_uarch_cortex_a55r0 ||
-            uarch == cpuinfo_uarch_cortex_a55) {
-            core_type = mlas_core_little;
-        } else {
-            core_type = mlas_core_big;
-        }
-#else
-        core_type = mlas_core_big;
-#endif  // MLAS_TARGET_ARM64
-
-    }
-    return core_type;
-}
-
+extern MlasCoreType MlasGetCoreType();
 
diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
@@ -17,6 +17,9 @@ Module Name:
 
 #include "mlasi.h"
 
+#include <thread>
+#include <mutex>
+
 #if defined(MLAS_TARGET_POWER) && defined(__linux__)
 #include <sys/auxv.h>
 #endif
@@ -28,6 +31,12 @@ Module Name:
 #define PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE 43
 #endif
 #elif defined(__linux__)
+#include <unistd.h>
+#include <sys/syscall.h>
+#if !defined(__NR_getcpu)
+#include <asm-generic/unistd.h>
+#endif
+
 #include <sys/auxv.h>
 #include <asm/hwcap.h>
 // N.B. Support building with older versions of asm/hwcap.h that do not define
@@ -425,3 +434,58 @@ Return Value:
     return MLAS_DEFAULT_PREFERRED_BUFFER_ALIGNMENT;
 #endif
 }
+
+#if defined(MLAS_TARGET_ARM64) && defined(__linux__)
+static MlasCoreType* mlas_coretype_tbl = nullptr;
+static uint32_t mlas_coretype_tbl_size = 0;
+static std::once_flag mlas_init_coretype_tbl;
+#endif
+
+MlasCoreType
+MlasGetCoreType()
+{
+
+#if defined(MLAS_TARGET_ARM64) && defined(__linux__)
+    std::call_once(mlas_init_coretype_tbl, []() {
+        mlas_coretype_tbl_size = std::thread::hardware_concurrency();
+        if (mlas_coretype_tbl_size == 0) {
+            return;
+        }
+        mlas_coretype_tbl = (MlasCoreType*)malloc(sizeof(MlasCoreType) * mlas_coretype_tbl_size);
+        for (uint32_t i = 0; i < mlas_coretype_tbl_size; i++) {
+            mlas_coretype_tbl[i] = mlas_core_unknown;
+        }
+    });
+
+    if (mlas_coretype_tbl_size == 0) {
+        // functionality missing, return default
+        return mlas_core_big;
+    }
+
+    unsigned cpu = 0;
+    if (syscall(__NR_getcpu, &cpu, NULL, NULL) != 0) {
+            // failed to detect current core id. give up
+            return mlas_core_big;
+    }
+
+    if (cpu >= mlas_coretype_tbl_size) {
+        return mlas_core_big;
+    }
+
+    auto core_type = mlas_coretype_tbl[cpu];
+    if (core_type == mlas_core_unknown) {
+        auto uarch = MLAS_CPUIDINFO::GetCPUIDInfo().GetCurrentUarch();
+        if (uarch == cpuinfo_uarch_cortex_a53 || uarch == cpuinfo_uarch_cortex_a55r0 ||
+            uarch == cpuinfo_uarch_cortex_a55) {
+            core_type = mlas_core_little;
+        } else {
+            core_type = mlas_core_big;
+        }
+        mlas_coretype_tbl[cpu] = core_type;
+    }
+    return core_type;
+
+#else
+    return mlas_core_big;
+#endif
+}
diff --git a/onnxruntime/core/mlas/lib/qgemm.cpp b/onnxruntime/core/mlas/lib/qgemm.cpp
@@ -209,7 +209,7 @@ MlasSymmQgemmBatch(
     if (ThreadPool == nullptr) {
         // So our caller handles threaded job partition.
         // Call single threaded operation directly
-        auto uarch = MlasGetCoreUArch();
+        auto uarch = MlasGetCoreType();
         MLAS_SYMM_QGEMM_OPERATION* operation =
             uarch == mlas_core_little ? dispatch->LitOperation : dispatch->BigOperation;
 
@@ -260,7 +260,7 @@ MlasSymmQgemmBatch(
     ThreadsPerGemm = ThreadCountM * ThreadCountN;
 
     MlasTrySimpleParallel(ThreadPool, ThreadsPerGemm * BatchN, [&](ptrdiff_t tid) {
-        auto uarch = MlasGetCoreUArch();
+        auto uarch = MlasGetCoreType();
         MLAS_SYMM_QGEMM_OPERATION* operation =
             uarch == mlas_core_little ? dispatch->LitOperation : dispatch->BigOperation;