Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix bug: getting current cpu core type #10630

Merged
merged 2 commits into from
Feb 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
163 changes: 94 additions & 69 deletions onnxruntime/core/mlas/lib/mlasi.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,12 @@ Module Name:
#include "core/common/cpuid_info.h"
using MLAS_CPUIDINFO = onnxruntime::CPUIDInfo;

#include <unistd.h>
#include <sys/syscall.h>
#if !defined(__NR_getcpu)
#include <asm-generic/unistd.h>
#endif

#endif // MLAS_TARGET_ARM64

#else // BUILD_MLAS_NO_ONNXRUNTIME
Expand All @@ -126,6 +132,8 @@ class MLASCPUIDInfo
// ARM
bool HasArmNeonDot() const { return has_arm_neon_dot_; }

int32_t GetCurrentUarch() { return -1; }

private:
MLASCPUIDInfo();

Expand Down Expand Up @@ -774,6 +782,47 @@ struct MLAS_CONV_SYM_POST_PROCESS_PARAMS {
// Environment information class.
//

/**
* @brief IDs for cpu microarchitectures.
*
* Copied from python cpuinfo package. Can't use the definition
* from cpuinfo directly as it causes lots of compilation issues
* in many platforms that we support.
*/
enum MlasUArch {
cpuinfo_uarch_unknown = 0,

/** ARM Cortex-A32. */
cpuinfo_uarch_cortex_a32 = 0x00300332,
/** ARM Cortex-A35. */
cpuinfo_uarch_cortex_a35 = 0x00300335,
/** ARM Cortex-A53. */
cpuinfo_uarch_cortex_a53 = 0x00300353,
/** ARM Cortex-A55 revision 0 (restricted dual-issue capabilities compared to revision 1+). */
cpuinfo_uarch_cortex_a55r0 = 0x00300354,
/** ARM Cortex-A55. */
cpuinfo_uarch_cortex_a55 = 0x00300355,
/** ARM Cortex-A57. */
cpuinfo_uarch_cortex_a57 = 0x00300357,
/** ARM Cortex-A65. */
cpuinfo_uarch_cortex_a65 = 0x00300365,
/** ARM Cortex-A72. */
cpuinfo_uarch_cortex_a72 = 0x00300372,
/** ARM Cortex-A73. */
cpuinfo_uarch_cortex_a73 = 0x00300373,
/** ARM Cortex-A75. */
cpuinfo_uarch_cortex_a75 = 0x00300375,
/** ARM Cortex-A76. */
cpuinfo_uarch_cortex_a76 = 0x00300376,
/** ARM Cortex-A77. */
cpuinfo_uarch_cortex_a77 = 0x00300377,
/** ARM Cortex-A78. */
cpuinfo_uarch_cortex_a78 = 0x00300378,
};

enum MlasCoreType { mlas_core_unknown = 0, mlas_core_little = 2, mlas_core_big = 3 };


struct MLAS_PLATFORM {

MLAS_PLATFORM(void);
Expand Down Expand Up @@ -836,6 +885,51 @@ struct MLAS_PLATFORM {
static constexpr int32_t MaximumThreadCount = MLAS_MAXIMUM_THREAD_COUNT;
#endif

#if defined(MLAS_TARGET_ARM64) && defined(__linux__)
// TODO!! implement uarch detection in Windows
std::vector<MlasCoreType> mlas_coretype_tbl;
#endif

/**
* @return 2 current core is little core with narrow memory load (e.g. ARMv8 a53)
* 3 current core is big core with wider load (e.g. ARMv8 a72)
*/
MlasCoreType GetCoreType()
yufenglee marked this conversation as resolved.
Show resolved Hide resolved
{
#if defined(MLAS_TARGET_ARM64) && defined(__linux__)

if (mlas_coretype_tbl.size() == 0) {
// functionality missing, return default
return mlas_core_big;
}

unsigned cpu = 0;
if (syscall(__NR_getcpu, &cpu, NULL, NULL) != 0) {
// failed to detect current core id. give up
return mlas_core_big;
}

if (cpu >= mlas_coretype_tbl.size()) {
mlas_coretype_tbl.resize(cpu + 1, mlas_core_unknown);
yufenglee marked this conversation as resolved.
Show resolved Hide resolved
}

auto core_type = mlas_coretype_tbl[cpu];
if (core_type == mlas_core_unknown) {
auto uarch = MLAS_CPUIDINFO::GetCPUIDInfo().GetCurrentUarch();
if (uarch == cpuinfo_uarch_cortex_a53 || uarch == cpuinfo_uarch_cortex_a55r0 ||
uarch == cpuinfo_uarch_cortex_a55) {
core_type = mlas_core_little;
} else {
core_type = mlas_core_big;
}
mlas_coretype_tbl[cpu] = core_type;
}
return core_type;

#else
return mlas_core_big;
#endif
}
};

inline
Expand Down Expand Up @@ -1987,72 +2081,3 @@ MlasReadTimeStampCounter(void)
#endif
#endif
}

/**
* @brief IDs for cpu microarchitectures.
*
* Copied from python cpuinfo package. Can't use the definition
* from cpuinfo directly as it causes lots of compilation issues
* in many platforms that we support.
*/
enum MlasUArch {
cpuinfo_uarch_unknown = 0,

/** ARM Cortex-A32. */
cpuinfo_uarch_cortex_a32 = 0x00300332,
/** ARM Cortex-A35. */
cpuinfo_uarch_cortex_a35 = 0x00300335,
/** ARM Cortex-A53. */
cpuinfo_uarch_cortex_a53 = 0x00300353,
/** ARM Cortex-A55 revision 0 (restricted dual-issue capabilities compared to revision 1+). */
cpuinfo_uarch_cortex_a55r0 = 0x00300354,
/** ARM Cortex-A55. */
cpuinfo_uarch_cortex_a55 = 0x00300355,
/** ARM Cortex-A57. */
cpuinfo_uarch_cortex_a57 = 0x00300357,
/** ARM Cortex-A65. */
cpuinfo_uarch_cortex_a65 = 0x00300365,
/** ARM Cortex-A72. */
cpuinfo_uarch_cortex_a72 = 0x00300372,
/** ARM Cortex-A73. */
cpuinfo_uarch_cortex_a73 = 0x00300373,
/** ARM Cortex-A75. */
cpuinfo_uarch_cortex_a75 = 0x00300375,
/** ARM Cortex-A76. */
cpuinfo_uarch_cortex_a76 = 0x00300376,
/** ARM Cortex-A77. */
cpuinfo_uarch_cortex_a77 = 0x00300377,
/** ARM Cortex-A78. */
cpuinfo_uarch_cortex_a78 = 0x00300378,
};

enum MlasCoreType { mlas_core_unknown = 0, mlas_core_little = 2, mlas_core_big = 3 };

/**
* @return 2 current core is little core with narrow memory load (e.g. ARMv8 a53)
* 3 current core is big core with wider load (e.g. ARMv8 a72)
*/
MLAS_FORCEINLINE
int32_t
MlasGetCoreUArch()
{
thread_local int32_t core_type = mlas_core_unknown;
if (core_type == mlas_core_unknown) {
// initialization needed
#if defined(MLAS_TARGET_ARM64) && defined(__linux__)
auto uarch = MLAS_CPUIDINFO::GetCPUIDInfo().GetCurrentUarch();
if (uarch == cpuinfo_uarch_cortex_a53 || uarch == cpuinfo_uarch_cortex_a55r0 ||
uarch == cpuinfo_uarch_cortex_a55) {
core_type = mlas_core_little;
} else {
core_type = mlas_core_big;
}
#else
core_type = mlas_core_big;
#endif // MLAS_TARGET_ARM64

}
return core_type;
}


11 changes: 11 additions & 0 deletions onnxruntime/core/mlas/lib/platform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ Module Name:

#include "mlasi.h"

#include <thread>
#include <mutex>

#if defined(MLAS_TARGET_POWER) && defined(__linux__)
#include <sys/auxv.h>
#endif
Expand Down Expand Up @@ -394,6 +397,14 @@ Return Value:
#endif
#endif

// Init the table describing the type (big or litte) of each core
#if defined(MLAS_TARGET_ARM64) && defined(__linux__)
// TODO!! implemente core uarch detection in Windows
auto tbl_size = std::thread::hardware_concurrency();
if (tbl_size > 0) {
mlas_coretype_tbl.resize(tbl_size, mlas_core_unknown);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[](http://example.com/codeflow?start=62&length=4)

nit: extra spaces

}
yufenglee marked this conversation as resolved.
Show resolved Hide resolved
#endif
}

size_t
Expand Down
4 changes: 2 additions & 2 deletions onnxruntime/core/mlas/lib/qgemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ MlasSymmQgemmBatch(
if (ThreadPool == nullptr) {
// So our caller handles threaded job partition.
// Call single threaded operation directly
auto uarch = MlasGetCoreUArch();
auto uarch = GetMlasPlatform().GetCoreType();
MLAS_SYMM_QGEMM_OPERATION* operation =
uarch == mlas_core_little ? dispatch->LitOperation : dispatch->BigOperation;

Expand Down Expand Up @@ -260,7 +260,7 @@ MlasSymmQgemmBatch(
ThreadsPerGemm = ThreadCountM * ThreadCountN;

MlasTrySimpleParallel(ThreadPool, ThreadsPerGemm * BatchN, [&](ptrdiff_t tid) {
auto uarch = MlasGetCoreUArch();
auto uarch = GetMlasPlatform().GetCoreType();
MLAS_SYMM_QGEMM_OPERATION* operation =
uarch == mlas_core_little ? dispatch->LitOperation : dispatch->BigOperation;

Expand Down