From 0e05f82b65b06675522b920eac0cfc5f5001850d Mon Sep 17 00:00:00 2001 From: Sai Kishan Pampana Date: Tue, 12 Mar 2024 16:32:07 -0700 Subject: [PATCH 1/5] Update LLC Core Count calculation --- onnxruntime/core/platform/windows/env.cc | 6 +++--- .../platform/windows/hardware_core_enumerator.cc | 14 ++++++++------ winml/lib/Api/HardwareCoreEnumerator.cpp | 15 ++++++++------- 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc index 983cc6089bb4c..a56875518270e 100644 --- a/onnxruntime/core/platform/windows/env.cc +++ b/onnxruntime/core/platform/windows/env.cc @@ -32,7 +32,7 @@ limitations under the License. #include "core/common/span_utils.h" #include "core/platform/env.h" #include "core/platform/scoped_resource.h" -#if defined(_M_X64) && !defined(_M_ARM64EC) && defined(ONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH) +#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(ONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH) #include "core/platform/windows/hardware_core_enumerator.h" #endif #include @@ -252,7 +252,7 @@ void WindowsEnv::SleepForMicroseconds(int64_t micros) const { } // EIGEN_NO_CPUID is not defined in any C/C++ source code. It is a compile option. -#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID) && defined(ONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH) +#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID) && !defined(ONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH) static constexpr std::array kVendorID_Intel = {0x756e6547, 0x6c65746e, 0x49656e69}; // "GenuntelineI" #endif int WindowsEnv::DefaultNumCores() { @@ -261,7 +261,7 @@ int WindowsEnv::DefaultNumCores() { int WindowsEnv::GetNumPhysicalCpuCores() const { // EIGEN_NO_CPUID is not defined in any C/C++ source code. It is a compile option. -#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID) && defined(ONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH) +#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID) && !defined(ONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH) // The following code is a temporary fix for a perf problem on Intel's Meteor Lake CPUs. The Intel compute platform has // a hybrid architecture that some CPU cores runs significant slower than the others. If we distribute our compute work // evenly to all CPU cores, the slowest CPU core will drag the performance down. So, instead, we reduce the total number diff --git a/onnxruntime/core/platform/windows/hardware_core_enumerator.cc b/onnxruntime/core/platform/windows/hardware_core_enumerator.cc index 121c59808ae59..644bd93faaab4 100644 --- a/onnxruntime/core/platform/windows/hardware_core_enumerator.cc +++ b/onnxruntime/core/platform/windows/hardware_core_enumerator.cc @@ -15,7 +15,8 @@ struct LogicalProcessorInformation { struct CoreCounter { uint32_t PhysicalCores = 0; - uint32_t SocDieCores = 0; + uint32_t LLCCores = 0; + }; static LogicalProcessorInformation GetLogicalProcessorInfos(LOGICAL_PROCESSOR_RELATIONSHIP relationship) { @@ -42,7 +43,7 @@ uint32_t CountSetBits(DWORD input) { return c; } -static CoreCounter GetNumberOPhysicalAndEngineeringCores() { +static CoreCounter GetCoreInfo() { auto logicalProcessorInformation = GetLogicalProcessorInfos(RelationAll); CoreCounter cores; @@ -73,17 +74,18 @@ static CoreCounter GetNumberOPhysicalAndEngineeringCores() { read += currentProcessorInfo->Size; } + //Cores with L2 and LLC cache levels = # Physical Cores - # logical cores without LLC + cores.LLCCores = cores.PhysicalCores - CountSetBits(dwLevel2GroupMask & ~dwLevel3GroupMask); - cores.SocDieCores = CountSetBits(dwLevel2GroupMask & ~dwLevel3GroupMask); return cores; } uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() { // # of physical cores = # of P cores + # of E Cores + # of Soc Cores. // # of logical cores = # of P cores x 2 (if hyper threading is enabled) + # of E cores + # of Soc Cores. - auto cores = GetNumberOPhysicalAndEngineeringCores(); - // We want to use the number of physical cores, but exclude soc cores - return cores.PhysicalCores - cores.SocDieCores; + auto cores = GetCoreInfo(); + + return cores.LLCCores; } } // namespace onnxruntime diff --git a/winml/lib/Api/HardwareCoreEnumerator.cpp b/winml/lib/Api/HardwareCoreEnumerator.cpp index d04e276347170..040ecbaf5bae6 100644 --- a/winml/lib/Api/HardwareCoreEnumerator.cpp +++ b/winml/lib/Api/HardwareCoreEnumerator.cpp @@ -14,7 +14,7 @@ struct LogicalProcessorInformation { struct CoreCounter { uint32_t PhysicalCores = 0; - uint32_t Num2CacheCores = 0; + uint32_t LLCCores = 0; }; static LogicalProcessorInformation GetLogicalProcessorInfos(LOGICAL_PROCESSOR_RELATIONSHIP relationship) { @@ -42,7 +42,7 @@ uint32_t CountSetBits(DWORD input) { return c; } -static CoreCounter GetNumberOPhysicalAndEngineeringCores() { +static CoreCounter GetCoreInfo() { auto logicalProcessorInformation = GetLogicalProcessorInfos(RelationAll); CoreCounter cores; @@ -64,6 +64,7 @@ static CoreCounter GetNumberOPhysicalAndEngineeringCores() { cores.PhysicalCores++; break; case RelationCache: + //Cache level masks count Logicial processors if (currentProcessorInfo->Cache.Level == 2) { dwLevel2GroupMask |= currentProcessorInfo->Cache.GroupMask.Mask; } else if (currentProcessorInfo->Cache.Level == 3) { @@ -75,14 +76,15 @@ static CoreCounter GetNumberOPhysicalAndEngineeringCores() { read += currentProcessorInfo->Size; } - cores.Num2CacheCores = CountSetBits(dwLevel2GroupMask & ~dwLevel3GroupMask); + cores.LLCCores = cores.PhysicalCores - CountSetBits(dwLevel2GroupMask & ~dwLevel3GroupMask); + return cores; } uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() { // # of physical cores = # of P cores + # of E Cores + # of Soc Cores. // # of logical cores = # of P cores x 2 (if hyper threading is enabled) + # of E cores + # of Soc Cores. - auto cores = GetNumberOPhysicalAndEngineeringCores(); + auto cores = GetCoreInfo(); #if !defined(_M_ARM64EC) && !defined(_M_ARM64) && !defined(__aarch64__) const int kVendorID_Intel[3] = {0x756e6547, 0x6c65746e, 0x49656e69}; // "GenuntelineI" @@ -97,9 +99,8 @@ uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() { auto isHybrid = (regs_leaf7[3] & (1 << 15)); if (isIntel && isHybrid) { - // We want to use the number of physical cores, but exclude soc cores - // On Intel Hybrid processors, numSocCores == cores.Num2CacheCores - return cores.PhysicalCores - cores.Num2CacheCores; + // We want to use the number of physical cores, but exclude cores without an LLC + return cores.LLCCores; } #endif From bb7271f61de5a7172e9814dfcda23a11c7f0bba8 Mon Sep 17 00:00:00 2001 From: Sai Kishan Pampana Date: Thu, 14 Mar 2024 12:15:14 -0700 Subject: [PATCH 2/5] Removing MTL specific flags --- onnxruntime/core/platform/windows/env.cc | 6 +++--- tools/ci_build/build.py | 3 +-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc index a56875518270e..c4a7e2d1f8f39 100644 --- a/onnxruntime/core/platform/windows/env.cc +++ b/onnxruntime/core/platform/windows/env.cc @@ -32,7 +32,7 @@ limitations under the License. #include "core/common/span_utils.h" #include "core/platform/env.h" #include "core/platform/scoped_resource.h" -#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(ONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH) +#if defined(_M_X64) && !defined(_M_ARM64EC) #include "core/platform/windows/hardware_core_enumerator.h" #endif #include @@ -252,7 +252,7 @@ void WindowsEnv::SleepForMicroseconds(int64_t micros) const { } // EIGEN_NO_CPUID is not defined in any C/C++ source code. It is a compile option. -#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID) && !defined(ONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH) +#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID) static constexpr std::array kVendorID_Intel = {0x756e6547, 0x6c65746e, 0x49656e69}; // "GenuntelineI" #endif int WindowsEnv::DefaultNumCores() { @@ -261,7 +261,7 @@ int WindowsEnv::DefaultNumCores() { int WindowsEnv::GetNumPhysicalCpuCores() const { // EIGEN_NO_CPUID is not defined in any C/C++ source code. It is a compile option. -#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID) && !defined(ONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH) +#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID) // The following code is a temporary fix for a perf problem on Intel's Meteor Lake CPUs. The Intel compute platform has // a hybrid architecture that some CPU cores runs significant slower than the others. If we distribute our compute work // evenly to all CPU cores, the slowest CPU core will drag the performance down. So, instead, we reduce the total number diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 1056c4ed84510..bcc90a6ce7235 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -1533,8 +1533,7 @@ def generate_build_tree( ldflags = ["/profile", "/DYNAMICBASE"] # Address Sanitizer libs do not have a Qspectre version. So they two cannot be both enabled. if not args.enable_address_sanitizer: - # Also enable a special perf patch that was made for Intel Meteor Lake mobile CPUs - cflags += ["/Qspectre", "/DONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH"] + cflags += ["/Qspectre"] if config == "Release": cflags += ["/O2", "/Ob2", "/DNDEBUG"] elif config == "RelWithDebInfo": From c5084492002a9cd1957ed7c73ab99ef0c0e4e716 Mon Sep 17 00:00:00 2001 From: spampana95 Date: Thu, 4 Apr 2024 13:28:44 -0700 Subject: [PATCH 3/5] Update env.cc to remove lintrunner issues --- onnxruntime/core/platform/windows/env.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc index c4a7e2d1f8f39..0b2db3f33f4b9 100644 --- a/onnxruntime/core/platform/windows/env.cc +++ b/onnxruntime/core/platform/windows/env.cc @@ -32,7 +32,7 @@ limitations under the License. #include "core/common/span_utils.h" #include "core/platform/env.h" #include "core/platform/scoped_resource.h" -#if defined(_M_X64) && !defined(_M_ARM64EC) +#if defined(_M_X64) && !defined(_M_ARM64EC) #include "core/platform/windows/hardware_core_enumerator.h" #endif #include @@ -252,7 +252,7 @@ void WindowsEnv::SleepForMicroseconds(int64_t micros) const { } // EIGEN_NO_CPUID is not defined in any C/C++ source code. It is a compile option. -#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID) +#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID) static constexpr std::array kVendorID_Intel = {0x756e6547, 0x6c65746e, 0x49656e69}; // "GenuntelineI" #endif int WindowsEnv::DefaultNumCores() { @@ -261,7 +261,7 @@ int WindowsEnv::DefaultNumCores() { int WindowsEnv::GetNumPhysicalCpuCores() const { // EIGEN_NO_CPUID is not defined in any C/C++ source code. It is a compile option. -#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID) +#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID) // The following code is a temporary fix for a perf problem on Intel's Meteor Lake CPUs. The Intel compute platform has // a hybrid architecture that some CPU cores runs significant slower than the others. If we distribute our compute work // evenly to all CPU cores, the slowest CPU core will drag the performance down. So, instead, we reduce the total number From 085d4a83f1182d70f0ace7cbeedb752963401d06 Mon Sep 17 00:00:00 2001 From: spampana95 Date: Thu, 4 Apr 2024 13:35:17 -0700 Subject: [PATCH 4/5] Update hardware_core_enumerator.cc to remove lintrunner issues --- onnxruntime/core/platform/windows/hardware_core_enumerator.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/onnxruntime/core/platform/windows/hardware_core_enumerator.cc b/onnxruntime/core/platform/windows/hardware_core_enumerator.cc index 644bd93faaab4..bf3b53afbd7d3 100644 --- a/onnxruntime/core/platform/windows/hardware_core_enumerator.cc +++ b/onnxruntime/core/platform/windows/hardware_core_enumerator.cc @@ -16,7 +16,6 @@ struct LogicalProcessorInformation { struct CoreCounter { uint32_t PhysicalCores = 0; uint32_t LLCCores = 0; - }; static LogicalProcessorInformation GetLogicalProcessorInfos(LOGICAL_PROCESSOR_RELATIONSHIP relationship) { @@ -74,7 +73,7 @@ static CoreCounter GetCoreInfo() { read += currentProcessorInfo->Size; } - //Cores with L2 and LLC cache levels = # Physical Cores - # logical cores without LLC + // Cores with L2 and LLC cache levels = # Physical Cores - # logical cores without LLC cores.LLCCores = cores.PhysicalCores - CountSetBits(dwLevel2GroupMask & ~dwLevel3GroupMask); return cores; From e5e9b3b1e3607685a00f123f0b965c4a1426bc00 Mon Sep 17 00:00:00 2001 From: spampana95 Date: Thu, 4 Apr 2024 13:37:18 -0700 Subject: [PATCH 5/5] Update HardwareCoreEnumerator.cpp to remove lintrunner issues --- winml/lib/Api/HardwareCoreEnumerator.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/winml/lib/Api/HardwareCoreEnumerator.cpp b/winml/lib/Api/HardwareCoreEnumerator.cpp index 040ecbaf5bae6..267245063520a 100644 --- a/winml/lib/Api/HardwareCoreEnumerator.cpp +++ b/winml/lib/Api/HardwareCoreEnumerator.cpp @@ -64,7 +64,7 @@ static CoreCounter GetCoreInfo() { cores.PhysicalCores++; break; case RelationCache: - //Cache level masks count Logicial processors + // Cache level masks count Logicial processors if (currentProcessorInfo->Cache.Level == 2) { dwLevel2GroupMask |= currentProcessorInfo->Cache.GroupMask.Mask; } else if (currentProcessorInfo->Cache.Level == 3) {