diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc index 983cc6089bb4c..0b2db3f33f4b9 100644 --- a/onnxruntime/core/platform/windows/env.cc +++ b/onnxruntime/core/platform/windows/env.cc @@ -32,7 +32,7 @@ limitations under the License. #include "core/common/span_utils.h" #include "core/platform/env.h" #include "core/platform/scoped_resource.h" -#if defined(_M_X64) && !defined(_M_ARM64EC) && defined(ONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH) +#if defined(_M_X64) && !defined(_M_ARM64EC) #include "core/platform/windows/hardware_core_enumerator.h" #endif #include @@ -252,7 +252,7 @@ void WindowsEnv::SleepForMicroseconds(int64_t micros) const { } // EIGEN_NO_CPUID is not defined in any C/C++ source code. It is a compile option. -#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID) && defined(ONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH) +#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID) static constexpr std::array kVendorID_Intel = {0x756e6547, 0x6c65746e, 0x49656e69}; // "GenuntelineI" #endif int WindowsEnv::DefaultNumCores() { @@ -261,7 +261,7 @@ int WindowsEnv::DefaultNumCores() { int WindowsEnv::GetNumPhysicalCpuCores() const { // EIGEN_NO_CPUID is not defined in any C/C++ source code. It is a compile option. -#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID) && defined(ONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH) +#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID) // The following code is a temporary fix for a perf problem on Intel's Meteor Lake CPUs. The Intel compute platform has // a hybrid architecture that some CPU cores runs significant slower than the others. If we distribute our compute work // evenly to all CPU cores, the slowest CPU core will drag the performance down. So, instead, we reduce the total number diff --git a/onnxruntime/core/platform/windows/hardware_core_enumerator.cc b/onnxruntime/core/platform/windows/hardware_core_enumerator.cc index 121c59808ae59..bf3b53afbd7d3 100644 --- a/onnxruntime/core/platform/windows/hardware_core_enumerator.cc +++ b/onnxruntime/core/platform/windows/hardware_core_enumerator.cc @@ -15,7 +15,7 @@ struct LogicalProcessorInformation { struct CoreCounter { uint32_t PhysicalCores = 0; - uint32_t SocDieCores = 0; + uint32_t LLCCores = 0; }; static LogicalProcessorInformation GetLogicalProcessorInfos(LOGICAL_PROCESSOR_RELATIONSHIP relationship) { @@ -42,7 +42,7 @@ uint32_t CountSetBits(DWORD input) { return c; } -static CoreCounter GetNumberOPhysicalAndEngineeringCores() { +static CoreCounter GetCoreInfo() { auto logicalProcessorInformation = GetLogicalProcessorInfos(RelationAll); CoreCounter cores; @@ -73,17 +73,18 @@ static CoreCounter GetNumberOPhysicalAndEngineeringCores() { read += currentProcessorInfo->Size; } + // Cores with L2 and LLC cache levels = # Physical Cores - # logical cores without LLC + cores.LLCCores = cores.PhysicalCores - CountSetBits(dwLevel2GroupMask & ~dwLevel3GroupMask); - cores.SocDieCores = CountSetBits(dwLevel2GroupMask & ~dwLevel3GroupMask); return cores; } uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() { // # of physical cores = # of P cores + # of E Cores + # of Soc Cores. // # of logical cores = # of P cores x 2 (if hyper threading is enabled) + # of E cores + # of Soc Cores. - auto cores = GetNumberOPhysicalAndEngineeringCores(); - // We want to use the number of physical cores, but exclude soc cores - return cores.PhysicalCores - cores.SocDieCores; + auto cores = GetCoreInfo(); + + return cores.LLCCores; } } // namespace onnxruntime diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 1056c4ed84510..bcc90a6ce7235 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -1533,8 +1533,7 @@ def generate_build_tree( ldflags = ["/profile", "/DYNAMICBASE"] # Address Sanitizer libs do not have a Qspectre version. So they two cannot be both enabled. if not args.enable_address_sanitizer: - # Also enable a special perf patch that was made for Intel Meteor Lake mobile CPUs - cflags += ["/Qspectre", "/DONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH"] + cflags += ["/Qspectre"] if config == "Release": cflags += ["/O2", "/Ob2", "/DNDEBUG"] elif config == "RelWithDebInfo": diff --git a/winml/lib/Api/HardwareCoreEnumerator.cpp b/winml/lib/Api/HardwareCoreEnumerator.cpp index d04e276347170..267245063520a 100644 --- a/winml/lib/Api/HardwareCoreEnumerator.cpp +++ b/winml/lib/Api/HardwareCoreEnumerator.cpp @@ -14,7 +14,7 @@ struct LogicalProcessorInformation { struct CoreCounter { uint32_t PhysicalCores = 0; - uint32_t Num2CacheCores = 0; + uint32_t LLCCores = 0; }; static LogicalProcessorInformation GetLogicalProcessorInfos(LOGICAL_PROCESSOR_RELATIONSHIP relationship) { @@ -42,7 +42,7 @@ uint32_t CountSetBits(DWORD input) { return c; } -static CoreCounter GetNumberOPhysicalAndEngineeringCores() { +static CoreCounter GetCoreInfo() { auto logicalProcessorInformation = GetLogicalProcessorInfos(RelationAll); CoreCounter cores; @@ -64,6 +64,7 @@ static CoreCounter GetNumberOPhysicalAndEngineeringCores() { cores.PhysicalCores++; break; case RelationCache: + // Cache level masks count Logicial processors if (currentProcessorInfo->Cache.Level == 2) { dwLevel2GroupMask |= currentProcessorInfo->Cache.GroupMask.Mask; } else if (currentProcessorInfo->Cache.Level == 3) { @@ -75,14 +76,15 @@ static CoreCounter GetNumberOPhysicalAndEngineeringCores() { read += currentProcessorInfo->Size; } - cores.Num2CacheCores = CountSetBits(dwLevel2GroupMask & ~dwLevel3GroupMask); + cores.LLCCores = cores.PhysicalCores - CountSetBits(dwLevel2GroupMask & ~dwLevel3GroupMask); + return cores; } uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() { // # of physical cores = # of P cores + # of E Cores + # of Soc Cores. // # of logical cores = # of P cores x 2 (if hyper threading is enabled) + # of E cores + # of Soc Cores. - auto cores = GetNumberOPhysicalAndEngineeringCores(); + auto cores = GetCoreInfo(); #if !defined(_M_ARM64EC) && !defined(_M_ARM64) && !defined(__aarch64__) const int kVendorID_Intel[3] = {0x756e6547, 0x6c65746e, 0x49656e69}; // "GenuntelineI" @@ -97,9 +99,8 @@ uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() { auto isHybrid = (regs_leaf7[3] & (1 << 15)); if (isIntel && isHybrid) { - // We want to use the number of physical cores, but exclude soc cores - // On Intel Hybrid processors, numSocCores == cores.Num2CacheCores - return cores.PhysicalCores - cores.Num2CacheCores; + // We want to use the number of physical cores, but exclude cores without an LLC + return cores.LLCCores; } #endif