From b238bb416c6c90dbfaea7f9263f308397e033559 Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Thu, 1 Feb 2024 15:02:10 -0800 Subject: [PATCH 1/4] Perf improve for Intel SoC --- onnxruntime/core/platform/windows/env.cc | 43 ++++++++- .../windows/hardware_core_enumerator.cc | 89 +++++++++++++++++++ .../windows/hardware_core_enumerator.h | 12 +++ onnxruntime/core/util/thread_utils.cc | 19 ++-- tools/ci_build/build.py | 3 +- 5 files changed, 159 insertions(+), 7 deletions(-) create mode 100644 onnxruntime/core/platform/windows/hardware_core_enumerator.cc create mode 100644 onnxruntime/core/platform/windows/hardware_core_enumerator.h diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc index 1a0713db43db8..dd6340078bbac 100644 --- a/onnxruntime/core/platform/windows/env.cc +++ b/onnxruntime/core/platform/windows/env.cc @@ -32,6 +32,9 @@ limitations under the License. #include "core/common/span_utils.h" #include "core/platform/env.h" #include "core/platform/scoped_resource.h" +#if defined(_M_X64) && !defined(_M_ARM64EC) +#include "core/platform/windows/hardware_core_enumerator.h" +#endif #include #include @@ -248,12 +251,50 @@ void WindowsEnv::SleepForMicroseconds(int64_t micros) const { Sleep(static_cast(micros) / 1000); } +// EIGEN_NO_CPUID is not defined in any C/C++ source code. It is a compile option. +#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID) && defined(ONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH) +static constexpr std::array kVendorID_Intel = {0x756e6547, 0x6c65746e, 0x49656e69}; // "GenuntelineI" +#endif int WindowsEnv::DefaultNumCores() { return std::max(1, static_cast(std::thread::hardware_concurrency() / 2)); } int WindowsEnv::GetNumPhysicalCpuCores() const { - return cores_.empty() ? DefaultNumCores() : static_cast(cores_.size()); +// EIGEN_NO_CPUID is not defined in any C/C++ source code. It is a compile option. +#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID) && defined(ONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH) + // The following code was added per a request from Intel. It is to reduce the total number of threads + // by one on Intel MTL CPUs. This is a special perf optimzation for a special kind of CPU. It is based on assumptions + // that: + // 1. All Intel CPUs should have 3 levels of cache. (However it is not true) + // 2. If a CPU core is only associated with two levels of cache, it should be a low performance CPU core and should + // not be used. + // The code below is very hacky, but we expect it should not cause any crash. The worst is it might return 1 that + // a thread pool will not be created, which is just a perf issue that does not impact usability. + // TODO: detect if CPUID instruction is available per instructions at https://wiki.osdev.org/CPUID#Checking_CPUID_availability + int regs[4]; + __cpuid(regs, 0); + bool bIsIntel = + (kVendorID_Intel[0] == regs[1]) && + (kVendorID_Intel[1] == regs[2]) && + (kVendorID_Intel[2] == regs[3]); + if (bIsIntel && regs[0] >= 7) { + // Query Structured Extended Feature Flags Enumeration Leaf + __cpuid(regs, 0x7); + // The bit 15 of EDX indicates if the processor is identified as a hybrid part. + bool ishybrid = regs[3] & (1 << 15); + if (ishybrid) { + // NOTE: even if ishybrid is true, it doesn't mean the processor must have P-cores/E-cores. + // On Intel CPUs we assume the HardwareCoreEnumerator::DefaultIntraOpNumThreads function would never fail. + // NOTE: due to resource restrictions, we cannot test this branch in our CI build pipelines. + return std::max(static_cast(1), HardwareCoreEnumerator::DefaultIntraOpNumThreads()); + } else { + return cores_.empty() ? DefaultNumCores() : static_cast(cores_.size()); + } + } else +#endif + { + return cores_.empty() ? DefaultNumCores() : static_cast(cores_.size()); + } } std::vector WindowsEnv::GetDefaultThreadAffinities() const { diff --git a/onnxruntime/core/platform/windows/hardware_core_enumerator.cc b/onnxruntime/core/platform/windows/hardware_core_enumerator.cc new file mode 100644 index 0000000000000..121c59808ae59 --- /dev/null +++ b/onnxruntime/core/platform/windows/hardware_core_enumerator.cc @@ -0,0 +1,89 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "hardware_core_enumerator.h" +#include +#include +#include + +namespace onnxruntime { + +struct LogicalProcessorInformation { + std::unique_ptr Buffer; + size_t Length; +}; + +struct CoreCounter { + uint32_t PhysicalCores = 0; + uint32_t SocDieCores = 0; +}; + +static LogicalProcessorInformation GetLogicalProcessorInfos(LOGICAL_PROCESSOR_RELATIONSHIP relationship) { + DWORD length = 0; + DWORD rc = GetLogicalProcessorInformationEx(relationship, nullptr, &length); + + assert(rc == FALSE); + + auto processorInformationBytes = std::make_unique(length); + + rc = GetLogicalProcessorInformationEx( + relationship, reinterpret_cast(processorInformationBytes.get()), &length); + + assert(rc == TRUE); + + return {std::move(processorInformationBytes), length}; +} + +uint32_t CountSetBits(DWORD input) { + uint32_t c; + for (c = 0; input; c++) { + input &= input - 1; + } + return c; +} + +static CoreCounter GetNumberOPhysicalAndEngineeringCores() { + auto logicalProcessorInformation = GetLogicalProcessorInfos(RelationAll); + + CoreCounter cores; + DWORD dwLevel2GroupMask = 0; + DWORD dwLevel3GroupMask = 0; + size_t read = 0; + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX currentProcessorInfo = NULL; + + while ((read + FIELD_OFFSET(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, Processor)) < logicalProcessorInformation.Length) { + currentProcessorInfo = + reinterpret_cast(logicalProcessorInformation.Buffer.get() + read); + if ((read + currentProcessorInfo->Size) > logicalProcessorInformation.Length) { + break; + } + + switch (currentProcessorInfo->Relationship) { + case RelationProcessorCore: + cores.PhysicalCores++; + break; + case RelationCache: + if (currentProcessorInfo->Cache.Level == 2) { + dwLevel2GroupMask |= currentProcessorInfo->Cache.GroupMask.Mask; + } else if (currentProcessorInfo->Cache.Level == 3) { + dwLevel3GroupMask |= currentProcessorInfo->Cache.GroupMask.Mask; + } + break; + } + + read += currentProcessorInfo->Size; + } + + cores.SocDieCores = CountSetBits(dwLevel2GroupMask & ~dwLevel3GroupMask); + return cores; +} + +uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() { + // # of physical cores = # of P cores + # of E Cores + # of Soc Cores. + // # of logical cores = # of P cores x 2 (if hyper threading is enabled) + # of E cores + # of Soc Cores. + auto cores = GetNumberOPhysicalAndEngineeringCores(); + // We want to use the number of physical cores, but exclude soc cores + return cores.PhysicalCores - cores.SocDieCores; +} + +} // namespace onnxruntime diff --git a/onnxruntime/core/platform/windows/hardware_core_enumerator.h b/onnxruntime/core/platform/windows/hardware_core_enumerator.h new file mode 100644 index 0000000000000..93b50f452afcd --- /dev/null +++ b/onnxruntime/core/platform/windows/hardware_core_enumerator.h @@ -0,0 +1,12 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include + +namespace onnxruntime { +struct HardwareCoreEnumerator { + HardwareCoreEnumerator() = delete; + static uint32_t DefaultIntraOpNumThreads(); +}; +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/util/thread_utils.cc b/onnxruntime/core/util/thread_utils.cc index a5a165e150cf1..2a6c14ff1b058 100644 --- a/onnxruntime/core/util/thread_utils.cc +++ b/onnxruntime/core/util/thread_utils.cc @@ -93,22 +93,31 @@ static std::unique_ptr CreateThreadPoolHelper(Env* env, OrtThreadPoolParams options) { ThreadOptions to; if (options.thread_pool_size <= 0) { // default - auto default_affinities = Env::Default().GetDefaultThreadAffinities(); - if (default_affinities.size() <= 1) { - return nullptr; - } - options.thread_pool_size = static_cast(default_affinities.size()); if (options.auto_set_affinity) { #ifdef _WIN32 // Only set thread affinity on Server with auto affinity. // On client best to let OS scheduler handle. // On big (P-Core) / little (E-Core) CPU designs affinity overrides QoS and has high power usage if (IsWindowsServer()) { + auto default_affinities = Env::Default().GetDefaultThreadAffinities(); + if (default_affinities.size() <= 1) { + return nullptr; + } + options.thread_pool_size = static_cast(default_affinities.size()); to.affinities = std::move(default_affinities); + } else { + options.thread_pool_size = Env::Default().GetNumPhysicalCpuCores(); } #else + auto default_affinities = Env::Default().GetDefaultThreadAffinities(); + if (default_affinities.size() <= 1) { + return nullptr; + } + options.thread_pool_size = static_cast(default_affinities.size()); to.affinities = std::move(default_affinities); #endif + } else { + options.thread_pool_size = Env::Default().GetNumPhysicalCpuCores(); } } if (options.thread_pool_size <= 1) { diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 96567c8767a82..7817f8c2aa50b 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -1526,7 +1526,8 @@ def generate_build_tree( ldflags = ["/profile", "/DYNAMICBASE"] # Address Sanitizer libs do not have a Qspectre version. So they two cannot be both enabled. if not args.enable_address_sanitizer: - cflags += ["/Qspectre"] + # Also enable a special perf patch that was made for Intel Meteor Lake mobile CPUs + cflags += ["/Qspectre", "/DONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH"] if config == "Release": cflags += ["/O2", "/Ob2", "/DNDEBUG"] elif config == "RelWithDebInfo": From 8dbc16cc450ba52761cd181a4433ce4d16310b86 Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Wed, 14 Feb 2024 13:52:51 -0800 Subject: [PATCH 2/4] update --- onnxruntime/core/platform/windows/env.cc | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc index dd6340078bbac..9c91bcb89875f 100644 --- a/onnxruntime/core/platform/windows/env.cc +++ b/onnxruntime/core/platform/windows/env.cc @@ -261,15 +261,19 @@ int WindowsEnv::DefaultNumCores() { int WindowsEnv::GetNumPhysicalCpuCores() const { // EIGEN_NO_CPUID is not defined in any C/C++ source code. It is a compile option. -#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID) && defined(ONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH) - // The following code was added per a request from Intel. It is to reduce the total number of threads - // by one on Intel MTL CPUs. This is a special perf optimzation for a special kind of CPU. It is based on assumptions - // that: - // 1. All Intel CPUs should have 3 levels of cache. (However it is not true) +//&&defined(ONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH) +#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID) + // The following code is a temporary fix for a perf problem on Intel's Meteor Lake CPUs. The Intel compute platform has + // a hybrid architecture that some CPU cores runs significant slower than the others. If we distribute our compute work + // evenly to all CPU cores, the slowest CPU core will drag the performance down. So, instead, we reduce the total number + // of threads to exclude the slowest cores out. + // The following code is based on assumptions that: + // 1. All Intel hybrid CPUs should have 3 levels of cache. // 2. If a CPU core is only associated with two levels of cache, it should be a low performance CPU core and should // not be used. - // The code below is very hacky, but we expect it should not cause any crash. The worst is it might return 1 that - // a thread pool will not be created, which is just a perf issue that does not impact usability. + // Since we don't know what the next Intel hybrid CPU would be like, later on we may need to rework the following code. + // However, no matter what the code should not cause any crash. The worst is it might return 1 that + // thread pools will not be created, which is just a perf issue and does not impact usability. // TODO: detect if CPUID instruction is available per instructions at https://wiki.osdev.org/CPUID#Checking_CPUID_availability int regs[4]; __cpuid(regs, 0); @@ -283,7 +287,7 @@ int WindowsEnv::GetNumPhysicalCpuCores() const { // The bit 15 of EDX indicates if the processor is identified as a hybrid part. bool ishybrid = regs[3] & (1 << 15); if (ishybrid) { - // NOTE: even if ishybrid is true, it doesn't mean the processor must have P-cores/E-cores. + // NOTE: even if ishybrid is true, it doesn't mean the processor must have P-cores and E-cores. // On Intel CPUs we assume the HardwareCoreEnumerator::DefaultIntraOpNumThreads function would never fail. // NOTE: due to resource restrictions, we cannot test this branch in our CI build pipelines. return std::max(static_cast(1), HardwareCoreEnumerator::DefaultIntraOpNumThreads()); From 7e6a945d0980cdaace48128196328ce4a398ef05 Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Wed, 14 Feb 2024 14:01:45 -0800 Subject: [PATCH 3/4] update --- tools/ci_build/build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 7817f8c2aa50b..244bebd81474d 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -1526,7 +1526,7 @@ def generate_build_tree( ldflags = ["/profile", "/DYNAMICBASE"] # Address Sanitizer libs do not have a Qspectre version. So they two cannot be both enabled. if not args.enable_address_sanitizer: - # Also enable a special perf patch that was made for Intel Meteor Lake mobile CPUs + # Also enable a special perf patch that was made for Intel Meteor Lake mobile CPUs cflags += ["/Qspectre", "/DONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH"] if config == "Release": cflags += ["/O2", "/Ob2", "/DNDEBUG"] From 86f82d2539e813b3e6a488f2ddd4d9294cd8e795 Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Wed, 14 Feb 2024 14:12:26 -0800 Subject: [PATCH 4/4] update --- onnxruntime/core/platform/windows/env.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc index 9c91bcb89875f..0eb34cbfbc9eb 100644 --- a/onnxruntime/core/platform/windows/env.cc +++ b/onnxruntime/core/platform/windows/env.cc @@ -32,7 +32,7 @@ limitations under the License. #include "core/common/span_utils.h" #include "core/platform/env.h" #include "core/platform/scoped_resource.h" -#if defined(_M_X64) && !defined(_M_ARM64EC) +#if defined(_M_X64) && !defined(_M_ARM64EC) && defined(ONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH) #include "core/platform/windows/hardware_core_enumerator.h" #endif #include @@ -261,8 +261,7 @@ int WindowsEnv::DefaultNumCores() { int WindowsEnv::GetNumPhysicalCpuCores() const { // EIGEN_NO_CPUID is not defined in any C/C++ source code. It is a compile option. -//&&defined(ONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH) -#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID) +#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID) && defined(ONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH) // The following code is a temporary fix for a perf problem on Intel's Meteor Lake CPUs. The Intel compute platform has // a hybrid architecture that some CPU cores runs significant slower than the others. If we distribute our compute work // evenly to all CPU cores, the slowest CPU core will drag the performance down. So, instead, we reduce the total number