From f5293d253c2ccc7a96c9ec791b91e0035359f45f Mon Sep 17 00:00:00 2001 From: A-Satti <104173224+A-Satti@users.noreply.github.com> Date: Fri, 6 Dec 2024 13:56:50 -0800 Subject: [PATCH] Update Intel Thread Counts (#22894) ### Description The default thread count methodology by onnxruntime did not account for new upcoming Intel microarchitectures leading to a suboptimal thread count. Optimizing the thread count for new Intel microarchitectures reveal gains on the majority of models across datatypes and shows gains up to ~1.5x speedup. ### Motivation and Context Applications should run on Intel with the most performant thread configuration for the majority of models. With new microarchitectures, adjusting the thread count methodology is required to take advantage of their differences. --- .../windows/hardware_core_enumerator.cc | 35 ++++++++++++++++++- tools/ci_build/build.py | 4 --- winml/lib/Api/HardwareCoreEnumerator.cpp | 25 +++++++++---- 3 files changed, 52 insertions(+), 12 deletions(-) diff --git a/onnxruntime/core/platform/windows/hardware_core_enumerator.cc b/onnxruntime/core/platform/windows/hardware_core_enumerator.cc index bf3b53afbd7d3..7464ab4c57d01 100644 --- a/onnxruntime/core/platform/windows/hardware_core_enumerator.cc +++ b/onnxruntime/core/platform/windows/hardware_core_enumerator.cc @@ -1,7 +1,8 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. #include "hardware_core_enumerator.h" +#include "core/platform/windows/env.h" #include #include #include @@ -83,6 +84,38 @@ uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() { // # of physical cores = # of P cores + # of E Cores + # of Soc Cores. // # of logical cores = # of P cores x 2 (if hyper threading is enabled) + # of E cores + # of Soc Cores. auto cores = GetCoreInfo(); +#if !defined(_M_ARM64EC) && !defined(_M_ARM64) && !defined(__aarch64__) + const int kVendorID_Intel[3] = {0x756e6547, 0x6c65746e, 0x49656e69}; // "GenuntelineI" + bool isIntelSpecifiedPlatform = false; + const int kVendorID_IntelSpecifiedPlatformIDs[3] = { + // ExtendedModel, ExtendedFamily, Family Code, and Model Number + 0xa06a, // MTL + 0xc065, // ARL-H + 0xb065 // ARL-U + }; + + int regs_leaf0[4]; + int regs_leaf1[4]; + __cpuid(regs_leaf0, 0); + __cpuid(regs_leaf1, 0x1); + + auto isIntel = (kVendorID_Intel[0] == regs_leaf0[1]) && (kVendorID_Intel[1] == regs_leaf0[2]) && (kVendorID_Intel[2] == regs_leaf0[3]); + + for (int intelSpecifiedPlatform : kVendorID_IntelSpecifiedPlatformIDs) { + if ((regs_leaf1[0] >> 4) == intelSpecifiedPlatform) { + isIntelSpecifiedPlatform = true; + } + } + + if (isIntel) { + if (isIntelSpecifiedPlatform) { + // We want to exclude cores without an LLC + return cores.LLCCores; + } else { + return cores.PhysicalCores; + } + } +#endif return cores.LLCCores; } diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index b50d11a0aff6d..fdc85e2cafa31 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -1559,10 +1559,6 @@ def generate_build_tree( # The "/profile" flag implies "/DEBUG:FULL /DEBUGTYPE:cv,fixup /OPT:REF /OPT:NOICF /INCREMENTAL:NO /FIXED:NO". We set it for satisfying a Microsoft internal compliance requirement. External users # do not need to have it. ldflags = ["/profile", "/DYNAMICBASE"] - # Address Sanitizer libs do not have a Qspectre version. So they two cannot be both enabled. - if not args.enable_address_sanitizer: - # Also enable a special perf patch that was made for Intel Meteor Lake mobile CPUs - cflags += ["/Qspectre", "/DONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH"] if config == "Release": cflags += ["/O2", "/Ob2", "/DNDEBUG"] elif config == "RelWithDebInfo": diff --git a/winml/lib/Api/HardwareCoreEnumerator.cpp b/winml/lib/Api/HardwareCoreEnumerator.cpp index 1763290718a8f..f1272fc1b8626 100644 --- a/winml/lib/Api/HardwareCoreEnumerator.cpp +++ b/winml/lib/Api/HardwareCoreEnumerator.cpp @@ -1,8 +1,8 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. + +// Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. #include "lib/Api/pch/pch.h" - #include "HardwareCoreEnumerator.h" namespace WINMLP { @@ -88,22 +88,33 @@ uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() { #if !defined(_M_ARM64EC) && !defined(_M_ARM64) && !defined(__aarch64__) const int kVendorID_Intel[3] = {0x756e6547, 0x6c65746e, 0x49656e69}; // "GenuntelineI" + bool isIntelSpecifiedPlatform = false; + const int kVendorID_IntelSpecifiedPlatformIDs[3] = { + // ExtendedModel,ExtendedFamily,Family Code, and Model Number + 0xa06a, // MTL + 0xc065, // ARL-H + 0xb065 // ARL-U + }; + int regs_leaf0[4]; - int regs_leaf7[4]; + int regs_leaf1[4]; __cpuid(regs_leaf0, 0); - __cpuid(regs_leaf7, 0x7); + __cpuid(regs_leaf1, 0x1); auto isIntel = (kVendorID_Intel[0] == regs_leaf0[1]) && (kVendorID_Intel[1] == regs_leaf0[2]) && (kVendorID_Intel[2] == regs_leaf0[3]); - auto isHybrid = (regs_leaf7[3] & (1 << 15)); + for (int intelSpecifiedPlatform : kVendorID_IntelSpecifiedPlatformIDs) { + if ((regs_leaf1[0] >> 4) == intelSpecifiedPlatform) { + isIntelSpecifiedPlatform = true; + } + } - if (isIntel && isHybrid) { + if (isIntel && isIntelSpecifiedPlatform) { // We want to use the number of physical cores, but exclude cores without an LLC return cores.LLCCores; } #endif - return cores.PhysicalCores; }