Skip to content

Commit

Permalink
Update Intel Thread Counts (#22894)
Browse files Browse the repository at this point in the history
### Description
The default thread count methodology by onnxruntime did not account for
new upcoming Intel microarchitectures leading to a suboptimal thread
count. Optimizing the thread count for new Intel microarchitectures
reveal gains on the majority of models across datatypes and shows gains
up to ~1.5x speedup.


### Motivation and Context
Applications should run on Intel with the most performant thread
configuration for the majority of models. With new microarchitectures,
adjusting the thread count methodology is required to take advantage of
their differences.
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
  • Loading branch information
A-Satti authored Dec 6, 2024
1 parent bd5a759 commit f5293d2
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 12 deletions.
35 changes: 34 additions & 1 deletion onnxruntime/core/platform/windows/hardware_core_enumerator.cc
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#include "hardware_core_enumerator.h"
#include "core/platform/windows/env.h"
#include <memory>
#include <Windows.h>
#include <assert.h>
Expand Down Expand Up @@ -83,6 +84,38 @@ uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() {
// # of physical cores = # of P cores + # of E Cores + # of Soc Cores.
// # of logical cores = # of P cores x 2 (if hyper threading is enabled) + # of E cores + # of Soc Cores.
auto cores = GetCoreInfo();
#if !defined(_M_ARM64EC) && !defined(_M_ARM64) && !defined(__aarch64__)
const int kVendorID_Intel[3] = {0x756e6547, 0x6c65746e, 0x49656e69}; // "GenuntelineI"
bool isIntelSpecifiedPlatform = false;
const int kVendorID_IntelSpecifiedPlatformIDs[3] = {
// ExtendedModel, ExtendedFamily, Family Code, and Model Number
0xa06a, // MTL
0xc065, // ARL-H
0xb065 // ARL-U
};

int regs_leaf0[4];
int regs_leaf1[4];
__cpuid(regs_leaf0, 0);
__cpuid(regs_leaf1, 0x1);

auto isIntel = (kVendorID_Intel[0] == regs_leaf0[1]) && (kVendorID_Intel[1] == regs_leaf0[2]) && (kVendorID_Intel[2] == regs_leaf0[3]);

for (int intelSpecifiedPlatform : kVendorID_IntelSpecifiedPlatformIDs) {
if ((regs_leaf1[0] >> 4) == intelSpecifiedPlatform) {
isIntelSpecifiedPlatform = true;
}
}

if (isIntel) {
if (isIntelSpecifiedPlatform) {
// We want to exclude cores without an LLC
return cores.LLCCores;
} else {
return cores.PhysicalCores;
}
}
#endif

return cores.LLCCores;
}
Expand Down
4 changes: 0 additions & 4 deletions tools/ci_build/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -1559,10 +1559,6 @@ def generate_build_tree(
# The "/profile" flag implies "/DEBUG:FULL /DEBUGTYPE:cv,fixup /OPT:REF /OPT:NOICF /INCREMENTAL:NO /FIXED:NO". We set it for satisfying a Microsoft internal compliance requirement. External users
# do not need to have it.
ldflags = ["/profile", "/DYNAMICBASE"]
# Address Sanitizer libs do not have a Qspectre version. So they two cannot be both enabled.
if not args.enable_address_sanitizer:
# Also enable a special perf patch that was made for Intel Meteor Lake mobile CPUs
cflags += ["/Qspectre", "/DONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH"]
if config == "Release":
cflags += ["/O2", "/Ob2", "/DNDEBUG"]
elif config == "RelWithDebInfo":
Expand Down
25 changes: 18 additions & 7 deletions winml/lib/Api/HardwareCoreEnumerator.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
// Copyright (c) Microsoft Corporation. All rights reserved.

// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#include "lib/Api/pch/pch.h"

#include "HardwareCoreEnumerator.h"

namespace WINMLP {
Expand Down Expand Up @@ -88,22 +88,33 @@ uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() {

#if !defined(_M_ARM64EC) && !defined(_M_ARM64) && !defined(__aarch64__)
const int kVendorID_Intel[3] = {0x756e6547, 0x6c65746e, 0x49656e69}; // "GenuntelineI"
bool isIntelSpecifiedPlatform = false;
const int kVendorID_IntelSpecifiedPlatformIDs[3] = {
// ExtendedModel,ExtendedFamily,Family Code, and Model Number
0xa06a, // MTL
0xc065, // ARL-H
0xb065 // ARL-U
};

int regs_leaf0[4];
int regs_leaf7[4];
int regs_leaf1[4];
__cpuid(regs_leaf0, 0);
__cpuid(regs_leaf7, 0x7);
__cpuid(regs_leaf1, 0x1);

auto isIntel = (kVendorID_Intel[0] == regs_leaf0[1]) && (kVendorID_Intel[1] == regs_leaf0[2]) &&
(kVendorID_Intel[2] == regs_leaf0[3]);

auto isHybrid = (regs_leaf7[3] & (1 << 15));
for (int intelSpecifiedPlatform : kVendorID_IntelSpecifiedPlatformIDs) {
if ((regs_leaf1[0] >> 4) == intelSpecifiedPlatform) {
isIntelSpecifiedPlatform = true;
}
}

if (isIntel && isHybrid) {
if (isIntel && isIntelSpecifiedPlatform) {
// We want to use the number of physical cores, but exclude cores without an LLC
return cores.LLCCores;
}
#endif

return cores.PhysicalCores;
}

Expand Down

0 comments on commit f5293d2

Please sign in to comment.