Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LLC Core count calculations updated #19921

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions onnxruntime/core/platform/windows/env.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ limitations under the License.
#include "core/common/span_utils.h"
#include "core/platform/env.h"
#include "core/platform/scoped_resource.h"
#if defined(_M_X64) && !defined(_M_ARM64EC) && defined(ONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH)
#if defined(_M_X64) && !defined(_M_ARM64EC)
#include "core/platform/windows/hardware_core_enumerator.h"
#endif
#include <unsupported/Eigen/CXX11/ThreadPool>
Expand Down Expand Up @@ -252,7 +252,7 @@ void WindowsEnv::SleepForMicroseconds(int64_t micros) const {
}

// EIGEN_NO_CPUID is not defined in any C/C++ source code. It is a compile option.
#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID) && defined(ONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH)
#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID)
static constexpr std::array<int, 3> kVendorID_Intel = {0x756e6547, 0x6c65746e, 0x49656e69}; // "GenuntelineI"
#endif
int WindowsEnv::DefaultNumCores() {
Expand All @@ -261,7 +261,7 @@ int WindowsEnv::DefaultNumCores() {

int WindowsEnv::GetNumPhysicalCpuCores() const {
// EIGEN_NO_CPUID is not defined in any C/C++ source code. It is a compile option.
#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID) && defined(ONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH)
#if defined(_M_X64) && !defined(_M_ARM64EC) && !defined(EIGEN_NO_CPUID)
// The following code is a temporary fix for a perf problem on Intel's Meteor Lake CPUs. The Intel compute platform has
// a hybrid architecture that some CPU cores runs significant slower than the others. If we distribute our compute work
// evenly to all CPU cores, the slowest CPU core will drag the performance down. So, instead, we reduce the total number
Expand Down
13 changes: 7 additions & 6 deletions onnxruntime/core/platform/windows/hardware_core_enumerator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ struct LogicalProcessorInformation {

struct CoreCounter {
uint32_t PhysicalCores = 0;
uint32_t SocDieCores = 0;
uint32_t LLCCores = 0;
};

static LogicalProcessorInformation GetLogicalProcessorInfos(LOGICAL_PROCESSOR_RELATIONSHIP relationship) {
Expand All @@ -42,7 +42,7 @@ uint32_t CountSetBits(DWORD input) {
return c;
}

static CoreCounter GetNumberOPhysicalAndEngineeringCores() {
static CoreCounter GetCoreInfo() {
auto logicalProcessorInformation = GetLogicalProcessorInfos(RelationAll);

CoreCounter cores;
Expand Down Expand Up @@ -73,17 +73,18 @@ static CoreCounter GetNumberOPhysicalAndEngineeringCores() {

read += currentProcessorInfo->Size;
}
// Cores with L2 and LLC cache levels = # Physical Cores - # logical cores without LLC
cores.LLCCores = cores.PhysicalCores - CountSetBits(dwLevel2GroupMask & ~dwLevel3GroupMask);

cores.SocDieCores = CountSetBits(dwLevel2GroupMask & ~dwLevel3GroupMask);
return cores;
}

uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() {
// # of physical cores = # of P cores + # of E Cores + # of Soc Cores.
// # of logical cores = # of P cores x 2 (if hyper threading is enabled) + # of E cores + # of Soc Cores.
auto cores = GetNumberOPhysicalAndEngineeringCores();
// We want to use the number of physical cores, but exclude soc cores
return cores.PhysicalCores - cores.SocDieCores;
auto cores = GetCoreInfo();

return cores.LLCCores;
}

} // namespace onnxruntime
3 changes: 1 addition & 2 deletions tools/ci_build/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -1533,8 +1533,7 @@ def generate_build_tree(
ldflags = ["/profile", "/DYNAMICBASE"]
# Address Sanitizer libs do not have a Qspectre version. So they two cannot be both enabled.
if not args.enable_address_sanitizer:
# Also enable a special perf patch that was made for Intel Meteor Lake mobile CPUs
cflags += ["/Qspectre", "/DONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH"]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This flag is default ON. It is enabled in every ONNX Runtime's official package. It was added because if in any case the code path doesn't work, at least we have a fallback. I can tell the customer to build the code without this flag.
Is there any benefit if we remove it?

cflags += ["/Qspectre"]
if config == "Release":
cflags += ["/O2", "/Ob2", "/DNDEBUG"]
elif config == "RelWithDebInfo":
Expand Down
15 changes: 8 additions & 7 deletions winml/lib/Api/HardwareCoreEnumerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ struct LogicalProcessorInformation {

smk2007 marked this conversation as resolved.
Show resolved Hide resolved
struct CoreCounter {
uint32_t PhysicalCores = 0;
uint32_t Num2CacheCores = 0;
uint32_t LLCCores = 0;
};

static LogicalProcessorInformation GetLogicalProcessorInfos(LOGICAL_PROCESSOR_RELATIONSHIP relationship) {
Expand Down Expand Up @@ -42,7 +42,7 @@ uint32_t CountSetBits(DWORD input) {
return c;
}

static CoreCounter GetNumberOPhysicalAndEngineeringCores() {
static CoreCounter GetCoreInfo() {
auto logicalProcessorInformation = GetLogicalProcessorInfos(RelationAll);

CoreCounter cores;
Expand All @@ -64,6 +64,7 @@ static CoreCounter GetNumberOPhysicalAndEngineeringCores() {
cores.PhysicalCores++;
break;
case RelationCache:
// Cache level masks count Logicial processors
if (currentProcessorInfo->Cache.Level == 2) {
dwLevel2GroupMask |= currentProcessorInfo->Cache.GroupMask.Mask;
} else if (currentProcessorInfo->Cache.Level == 3) {
Expand All @@ -75,14 +76,15 @@ static CoreCounter GetNumberOPhysicalAndEngineeringCores() {
read += currentProcessorInfo->Size;
}

cores.Num2CacheCores = CountSetBits(dwLevel2GroupMask & ~dwLevel3GroupMask);
cores.LLCCores = cores.PhysicalCores - CountSetBits(dwLevel2GroupMask & ~dwLevel3GroupMask);

return cores;
}

uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() {
// # of physical cores = # of P cores + # of E Cores + # of Soc Cores.
// # of logical cores = # of P cores x 2 (if hyper threading is enabled) + # of E cores + # of Soc Cores.
auto cores = GetNumberOPhysicalAndEngineeringCores();
auto cores = GetCoreInfo();

#if !defined(_M_ARM64EC) && !defined(_M_ARM64) && !defined(__aarch64__)
const int kVendorID_Intel[3] = {0x756e6547, 0x6c65746e, 0x49656e69}; // "GenuntelineI"
Expand All @@ -97,9 +99,8 @@ uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() {
auto isHybrid = (regs_leaf7[3] & (1 << 15));

if (isIntel && isHybrid) {
// We want to use the number of physical cores, but exclude soc cores
// On Intel Hybrid processors, numSocCores == cores.Num2CacheCores
return cores.PhysicalCores - cores.Num2CacheCores;
// We want to use the number of physical cores, but exclude cores without an LLC
return cores.LLCCores;
}
#endif

Expand Down