Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

[BesTLA] New thread pool and hybrid dispatcher #118

Merged
merged 45 commits into from
Mar 8, 2024
Merged
Show file tree
Hide file tree
Changes from 39 commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
851af6a
update
yuchengliu1 Jan 10, 2024
c2390c5
update
yuchengliu1 Jan 10, 2024
ba4f107
update for spinlock
yuchengliu1 Jan 15, 2024
8abef2f
update base thread pool
yuchengliu1 Feb 4, 2024
5cdd61d
bond core
yuchengliu1 Feb 4, 2024
b572c67
using singleton instead of extern
yuchengliu1 Feb 20, 2024
1201235
add sync
yuchengliu1 Feb 21, 2024
79b7cbc
integrate to core
yuchengliu1 Feb 21, 2024
86feb68
fix bugs
zhewang1-intc Feb 23, 2024
2ed0175
add thread config for hybrid cpu
luoyu-intel Feb 27, 2024
1bade92
split benchmark and UT
luoyu-intel Feb 28, 2024
b35b558
link pthread for Linux
luoyu-intel Feb 28, 2024
1e6e764
add SchedulerDispatcher
zhewang1-intc Feb 28, 2024
50cf1ae
opt benchmark code
luoyu-intel Feb 28, 2024
e0bc203
complete compbf16
luoyu-intel Feb 28, 2024
e2dc613
add core order for client
yuchengliu1 Feb 28, 2024
ef1a7e1
remove benchmark from UT
luoyu-intel Feb 29, 2024
868af0b
remove dbg msg
luoyu-intel Feb 29, 2024
60c3925
optimize log print for each benchmark case
luoyu-intel Feb 29, 2024
ee7bf2d
integrade to ns
zhewang1-intc Feb 29, 2024
be86b94
modify cmake options for OMP
luoyu-intel Feb 29, 2024
aca2a23
fix memory leak
zhewang1-intc Feb 29, 2024
432cb89
UT fix
yuchengliu1 Feb 29, 2024
4427d73
fix bugs
zhewang1-intc Feb 29, 2024
0e12ac9
fix bugs
yuchengliu1 Feb 29, 2024
942c608
update for MHA
yuchengliu1 Feb 29, 2024
8a07eac
change cmake variable name
luoyu-intel Mar 1, 2024
2564c8f
update PE value
yuchengliu1 Mar 4, 2024
5a1750d
update for MTL
yuchengliu1 Mar 4, 2024
f1e96be
support dynamic PE
yuchengliu1 Mar 5, 2024
0746b1e
dynamic PE by ISA
yuchengliu1 Mar 5, 2024
d0e5e30
use omp from bestla
luoyu-intel Mar 6, 2024
1722f1c
remove threading code in ne_layers
luoyu-intel Mar 6, 2024
6db2a36
remove thread join
luoyu-intel Mar 6, 2024
fee0748
add std thread pool in ne_layers
luoyu-intel Mar 6, 2024
1c126d6
use OMP as default thread pool
luoyu-intel Mar 6, 2024
3cb5f02
use one threadpool in neuralspeed. use BTLA options directly. add som…
luoyu-intel Mar 6, 2024
6bf8c76
clang-format
luoyu-intel Mar 6, 2024
d32a5f2
rerun with clang-format 14.0.0
luoyu-intel Mar 6, 2024
c6aa107
update clang-tidy
luoyu-intel Mar 6, 2024
8055d6f
disable OMP for clangtidy
luoyu-intel Mar 6, 2024
49dedc1
remove dbg code
luoyu-intel Mar 6, 2024
c0af2c9
remove pointer
yuchengliu1 Mar 6, 2024
35ae39c
fix bug of adjust PE ratio. add priority for Windows process.
luoyu-intel Mar 7, 2024
289d655
remove dbg code
luoyu-intel Mar 7, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ option(NS_AVX512_VBMI "neural_speed: enable AVX512-VBMI"
option(NS_AVX512_VNNI "neural_speed: enable AVX512-VNNI" OFF)
option(NS_FMA "neural_speed: enable FMA" ON)
option(NS_AMX "neural_speed: enable AMX" OFF)
option(NS_USE_OMP "neural_speed: use OpenMP thread pool." ON)

option(NS_BUILD_TESTS "neural_speed: build tests" ${NS_STANDALONE})
option(NS_BTLA_UT "enable BesTLA's unit tests" OFF)
option(NS_BUILD_EXAMPLES "neural_speed: build examples" ${NS_STANDALONE})
option(NS_USE_CLANG_TIDY "neural_speed: clang-tidy check" OFF)

Expand Down Expand Up @@ -135,12 +135,13 @@ if (NS_PYTHON_API)
add_subdirectory(third_party/pybind11)
endif()

if (NS_BTLA_UT)
set(BTLA_UT_ALL ON)
if(NS_USE_OMP)
include(FindOpenMP)
# compile BesTLA's OMPTheading class, then it can be used in ne_layers
set(BTLA_ENABLE_OPENMP ON CACHE BOOL "BesTLA enable compiling OpenMP threading")
add_compile_definitions(NS_USE_OMP)
endif()
include(FindOpenMP)

set(BTLA_USE_OPENMP ON CACHE BOOL "BesTLA use OpenMP")
add_subdirectory(bestla)

add_subdirectory(neural_speed)
38 changes: 33 additions & 5 deletions CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -49,23 +49,51 @@
"value": "x64",
"strategy": "external"
},
"cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" }
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Debug",
"NS_PROFILING": "ON",
"NS_USE_OMP": "ON",
"BTLA_UT_DEBUG": "ON"
}
},
{
"name": "x64-release",
"displayName": "x64 Release",
"description": "Target Windows (64-bit) with the Visual Studio development environment. (RelWithDebInfo)",
"inherits": "x64-debug",
"cacheVariables": { "CMAKE_BUILD_TYPE": "Release" }
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Release",
"BTLA_UT_DEBUG": "OFF"
}
},
{
"name": "x64-release-thread",
"displayName": "x64 Release without OpenMP",
"description": "Target Windows (64-bit) with the Visual Studio development environment. (RelWithDebInfo)",
"inherits": "x64-release",
"cacheVariables": {
"NS_USE_OMP": "OFF"
}
},
{
"name": "x64-bestla-UT",
"displayName": "x64 BesTLA unit test",
"description": "Target Windows (64-bit) with the Visual Studio development environment. (RelWithDebInfo)",
"inherits": "x64-debug",
"inherits": "x64-release",
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Release",
"NS_BTLA_UT": "ON"
"CMAKE_BUILD_TYPE": "RelWithDebInfo",
"BTLA_UT_ALL": "ON",
"BTLA_UT_BENCHMARK": "ON",
"BTLA_UT_OPENMP": "ON"
}
},
{
"name": "x64-ut-thread",
"displayName": "x64 BesTLA UT without OpenMP",
"description": "Target Windows (64-bit) with the Visual Studio development environment. (RelWithDebInfo)",
"inherits": "x64-bestla-UT",
"cacheVariables": {
"BTLA_UT_OPENMP": "OFF"
}
}
]
Expand Down
32 changes: 24 additions & 8 deletions bestla/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ project(bestla LANGUAGES CXX VERSION 0.1.0)
file(GLOB headers ${PROJECT_NAME}/*.h ${PROJECT_NAME}/*.hpp)
file(GLOB xbyak_headers ${PROJECT_NAME}/xbyak/*.h ${PROJECT_NAME}/xbyak/*.hpp)

option(BTLA_USE_OPENMP "Enable OpenMP thread pool" OFF)
option(BTLA_ENABLE_OPENMP "Compile OpenMP thread pool if OMP can be found" OFF)

option(BTLA_UT_ALL "Enable all unit tests" OFF)
option(BTLA_UT_DEBUG "Enable debug unit tests" OFF)
Expand All @@ -19,7 +19,7 @@ option(BTLA_UT_KERNEL_INTRIN "Enable unit test for intrinsic kernels" OFF)
option(BTLA_UT_KERNEL_WRAPPER "Enable unit test for runtime ISA kernels" OFF)
option(BTLA_UT_NOASAN "Disable sanitize" OFF)
option(BTLA_UT_BENCHMARK "Benchmark ON may take a long time to finish all tests" OFF)
option(BTLA_UT_OPENMP "Use OpenMP" ON)
option(BTLA_UT_OPENMP "Use OpenMP for UT tests" OFF)

add_library(${PROJECT_NAME} INTERFACE)
add_library(neural_speed::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
Expand All @@ -30,10 +30,10 @@ target_include_directories(
)


if(BTLA_USE_OPENMP)
message(STATUS "BesTLA using OpenMP")
if(BTLA_ENABLE_OPENMP)
message(STATUS "BesTLA enable OpenMP ThreadPool")
target_compile_definitions(${PROJECT_NAME} INTERFACE BTLA_USE_OPENMP)
endif(BTLA_USE_OPENMP)
endif(BTLA_ENABLE_OPENMP)

if(WIN32)
target_compile_definitions(${PROJECT_NAME} INTERFACE _CRT_SECURE_NO_WARNINGS NOMINMAX)
Expand Down Expand Up @@ -64,12 +64,15 @@ endif()

function(add_ut_flag UT_OPTION)
if(${${UT_OPTION}})
target_compile_definitions(${PROJECT_NAME}_ut PRIVATE ${UT_OPTION})
# target_compile_definitions(${PROJECT_NAME}_ut PRIVATE ${UT_OPTION})
add_compile_definitions(${UT_OPTION})
message(${UT_OPTION})
endif()
endfunction()

if(UT_BUILD)
file(GLOB srcs ${PROJECT_NAME}/ut/*.cc ${PROJECT_NAME}/ut/*.cpp) #compile everything even run parts of UTs
list(REMOVE_ITEM srcs ${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}/ut/bestla_benchmark.cpp)
file(GLOB ut_headers ${PROJECT_NAME}/ut/*.h)
include_directories(${PROJECT_NAME})
add_executable(${PROJECT_NAME}_ut ${srcs} ${headers} ${ut_headers})
Expand All @@ -96,8 +99,21 @@ if(UT_BUILD)
add_ut_flag(BTLA_UT_KERNEL_INTRIN)
add_ut_flag(BTLA_UT_KERNEL_JIT)
add_ut_flag(BTLA_UT_KERNEL_WRAPPER)
add_ut_flag(BTLA_UT_BENCHMARK)

target_link_libraries(${PROJECT_NAME}_ut PRIVATE ${PROJECT_NAME})
endif(UT_BUILD)

if(BTLA_UT_BENCHMARK)
file(GLOB srcs ${PROJECT_NAME}/ut/bestla_benchmark.cpp) #compile everything even run parts of UTs
file(GLOB ut_headers ${PROJECT_NAME}/ut/*.h)
include_directories(${PROJECT_NAME})
add_executable(${PROJECT_NAME}_benchmark ${srcs} ${headers} ${ut_headers})
if(BTLA_UT_OPENMP)
include(FindOpenMP)
target_compile_definitions(${PROJECT_NAME} INTERFACE BTLA_USE_OPENMP)
target_link_libraries(${PROJECT_NAME}_benchmark PRIVATE OpenMP::OpenMP_CXX)
endif()
if(NOT WIN32)
target_link_options(${PROJECT_NAME}_benchmark PRIVATE -lpthread)
endif()
target_link_libraries(${PROJECT_NAME}_benchmark PRIVATE ${PROJECT_NAME})
endif(BTLA_UT_BENCHMARK)
1 change: 1 addition & 0 deletions bestla/bestla/bestla.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ enum class BTLA_ISA : uint8_t {
AMX_INT8,
AVX512_FP16,
AVX512_BF16,
ISA_COUNT,
};
enum class BTLA_DTYPE : uint32_t {
EleBitsMask = 0xff,
Expand Down
93 changes: 81 additions & 12 deletions bestla/bestla/bestla_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,7 @@ class CpuDevice {
public:
inline int getThreads() { return numthreads; }
inline int getCores() { return numcores; }
inline uint32_t getL3CacheSize() { return L3Cache; }
inline uint32_t getL2CacheSize() { return L2Cache; }
inline uint32_t getL1CacheSize() { return L1Cache; }
inline uint32_t getL2CacheSize_E() { return E_L2Cache; }
Expand All @@ -228,7 +229,7 @@ class CpuDevice {
inline bool AMX_BF16() { return mHasAMX_BF16; }
inline bool AVX512_BF16() { return mHasAVX512_BF16; }
inline bool AVX512_FP16() { return mHasAVX512_FP16; }
inline float getPE() { return (P_core.size() * P_power) / (E_core.size() * E_power); }
inline float* const getPE() { return PE; }
inline size_t getPcoreNum() { return P_core.size(); }
inline size_t getEcoreNum() { return E_core.size(); }
inline size_t getSMTcoreNum() { return SMT_core.size(); }
Expand Down Expand Up @@ -328,12 +329,40 @@ class CpuDevice {
}
}
numcores = P_core.size() + E_core.size();
numthreads = P_core.size() * 2 + E_core.size();
numthreads = P_core.size() + E_core.size() + SMT_core.size();

{
// set PE
uint32_t tmp[4];
_cpu.getCpuid(1, tmp);
if (p) printf("!!!\t%x\t%x\t%x\t%x!!!\n", tmp[0], tmp[1], tmp[2], tmp[3]);
const int famliy = (tmp[0] >> 8) & ((1u << 4) - 1); // cpu.extractBit(a[0], 8, 11);
const int extendedModel = (tmp[0] >> 16) & ((1u << 4) - 1); // cpu.extractBit(a[0], 16, 24);
{
for (int i = 0; i < int(BTLA_ISA::ISA_COUNT); i++) PE[i] = 1.0f;
// CPU identification refer to: https://en.wikichip.org/wiki/intel/cpuid
if (famliy == 6) switch (extendedModel) {
case 9: // ALD
PE[int(BTLA_ISA::AVX2)] = 3.0f;
PE[int(BTLA_ISA::AVX_VNNI)] = 5.0f;
break;
case 10: // MTL
PE[int(BTLA_ISA::AVX2)] = 2.2f;
PE[int(BTLA_ISA::AVX_VNNI)] = 3.0f;
break;
case 11: // RPL
PE[int(BTLA_ISA::AVX2)] = 1.8f;
PE[int(BTLA_ISA::AVX_VNNI)] = 2.6f;
break;
}
}
}
} else {
L1Cache = _cpu.getDataCacheSize(0);
L2Cache = _cpu.getDataCacheSize(1);
numthreads = numcores;
}
L3Cache = _cpu.getDataCacheSize(2);
#if FIXED_CACHE
L2Cache = L2Cache >= FIXED_CACHE_SIZE ? FIXED_CACHE_SIZE : L2Cache;
E_L2Cache = E_L2Cache >= FIXED_CACHE_SIZE ? FIXED_CACHE_SIZE : E_L2Cache;
Expand All @@ -357,7 +386,7 @@ class CpuDevice {
Xbyak::util::Cpu cpu;
uint32_t tmp[4];
cpu.getCpuid(0x1A, tmp);
int core_type = (tmp[0] >> 24) & ((1u << 7) - 1); // cpu.extractBit(a[0], 24, 31);
int core_type = (tmp[0] >> 24) & ((1u << 8) - 1); // cpu.extractBit(a[0], 24, 31);
switch (core_type) {
case 32:
// printf("Atom\n");
Expand Down Expand Up @@ -407,7 +436,7 @@ class CpuDevice {
}
static void core_bond(int core) {
#ifdef _WIN32
SetThreadAffinityMask(GetCurrentThread(), 1 << core);
SetThreadAffinityMask(GetCurrentThread(), 1LL << core);
#else
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
Expand All @@ -420,7 +449,7 @@ class CpuDevice {
static void core_bond(std::thread& thread, int core) {
#ifdef _WIN32
HANDLE handle = thread.native_handle();
SetThreadAffinityMask(handle, 1 << core);
SetThreadAffinityMask(handle, 1LL << core);
#else
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
Expand All @@ -434,29 +463,69 @@ class CpuDevice {
bool isHybrid() { return mHybrid; }

protected:
uint32_t L2Cache, L1Cache;
uint32_t L2Cache, L1Cache, L3Cache;
bool mHybrid = false;
bool mHasAVX2, mHasAVX_VNNI, mHasAVX, mHasAVX512_VNNI, mHasAMX_INT8, mHasAMX_BF16, mHasAVX512F, mHasAVX512_BF16,
mHasAVX512_FP16;
int numcores;
int numthreads;
std::vector<int> P_core, E_core, SMT_core;
uint32_t E_L2Cache, E_L1Cache;
float P_power = 4.8, E_power = 2.3;
float PE[int(BTLA_ISA::ISA_COUNT)];
};

#define GetCPUDevice() auto _cd = bestla::device::CpuDevice::getInstance();

class CpuBase {
class CpuRuntime {
public:
CpuBase() {
CpuRuntime() = default;
static CpuRuntime& getInstance(int thread) {
static std::map<int, CpuRuntime> instances;
if (instances.count(thread) == 0) instances[thread] = CpuRuntime(thread);
return instances[thread];
}

inline float getPE(const BTLA_ISA isa) {
// printf("GET:%d\t%f\n",int(isa), *cur_PE);
return PE[int(isa)] * P_core_num / E_core_num;
}

inline void adjustPE(const BTLA_ISA isa, const float PE_) {
// printf("Adjust:%d,%f\n",int(isa),PE_);
PE[int(isa)] *= PE_;
}

size_t mL2Cache, mL1Cache, mL2Cache_P = 0, mL1Cache_P = 0, mL2Cache_E = 0, mL1Cache_E = 0;
int P_core_num = 0, E_core_num = 0;
bool mHybrid = false;

private:
CpuRuntime(int thread) {
GetCPUDevice();
mL2Cache = _cd->getL2CacheSize();
mL1Cache = _cd->getL1CacheSize();
mNumThreads = _cd->getThreads();
maxThreads = _cd->getThreads();
mHybrid = false;
if (_cd->isHybrid() && thread > _cd->getPcoreNum()) {
if (thread > _cd->getPcoreNum() + _cd->getEcoreNum()) {
mL1Cache_P = mL1Cache / 2;
mL2Cache_P = mL2Cache / 2;
P_core_num = _cd->getPcoreNum();
E_core_num = _cd->getEcoreNum();
} else {
mL1Cache_P = mL1Cache;
mL2Cache_P = mL2Cache;
P_core_num = _cd->getPcoreNum();
E_core_num = thread - P_core_num;
}
mL1Cache_E = _cd->getL1CacheSize_E();
mL2Cache_E = _cd->getL2CacheSize_E();
mHybrid = true;
memcpy(PE, _cd->getPE(), int(BTLA_ISA::ISA_COUNT) * sizeof(float));
}
}
size_t mL2Cache, mL1Cache;
int mNumThreads;
float PE[int(BTLA_ISA::ISA_COUNT)];
int maxThreads;
};
} // namespace device
} // namespace bestla
Loading
Loading