-
Notifications
You must be signed in to change notification settings - Fork 3
/
PcieBandwidthBenchmarker.h
112 lines (95 loc) · 2.95 KB
/
PcieBandwidthBenchmarker.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
/*
* PcieBandwidthBenchmarker.h
*
* Created on: Feb 8, 2021
* Author: tugrul
*/
#ifndef PCIEBANDWIDTHBENCHMARKER_H_
#define PCIEBANDWIDTHBENCHMARKER_H_
#include "GraphicsCardSupplyDepot.h"
#include "VirtualMultiArray.h"
#include<vector>
#include<thread>
#include<cmath>
template <class F>
void parallelFor(const int N, const F f) {
std::vector<std::thread> thr;
for(int i=0; i<N; i++) {
thr.push_back(std::thread(f,i));
}
for(int i=0; i<N; i++) {
thr[i].join();
}
}
class PcieBandwidthBenchmarker
{
private:
struct Obj
{
char test[1024*128];
};
std::vector<float> bandwidthOptimizedMultipliers;
float minBw;
public:
// benchmarks system to find pcie performances
// megabytesPerCardAllowedForBenchmarking: each card is given this amount of data (in MB unit) during benchmark
PcieBandwidthBenchmarker(int megabytesPerCardAllowedForBenchmarking=128)
{
GraphicsCardSupplyDepot depot;
const size_t n = megabytesPerCardAllowedForBenchmarking*4; // for each test array (2 arrays below)
const size_t pageSize=1;
const int maxActivePagesPerGpu = 1;
auto gpus = depot.requestGpus();
auto findBandwidth = [&](int selectDevice, std::vector<ClDevice> & devList)
{
std::vector<int> bwList;
for(int i=0;i<devList.size();i++)
{
bwList.push_back(i==selectDevice?4:0);
}
VirtualMultiArray<Obj> data1(n,devList,pageSize,maxActivePagesPerGpu,bwList);
VirtualMultiArray<Obj> data2(n,devList,pageSize,maxActivePagesPerGpu,bwList);
const int numThr = std::thread::hardware_concurrency();
std::chrono::milliseconds t1 = std::chrono::duration_cast< std::chrono::milliseconds >(std::chrono::system_clock::now().time_since_epoch());
parallelFor(numThr,[&](int idx){
for(size_t j=0;j<n;j++)
{
if(idx==(j%numThr))
{
data1[j]=data2[j];
}
}
});
std::chrono::milliseconds t2 = std::chrono::duration_cast< std::chrono::milliseconds >(std::chrono::system_clock::now().time_since_epoch());
return 1.0/(t2.count()-t1.count());
};
// heating cpu/pcie
for(int i=0;i<gpus.size();i++)
{
findBandwidth(i,gpus);
}
// benchmark
std::vector<float> bwMeasured;
minBw=1000000000.0f;
for(int i=0;i<gpus.size();i++)
{
bwMeasured.push_back(findBandwidth(i,gpus));
if(minBw>bwMeasured[i])
minBw=bwMeasured[i];
}
bandwidthOptimizedMultipliers = bwMeasured;
}
// gets multipliers for each physical card that maximizes bandwidth
// minimumMultiplierNeeded: slowest-accessed card is given this amount of channels (virtual cards)
// other cards receive higher multipliers depending on their relative communication performance
std::vector<int> bestBandwidth(int minimumMultiplierNeeded){
std::vector<int> result;
float mul = 1.0 / minBw;
for(int i=0;i<bandwidthOptimizedMultipliers.size();i++)
{
result.push_back(std::floor(bandwidthOptimizedMultipliers[i] * mul*minimumMultiplierNeeded));
}
return result;
}
};
#endif /* PCIEBANDWIDTHBENCHMARKER_H_ */