-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathbenchmark.cpp
106 lines (88 loc) · 2.62 KB
/
benchmark.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#include <iostream>
#include <device_matrix.h>
using namespace std;
typedef device_matrix<float> mat;
struct Timer {
Timer();
void tic();
float toc();
cudaEvent_t start, stop;
};
template <typename T>
void randomInit(device_matrix<T>& m) {
T* h_data = new T [m.size()];
for (int i=0; i<m.size(); ++i)
h_data[i] = rand() / (T) RAND_MAX;
cudaMemcpy(m.getData(), h_data, m.size() * sizeof(T), cudaMemcpyHostToDevice);
delete [] h_data;
}
void benchmark();
void matrixMul(int m, int n, int l);
void showGFlops(double flops, float time);
int main (int argc, char* argv[]) {
benchmark();
return 0;
}
void benchmark() {
srand(2013);
matrixMul(32, 48, 16);
matrixMul(64, 96, 32);
matrixMul(128, 192, 64);
matrixMul(256, 384, 128);
matrixMul(512, 768, 256);
matrixMul(1024, 1536, 512);
matrixMul(2048, 3072, 1024);
}
void matrixMul(int m, int n, int l) {
mat A(m, n), B(n, l), C;
randomInit(A);
randomInit(B);
int nIter = 128;
Timer timer1, timer2;
// ===== Method 1 : sgemm(A,B,C) =====
timer1.tic();
for (int i=0; i<nIter; ++i)
gemm(A, B, C);
float avgTime1 = timer1.toc() / nIter;
// ===== Method 2 : C = A*B =====
timer2.tic();
for (int i=0; i<nIter; ++i)
mat C(A * B);
float avgTime2 = timer2.toc() / nIter;
// ===== Calculate GFlops =====
double flops = 2.0 * (double) A.getRows() * (double) A.getCols() * (double) B.getCols();
double gigaFlops1 = (flops * 1.0e-9f) / (avgTime1 / 1000.0f);
double gigaFlops2 = (flops * 1.0e-9f) / (avgTime2 / 1000.0f);
// ===== Benchmark Summary =====
printf(" Matrix Multiplication \n"
"+----------------------------------------+\n"
"| matrix rows cols |\n"
"+----------------------------------------+\n"
"| A %4lu %4lu |\n"
"| |\n"
"| B %4lu %4lu |\n"
"| |\n"
"| AxB %4lu %4lu |\n"
"+----------------------------------------+\n"
"| Performance sgemm(A,B,C) C = A*B |\n"
"| GFlops %5.1f %5.1f |\n"
"+----------------------------------------+\n\n",
A.getRows(), A.getCols(),
B.getRows(), B.getCols(),
A.getRows(), B.getCols(),
gigaFlops1, gigaFlops2);
}
Timer::Timer() {
CCE(cudaEventCreate(&start));
CCE(cudaEventCreate(&stop));
}
void Timer::tic() {
CCE(cudaEventRecord(start, NULL));
}
float Timer::toc() {
CCE(cudaEventRecord(stop, NULL));
CCE(cudaEventSynchronize(stop));
float diff = 0.0f;
CCE(cudaEventElapsedTime(&diff , start, stop));
return diff;
}