forked from zjin-lcf/HeCBench
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.cpp
114 lines (95 loc) · 3 KB
/
main.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <chrono>
#include <assert.h>
#include <omp.h>
#define BLOCK_SIZE 256
// measure cost of additions without atomics
template <typename T>
void woAtomicOnGlobalMem(T* result, int size, int n)
{
#pragma omp target teams distribute parallel for thread_limit(BLOCK_SIZE)
for (unsigned int tid = 0; tid < n; tid++) {
for ( unsigned int i = tid * size; i < (tid + 1) * size; i++) {
result[tid] += i % 2;
}
}
}
// measure cost of additions with atomics
template <typename T>
void wiAtomicOnGlobalMem(T* result, int size, int n)
{
#pragma omp target teams distribute parallel for thread_limit(BLOCK_SIZE)
for (unsigned int tid = 0; tid < n; tid++) {
#pragma omp atomic update
result[tid/size] += tid % 2;
}
}
template <typename T>
void memset(T* result, int n)
{
#pragma omp target teams distribute parallel for
for (unsigned int tid = 0; tid < n; tid++) {
result[tid] = 0;
}
}
template <typename T>
void atomicCost (int t, int repeat)
{
for (int size = 1; size <= 16; size++) {
printf("\n\n");
printf("Each thread sums up %d elements\n", size);
assert(t % size == 0);
assert(t / size % BLOCK_SIZE == 0);
size_t result_size = sizeof(T) * t / size;
T* result_wi = (T*) malloc (result_size);
T* result_wo = (T*) malloc (result_size);
#pragma omp target data map(alloc: result_wi[0:t/size], result_wo[0:t/size])
{
auto start = std::chrono::steady_clock::now();
for(int i=0; i<repeat; i++)
{
memset(result_wi, 0, t/size);
wiAtomicOnGlobalMem<T>(result_wi, size, t);
}
auto end = std::chrono::steady_clock::now();
auto time = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
printf("Average execution time of WithAtomicOnGlobalMem: %f (us)\n",
time * 1e-3f / repeat);
#pragma omp target update from (result_wi[0:t/size])
start = std::chrono::steady_clock::now();
for(int i=0; i<repeat; i++)
{
memset(result_wo, 0, t/size);
woAtomicOnGlobalMem<T>(result_wo, size, t/size);
}
end = std::chrono::steady_clock::now();
time = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count();
printf("Average execution time of WithoutAtomicOnGlobalMem: %f (us)\n",
time * 1e-3f / repeat);
#pragma omp target update from (result_wo[0:t/size])
int diff = memcmp(result_wi, result_wo, result_size);
printf("%s\n", diff ? "FAIL" : "PASS");
}
free(result_wi);
free(result_wo);
}
}
int main(int argc, char* argv[])
{
if (argc != 2) {
printf("Usage: %s <repeat>\n", argv[0]);
return 1;
}
const int repeat = atoi(argv[1]);
const int t = 922521600;
assert(t % BLOCK_SIZE == 0);
printf("\nFP64 atomic add\n");
atomicCost<double>(t, repeat);
printf("\nINT32 atomic add\n");
atomicCost<int>(t, repeat);
printf("\nFP32 atomic add\n");
atomicCost<float>(t, repeat);
return 0;
}