-
Notifications
You must be signed in to change notification settings - Fork 0
/
finiteDiffTest.cu
197 lines (161 loc) · 5.59 KB
/
finiteDiffTest.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#include <iostream>
#include <cuda_runtime.h>
// called only from device
__device__ void costFunction(float* F, float* T, int i, int numElements){
if (i < numElements)
{
F[i] = T[i] * T[i];
}
}
// Trying finite difference, called from host
__global__ void finiteDiff(float* Fp, float* Tp, float* Fn, float* Tn, float* dF, float e, int numElements){
int i = blockDim.x * blockIdx.x + threadIdx.x;
costFunction(Fn,Tn,i,numElements);
costFunction(Fp,Tp,i,numElements);
if (i < numElements)
{
dF[i] = (Fp[i] - Fn[i]) / (2*e); // think the 0.0f exists to cast to float
}
}
int main (void) {
// Declare Vairables host variables start with h, device vars with d
int numElements = 50000; // vector size --- determine whats the max we can do on GPU
size_t size = numElements*sizeof(float); // total vector size
// we want C <-- A*B so init three vectors on host
float *h_Fp = (float*) malloc(size);
float *h_Fn = (float*) malloc(size);
float *h_Tp = (float*) malloc(size);
float *h_Tn = (float*) malloc(size);
float *h_dF = (float*) malloc(size);
if (h_Fp == NULL || h_Fn == NULL || h_Tp == NULL || h_Tn == NULL)
{
std::cout<< "Failed to allocate on host device" << std::endl;
exit(EXIT_FAILURE);
}
// initialize Tp and Tn
float e = 0.0001;
for (int i = 0; i < numElements; i++)
{
// 0.001 -> 500
h_Tn[i] = i/100 - e;
h_Tp[i] = i/100 + e;
}
// DEVICE -- GPU
cudaError_t err = cudaSuccess; // errror code
// init vectors on device
float *d_Fp = NULL; // NULL pointer
err = cudaMalloc((void **)&d_Fp, size);
if (err != cudaSuccess)
{
std::cout << "Failed to allocate device vector d_Fp, Error code: " << cudaGetErrorString(err) << std::endl;
exit(EXIT_FAILURE);
}
float *d_Fn = NULL; // NULL pointer
err = cudaMalloc((void **)&d_Fn, size);
if (err != cudaSuccess)
{
std::cout << "Failed to allocate device vector d_Fn, Error code: " << cudaGetErrorString(err) << std::endl;
exit(EXIT_FAILURE);
}
float *d_Tp = NULL; // NULL pointer
err = cudaMalloc((void **)&d_Tp, size);
if (err != cudaSuccess)
{
std::cout << "Failed to allocate device vector d_Tp, Error code: " << cudaGetErrorString(err) << std::endl;
exit(EXIT_FAILURE);
}
float *d_Tn = NULL; // NULL pointer
err = cudaMalloc((void **)&d_Tn, size);
if (err != cudaSuccess)
{
std::cout << "Failed to allocate device vector d_Tn, Error code: " << cudaGetErrorString(err) << std::endl;
exit(EXIT_FAILURE);
}
float *d_dF = NULL; // NULL pointer
err = cudaMalloc((void **)&d_dF, size);
if (err != cudaSuccess)
{
std::cout << "Failed to allocate device vector d_dF, Error code: " << cudaGetErrorString(err) << std::endl;
exit(EXIT_FAILURE);
}
// COPY DATA FROM HOST TO DEVICE using cudaMemcpy(destination_add, source_add, size, direction)
std::cout << "Begin to copy memory from host to device." << std::endl;
err = cudaMemcpy(d_Tp, h_Tp, size, cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
std::cout << "Failed to copy host vector Tp to device, Error code: " << cudaGetErrorString(err) << std::endl;
exit(EXIT_FAILURE);
}
err = cudaMemcpy(d_Tn, h_Tn, size, cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
std::cout << "Failed to copy host vector Tp to device, Error code: " << cudaGetErrorString(err) << std::endl;
exit(EXIT_FAILURE);
}
// FINITE DIFF ON KERNEL
int threadsPerBlock = 256; // threads per block is limited to 1024
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; // dont understand this tbh
std::cout<<"CUDA kernel lauched with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
finiteDiff<<<blocksPerGrid,threadsPerBlock>>>(d_Fp,d_Tp,d_Fn,d_Tn, d_dF, e, numElements);
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cout << "Failed to lauch vecAdd kernel, Error code: " << cudaGetErrorString(err) << std::endl;
exit(EXIT_FAILURE);
}
// COPY DATA BACK TO HOST FROM DEVICE
std::cout << "Begin to copy memory from device to host." << std::endl;
err = cudaMemcpy(h_dF, d_dF, size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
{
std::cout << "Failed to copy device vector dF to host, Error code: " << cudaGetErrorString(err) << std::endl;
exit(EXIT_FAILURE);
}
// for (int i = 0; i < 200; i++)
// {
// if (fabs(2*(h_Tn[i] + e) - h_dF[i]) > 1e-2)
// {
// std::cout<<"Result verification failed at element " << i << " tol:: " << fabs(2*(h_Tn[i] + e) - h_dF[i]) << std::endl;
// exit(EXIT_FAILURE);
// }
// }
// free vectors on device
err = cudaFree(d_Fp);
if (err != cudaSuccess)
{
std::cout << "Failed to free device vector d_Fp, Error code: " << cudaGetErrorString(err) << std::endl;
exit(EXIT_FAILURE);
}
err = cudaFree(d_Fn);
if (err != cudaSuccess)
{
std::cout << "Failed to free device vector d_Fn, Error code: " << cudaGetErrorString(err) << std::endl;
exit(EXIT_FAILURE);
}
err = cudaFree(d_Tp);
if (err != cudaSuccess)
{
std::cout << "Failed to free device vector d_Tp, Error code: " << cudaGetErrorString(err) << std::endl;
exit(EXIT_FAILURE);
}
err = cudaFree(d_Tn);
if (err != cudaSuccess)
{
std::cout << "Failed to free device vector d_Tn, Error code: " << cudaGetErrorString(err) << std::endl;
exit(EXIT_FAILURE);
}
err = cudaFree(d_dF);
if (err != cudaSuccess)
{
std::cout << "Failed to free device vector d_dF, Error code: " << cudaGetErrorString(err) << std::endl;
exit(EXIT_FAILURE);
}
// free memory
free(h_Fp);
free(h_Fn);
free(h_Tp);
free(h_Tn);
free(h_dF);
std::cout << "Done!" << std::endl;
return 0;
}