-
Notifications
You must be signed in to change notification settings - Fork 0
/
one_d_stencil.py
174 lines (140 loc) · 4.59 KB
/
one_d_stencil.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
'''
Demonstrate the significant performance difference between transferring
regular host memory and pinned (pagelocked) host memory.
'''
from __future__ import print_function
from timeit import default_timer as timer
import numpy as np
from numba import vectorize,int32, float32, cuda, jit, void
import math
from scipy.stats import itemfreq
src = np.arange(40 ** 2, dtype=np.float32)
dst = np.empty_like(src)
src1 = np.empty_like(src)
src2 = np.empty_like(src)
# ------------ Correct Configuration for 256*256 window
# bpg = 64
# # Each block size wopuld be 32*32
# tpb = 32
# n = bpg * tpb * tpb
global radius
radius = 1
# ------------Correct Configuration for 3*6 window
bpg = 2
# Each block size wopuld be 3*3
global tpb
tpb = 30
n = (bpg * tpb ) + radius *4
BIN_COUNT = 10
@jit([void(float32[:])], target='cuda')
def gpu_1d_stencil(A):
# Each block of size tbp
# radius = 1
# tpb = 30
sA = cuda.shared.array(32, dtype=float32)
# sA = cuda.shared.array(shape=(1, tpb + 2 *radius), dtype=float32)
# sA = cuda.shared.array(tpb + 2*radius, dtype=float32)
tx = cuda.threadIdx.x
# ty = cuda.threadIdx.y
bx = cuda.blockIdx.x
# by = cuda.blockIdx.y
bw = cuda.blockDim.x
# bh = cuda.blockDim.y
x = tx + bx * bw
cx = tx + radius
# Copying all elements from memory to device shared memory
sA[cx] = A[x]
if(tx == bx):
# Copying Apron Values. i.e : Outside the bound values.
sA[cx - radius] = A[x - radius]
sA[cx + tpb] = A[x + tpb]
cuda.syncthreads()
acc = 0
# rolling over the ROI
for i in range(-radius,radius+1,1):
acc += sA[cx + i]
# Fancy averaging ?
# acc = acc/3
if(x > 0 and x < 64):
# adding the difference to the global memory,
# as there already are values at specific index.
acc = acc - sA[cx]
cuda.atomic.add(A, x, acc)
@jit([void(float32[:],int32[:])], target='cuda')
def gpu_histogram(A, Hist):
# Each block of size tbp
# radius = 1 * 2 // At both ends.
# tpb = 30
# size = tpb + radius * 2 == 32 for each block
totalsize = bpg* (tpb+radius*2) # for all blocks
sA = cuda.shared.array(32, dtype=float32)
tx = cuda.threadIdx.x
# ty = cuda.threadIdx.y
bx = cuda.blockIdx.x
# by = cuda.blockIdx.y
bw = cuda.blockDim.x
# bh = cuda.blockDim.y
x = tx + bx * bw
cx = tx + radius
# Copying all elements from memory to device shared memory
sA[cx] = A[x]
if(tx == bx):
# Copying Apron Values. i.e : Outside the bound values.
sA[cx - radius] = A[x - radius]
sA[cx + tpb] = A[x + tpb]
cuda.syncthreads()
if(x > 0 and x < totalsize):
# Calculating the bin index to save
binIndex = sA[cx] % BIN_COUNT
cuda.atomic.add(Hist, binIndex ,1)
def cudatest_stencil():
src1 = np.arange(n, dtype=np.float32)
print(src1)
stream = cuda.stream() # use stream to trigger async memory transfer
ts = timer()
# Controll the iterations
count = 1
for i in range(count):
with stream.auto_synchronize():
# ts = timer()
d_src1 = cuda.to_device(src1, stream=stream)
gpu_1d_stencil[bpg, tpb, stream](d_src1)
d_src1.copy_to_host(src1, stream=stream)
te = timer()
print('pinned ',count," : ", te - ts)
# mid = math.ceil(n/2)
# print(src1[mid:n])
print(src1[0:n])
print(len(src1[0:n]))
def cudatest_hist():
# src1 = np.arange(n, dtype=np.float32)
src1 = np.random.randint(BIN_COUNT,size=n).astype(np.float32)
histogram = np.zeros(BIN_COUNT, dtype=np.int32)
print(src1)
stream = cuda.stream() # use stream to trigger async memory transfer
ts = timer()
# Controll the iterations
count = 1
for i in range(count):
with stream.auto_synchronize():
# ts = timer()
d_src1 = cuda.to_device(src1, stream=stream)
d_hist = cuda.to_device(histogram, stream=stream)
# gpu_1d_stencil[bpg, tpb, stream](d_src1)
gpu_histogram[bpg, tpb, stream](d_src1,d_hist)
d_src1.copy_to_host(src1, stream=stream)
d_hist.copy_to_host(histogram, stream=stream)
te = timer()
print('pinned ',count," : ", te - ts)
print(histogram)
# Taking histogram on origional data.
# This histogram will contain few more frequency due to the padding we add in the orional data.
# in kernel code.
hist = src1.astype(np.int64)
x = itemfreq(hist.ravel())
hist = x#[:, 1]/sum(x[:, 1])
print(hist)
# cudatest_stencil()
cudatest_hist()
print('done')
# assert np.allclose(dst, src)