-
Notifications
You must be signed in to change notification settings - Fork 345
/
binary_kernels.cu
184 lines (143 loc) · 6.04 KB
/
binary_kernels.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#include <stdio.h>
#define BLOCK_SIZE 16
// CUDA tutorial: http://www.nvidia.com/docs/IO/116711/sc11-cuda-c-basics.pdf
// http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory
// A is shape (m,n), B is shape (n,k) and C is shape (m,k)
__global__ void gemm(float* A, float* B, float* C, int m, int n, int k) {
// Block row and column
int blockRow = blockIdx.y;
int blockCol = blockIdx.x;
// Thread row and column within Csub
int row = threadIdx.y;
int col = threadIdx.x;
// Each thread block computes one sub-matrix Csub of C
float* Csub = &C[BLOCK_SIZE * k * blockRow + BLOCK_SIZE * blockCol];
// Shared memory used to store Asub and Bsub respectively
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Each thread computes one element of Csub
// by accumulating results into Cvalue
// block_size = 16 -> 256 threads, one per Csub element
float Cvalue = 0.0;
// Loop over all the sub-matrices of A and B that are
// required to compute Csub
// Multiply each pair of sub-matrices together
// and accumulate the results
for (int i = 0; i < (n / BLOCK_SIZE); ++i) {
// Get sub-matrix Asub of A
float* Asub = &A[BLOCK_SIZE * blockRow * n + BLOCK_SIZE * i];
// Get sub-matrix Bsub of B
float* Bsub = &B[BLOCK_SIZE * k * i + BLOCK_SIZE * blockCol];
// Load Asub and Bsub from device memory to shared memory
// Each thread loads one element of each sub-matrix
As[row][col] = Asub[row*n+col];
Bs[row][col] = Bsub[row*k+col];
// Synchronize to make sure the sub-matrices are loaded
// before starting the computation
__syncthreads();
// Multiply Asub and Bsub together
for (int j = 0; j < BLOCK_SIZE; ++j) Cvalue += As[row][j] * Bs[j][col];
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write Csub to device memory
// Each thread writes one element
if(col + blockCol* BLOCK_SIZE< k && row + blockRow* BLOCK_SIZE< m) Csub[row*k+col] = Cvalue;
}
// 32 single float array -> 32 bits unsigned int
__device__ unsigned int concatenate(float* array)
{
unsigned int rvalue=0;
unsigned int sign;
for (int i = 0; i < 32; i++)
{
sign = (array[i]>=0);
rvalue = rvalue | (sign<<i);
}
return rvalue;
}
__global__ void concatenate_rows_kernel(float *a, unsigned int *b, int size)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i<size) b[i] = concatenate(&a[i*32]);
}
__global__ void concatenate_cols_kernel(float *a, unsigned int *b, int m, int n)
{
int j = blockIdx.x * blockDim.x + threadIdx.x;
if(j<n){
float * array = new float[32];
for(int i=0; i<m; i+=32){
for(int k=0; k<32;k++) array[k] = a[j + n*(i+k)];
b[j+n*i/32]=concatenate(array);
}
delete[] array;
}
}
// 32 bits unsigned int -> 32 single float array
// TODO: the array allocation should not be done here
__device__ float* deconcatenate(unsigned int x)
{
float * array = new float[32];
for (int i = 0; i < 32; i++)
{
array[i] = (x & ( 1 << i )) >> i;
}
return array;
}
__global__ void deconcatenate_rows_kernel(unsigned int *a, float *b, int size)
{
float * array;
for(int i=0; i<size; i+=32)
{
array = deconcatenate(a[i/32]);
for (int k=0;k<32;k++) b[i+k] = array[k];
delete[] array;
}
}
// A is shape (m,n), B is shape (n,k) and C is shape (m,k)
__global__ void xnor_gemm(unsigned int* A, unsigned int* B, float* C, int m, int n, int k) {
// Block row and column
int blockRow = blockIdx.y;
int blockCol = blockIdx.x;
// Thread row and column within Csub
int row = threadIdx.y;
int col = threadIdx.x;
// Each thread block computes one sub-matrix Csub of C
float* Csub = &C[BLOCK_SIZE * k * blockRow + BLOCK_SIZE * blockCol];
// Shared memory used to store Asub and Bsub respectively
__shared__ unsigned int As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ unsigned int Bs[BLOCK_SIZE][BLOCK_SIZE];
// Each thread computes one element of Csub
// by accumulating results into Cvalue
// block_size = 16 -> 256 threads, one per Csub element
unsigned int Cvalue = 0;
// Loop over all the sub-matrices of A and B that are
// required to compute Csub
// Multiply each pair of sub-matrices together
// and accumulate the results
for (int i = 0; i < (n / BLOCK_SIZE); ++i) {
// Get sub-matrix Asub of A
unsigned int* Asub = &A[BLOCK_SIZE * blockRow * n + BLOCK_SIZE * i];
// Get sub-matrix Bsub of B
unsigned int* Bsub = &B[BLOCK_SIZE * k * i + BLOCK_SIZE * blockCol];
// Load Asub and Bsub from device memory to shared memory
// Each thread loads one element of each sub-matrix
As[row][col] = Asub[row*n+col];
Bs[row][col] = Bsub[row*k+col];
// Synchronize to make sure the sub-matrices are loaded
// before starting the computation
__syncthreads();
// Multiply Asub and Bsub together
// THIS IS THE MOST INTERESTING PART
for (int j = 0; j < BLOCK_SIZE; ++j) Cvalue += __popc(As[row][j]^Bs[j][col]);
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write Csub to device memory
// Each thread writes one element
if(col + blockCol* BLOCK_SIZE< k && row + blockRow* BLOCK_SIZE< m) Csub[row*k+col] = -(2*(float)Cvalue-32*n);
}