-
Notifications
You must be signed in to change notification settings - Fork 36
/
Copy pathmatrix_mult.cl
230 lines (209 loc) · 8.99 KB
/
matrix_mult.cl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
// Copyright (C) 2013-2016 Altera Corporation, San Jose, California, USA. All rights reserved.
// Permission is hereby granted, free of charge, to any person obtaining a copy of this
// software and associated documentation files (the "Software"), to deal in the Software
// without restriction, including without limitation the rights to use, copy, modify, merge,
// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
// whom the Software is furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in all copies or
// substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
// This agreement shall be governed in all respects by the laws of the State of California and
// by the laws of the United States of America.
// This kernel computes C = A * B, where
// A is a N x K matrix
// B is a K x M matrix
// C is a N x M matrix
// All dimensions must be a multiple of BLOCK_SIZE (defined below).
//
// The ND-range is two-dimensional and corresponds to the dimensions of matrix
// C. Each work-item computes one element of the output matrix.
//
// The implemented algorithm uses blocking to take advantage of data reuse
// across multiple elements in matrix C. This is just like the standard loop
// tiling optimization often used in matrix multiplication implementations.
//
// This kernel is intended to be compiled with the following compiler flags:
// --no-interleaving default
// This flag indicates that the global memory is divided into two logical
// banks and allows the host program to assign buffers to specific buffers.
// This allows the host to manage the load on each memory bank, usually
// to maximize the memory bandwidth usage.
//
// This flag is used for matrix multiplication because there are
// two primary memory accesses: reads from matrix A and reads from
// matrix B. To maximize memory bandwidth, the two input matrices
// are placed in different memory banks, which ensures that there is no
// contention when trying to read elements from both matrices
// simultaneously.
//
// -fp-relaxed=true
// This flag enables the order of additions in the dot product
// computation within a block to be rearranged. This enables the additions
// to be computed more efficiently in hardware, using a tree structure
// instead of a vine.
//
// As a simple example, take the addition of four values: a0 + a1 + a2 + a3.
// The default implementation (without -fp-relaxed=true) is:
// (((a0 + a1) + a2) + a3)
// which matches the standard ordering of operations. In hardware, this
// looks like:
// a0 a1
// |-+-|
// | a2
// |-+-|
// | a3
// |-+-|
// |
//
// With -fp-relaxed=true, the implementation is a balanced tree:
// ((a0 + a1) + (a2 + a3))
// In hardware, this looks like:
// a0 a1 a2 a3
// |-+-| |-+-|
// | |
// |----+----|
// |
//
// There are two values that need to be defined in the preprocessor.
// BLOCK_SIZE
// The dimension of the block used in the core computation
// is BLOCK_SIZE x BLOCK_SIZE. This is defined in the host
// include file because the host needs to know too (just to
// ensure that the matrix sizes are a multiple of the block
// size.
// SIMD_WORK_ITEMS
// This value tells the compiler how many work-items in the work-group
// in a SIMD fashion. In the context of matrix multiplication, this
// value indicates how many output elements will be computed
// in a SIMD manner. BLOCK_SIZE must be a multiple of SIMD_WORK_ITEMS.
// See the Optimization Guide for details about this attribute.
//
// The combination of these values determines the number of floating-point
// operations per cycle.
#include "../host/inc/matrixMult.h"
#ifndef SIMD_WORK_ITEMS
#define SIMD_WORK_ITEMS 4 // default value
#endif
#if 0
__kernel
__attribute((reqd_work_group_size(BLOCK_SIZE,BLOCK_SIZE,1)))
__attribute((num_simd_work_items(SIMD_WORK_ITEMS)))
void matrixMult( // Input and output matrices
__global float *restrict C,
__global float *A,
__global float *B,
// Widths of matrices.
int A_width, int B_width)
{
// Local storage for a block of input matrices A and B
__local float A_local[BLOCK_SIZE][BLOCK_SIZE];
__local float B_local[BLOCK_SIZE][BLOCK_SIZE];
// Block index
int block_x = get_group_id(0);
int block_y = get_group_id(1);
// Local ID index (offset within a block)
int local_x = get_local_id(0);
int local_y = get_local_id(1);
// Compute loop bounds
int a_start = A_width * BLOCK_SIZE * block_y;
int a_end = a_start + A_width - 1;
int b_start = BLOCK_SIZE * block_x;
float running_sum = 0.0f;
// Compute the matrix multiplication result for this output element. Each
// loop iteration processes one block of the matrix.
for (int a = a_start, b = b_start; a <= a_end; a += BLOCK_SIZE, b += (BLOCK_SIZE * B_width))
{
// Load the matrices to local memory. Note that the (x, y) indices
// are swapped for A_local and B_local. This affects the reads from
// A_local and B_local below and result in more efficient hardware.
//
// This is actually an optimization that the compiler can perform,
// but is shown here for illustration purposes.
A_local[local_y][local_x] = A[a + A_width * local_y + local_x];
B_local[local_x][local_y] = B[b + B_width * local_y + local_x];
// Wait for the entire block to be loaded.
barrier(CLK_LOCAL_MEM_FENCE);
// Do the dot product accumulation within this block. Fully unroll the loop.
// As a result of the swap of indices above, memory accesses to
// A_local and B_local are very efficient because each loop iteration
// accesses consecutive elements. This can be seen by unrolling the
// loop and analyzing the regions that are loaded:
// A_local[local_y][0..BLOCK_SIZE-1] and
// B_local[local_x][0..BLOCK_SIZE-1]
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k)
{
running_sum += A_local[local_y][k] * B_local[local_x][k];
}
// Wait for the block to be fully consumed before loading the next
// block.
barrier(CLK_LOCAL_MEM_FENCE);
}
// Store result in matrix C
C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = running_sum;
}
#endif
#if 1
/* Double buffering based GEMM
*
*/
__kernel
__attribute((reqd_work_group_size(BLOCK_SIZE,BLOCK_SIZE,1)))
__attribute((num_simd_work_items(SIMD_WORK_ITEMS)))
void matrixMult( // Input and output matrices
__global float *restrict C,
__global float *A,
__global float *B,
// Widths of matrices.
int A_width, int B_width)
{
// Local storage for a block of input matrices A and B
__local float A_local[2][BLOCK_SIZE][BLOCK_SIZE];
__local float B_local[2][BLOCK_SIZE][BLOCK_SIZE];
// Block index
int block_x = get_group_id(0);
int block_y = get_group_id(1);
// Local ID index (offset within a block)
int local_x = get_local_id(0);
int local_y = get_local_id(1);
// Compute loop bounds
int a_start = A_width * BLOCK_SIZE * block_y;
int a_end = a_start + A_width - 1;
int b_start = BLOCK_SIZE * block_x;
float running_sum = 0.0f;
A_local[0][local_y][local_x] = A[a_start + A_width * local_y + local_x];
B_local[0][local_x][local_y] = B[b_start + B_width * local_y + local_x];
// Wait for the entire block to be loaded.
//a_start += BLOCK_SIZE;
//b_start += (BLOCK_SIZE * B_width);
int num_blocks = A_width / BLOCK_SIZE;
//for (int a = a_start, b = b_start; a <= a_end; a += BLOCK_SIZE, b += (BLOCK_SIZE * B_width))
for (int t = 0; t < num_blocks; t++)
{
// preload the next block
barrier(CLK_LOCAL_MEM_FENCE);
int tt = t + 1;
a_start += BLOCK_SIZE;
b_start += (BLOCK_SIZE * B_width);
if(tt < num_blocks) {
A_local[tt%2][local_y][local_x] = A[a_start + A_width * local_y + local_x];
B_local[tt%2][local_x][local_y] = B[b_start + B_width * local_y + local_x];
}
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k)
{
running_sum += A_local[t%2][local_y][k] * B_local[t%2][local_x][k];
}
}
C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = running_sum;
}
#endif