-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtuned_variant01_op_02.c
executable file
·179 lines (167 loc) · 6.45 KB
/
tuned_variant01_op_02.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
/** First variant, only focused on passing tests. */
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <immintrin.h>
#define MAT_SIDE_LENGTH 8
void printMatrix(const char *filename, const float *matrix, int rows, int cols)
{
FILE *file = fopen(filename, "w");
if (file == NULL)
{
perror("Failed to open file");
return;
}
for (int i = 0; i < rows; i++)
{
for (int j = 0; j < cols; j++)
{
fprintf(file, "%8.2f ", matrix[i * cols + j]);
}
fprintf(file, "\n");
}
fclose(file);
}
/** An array of 8 __m256 (64 elements) */
struct Matrix_64
{
__m256 rows[MAT_SIDE_LENGTH];
};
typedef struct Matrix_64 Matrix_64;
/** Loads 64 elements from the source array into a Matrix_64 struct. The stride
* specifies how far apart the elements of each row are in memory. For example,
* a stride of 1 would mean that the elements are contiguous in memory. */
void loadMat64(Matrix_64 *mat, const float *source, size_t stride)
{
for (int i = 0; i < MAT_SIDE_LENGTH; i++)
{
mat->rows[i] = _mm256_loadu_ps(source + stride * i);
}
}
/** Stores 64 elements from the Matrix_64 struct into a destination array. Same
* stride parameter as loadMat64. */
void storeMat64(const Matrix_64 *mat, float *dest, size_t stride)
{
for (int i = 0; i < MAT_SIDE_LENGTH; i++)
{
_mm256_storeu_ps(dest + stride * i, mat->rows[i]);
}
}
/** In place transpose of an 8x8 matrix
*
* Example unpacklo and unpackhi:
* [00, 01, 02, 03, 04, 05, 06, 07]
* [08, 09, 10, 11, 12, 13, 14, 15]
*
* [00, 08, 01, 09, 02, 10, 03, 11]
* [04, 12, 05, 13, 06, 14, 07, 15]
*
* _MM_SHUFFLE(1, 0, 1, 0):
* 1st & 2nd from tmp0,
* 1st & 2nd from tmp2,
* 5th & 6th from tmp0,
* 5th & 6th from tmp2.
*
* _MM_SHUFFLE(3, 2, 3, 2):
* 3rd & 4th from tmp0,
* 3rd & 4th from tmp2,
* 7th & 8th from tmp0,
* 7th & 8th from tmp2.
*
* _mm256_permute2f128_ps(shuff0, shuff4, 0x20):
* 1st half of shuff0,
* 1st half of shuff4.
*
* _mm256_permute2f128_ps(shuff0, shuff4, 0x31):
* 2nd half of shuff0,
* 2nd half of shuff4.
*/
void transposeMat64(Matrix_64 *m8)
{
// Unpack and interleave rows of the matrix
__m256 tmp0 = _mm256_unpacklo_ps(m8->rows[0], m8->rows[1]);
__m256 tmp1 = _mm256_unpackhi_ps(m8->rows[0], m8->rows[1]);
__m256 tmp2 = _mm256_unpacklo_ps(m8->rows[2], m8->rows[3]);
__m256 tmp3 = _mm256_unpackhi_ps(m8->rows[2], m8->rows[3]);
__m256 tmp4 = _mm256_unpacklo_ps(m8->rows[4], m8->rows[5]);
__m256 tmp5 = _mm256_unpackhi_ps(m8->rows[4], m8->rows[5]);
__m256 tmp6 = _mm256_unpacklo_ps(m8->rows[6], m8->rows[7]);
__m256 tmp7 = _mm256_unpackhi_ps(m8->rows[6], m8->rows[7]);
// more interleaving
__m256 shuff0 = _mm256_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(1, 0, 1, 0));
__m256 shuff1 = _mm256_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(3, 2, 3, 2));
__m256 shuff2 = _mm256_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(1, 0, 1, 0));
__m256 shuff3 = _mm256_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(3, 2, 3, 2));
__m256 shuff4 = _mm256_shuffle_ps(tmp4, tmp6, _MM_SHUFFLE(1, 0, 1, 0));
__m256 shuff5 = _mm256_shuffle_ps(tmp4, tmp6, _MM_SHUFFLE(3, 2, 3, 2));
__m256 shuff6 = _mm256_shuffle_ps(tmp5, tmp7, _MM_SHUFFLE(1, 0, 1, 0));
__m256 shuff7 = _mm256_shuffle_ps(tmp5, tmp7, _MM_SHUFFLE(3, 2, 3, 2));
// Final swap and store
m8->rows[0] = _mm256_permute2f128_ps(shuff0, shuff4, 0x20);
m8->rows[1] = _mm256_permute2f128_ps(shuff1, shuff5, 0x20);
m8->rows[2] = _mm256_permute2f128_ps(shuff2, shuff6, 0x20);
m8->rows[3] = _mm256_permute2f128_ps(shuff3, shuff7, 0x20);
m8->rows[4] = _mm256_permute2f128_ps(shuff0, shuff4, 0x31);
m8->rows[5] = _mm256_permute2f128_ps(shuff1, shuff5, 0x31);
m8->rows[6] = _mm256_permute2f128_ps(shuff2, shuff6, 0x31);
m8->rows[7] = _mm256_permute2f128_ps(shuff3, shuff7, 0x31);
}
/** Transposes a larger matrix by tiling into 8x8 blocks (Matrix_64) */
void transposeLargeMat(int m, int n, float *src, int rs_s, int cs_s, float *dst, int rs_d, int cs_d)
{
int numBlocksM = m / MAT_SIDE_LENGTH;
int numBlocksN = n / MAT_SIDE_LENGTH;
// Loop through each 8x8 block
for (int i = 0; i < numBlocksM; i++)
{
for (int j = 0; j < numBlocksN; j++)
{
Matrix_64 block;
/** This handles when strides are (2, 16, 16, 2), etc. Main difference from the rest is that
* MAT_SIDE_LENGTH is subtracted from both, otherwise the last 8 elements of the 8x8 block
* will be incorrect. It also doesn't transpose. */
if (rs_s > 1 && rs_d > 1 && cs_s > 1 && cs_d > 1)
{
loadMat64(&block, src + (i * MAT_SIDE_LENGTH) + (j * MAT_SIDE_LENGTH * cs_s) - MAT_SIDE_LENGTH, cs_s);
storeMat64(&block, dst + (i * MAT_SIDE_LENGTH) + (j * MAT_SIDE_LENGTH * cs_s) - MAT_SIDE_LENGTH, cs_s);
}
/** This handles when strides are ( 16, 1, 16, 1), etc. The loading and saving are the same
* except for i and j are swapped. Same pattern as the next one. */
else if (cs_s == 1 && cs_d == 1)
{
loadMat64(&block, src + (i * MAT_SIDE_LENGTH * rs_s) + (j * MAT_SIDE_LENGTH), rs_s);
transposeMat64(&block);
storeMat64(&block, dst + (j * MAT_SIDE_LENGTH * rs_s) + (i * MAT_SIDE_LENGTH), rs_s);
}
/** This handles when strides are (1, 16, 1, 16), etc. The loading and saving are the same
* except for i and j are swapped. Same pattern as the previous one.*/
else if (rs_s == 1 && rs_d == 1)
{
loadMat64(&block, src + (i * MAT_SIDE_LENGTH) + (j * MAT_SIDE_LENGTH * cs_s), cs_s);
transposeMat64(&block);
storeMat64(&block, dst + (j * MAT_SIDE_LENGTH) + (i * MAT_SIDE_LENGTH * cs_s), cs_s);
}
/** This handles when strides are (16, 1, 1, 16), etc. No transpose needed and same pattern
* as the next one. */
else if (cs_s == 1 && rs_d == 1)
{
loadMat64(&block, src + (i * MAT_SIDE_LENGTH * rs_s) + (j * MAT_SIDE_LENGTH), rs_s);
storeMat64(&block, dst + (j * MAT_SIDE_LENGTH) + (i * MAT_SIDE_LENGTH * rs_s), rs_s);
}
/** This handles when strides are (1, 16, 16, 1), etc. No transpose needed and same pattern
* as the previous one. */
else if (rs_s == 1 && cs_d == 1)
{
loadMat64(&block, src + (i * MAT_SIDE_LENGTH * cs_s) + (j * MAT_SIDE_LENGTH), cs_s);
storeMat64(&block, dst + (j * MAT_SIDE_LENGTH) + (i * MAT_SIDE_LENGTH * cs_s), cs_s);
}
}
}
}
#ifndef FUN_NAME
#define FUN_NAME baseline_transpose
#endif
void FUN_NAME(int m, int n, float *src, int rs_s, int cs_s, float *dst, int rs_d, int cs_d)
{
transposeLargeMat(m, n, src, rs_s, cs_s, dst, rs_d, cs_d);
}