Skip to content

Commit

Permalink
DotProductAVX: Unroll loops
Browse files Browse the repository at this point in the history
This improves the performance significantly.

Signed-off-by: Stefan Weil <[email protected]>
  • Loading branch information
stweil committed May 26, 2017
1 parent 9f03941 commit 52f4ed4
Showing 1 changed file with 11 additions and 1 deletion.
12 changes: 11 additions & 1 deletion arch/dotproductavx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ namespace tesseract {
// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel AVX intrinsics to access the SIMD instruction set.
double DotProductAVX(const double* u, const double* v, int n) {
int max_offset = n - 3;
int max_offset = n - 7;
int offset = 0;
// Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and
// v, and multiplying them together in parallel.
Expand All @@ -59,6 +59,11 @@ double DotProductAVX(const double* u, const double* v, int n) {
__m256d product = _mm256_mul_pd(floats1, floats2);
sum = _mm256_add_pd(sum, product);
offset += 4;
floats1 = _mm256_load_pd(u + offset);
floats2 = _mm256_load_pd(v + offset);
product = _mm256_mul_pd(floats1, floats2);
sum = _mm256_add_pd(sum, product);
offset += 4;
} while (offset < max_offset);
} else {
do {
Expand All @@ -69,6 +74,11 @@ double DotProductAVX(const double* u, const double* v, int n) {
__m256d product = _mm256_mul_pd(floats1, floats2);
sum = _mm256_add_pd(sum, product);
offset += 4;
floats1 = _mm256_loadu_pd(u + offset);
floats2 = _mm256_loadu_pd(v + offset);
product = _mm256_mul_pd(floats1, floats2);
sum = _mm256_add_pd(sum, product);
offset += 4;
} while (offset < max_offset);
}
}
Expand Down

0 comments on commit 52f4ed4

Please sign in to comment.