DotProductAVX: Unroll loops

This improves the performance significantly. Signed-off-by: Stefan Weil <[email protected]>
tesseract-ocr · May 26, 2017 · 52f4ed4 · 52f4ed4
1 parent 9f03941
commit 52f4ed4
Showing 1 changed file with 11 additions and 1 deletion.
diff --git a/arch/dotproductavx.cpp b/arch/dotproductavx.cpp
@@ -42,7 +42,7 @@ namespace tesseract {
 // Computes and returns the dot product of the n-vectors u and v.
 // Uses Intel AVX intrinsics to access the SIMD instruction set.
 double DotProductAVX(const double* u, const double* v, int n) {
-  int max_offset = n - 3;
+  int max_offset = n - 7;
   int offset = 0;
   // Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and
   // v, and multiplying them together in parallel.
@@ -59,6 +59,11 @@ double DotProductAVX(const double* u, const double* v, int n) {
         __m256d product = _mm256_mul_pd(floats1, floats2);
         sum = _mm256_add_pd(sum, product);
         offset += 4;
+        floats1 = _mm256_load_pd(u + offset);
+        floats2 = _mm256_load_pd(v + offset);
+        product = _mm256_mul_pd(floats1, floats2);
+        sum = _mm256_add_pd(sum, product);
+        offset += 4;
       } while (offset < max_offset);
     } else {
       do {
@@ -69,6 +74,11 @@ double DotProductAVX(const double* u, const double* v, int n) {
         __m256d product = _mm256_mul_pd(floats1, floats2);
         sum = _mm256_add_pd(sum, product);
         offset += 4;
+        floats1 = _mm256_loadu_pd(u + offset);
+        floats2 = _mm256_loadu_pd(v + offset);
+        product = _mm256_mul_pd(floats1, floats2);
+        sum = _mm256_add_pd(sum, product);
+        offset += 4;
       } while (offset < max_offset);
     }
   }