From 30bf2635fd6a31e534e354608052db415ea12ba9 Mon Sep 17 00:00:00 2001
From: Ger Hobbelt <ger@hobbelt.com>
Date: Tue, 13 Jul 2021 08:59:26 +0200
Subject: [PATCH] bugfix of FMA port to FAST_FLOAT: 8 float FPs fit in a single
 256bit vector (8x32) (contrasting 4 double FPs: 4*64)

---
 src/arch/dotproductfma.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/arch/dotproductfma.cpp b/src/arch/dotproductfma.cpp
index 32154283ae..6afaefd3eb 100644
--- a/src/arch/dotproductfma.cpp
+++ b/src/arch/dotproductfma.cpp
@@ -31,26 +31,26 @@ namespace tesseract {
 // Uses Intel FMA intrinsics to access the SIMD instruction set.
 #if defined(FAST_FLOAT)
 TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n) {
-  const unsigned quot = n / 8;
-  const unsigned rem = n % 8;
+  const unsigned quot = n / 16;
+  const unsigned rem = n % 16;
   __m256 t0 = _mm256_setzero_ps();
   __m256 t1 = _mm256_setzero_ps();
   for (unsigned k = 0; k < quot; k++) {
     __m256 f0 = _mm256_loadu_ps(u);
     __m256 f1 = _mm256_loadu_ps(v);
     t0 = _mm256_fmadd_ps(f0, f1, t0);
-    u += 4;
-    v += 4;
+    u += 8;
+    v += 8;
     __m256 f2 = _mm256_loadu_ps(u);
     __m256 f3 = _mm256_loadu_ps(v);
     t1 = _mm256_fmadd_ps(f2, f3, t1);
-    u += 4;
-    v += 4;
+    u += 8;
+    v += 8;
   }
   t0 = _mm256_hadd_ps(t0, t1);
-  alignas(32) float tmp[4];
+  alignas(32) TFloat tmp[8];
   _mm256_store_ps(tmp, t0);
-  float result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
+  TFloat result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
   for (unsigned k = 0; k < rem; k++) {
     result += *u++ * *v++;
   }