From 29e2379b56a503d390dcb96af26874e236a9c8ac Mon Sep 17 00:00:00 2001
From: Ger Hobbelt <ger@hobbelt.com>
Date: Tue, 13 Jul 2021 09:20:39 +0200
Subject: [PATCH] extracted from 3490: implements DotProductSSE() for
 FAST_FLOAT

---
 src/arch/dotproductsse.cpp | 64 ++++++++++++++++++++++++++++++++++----
 src/arch/intsimdmatrix.h   |  2 +-
 2 files changed, 59 insertions(+), 7 deletions(-)

diff --git a/src/arch/dotproductsse.cpp b/src/arch/dotproductsse.cpp
index ec94f50341..9122e9d1b1 100644
--- a/src/arch/dotproductsse.cpp
+++ b/src/arch/dotproductsse.cpp
@@ -31,12 +31,63 @@ namespace tesseract {
 // Computes and returns the dot product of the n-vectors u and v.
 // Uses Intel SSE intrinsics to access the SIMD instruction set.
 #if defined(FAST_FLOAT)
-TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n) {
-  TFloat total = 0.0;
-  for (int k = 0; k < n; ++k) {
-    total += u[k] * v[k];
+float DotProductSSE(const float *u, const float *v, int n) {
+  int max_offset = n - 4;
+  int offset = 0;
+  // Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and
+  // v, and multiplying them together in parallel.
+  __m128 sum = _mm_setzero_ps();
+  if (offset <= max_offset) {
+    offset = 4;
+    // Aligned load is reputedly faster but requires 16 byte aligned input.
+    if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 &&
+        (reinterpret_cast<uintptr_t>(v) & 15) == 0) {
+      // Use aligned load.
+      sum = _mm_load_ps(u);
+      __m128 floats2 = _mm_load_ps(v);
+      // Multiply.
+      sum = _mm_mul_ps(sum, floats2);
+      while (offset <= max_offset) {
+        __m128 floats1 = _mm_load_ps(u + offset);
+        floats2 = _mm_load_ps(v + offset);
+        floats1 = _mm_mul_ps(floats1, floats2);
+        sum = _mm_add_ps(sum, floats1);
+        offset += 4;
+      }
+    } else {
+      // Use unaligned load.
+      sum = _mm_loadu_ps(u);
+      __m128 floats2 = _mm_loadu_ps(v);
+      // Multiply.
+      sum = _mm_mul_ps(sum, floats2);
+      while (offset <= max_offset) {
+        __m128 floats1 = _mm_loadu_ps(u + offset);
+        floats2 = _mm_loadu_ps(v + offset);
+        floats1 = _mm_mul_ps(floats1, floats2);
+        sum = _mm_add_ps(sum, floats1);
+        offset += 4;
+      }
+    }
   }
-  return total;
+  // Add the 4 sums in sum horizontally.
+#if 0
+	alignas(32) float tmp[4];
+	_mm_store_ps(tmp, sum);
+	float result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
+#else
+  __m128 zero = _mm_setzero_ps();
+  // https://www.felixcloutier.com/x86/haddps
+  sum = _mm_hadd_ps(sum, zero);
+  sum = _mm_hadd_ps(sum, zero);
+  // Extract the low result.
+  float result = _mm_cvtss_f32(sum);
+#endif
+  // Add on any left-over products.
+  while (offset < n) {
+    result += u[offset] * v[offset];
+    ++offset;
+  }
+  return result;
 }
 #else
 double DotProductSSE(const double *u, const double *v, int n) {
@@ -48,7 +99,8 @@ double DotProductSSE(const double *u, const double *v, int n) {
   if (offset <= max_offset) {
     offset = 2;
     // Aligned load is reputedly faster but requires 16 byte aligned input.
-    if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 && (reinterpret_cast<uintptr_t>(v) & 15) == 0) {
+    if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 &&
+        (reinterpret_cast<uintptr_t>(v) & 15) == 0) {
       // Use aligned load.
       sum = _mm_load_pd(u);
       __m128d floats2 = _mm_load_pd(v);
diff --git a/src/arch/intsimdmatrix.h b/src/arch/intsimdmatrix.h
index aa05a450ee..98a894f0ca 100644
--- a/src/arch/intsimdmatrix.h
+++ b/src/arch/intsimdmatrix.h
@@ -115,7 +115,7 @@ struct TESS_API IntSimdMatrix {
   static const IntSimdMatrix *intSimdMatrix;
   // Only available with NEON.
   static const IntSimdMatrix *intSimdMatrixNEON;
-  // Only available with AVX2 / SSE.
+  // Only available with AVX2 / AVX / FMA / SSE.
   static const IntSimdMatrix *intSimdMatrixAVX2;
   static const IntSimdMatrix *intSimdMatrixSSE;
 };