From 30bf2635fd6a31e534e354608052db415ea12ba9 Mon Sep 17 00:00:00 2001 From: Ger Hobbelt Date: Tue, 13 Jul 2021 08:59:26 +0200 Subject: [PATCH] bugfix of FMA port to FAST_FLOAT: 8 float FPs fit in a single 256bit vector (8x32) (contrasting 4 double FPs: 4*64) --- src/arch/dotproductfma.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/arch/dotproductfma.cpp b/src/arch/dotproductfma.cpp index 32154283ae..6afaefd3eb 100644 --- a/src/arch/dotproductfma.cpp +++ b/src/arch/dotproductfma.cpp @@ -31,26 +31,26 @@ namespace tesseract { // Uses Intel FMA intrinsics to access the SIMD instruction set. #if defined(FAST_FLOAT) TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n) { - const unsigned quot = n / 8; - const unsigned rem = n % 8; + const unsigned quot = n / 16; + const unsigned rem = n % 16; __m256 t0 = _mm256_setzero_ps(); __m256 t1 = _mm256_setzero_ps(); for (unsigned k = 0; k < quot; k++) { __m256 f0 = _mm256_loadu_ps(u); __m256 f1 = _mm256_loadu_ps(v); t0 = _mm256_fmadd_ps(f0, f1, t0); - u += 4; - v += 4; + u += 8; + v += 8; __m256 f2 = _mm256_loadu_ps(u); __m256 f3 = _mm256_loadu_ps(v); t1 = _mm256_fmadd_ps(f2, f3, t1); - u += 4; - v += 4; + u += 8; + v += 8; } t0 = _mm256_hadd_ps(t0, t1); - alignas(32) float tmp[4]; + alignas(32) TFloat tmp[8]; _mm256_store_ps(tmp, t0); - float result = tmp[0] + tmp[1] + tmp[2] + tmp[3]; + TFloat result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7]; for (unsigned k = 0; k < rem; k++) { result += *u++ * *v++; }