From ef4d5b2e696ded0418925032ff0977215565f88e Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Wed, 20 Feb 2019 17:41:49 +0100 Subject: [PATCH] Optimize calculation of dot product for double vectors with AVX This improves the performance with best models and should also make training faster. Signed-off-by: Stefan Weil --- src/arch/dotproductavx.cpp | 86 ++++++++++---------------------------- 1 file changed, 23 insertions(+), 63 deletions(-) diff --git a/src/arch/dotproductavx.cpp b/src/arch/dotproductavx.cpp index ac2c2f60c7..545b2f059c 100644 --- a/src/arch/dotproductavx.cpp +++ b/src/arch/dotproductavx.cpp @@ -29,70 +29,30 @@ namespace tesseract { // Computes and returns the dot product of the n-vectors u and v. // Uses Intel AVX intrinsics to access the SIMD instruction set. double DotProductAVX(const double* u, const double* v, int n) { - int max_offset = n - 4; - int offset = 0; - // Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and - // v, and multiplying them together in parallel. - __m256d sum = _mm256_setzero_pd(); - if (offset <= max_offset) { - offset = 4; - // Aligned load is reputedly faster but requires 32 byte aligned input. - if ((reinterpret_cast(u) & 31) == 0 && - (reinterpret_cast(v) & 31) == 0) { - // Use aligned load. - __m256d floats1 = _mm256_load_pd(u); - __m256d floats2 = _mm256_load_pd(v); - // Multiply. - sum = _mm256_mul_pd(floats1, floats2); - while (offset <= max_offset) { - floats1 = _mm256_load_pd(u + offset); - floats2 = _mm256_load_pd(v + offset); - offset += 4; - __m256d product = _mm256_mul_pd(floats1, floats2); - sum = _mm256_add_pd(sum, product); - } - } else { - // Use unaligned load. - __m256d floats1 = _mm256_loadu_pd(u); - __m256d floats2 = _mm256_loadu_pd(v); - // Multiply. - sum = _mm256_mul_pd(floats1, floats2); - while (offset <= max_offset) { - floats1 = _mm256_loadu_pd(u + offset); - floats2 = _mm256_loadu_pd(v + offset); - offset += 4; - __m256d product = _mm256_mul_pd(floats1, floats2); - sum = _mm256_add_pd(sum, product); - } - } + const unsigned quot = n / 8; + const unsigned rem = n % 8; + __m256d t0 = _mm256_setzero_pd(); + __m256d t1 = _mm256_setzero_pd(); + for (unsigned k = 0; k < quot; k++) { + __m256d f0 = _mm256_loadu_pd(u); + __m256d f1 = _mm256_loadu_pd(v); + f0 = _mm256_mul_pd(f0, f1); + t0 = _mm256_add_pd(t0, f0); + u += 4; + v += 4; + __m256d f2 = _mm256_loadu_pd(u); + __m256d f3 = _mm256_loadu_pd(v); + f2 = _mm256_mul_pd(f2, f3); + t1 = _mm256_add_pd(t1, f2); + u += 4; + v += 4; } - // Add the 4 product sums together horizontally. Not so easy as with sse, as - // there is no add across the upper/lower 128 bit boundary, so permute to - // move the upper 128 bits to lower in another register. - __m256d sum2 = _mm256_permute2f128_pd(sum, sum, 1); - sum = _mm256_hadd_pd(sum, sum2); - sum = _mm256_hadd_pd(sum, sum); - double result; - // _mm256_extract_f64 doesn't exist, but resist the temptation to use an sse - // instruction, as that introduces a 70 cycle delay. All this casting is to - // fool the intrinsics into thinking we are extracting the bottom int64. - auto cast_sum = _mm256_castpd_si256(sum); -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wstrict-aliasing" - *(reinterpret_cast(&result)) = -#if defined(_WIN32) || defined(__i386__) - // This is a very simple workaround that is activated - // for all platforms that do not have _mm256_extract_epi64. - // _mm256_extract_epi64(X, Y) == ((uint64_t*)&X)[Y] - ((uint64_t*)&cast_sum)[0] -#else - _mm256_extract_epi64(cast_sum, 0) -#endif - ; -#pragma GCC diagnostic pop - while (offset < n) { - result += u[offset] * v[offset]; - ++offset; + t0 = _mm256_hadd_pd(t0, t1); + alignas(32) double tmp[4]; + _mm256_store_pd(tmp, t0); + double result = tmp[0] + tmp[1] + tmp[2] + tmp[3]; + for (unsigned k = 0; k < rem; k++) { + result += *u++ * *v++; } return result; }