Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize calculation of dot product for double vectors with AVX #2257

Merged
merged 1 commit into from
Feb 21, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 23 additions & 63 deletions src/arch/dotproductavx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,70 +29,30 @@ namespace tesseract {
// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel AVX intrinsics to access the SIMD instruction set.
double DotProductAVX(const double* u, const double* v, int n) {
int max_offset = n - 4;
int offset = 0;
// Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and
// v, and multiplying them together in parallel.
__m256d sum = _mm256_setzero_pd();
if (offset <= max_offset) {
offset = 4;
// Aligned load is reputedly faster but requires 32 byte aligned input.
if ((reinterpret_cast<uintptr_t>(u) & 31) == 0 &&
(reinterpret_cast<uintptr_t>(v) & 31) == 0) {
// Use aligned load.
__m256d floats1 = _mm256_load_pd(u);
__m256d floats2 = _mm256_load_pd(v);
// Multiply.
sum = _mm256_mul_pd(floats1, floats2);
while (offset <= max_offset) {
floats1 = _mm256_load_pd(u + offset);
floats2 = _mm256_load_pd(v + offset);
offset += 4;
__m256d product = _mm256_mul_pd(floats1, floats2);
sum = _mm256_add_pd(sum, product);
}
} else {
// Use unaligned load.
__m256d floats1 = _mm256_loadu_pd(u);
__m256d floats2 = _mm256_loadu_pd(v);
// Multiply.
sum = _mm256_mul_pd(floats1, floats2);
while (offset <= max_offset) {
floats1 = _mm256_loadu_pd(u + offset);
floats2 = _mm256_loadu_pd(v + offset);
offset += 4;
__m256d product = _mm256_mul_pd(floats1, floats2);
sum = _mm256_add_pd(sum, product);
}
}
const unsigned quot = n / 8;
const unsigned rem = n % 8;
__m256d t0 = _mm256_setzero_pd();
__m256d t1 = _mm256_setzero_pd();
for (unsigned k = 0; k < quot; k++) {
__m256d f0 = _mm256_loadu_pd(u);
__m256d f1 = _mm256_loadu_pd(v);
f0 = _mm256_mul_pd(f0, f1);
t0 = _mm256_add_pd(t0, f0);
u += 4;
v += 4;
__m256d f2 = _mm256_loadu_pd(u);
__m256d f3 = _mm256_loadu_pd(v);
f2 = _mm256_mul_pd(f2, f3);
t1 = _mm256_add_pd(t1, f2);
u += 4;
v += 4;
}
// Add the 4 product sums together horizontally. Not so easy as with sse, as
// there is no add across the upper/lower 128 bit boundary, so permute to
// move the upper 128 bits to lower in another register.
__m256d sum2 = _mm256_permute2f128_pd(sum, sum, 1);
sum = _mm256_hadd_pd(sum, sum2);
sum = _mm256_hadd_pd(sum, sum);
double result;
// _mm256_extract_f64 doesn't exist, but resist the temptation to use an sse
// instruction, as that introduces a 70 cycle delay. All this casting is to
// fool the intrinsics into thinking we are extracting the bottom int64.
auto cast_sum = _mm256_castpd_si256(sum);
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
*(reinterpret_cast<int64_t*>(&result)) =
#if defined(_WIN32) || defined(__i386__)
// This is a very simple workaround that is activated
// for all platforms that do not have _mm256_extract_epi64.
// _mm256_extract_epi64(X, Y) == ((uint64_t*)&X)[Y]
((uint64_t*)&cast_sum)[0]
#else
_mm256_extract_epi64(cast_sum, 0)
#endif
;
#pragma GCC diagnostic pop
while (offset < n) {
result += u[offset] * v[offset];
++offset;
t0 = _mm256_hadd_pd(t0, t1);
alignas(32) double tmp[4];
_mm256_store_pd(tmp, t0);
double result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
for (unsigned k = 0; k < rem; k++) {
result += *u++ * *v++;
}
return result;
}
Expand Down