Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

extracted from 3490: implements DotProductSSE() for FAST_FLOAT #3

Merged
merged 1 commit into from
Jul 13, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 58 additions & 6 deletions src/arch/dotproductsse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,63 @@ namespace tesseract {
// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel SSE intrinsics to access the SIMD instruction set.
#if defined(FAST_FLOAT)
TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n) {
TFloat total = 0.0;
for (int k = 0; k < n; ++k) {
total += u[k] * v[k];
float DotProductSSE(const float *u, const float *v, int n) {
int max_offset = n - 4;
int offset = 0;
// Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and
// v, and multiplying them together in parallel.
__m128 sum = _mm_setzero_ps();
if (offset <= max_offset) {
offset = 4;
// Aligned load is reputedly faster but requires 16 byte aligned input.
if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 &&
(reinterpret_cast<uintptr_t>(v) & 15) == 0) {
// Use aligned load.
sum = _mm_load_ps(u);
__m128 floats2 = _mm_load_ps(v);
// Multiply.
sum = _mm_mul_ps(sum, floats2);
while (offset <= max_offset) {
__m128 floats1 = _mm_load_ps(u + offset);
floats2 = _mm_load_ps(v + offset);
floats1 = _mm_mul_ps(floats1, floats2);
sum = _mm_add_ps(sum, floats1);
offset += 4;
}
} else {
// Use unaligned load.
sum = _mm_loadu_ps(u);
__m128 floats2 = _mm_loadu_ps(v);
// Multiply.
sum = _mm_mul_ps(sum, floats2);
while (offset <= max_offset) {
__m128 floats1 = _mm_loadu_ps(u + offset);
floats2 = _mm_loadu_ps(v + offset);
floats1 = _mm_mul_ps(floats1, floats2);
sum = _mm_add_ps(sum, floats1);
offset += 4;
}
}
}
return total;
// Add the 4 sums in sum horizontally.
#if 0
alignas(32) float tmp[4];
_mm_store_ps(tmp, sum);
float result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
#else
__m128 zero = _mm_setzero_ps();
// https://www.felixcloutier.com/x86/haddps
sum = _mm_hadd_ps(sum, zero);
sum = _mm_hadd_ps(sum, zero);
// Extract the low result.
float result = _mm_cvtss_f32(sum);
#endif
// Add on any left-over products.
while (offset < n) {
result += u[offset] * v[offset];
++offset;
}
return result;
}
#else
double DotProductSSE(const double *u, const double *v, int n) {
Expand All @@ -48,7 +99,8 @@ double DotProductSSE(const double *u, const double *v, int n) {
if (offset <= max_offset) {
offset = 2;
// Aligned load is reputedly faster but requires 16 byte aligned input.
if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 && (reinterpret_cast<uintptr_t>(v) & 15) == 0) {
if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 &&
(reinterpret_cast<uintptr_t>(v) & 15) == 0) {
// Use aligned load.
sum = _mm_load_pd(u);
__m128d floats2 = _mm_load_pd(v);
Expand Down
2 changes: 1 addition & 1 deletion src/arch/intsimdmatrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ struct TESS_API IntSimdMatrix {
static const IntSimdMatrix *intSimdMatrix;
// Only available with NEON.
static const IntSimdMatrix *intSimdMatrixNEON;
// Only available with AVX2 / SSE.
// Only available with AVX2 / AVX / FMA / SSE.
static const IntSimdMatrix *intSimdMatrixAVX2;
static const IntSimdMatrix *intSimdMatrixSSE;
};
Expand Down