Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tfloat patch 3: implements DotProductSSE() for FAST_FLOAT #3493

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -146,39 +146,45 @@ noinst_LTLIBRARIES += libtesseract_native.la
libtesseract_native_la_CXXFLAGS = -O3 -ffast-math
if MARCH_NATIVE_OPT
libtesseract_native_la_CXXFLAGS += -march=native -mtune=native
libtesseract_native_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
endif
libtesseract_native_la_SOURCES = src/arch/dotproduct.cpp

if HAVE_AVX
libtesseract_avx_la_CXXFLAGS = -mavx
libtesseract_avx_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
libtesseract_avx_la_SOURCES = src/arch/dotproductavx.cpp
libtesseract_la_LIBADD += libtesseract_avx.la
noinst_LTLIBRARIES += libtesseract_avx.la
endif

if HAVE_AVX2
libtesseract_avx2_la_CXXFLAGS = -mavx2
libtesseract_avx2_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
libtesseract_avx2_la_SOURCES = src/arch/intsimdmatrixavx2.cpp
libtesseract_la_LIBADD += libtesseract_avx2.la
noinst_LTLIBRARIES += libtesseract_avx2.la
endif

if HAVE_FMA
libtesseract_fma_la_CXXFLAGS = -mfma
libtesseract_fma_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
libtesseract_fma_la_SOURCES = src/arch/dotproductfma.cpp
libtesseract_la_LIBADD += libtesseract_fma.la
noinst_LTLIBRARIES += libtesseract_fma.la
endif

if HAVE_SSE4_1
libtesseract_sse_la_CXXFLAGS = -msse4.1
libtesseract_sse_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
libtesseract_sse_la_SOURCES = src/arch/dotproductsse.cpp src/arch/intsimdmatrixsse.cpp
libtesseract_la_LIBADD += libtesseract_sse.la
noinst_LTLIBRARIES += libtesseract_sse.la
endif

if HAVE_NEON
libtesseract_neon_la_CXXFLAGS = $(NEON_CXXFLAGS)
libtesseract_neon_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
libtesseract_neon_la_SOURCES = src/arch/intsimdmatrixneon.cpp
libtesseract_la_LIBADD += libtesseract_neon.la
noinst_LTLIBRARIES += libtesseract_neon.la
Expand Down Expand Up @@ -1230,6 +1236,7 @@ check_PROGRAMS += commandlineflags_test
check_PROGRAMS += dawg_test
endif # ENABLE_TRAINING
check_PROGRAMS += denorm_test
check_PROGRAMS += dotproduct_test
if !DISABLED_LEGACY_ENGINE
check_PROGRAMS += equationdetect_test
endif # !DISABLED_LEGACY_ENGINE
Expand Down Expand Up @@ -1356,6 +1363,16 @@ denorm_test_SOURCES = unittest/denorm_test.cc
denorm_test_CPPFLAGS = $(unittest_CPPFLAGS)
denorm_test_LDADD = $(TESS_LIBS)

dotproduct_test_SOURCES = unittest/dotproduct_test.cc
dotproduct_test_CPPFLAGS = $(unittest_CPPFLAGS)
if HAVE_AVX2
dotproduct_test_CPPFLAGS += -DHAVE_AVX2
endif
if HAVE_SSE4_1
dotproduct_test_CPPFLAGS += -DHAVE_SSE4_1
endif
dotproduct_test_LDADD = $(TESS_LIBS)

if !DISABLED_LEGACY_ENGINE
equationdetect_test_SOURCES = unittest/equationdetect_test.cc
equationdetect_test_CPPFLAGS = $(unittest_CPPFLAGS)
Expand Down
2 changes: 1 addition & 1 deletion src/arch/dotproduct.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
namespace tesseract {

// Computes and returns the dot product of the two n-vectors u and v.
double DotProductNative(const double *u, const double *v, int n) {
TFloat DotProductNative(const TFloat *u, const TFloat *v, int n) {
double total = 0.0;
for (int k = 0; k < n; ++k) {
total += u[k] * v[k];
Expand Down
14 changes: 10 additions & 4 deletions src/arch/dotproduct.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,25 @@
#ifndef TESSERACT_ARCH_DOTPRODUCT_H_
#define TESSERACT_ARCH_DOTPRODUCT_H_

#include "tfloat.h"

namespace tesseract {

// Computes and returns the dot product of the n-vectors u and v.
double DotProductNative(const double *u, const double *v, int n);
TFloat DotProductNative(const TFloat *u, const TFloat *v, int n);

// Uses Intel AVX intrinsics to access the SIMD instruction set.
double DotProductAVX(const double *u, const double *v, int n);
TFloat DotProductAVX(const TFloat *u, const TFloat *v, int n);
TFloat DotProductAVX1(const TFloat *u, const TFloat *v, int n);
TFloat DotProductAVX2(const TFloat *u, const TFloat *v, int n);
TFloat DotProductAVX3(const TFloat *u, const TFloat *v, int n);
TFloat DotProductAVX4(const TFloat *u, const TFloat *v, int n);

// Use Intel FMA.
double DotProductFMA(const double *u, const double *v, int n);
TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n);

// Uses Intel SSE intrinsics to access the SIMD instruction set.
double DotProductSSE(const double *u, const double *v, int n);
TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n);

} // namespace tesseract.

Expand Down
74 changes: 74 additions & 0 deletions src/arch/dotproductavx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,79 @@ namespace tesseract {

// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel AVX intrinsics to access the SIMD instruction set.
#if defined(FAST_FLOAT)
float DotProductAVX(const float *u, const float *v, int n) {
const unsigned quot = n / 8;
const unsigned rem = n % 8;
__m256 t0 = _mm256_setzero_ps();
for (unsigned k = 0; k < quot; k++) {
__m256 f0 = _mm256_loadu_ps(u);
__m256 f1 = _mm256_loadu_ps(v);
f0 = _mm256_mul_ps(f0, f1);
t0 = _mm256_add_ps(t0, f0);
u += 8;
v += 8;
}
alignas(32) float tmp[8];
_mm256_store_ps(tmp, t0);
float result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
for (unsigned k = 0; k < rem; k++) {
result += *u++ * *v++;
}
return result;
}
float DotProductAVX1(const float *u, const float *v, int n) {
const unsigned quot = n / 16;
const unsigned rem = n % 16;
__m256 t0 = _mm256_setzero_ps();
__m256 t1 = _mm256_setzero_ps();
for (unsigned k = 0; k < quot; k++) {
__m256 f0 = _mm256_loadu_ps(u);
__m256 f1 = _mm256_loadu_ps(v);
__m256 f2 = _mm256_loadu_ps(u + 8);
__m256 f3 = _mm256_loadu_ps(v + 8);
f0 = _mm256_mul_ps(f0, f1);
f2 = _mm256_mul_ps(f2, f3);
t0 = _mm256_add_ps(t0, f0);
t1 = _mm256_add_ps(t1, f2);
u += 16;
v += 16;
}
t0 = _mm256_hadd_ps(t0, t1);
alignas(32) float tmp[8];
_mm256_store_ps(tmp, t0);
float result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
for (unsigned k = 0; k < rem; k++) {
result += *u++ * *v++;
}
return result;
}
#else
double DotProductAVX1(const double *u, const double *v, int n) {
__m256d t0 = _mm256_setzero_pd();
__m256d t1 = _mm256_setzero_pd();
for (unsigned quot = n / 8; quot > 0; quot--) {
__m256d f0 = _mm256_loadu_pd(u);
__m256d f1 = _mm256_loadu_pd(v);
__m256d f2 = _mm256_loadu_pd(u + 4);
__m256d f3 = _mm256_loadu_pd(v + 4);
f0 = _mm256_mul_pd(f0, f1);
f2 = _mm256_mul_pd(f2, f3);
t0 = _mm256_add_pd(t0, f0);
t1 = _mm256_add_pd(t1, f2);
u += 8;
v += 8;
}
t0 = _mm256_hadd_pd(t0, t1);
alignas(32) double tmp[4];
_mm256_store_pd(tmp, t0);
double result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
for (unsigned rem = n % 8; rem > 0; rem--) {
result += *u++ * *v++;
}
return result;
}

double DotProductAVX(const double *u, const double *v, int n) {
const unsigned quot = n / 8;
const unsigned rem = n % 8;
Expand Down Expand Up @@ -57,6 +130,7 @@ double DotProductAVX(const double *u, const double *v, int n) {
}
return result;
}
#endif

} // namespace tesseract.

Expand Down
29 changes: 29 additions & 0 deletions src/arch/dotproductfma.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,34 @@ namespace tesseract {

// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel FMA intrinsics to access the SIMD instruction set.
#if defined(FAST_FLOAT)
TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n) {
const unsigned quot = n / 8;
const unsigned rem = n % 8;
__m256 t0 = _mm256_setzero_ps();
__m256 t1 = _mm256_setzero_ps();
for (unsigned k = 0; k < quot; k++) {
__m256 f0 = _mm256_loadu_ps(u);
__m256 f1 = _mm256_loadu_ps(v);
t0 = _mm256_fmadd_ps(f0, f1, t0);
u += 4;
v += 4;
__m256 f2 = _mm256_loadu_ps(u);
__m256 f3 = _mm256_loadu_ps(v);
t1 = _mm256_fmadd_ps(f2, f3, t1);
u += 4;
v += 4;
}
t0 = _mm256_hadd_ps(t0, t1);
alignas(32) float tmp[4];
_mm256_store_ps(tmp, t0);
float result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
for (unsigned k = 0; k < rem; k++) {
result += *u++ * *v++;
}
return result;
}
#else
double DotProductFMA(const double *u, const double *v, int n) {
const unsigned quot = n / 8;
const unsigned rem = n % 8;
Expand All @@ -55,6 +83,7 @@ double DotProductFMA(const double *u, const double *v, int n) {
}
return result;
}
#endif

} // namespace tesseract.

Expand Down
63 changes: 62 additions & 1 deletion src/arch/dotproductsse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,67 @@ namespace tesseract {

// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel SSE intrinsics to access the SIMD instruction set.
double DotProductSSE(const double *u, const double *v, int n) {
#if defined(FAST_FLOAT)
TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n) {
int max_offset = n - 4;
int offset = 0;
// Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and
// v, and multiplying them together in parallel.
__m128 sum = _mm_setzero_ps();
if (offset <= max_offset) {
offset = 4;
// Aligned load is reputedly faster but requires 16 byte aligned input.
if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 && (reinterpret_cast<uintptr_t>(v) & 15) == 0) {
// Use aligned load.
sum = _mm_load_ps(u);
__m128 floats2 = _mm_load_ps(v);
// Multiply.
sum = _mm_mul_ps(sum, floats2);
while (offset <= max_offset) {
__m128 floats1 = _mm_load_ps(u + offset);
floats2 = _mm_load_ps(v + offset);
floats1 = _mm_mul_ps(floats1, floats2);
sum = _mm_add_ps(sum, floats1);
offset += 4;
}
}
else {
// Use unaligned load.
sum = _mm_loadu_ps(u);
__m128 floats2 = _mm_loadu_ps(v);
// Multiply.
sum = _mm_mul_ps(sum, floats2);
while (offset <= max_offset) {
__m128 floats1 = _mm_loadu_ps(u + offset);
floats2 = _mm_loadu_ps(v + offset);
floats1 = _mm_mul_ps(floats1, floats2);
sum = _mm_add_ps(sum, floats1);
offset += 4;
}
}
}
// Add the 4 sums in sum horizontally.
#if 0
alignas(32) TFloat tmp[4];
_mm_store_ps(tmp, sum);
float result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
#else
__m128 zero = _mm_setzero_ps();
// https://www.felixcloutier.com/x86/haddps
sum = _mm_hadd_ps(sum, zero);
sum = _mm_hadd_ps(sum, zero);
// Extract the low result.
float result = _mm_cvtss_f32(sum);
#endif
// Add on any left-over products.
while (offset < n) {
result += u[offset] * v[offset];
++offset;
}
return result;
}
#else
TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n) {
int max_offset = n - 2;
int offset = 0;
// Accumulate a set of 2 sums in sum, by loading pairs of 2 values from u and
Expand Down Expand Up @@ -78,6 +138,7 @@ double DotProductSSE(const double *u, const double *v, int n) {
}
return result;
}
#endif

} // namespace tesseract.

Expand Down
2 changes: 1 addition & 1 deletion src/arch/intsimdmatrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ void IntSimdMatrix::Init(const GENERIC_2D_ARRAY<int8_t> &w, std::vector<int8_t>
// u is imagined to have an extra element at the end with value 1, to
// implement the bias, but it doesn't actually have it.
void IntSimdMatrix::MatrixDotVector(const GENERIC_2D_ARRAY<int8_t> &w,
const std::vector<double> &scales, const int8_t *u, double *v) {
const std::vector<TFloat> &scales, const int8_t *u, TFloat *v) {
int num_out = w.dim1();
int num_in = w.dim2() - 1;
// Base implementation.
Expand Down
18 changes: 10 additions & 8 deletions src/arch/intsimdmatrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
#include <cstdint>
#include <vector>

#include "tfloat.h"

namespace tesseract {

template <class T>
Expand Down Expand Up @@ -78,8 +80,8 @@ struct TESS_API IntSimdMatrix {
// u is imagined to have an extra element at the end with value 1, to
// implement the bias, but it doesn't actually have it.
// Computes the base C++ implementation.
static void MatrixDotVector(const GENERIC_2D_ARRAY<int8_t> &w, const std::vector<double> &scales,
const int8_t *u, double *v);
static void MatrixDotVector(const GENERIC_2D_ARRAY<int8_t> &w, const std::vector<TFloat> &scales,
const int8_t *u, TFloat *v);

// Rounds the input up to a multiple of the given factor.
static int Roundup(int input, int factor) {
Expand All @@ -95,8 +97,8 @@ struct TESS_API IntSimdMatrix {
// RoundInputs above.
// The input will be over-read to the extent of the padding. There are no
// alignment requirements.
using MatrixDotVectorFunction = void (*)(int, int, const int8_t *, const double *, const int8_t *,
double *);
using MatrixDotVectorFunction = void (*)(int, int, const int8_t *, const TFloat *, const int8_t *,
TFloat *);
MatrixDotVectorFunction matrixDotVectorFunction;

// Number of 32 bit outputs held in each register.
Expand All @@ -112,10 +114,10 @@ struct TESS_API IntSimdMatrix {

static const IntSimdMatrix *intSimdMatrix;
// Only available with NEON.
static const IntSimdMatrix intSimdMatrixNEON;
// Only available with AVX2 / SSE.
static const IntSimdMatrix intSimdMatrixAVX2;
static const IntSimdMatrix intSimdMatrixSSE;
static const IntSimdMatrix *intSimdMatrixNEON;
// Only available with AVX2 / AVX / FMA / SSE.
static const IntSimdMatrix *intSimdMatrixAVX2;
static const IntSimdMatrix *intSimdMatrixSSE;
};

} // namespace tesseract
Expand Down
Loading