Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

just a couple of 'shadowed local variables' compiler warning fixes #9

Closed
wants to merge 18 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -147,41 +147,53 @@ libtesseract_native_la_CXXFLAGS = -O3 -ffast-math
if OPENMP_SIMD
libtesseract_native_la_CXXFLAGS += -fopenmp-simd -DOPENMP_SIMD
endif
if HAVE_AVX
libtesseract_native_la_CXXFLAGS += -mavx
endif
if MARCH_NATIVE_OPT
libtesseract_native_la_CXXFLAGS += -march=native -mtune=native
endif
libtesseract_native_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
libtesseract_native_la_SOURCES = src/arch/dotproduct.cpp

if HAVE_AVX
libtesseract_avx_la_CXXFLAGS = -mavx
libtesseract_avx_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
libtesseract_avx_la_SOURCES = src/arch/dotproductavx.cpp
libtesseract_la_LIBADD += libtesseract_avx.la
noinst_LTLIBRARIES += libtesseract_avx.la
endif

if HAVE_AVX2
libtesseract_avx2_la_CXXFLAGS = -mavx2
libtesseract_avx2_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
libtesseract_avx2_la_SOURCES = src/arch/intsimdmatrixavx2.cpp
libtesseract_la_LIBADD += libtesseract_avx2.la
noinst_LTLIBRARIES += libtesseract_avx2.la
endif

if HAVE_FMA
libtesseract_fma_la_CXXFLAGS = -mfma
libtesseract_fma_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
libtesseract_fma_la_SOURCES = src/arch/dotproductfma.cpp
libtesseract_la_LIBADD += libtesseract_fma.la
noinst_LTLIBRARIES += libtesseract_fma.la
endif

if HAVE_SSE4_1
libtesseract_sse_la_CXXFLAGS = -msse4.1
libtesseract_sse_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
if OPENMP_SIMD
libtesseract_sse_la_CXXFLAGS += -fopenmp-simd -DOPENMP_SIMD
endif
libtesseract_sse_la_SOURCES = src/arch/dotproductsse.cpp src/arch/intsimdmatrixsse.cpp
libtesseract_la_LIBADD += libtesseract_sse.la
noinst_LTLIBRARIES += libtesseract_sse.la
endif

if HAVE_NEON
libtesseract_neon_la_CXXFLAGS = $(NEON_CXXFLAGS)
libtesseract_neon_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
libtesseract_neon_la_SOURCES = src/arch/intsimdmatrixneon.cpp
libtesseract_la_LIBADD += libtesseract_neon.la
noinst_LTLIBRARIES += libtesseract_neon.la
Expand Down Expand Up @@ -1233,6 +1245,7 @@ check_PROGRAMS += commandlineflags_test
check_PROGRAMS += dawg_test
endif # ENABLE_TRAINING
check_PROGRAMS += denorm_test
check_PROGRAMS += dotproduct_test
if !DISABLED_LEGACY_ENGINE
check_PROGRAMS += equationdetect_test
endif # !DISABLED_LEGACY_ENGINE
Expand Down Expand Up @@ -1359,6 +1372,16 @@ denorm_test_SOURCES = unittest/denorm_test.cc
denorm_test_CPPFLAGS = $(unittest_CPPFLAGS)
denorm_test_LDADD = $(TESS_LIBS)

dotproduct_test_SOURCES = unittest/dotproduct_test.cc
dotproduct_test_CPPFLAGS = $(unittest_CPPFLAGS)
if HAVE_AVX2
dotproduct_test_CPPFLAGS += -DHAVE_AVX2
endif
if HAVE_SSE4_1
dotproduct_test_CPPFLAGS += -DHAVE_SSE4_1
endif
dotproduct_test_LDADD = $(TESS_LIBS)

if !DISABLED_LEGACY_ENGINE
equationdetect_test_SOURCES = unittest/equationdetect_test.cc
equationdetect_test_CPPFLAGS = $(unittest_CPPFLAGS)
Expand Down
11 changes: 6 additions & 5 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -297,13 +297,14 @@ OPENCL_CPPFLAGS=''
OPENCL_LDFLAGS=''
case "${host_os}" in
*darwin* | *-macos10*)
echo "checking for OpenCL framework"
MY_CHECK_FRAMEWORK([OpenCL])
if test $my_cv_framework_OpenCL = yes; then
have_opencl_lib=true
MY_CHECK_FRAMEWORK([Accelerate])
if test $my_cv_framework_Accelerate = yes; then
AM_CPPFLAGS="-DHAVE_FRAMEWORK_ACCELERATE $AM_CPPFLAGS"
LDFLAGS="$LDFLAGS -framework Accelerate"
fi
MY_CHECK_FRAMEWORK([OpenCL])
if test "$enable_opencl" = "yes"; then
if !($have_opencl_lib); then
if test $my_cv_framework_OpenCL = no; then
AC_MSG_ERROR([Required OpenCL library not found!])
fi
AM_CPPFLAGS="-DUSE_OPENCL $AM_CPPFLAGS"
Expand Down
1 change: 0 additions & 1 deletion src/api/hocrrenderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,6 @@ char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {
hocr_str << "\n </span>";
++scnt;
} else if (lstm_choice_mode == 2) {
tesseract::ChoiceIterator ci(*res_it);
stweil marked this conversation as resolved.
Show resolved Hide resolved
hocr_str << "\n <span class='ocrx_cinfo'"
<< " id='"
<< "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt << "'>";
Expand Down
6 changes: 3 additions & 3 deletions src/arch/dotproduct.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,12 @@
namespace tesseract {

// Computes and returns the dot product of the two n-vectors u and v.
double DotProductNative(const double *u, const double *v, int n) {
double total = 0.0;
TFloat DotProductNative(const TFloat *u, const TFloat *v, int n) {
TFloat total = 0;
#if defined(OPENMP_SIMD)
#pragma omp simd reduction(+:total)
#endif
for (int k = 0; k < n; ++k) {
for (int k = 0; k < n; k++) {
total += u[k] * v[k];
}
return total;
Expand Down
15 changes: 11 additions & 4 deletions src/arch/dotproduct.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,27 @@
#ifndef TESSERACT_ARCH_DOTPRODUCT_H_
#define TESSERACT_ARCH_DOTPRODUCT_H_

#include "tfloat.h"

namespace tesseract {

// Computes and returns the dot product of the n-vectors u and v.
double DotProductNative(const double *u, const double *v, int n);
TFloat DotProductNative(const TFloat *u, const TFloat *v, int n);

// Uses Intel AVX intrinsics to access the SIMD instruction set.
double DotProductAVX(const double *u, const double *v, int n);
TFloat DotProductAVX(const TFloat *u, const TFloat *v, int n);
TFloat DotProductAVX1(const TFloat *u, const TFloat *v, int n);
TFloat DotProductAVX2(const TFloat *u, const TFloat *v, int n);
TFloat DotProductAVX3(const TFloat *u, const TFloat *v, int n);
TFloat DotProductAVX4(const TFloat *u, const TFloat *v, int n);

// Use Intel FMA.
double DotProductFMA(const double *u, const double *v, int n);
TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n);

// Uses Intel SSE intrinsics to access the SIMD instruction set.
double DotProductSSE(const double *u, const double *v, int n);
TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n);

TFloat DotProductAccelerate(const TFloat *u, const TFloat *v, int n);
} // namespace tesseract.

#endif // TESSERACT_ARCH_DOTPRODUCT_H_
74 changes: 74 additions & 0 deletions src/arch/dotproductavx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,79 @@ namespace tesseract {

// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel AVX intrinsics to access the SIMD instruction set.
#if defined(FAST_FLOAT)
float DotProductAVX(const float *u, const float *v, int n) {
const unsigned quot = n / 8;
const unsigned rem = n % 8;
__m256 t0 = _mm256_setzero_ps();
for (unsigned k = 0; k < quot; k++) {
__m256 f0 = _mm256_loadu_ps(u);
__m256 f1 = _mm256_loadu_ps(v);
f0 = _mm256_mul_ps(f0, f1);
t0 = _mm256_add_ps(t0, f0);
u += 8;
v += 8;
}
alignas(32) float tmp[8];
_mm256_store_ps(tmp, t0);
float result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
for (unsigned k = 0; k < rem; k++) {
result += *u++ * *v++;
}
return result;
}
float DotProductAVX1(const float *u, const float *v, int n) {
const unsigned quot = n / 16;
const unsigned rem = n % 16;
__m256 t0 = _mm256_setzero_ps();
__m256 t1 = _mm256_setzero_ps();
for (unsigned k = 0; k < quot; k++) {
__m256 f0 = _mm256_loadu_ps(u);
__m256 f1 = _mm256_loadu_ps(v);
__m256 f2 = _mm256_loadu_ps(u + 8);
__m256 f3 = _mm256_loadu_ps(v + 8);
f0 = _mm256_mul_ps(f0, f1);
f2 = _mm256_mul_ps(f2, f3);
t0 = _mm256_add_ps(t0, f0);
t1 = _mm256_add_ps(t1, f2);
u += 16;
v += 16;
}
t0 = _mm256_hadd_ps(t0, t1);
alignas(32) float tmp[8];
_mm256_store_ps(tmp, t0);
float result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
for (unsigned k = 0; k < rem; k++) {
result += *u++ * *v++;
}
return result;
}
#else
double DotProductAVX1(const double *u, const double *v, int n) {
__m256d t0 = _mm256_setzero_pd();
__m256d t1 = _mm256_setzero_pd();
for (unsigned quot = n / 8; quot > 0; quot--) {
__m256d f0 = _mm256_loadu_pd(u);
__m256d f1 = _mm256_loadu_pd(v);
__m256d f2 = _mm256_loadu_pd(u + 4);
__m256d f3 = _mm256_loadu_pd(v + 4);
f0 = _mm256_mul_pd(f0, f1);
f2 = _mm256_mul_pd(f2, f3);
t0 = _mm256_add_pd(t0, f0);
t1 = _mm256_add_pd(t1, f2);
u += 8;
v += 8;
}
t0 = _mm256_hadd_pd(t0, t1);
alignas(32) double tmp[4];
_mm256_store_pd(tmp, t0);
double result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
for (unsigned rem = n % 8; rem > 0; rem--) {
result += *u++ * *v++;
}
return result;
}

double DotProductAVX(const double *u, const double *v, int n) {
const unsigned quot = n / 8;
const unsigned rem = n % 8;
Expand Down Expand Up @@ -57,6 +130,7 @@ double DotProductAVX(const double *u, const double *v, int n) {
}
return result;
}
#endif

} // namespace tesseract.

Expand Down
29 changes: 29 additions & 0 deletions src/arch/dotproductfma.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,34 @@ namespace tesseract {

// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel FMA intrinsics to access the SIMD instruction set.
#if defined(FAST_FLOAT)
TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n) {
const unsigned quot = n / 16;
const unsigned rem = n % 16;
__m256 t0 = _mm256_setzero_ps();
__m256 t1 = _mm256_setzero_ps();
for (unsigned k = 0; k < quot; k++) {
__m256 f0 = _mm256_loadu_ps(u);
__m256 f1 = _mm256_loadu_ps(v);
t0 = _mm256_fmadd_ps(f0, f1, t0);
u += 8;
v += 8;
__m256 f2 = _mm256_loadu_ps(u);
__m256 f3 = _mm256_loadu_ps(v);
t1 = _mm256_fmadd_ps(f2, f3, t1);
u += 8;
v += 8;
}
t0 = _mm256_hadd_ps(t0, t1);
alignas(32) TFloat tmp[8];
_mm256_store_ps(tmp, t0);
TFloat result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
for (unsigned k = 0; k < rem; k++) {
result += *u++ * *v++;
}
return result;
}
#else
double DotProductFMA(const double *u, const double *v, int n) {
const unsigned quot = n / 8;
const unsigned rem = n % 8;
Expand All @@ -55,6 +83,7 @@ double DotProductFMA(const double *u, const double *v, int n) {
}
return result;
}
#endif

} // namespace tesseract.

Expand Down
64 changes: 63 additions & 1 deletion src/arch/dotproductsse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,66 @@ namespace tesseract {

// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel SSE intrinsics to access the SIMD instruction set.
#if defined(FAST_FLOAT)
float DotProductSSE(const float *u, const float *v, int n) {
int max_offset = n - 4;
int offset = 0;
// Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and
// v, and multiplying them together in parallel.
__m128 sum = _mm_setzero_ps();
if (offset <= max_offset) {
offset = 4;
// Aligned load is reputedly faster but requires 16 byte aligned input.
if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 &&
(reinterpret_cast<uintptr_t>(v) & 15) == 0) {
// Use aligned load.
sum = _mm_load_ps(u);
__m128 floats2 = _mm_load_ps(v);
// Multiply.
sum = _mm_mul_ps(sum, floats2);
while (offset <= max_offset) {
__m128 floats1 = _mm_load_ps(u + offset);
floats2 = _mm_load_ps(v + offset);
floats1 = _mm_mul_ps(floats1, floats2);
sum = _mm_add_ps(sum, floats1);
offset += 4;
}
} else {
// Use unaligned load.
sum = _mm_loadu_ps(u);
__m128 floats2 = _mm_loadu_ps(v);
// Multiply.
sum = _mm_mul_ps(sum, floats2);
while (offset <= max_offset) {
__m128 floats1 = _mm_loadu_ps(u + offset);
floats2 = _mm_loadu_ps(v + offset);
floats1 = _mm_mul_ps(floats1, floats2);
sum = _mm_add_ps(sum, floats1);
offset += 4;
}
}
}
// Add the 4 sums in sum horizontally.
#if 0
alignas(32) float tmp[4];
_mm_store_ps(tmp, sum);
float result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
#else
__m128 zero = _mm_setzero_ps();
// https://www.felixcloutier.com/x86/haddps
sum = _mm_hadd_ps(sum, zero);
sum = _mm_hadd_ps(sum, zero);
// Extract the low result.
float result = _mm_cvtss_f32(sum);
#endif
// Add on any left-over products.
while (offset < n) {
result += u[offset] * v[offset];
++offset;
}
return result;
}
#else
double DotProductSSE(const double *u, const double *v, int n) {
int max_offset = n - 2;
int offset = 0;
Expand All @@ -39,7 +99,8 @@ double DotProductSSE(const double *u, const double *v, int n) {
if (offset <= max_offset) {
offset = 2;
// Aligned load is reputedly faster but requires 16 byte aligned input.
if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 && (reinterpret_cast<uintptr_t>(v) & 15) == 0) {
if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 &&
(reinterpret_cast<uintptr_t>(v) & 15) == 0) {
// Use aligned load.
sum = _mm_load_pd(u);
__m128d floats2 = _mm_load_pd(v);
Expand Down Expand Up @@ -78,6 +139,7 @@ double DotProductSSE(const double *u, const double *v, int n) {
}
return result;
}
#endif

} // namespace tesseract.

Expand Down
2 changes: 1 addition & 1 deletion src/arch/intsimdmatrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ void IntSimdMatrix::Init(const GENERIC_2D_ARRAY<int8_t> &w, std::vector<int8_t>
// u is imagined to have an extra element at the end with value 1, to
// implement the bias, but it doesn't actually have it.
void IntSimdMatrix::MatrixDotVector(const GENERIC_2D_ARRAY<int8_t> &w,
const std::vector<double> &scales, const int8_t *u, double *v) {
const std::vector<TFloat> &scales, const int8_t *u, TFloat *v) {
int num_out = w.dim1();
int num_in = w.dim2() - 1;
// Base implementation.
Expand Down
Loading