Added intsimdmatrix as a generic integer matrixdotvector function wit…

…h AVX2 and SSE specializations
tesseract-ocr · Sep 8, 2017 · fc6a390 · fc6a390 · Shreeshrii · Sep 10, 2017
1 parent ad74e8a
commit fc6a390
Show file tree

Hide file tree

Showing 21 changed files with 1,549 additions and 41 deletions.
diff --git a/api/Makefile.am b/api/Makefile.am
@@ -30,6 +30,7 @@ libtesseract_api_la_LIBADD = \
     ../dict/libtesseract_dict.la \
     ../arch/libtesseract_arch.la \
     ../arch/libtesseract_avx.la \
+    ../arch/libtesseract_avx2.la \
     ../arch/libtesseract_sse.la \
     ../lstm/libtesseract_lstm.la \
     ../ccstruct/libtesseract_ccstruct.la \
@@ -60,6 +61,7 @@ libtesseract_la_LIBADD = \
     ../dict/libtesseract_dict.la \
     ../arch/libtesseract_arch.la \
     ../arch/libtesseract_avx.la \
+    ../arch/libtesseract_avx2.la \
     ../arch/libtesseract_sse.la \
     ../lstm/libtesseract_lstm.la \
     ../ccstruct/libtesseract_ccstruct.la \

diff --git a/arch/Makefile.am b/arch/Makefile.am
@@ -1,4 +1,4 @@
-AM_CPPFLAGS += -I$(top_srcdir)/ccutil -I$(top_srcdir)/viewer -DUSE_STD_NAMESPACE
+AM_CPPFLAGS += -I$(top_srcdir)/ccstruct -I$(top_srcdir)/ccutil -I$(top_srcdir)/viewer -DUSE_STD_NAMESPACE
 AUTOMAKE_OPTIONS = subdir-objects
 SUBDIRS =
 AM_CXXFLAGS =
@@ -8,31 +8,37 @@ AM_CXXFLAGS += -fvisibility=hidden -fvisibility-inlines-hidden
 AM_CPPFLAGS += -DTESS_EXPORTS
 endif
 
-include_HEADERS = dotproductavx.h dotproductsse.h simddetect.h
+include_HEADERS = dotproductavx.h dotproductsse.h intsimdmatrix.h intsimdmatrixavx2.h intsimdmatrixsse.h simddetect.h
 
 noinst_HEADERS =
 
 if !USING_MULTIPLELIBS
-noinst_LTLIBRARIES = libtesseract_avx.la libtesseract_sse.la
+noinst_LTLIBRARIES = libtesseract_avx.la libtesseract_avx2.la libtesseract_sse.la
 noinst_LTLIBRARIES += libtesseract_arch.la
 else
-lib_LTLIBRARIES = libtesseract_avx.la libtesseract_sse.la
+lib_LTLIBRARIES = libtesseract_avx.la libtesseract_avx2.la libtesseract_sse.la
 lib_LTLIBRARIES += libtesseract_arch.la
 libtesseract_arch_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION)
 libtesseract_avx_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION)
+libtesseract_avx2_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION)
 libtesseract_sse_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION)
 endif
 
 if AVX_OPT
 libtesseract_avx_la_CXXFLAGS = -mavx
 endif
+if AVX2_OPT
+libtesseract_avx2_la_CXXFLAGS = -mavx2
+endif
 if SSE41_OPT
 libtesseract_sse_la_CXXFLAGS = -msse4.1
 endif
 
-libtesseract_arch_la_SOURCES = simddetect.cpp
+libtesseract_arch_la_SOURCES = intsimdmatrix.cpp simddetect.cpp
 
 libtesseract_avx_la_SOURCES = dotproductavx.cpp
 
-libtesseract_sse_la_SOURCES = dotproductsse.cpp
+libtesseract_avx2_la_SOURCES = intsimdmatrixavx2.cpp
+
+libtesseract_sse_la_SOURCES = dotproductsse.cpp intsimdmatrixsse.cpp
 
diff --git a/arch/intsimdmatrix.cpp b/arch/intsimdmatrix.cpp
@@ -0,0 +1,133 @@
+///////////////////////////////////////////////////////////////////////
+// File:        intsimdmatrix.cpp
+// Description: Base class for 8-bit int SIMD matrix multipliers.
+// Author:      Ray Smith
+// Created:     Tue Aug 15 08:01:32 PST 2017
+//
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#include "intsimdmatrix.h"
+#include "intsimdmatrixavx2.h"
+#include "intsimdmatrixsse.h"
+#include "simddetect.h"
+
+namespace tesseract {
+
+// Factory makes and returns an IntSimdMatrix (sub)class of the best
+// available type for the current architecture.
+/* static */
+IntSimdMatrix* IntSimdMatrix::GetFastestMultiplier() {
+  IntSimdMatrix* multiplier = nullptr;
+  if (SIMDDetect::IsAVX2Available()) {
+    multiplier = new IntSimdMatrixAVX2();
+  } else if (SIMDDetect::IsSSEAvailable()) {
+    multiplier = new IntSimdMatrixSSE();
+  } else {
+    // Default c++ implementation.
+    multiplier = new IntSimdMatrix();
+  }
+  return multiplier;
+}
+
+// Computes a reshaped copy of the weight matrix w. If there are no
+// partial_funcs_, it does nothing.
+void IntSimdMatrix::Init(const GENERIC_2D_ARRAY<int8_t>& w) {
+  if (partial_funcs_.empty()) return;
+  int num_out = w.dim1();
+  int num_in = w.dim2() - 1;
+  // The rounded-up sizes of the reshaped weight matrix, excluding biases.
+  int rounded_num_in = Roundup(num_in, num_inputs_per_group_);
+  int rounded_num_out = RoundOutputs(num_out);
+  // Add the bias and compute the required size.
+  shaped_w_.resize((rounded_num_in + 1) * rounded_num_out, 0);
+  int shaped_index = 0;
+  int output = 0;
+  // Each number of registers needs a different format! Iterates over the
+  // different numbers of registers (each a power of 2).
+  for (int num_registers = max_output_registers_; num_registers >= 1;
+       num_registers /= 2) {
+    // The number of outputs that we will generate with this many registers.
+    int num_outputs_per_register_set =
+        num_registers * num_outputs_per_register_;
+    // Use the max number of registers until we have to go fewer.
+    while (output + num_outputs_per_register_set <= rounded_num_out) {
+      // Accumulating outputs in registers saves iterating over the inputs, so
+      // we only have to do it once per output register set.
+      for (int input = 0; input < num_in; input += num_inputs_per_group_) {
+        // Iterate over the number of outputs in a register set.
+        for (int j = 0; j < num_outputs_per_register_set; ++j) {
+          // Inner-most loop corresponds to the number of inputs in an input
+          // group.
+          for (int i = 0; i < num_inputs_per_group_; ++i) {
+            int8_t weight = 0;
+            if (output + j < num_out && input + i < num_in)
+              weight = w(output + j, input + i);
+            shaped_w_[shaped_index++] = weight;
+          }
+        }
+      }
+      // Append the bias weights for the register set.
+      for (int j = 0; j < num_outputs_per_register_set; ++j) {
+        int8_t weight = 0;
+        if (output + j < num_out) weight = w(output + j, num_in);
+        shaped_w_[shaped_index++] = weight;
+      }
+      output += num_outputs_per_register_set;
+    }
+  }
+}
+
+// Computes matrix.vector v = Wu.
+// u is of size W.dim2() - 1 and the output v is of size W.dim1().
+// u is imagined to have an extra element at the end with value 1, to
+// implement the bias, but it doesn't actually have it.
+void IntSimdMatrix::MatrixDotVector(const GENERIC_2D_ARRAY<int8_t>& w,
+                                    const GenericVector<double>& scales,
+                                    const int8_t* u, double* v) const {
+  int num_out = w.dim1();
+  int num_in = w.dim2() - 1;
+  if (partial_funcs_.empty()) {
+    // Base implementation.
+    for (int i = 0; i < num_out; ++i) {
+      const int8_t* wi = w[i];
+      int total = 0;
+      for (int j = 0; j < num_in; ++j) total += wi[j] * u[j];
+      // Add in the bias and correct for integer values.
+      v[i] = (static_cast<double>(total) / MAX_INT8 + wi[num_in]) * scales[i];
+    }
+  } else {
+    const int8_t* w_data = shaped_w_.data();
+    const double* scales_data = &scales[0];
+    // Each call to a partial_func_ produces group_size outputs, except the
+    // last one, which can produce less.
+    int group_size = num_outputs_per_register_ * max_output_registers_;
+    int rounded_num_in = Roundup(num_in, num_inputs_per_group_);
+    int rounded_num_out = RoundOutputs(num_out);
+    int output = 0;
+    for (auto fn : partial_funcs_) {
+      // The amount of w_data consumed by each call to fn.
+      int w_step = (rounded_num_in + 1) * group_size;
+      // Run with this group size, until it would produce too much output, then
+      // switch to a smaller size.
+      for (; output + group_size <= rounded_num_out; output += group_size) {
+        (*fn)(w_data, scales_data, u, rounded_num_in, num_out - output, v);
+        w_data += w_step;
+        scales_data += group_size;
+        v += group_size;
+      }
+      group_size /= 2;
+    }
+  }
+}
+
+}  // namespace tesseract
diff --git a/arch/intsimdmatrix.h b/arch/intsimdmatrix.h
@@ -0,0 +1,135 @@
+///////////////////////////////////////////////////////////////////////
+// File:        intsimdmatrix.h
+// Description: Base class for 8-bit int SIMD matrix multipliers.
+// Author:      Ray Smith
+// Created:     Tue Aug 15 07:37:20 PST 2017
+//
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_ARCH_INTSIMDMATRIX_H_
+#define TESSERACT_ARCH_INTSIMDMATRIX_H_
+
+#include <stdint.h>
+#include <vector>
+#include "genericvector.h"
+#include "matrix.h"
+
+namespace tesseract {
+
+// Base class for a SIMD function to multiply a matrix by a vector, with sources
+// of 8-bit signed integer, and result in a double, after appropriate scaling.
+// Assumes a specific method of multiplication that can be applied to any size
+// and number of SIMD registers as follows:
+// int32_t results are computed with num_outputs_per_register_ in each of
+// max_output_registers_ result registers, repeatedly until it would make too
+// many results, then the number of registers is halved, and so-on down to a
+// single result register. The last calculation only outputs the required number
+// of results instead of writing beyond the bounds. Eg: matrix has 75 outputs,
+//  num_outputs_per_register_ = 4, and max_output_registers_ = 8,
+// Step 1: 8x4=32 results are computed,
+// Step 2: 8x4=32 again, total 64,
+// Step 3: 2x4=8 (since 8x4 is too many, so is 4x4), total 72,
+// Step 4: 1x3, total 75.
+// Each step above is computed using a PartialFunc, which runs over the input
+// vector once. The input is read one registerful of num_inputs_per_register_
+// at a time (presumably 4x num_outputs_per_register_ since they are int8_t)
+// so the inputs MUST BE PADDED to a multiple of num_inputs_per_register_.
+// Since it is slow (on Intel at least) to horizontally add in a register,
+// provision is made to process num_inputs_per_group_ inputs at a time, with
+// the group being replicated num_input_groups_ times and multiplied by a
+// num_inputs_per_group_ by num_input_groups_ rectangle of the weights matrix.
+// This is most convenient if num_inputs_per_group_ is 4, and the product
+// sign-extends and sums 8x8=16 bit results to 32 bits, adding 4 adjacent
+// results in the process, but it doesn't have to be implemented that way.
+// The weights are re-ordered by Init() to be used sequentially by the above
+// algorithm, followed by the biases, so they can be added at the end.
+// The base class computes the base C++ implementation.
+// NOTE that, although the subclasses execute on different SIMD hardware, no
+// virtual methods are needed, as the constructor sets up everything that
+// is required to allow the base class implementation to do all the work.
+class IntSimdMatrix {
+ public:
+  // Constructor should set the data members to indicate the sizes.
+  // NOTE: Base constructor public only for test purposes.
+  IntSimdMatrix()
+      : num_outputs_per_register_(1),
+        max_output_registers_(1),
+        num_inputs_per_register_(1),
+        num_inputs_per_group_(1),
+        num_input_groups_(1) {}
+
+  // Factory makes and returns an IntSimdMatrix (sub)class of the best
+  // available type for the current architecture.
+  static IntSimdMatrix* GetFastestMultiplier();
+
+  // Computes a reshaped copy of the weight matrix w. If there are no
+  // partial_funcs_, it does nothing.
+  void Init(const GENERIC_2D_ARRAY<int8_t>& w);
+
+  // Rounds the size up to a multiple of the input register size (in int8_t).
+  int RoundInputs(int size) const {
+    return Roundup(size, num_inputs_per_register_);
+  }
+  // Rounds the size up to a multiple of the output register size (in int32_t).
+  int RoundOutputs(int size) const {
+    return Roundup(size, num_outputs_per_register_);
+  }
+
+  // Computes matrix.vector v = Wu.
+  // u is of size W.dim2() - 1 and the output v is of size W.dim1().
+  // u is imagined to have an extra element at the end with value 1, to
+  // implement the bias, but it doesn't actually have it.
+  // Computes the base C++ implementation, if there are no partial_funcs_.
+  // NOTE: The size of the input vector (u) must be padded using
+  // RoundInputs above.
+  // The input will be over-read to the extent of the padding. There are no
+  // alignment requirements.
+  void MatrixDotVector(const GENERIC_2D_ARRAY<int8_t>& w,
+                       const GenericVector<double>& scales, const int8_t* u,
+                       double* v) const;
+
+ protected:
+  // Function to compute part of a matrix.vector multiplication. The weights
+  // are in a very specific order (see above) in w, which is multiplied by
+  // u of length num_in, to produce output v after scaling the integer results
+  // by the corresponding member of scales.
+  // The amount of w and scales consumed is fixed and not available to the
+  // caller. The number of outputs written to v will be at most num_out.
+  typedef void (*PartialFunc)(const int8_t* w, const double* scales,
+                              const int8_t* u, int num_in, int num_out,
+                              double* v);
+
+  // Rounds the input up to a multiple of the given factor.
+  static int Roundup(int input, int factor) {
+    return (input + factor - 1) / factor * factor;
+  }
+
+  // Number of 32 bit outputs held in each register.
+  int num_outputs_per_register_;
+  // Maximum number of registers that we will use to hold outputs.
+  int max_output_registers_;
+  // Number of 8 bit inputs in the inputs register.
+  int num_inputs_per_register_;
+  // Number of inputs in each weight group.
+  int num_inputs_per_group_;
+  // Number of groups of inputs to be broadcast.
+  int num_input_groups_;
+  // The weights matrix reorganized in whatever way suits this instance.
+  std::vector<int8_t> shaped_w_;
+  // A series of functions to compute a partial result.
+  std::vector<PartialFunc> partial_funcs_;
+};
+
+}  // namespace tesseract
+
+#endif  // TESSERACT_ARCH_INTSIMDMATRIX_H_