-
Notifications
You must be signed in to change notification settings - Fork 9.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added intsimdmatrix as a generic integer matrixdotvector function wit…
…h AVX2 and SSE specializations
- Loading branch information
Ray Smith
committed
Sep 8, 2017
1 parent
ad74e8a
commit fc6a390
Showing
21 changed files
with
1,549 additions
and
41 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
/////////////////////////////////////////////////////////////////////// | ||
// File: intsimdmatrix.cpp | ||
// Description: Base class for 8-bit int SIMD matrix multipliers. | ||
// Author: Ray Smith | ||
// Created: Tue Aug 15 08:01:32 PST 2017 | ||
// | ||
// (C) Copyright 2017, Google Inc. | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
/////////////////////////////////////////////////////////////////////// | ||
|
||
#include "intsimdmatrix.h" | ||
#include "intsimdmatrixavx2.h" | ||
#include "intsimdmatrixsse.h" | ||
#include "simddetect.h" | ||
|
||
namespace tesseract { | ||
|
||
// Factory makes and returns an IntSimdMatrix (sub)class of the best | ||
// available type for the current architecture. | ||
/* static */ | ||
IntSimdMatrix* IntSimdMatrix::GetFastestMultiplier() { | ||
IntSimdMatrix* multiplier = nullptr; | ||
if (SIMDDetect::IsAVX2Available()) { | ||
multiplier = new IntSimdMatrixAVX2(); | ||
} else if (SIMDDetect::IsSSEAvailable()) { | ||
multiplier = new IntSimdMatrixSSE(); | ||
} else { | ||
// Default c++ implementation. | ||
multiplier = new IntSimdMatrix(); | ||
} | ||
return multiplier; | ||
} | ||
|
||
// Computes a reshaped copy of the weight matrix w. If there are no | ||
// partial_funcs_, it does nothing. | ||
void IntSimdMatrix::Init(const GENERIC_2D_ARRAY<int8_t>& w) { | ||
if (partial_funcs_.empty()) return; | ||
int num_out = w.dim1(); | ||
int num_in = w.dim2() - 1; | ||
// The rounded-up sizes of the reshaped weight matrix, excluding biases. | ||
int rounded_num_in = Roundup(num_in, num_inputs_per_group_); | ||
int rounded_num_out = RoundOutputs(num_out); | ||
// Add the bias and compute the required size. | ||
shaped_w_.resize((rounded_num_in + 1) * rounded_num_out, 0); | ||
int shaped_index = 0; | ||
int output = 0; | ||
// Each number of registers needs a different format! Iterates over the | ||
// different numbers of registers (each a power of 2). | ||
for (int num_registers = max_output_registers_; num_registers >= 1; | ||
num_registers /= 2) { | ||
// The number of outputs that we will generate with this many registers. | ||
int num_outputs_per_register_set = | ||
num_registers * num_outputs_per_register_; | ||
// Use the max number of registers until we have to go fewer. | ||
while (output + num_outputs_per_register_set <= rounded_num_out) { | ||
// Accumulating outputs in registers saves iterating over the inputs, so | ||
// we only have to do it once per output register set. | ||
for (int input = 0; input < num_in; input += num_inputs_per_group_) { | ||
// Iterate over the number of outputs in a register set. | ||
for (int j = 0; j < num_outputs_per_register_set; ++j) { | ||
// Inner-most loop corresponds to the number of inputs in an input | ||
// group. | ||
for (int i = 0; i < num_inputs_per_group_; ++i) { | ||
int8_t weight = 0; | ||
if (output + j < num_out && input + i < num_in) | ||
weight = w(output + j, input + i); | ||
shaped_w_[shaped_index++] = weight; | ||
} | ||
} | ||
} | ||
// Append the bias weights for the register set. | ||
for (int j = 0; j < num_outputs_per_register_set; ++j) { | ||
int8_t weight = 0; | ||
if (output + j < num_out) weight = w(output + j, num_in); | ||
shaped_w_[shaped_index++] = weight; | ||
} | ||
output += num_outputs_per_register_set; | ||
} | ||
} | ||
} | ||
|
||
// Computes matrix.vector v = Wu. | ||
// u is of size W.dim2() - 1 and the output v is of size W.dim1(). | ||
// u is imagined to have an extra element at the end with value 1, to | ||
// implement the bias, but it doesn't actually have it. | ||
void IntSimdMatrix::MatrixDotVector(const GENERIC_2D_ARRAY<int8_t>& w, | ||
const GenericVector<double>& scales, | ||
const int8_t* u, double* v) const { | ||
int num_out = w.dim1(); | ||
int num_in = w.dim2() - 1; | ||
if (partial_funcs_.empty()) { | ||
// Base implementation. | ||
for (int i = 0; i < num_out; ++i) { | ||
const int8_t* wi = w[i]; | ||
int total = 0; | ||
for (int j = 0; j < num_in; ++j) total += wi[j] * u[j]; | ||
// Add in the bias and correct for integer values. | ||
v[i] = (static_cast<double>(total) / MAX_INT8 + wi[num_in]) * scales[i]; | ||
} | ||
} else { | ||
const int8_t* w_data = shaped_w_.data(); | ||
const double* scales_data = &scales[0]; | ||
// Each call to a partial_func_ produces group_size outputs, except the | ||
// last one, which can produce less. | ||
int group_size = num_outputs_per_register_ * max_output_registers_; | ||
int rounded_num_in = Roundup(num_in, num_inputs_per_group_); | ||
int rounded_num_out = RoundOutputs(num_out); | ||
int output = 0; | ||
for (auto fn : partial_funcs_) { | ||
// The amount of w_data consumed by each call to fn. | ||
int w_step = (rounded_num_in + 1) * group_size; | ||
// Run with this group size, until it would produce too much output, then | ||
// switch to a smaller size. | ||
for (; output + group_size <= rounded_num_out; output += group_size) { | ||
(*fn)(w_data, scales_data, u, rounded_num_in, num_out - output, v); | ||
w_data += w_step; | ||
scales_data += group_size; | ||
v += group_size; | ||
} | ||
group_size /= 2; | ||
} | ||
} | ||
} | ||
|
||
} // namespace tesseract |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
/////////////////////////////////////////////////////////////////////// | ||
// File: intsimdmatrix.h | ||
// Description: Base class for 8-bit int SIMD matrix multipliers. | ||
// Author: Ray Smith | ||
// Created: Tue Aug 15 07:37:20 PST 2017 | ||
// | ||
// (C) Copyright 2017, Google Inc. | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
/////////////////////////////////////////////////////////////////////// | ||
|
||
#ifndef TESSERACT_ARCH_INTSIMDMATRIX_H_ | ||
#define TESSERACT_ARCH_INTSIMDMATRIX_H_ | ||
|
||
#include <stdint.h> | ||
#include <vector> | ||
#include "genericvector.h" | ||
#include "matrix.h" | ||
|
||
namespace tesseract { | ||
|
||
// Base class for a SIMD function to multiply a matrix by a vector, with sources | ||
// of 8-bit signed integer, and result in a double, after appropriate scaling. | ||
// Assumes a specific method of multiplication that can be applied to any size | ||
// and number of SIMD registers as follows: | ||
// int32_t results are computed with num_outputs_per_register_ in each of | ||
// max_output_registers_ result registers, repeatedly until it would make too | ||
// many results, then the number of registers is halved, and so-on down to a | ||
// single result register. The last calculation only outputs the required number | ||
// of results instead of writing beyond the bounds. Eg: matrix has 75 outputs, | ||
// num_outputs_per_register_ = 4, and max_output_registers_ = 8, | ||
// Step 1: 8x4=32 results are computed, | ||
// Step 2: 8x4=32 again, total 64, | ||
// Step 3: 2x4=8 (since 8x4 is too many, so is 4x4), total 72, | ||
// Step 4: 1x3, total 75. | ||
// Each step above is computed using a PartialFunc, which runs over the input | ||
// vector once. The input is read one registerful of num_inputs_per_register_ | ||
// at a time (presumably 4x num_outputs_per_register_ since they are int8_t) | ||
// so the inputs MUST BE PADDED to a multiple of num_inputs_per_register_. | ||
// Since it is slow (on Intel at least) to horizontally add in a register, | ||
// provision is made to process num_inputs_per_group_ inputs at a time, with | ||
// the group being replicated num_input_groups_ times and multiplied by a | ||
// num_inputs_per_group_ by num_input_groups_ rectangle of the weights matrix. | ||
// This is most convenient if num_inputs_per_group_ is 4, and the product | ||
// sign-extends and sums 8x8=16 bit results to 32 bits, adding 4 adjacent | ||
// results in the process, but it doesn't have to be implemented that way. | ||
// The weights are re-ordered by Init() to be used sequentially by the above | ||
// algorithm, followed by the biases, so they can be added at the end. | ||
// The base class computes the base C++ implementation. | ||
// NOTE that, although the subclasses execute on different SIMD hardware, no | ||
// virtual methods are needed, as the constructor sets up everything that | ||
// is required to allow the base class implementation to do all the work. | ||
class IntSimdMatrix { | ||
public: | ||
// Constructor should set the data members to indicate the sizes. | ||
// NOTE: Base constructor public only for test purposes. | ||
IntSimdMatrix() | ||
: num_outputs_per_register_(1), | ||
max_output_registers_(1), | ||
num_inputs_per_register_(1), | ||
num_inputs_per_group_(1), | ||
num_input_groups_(1) {} | ||
|
||
// Factory makes and returns an IntSimdMatrix (sub)class of the best | ||
// available type for the current architecture. | ||
static IntSimdMatrix* GetFastestMultiplier(); | ||
|
||
// Computes a reshaped copy of the weight matrix w. If there are no | ||
// partial_funcs_, it does nothing. | ||
void Init(const GENERIC_2D_ARRAY<int8_t>& w); | ||
|
||
// Rounds the size up to a multiple of the input register size (in int8_t). | ||
int RoundInputs(int size) const { | ||
return Roundup(size, num_inputs_per_register_); | ||
} | ||
// Rounds the size up to a multiple of the output register size (in int32_t). | ||
int RoundOutputs(int size) const { | ||
return Roundup(size, num_outputs_per_register_); | ||
} | ||
|
||
// Computes matrix.vector v = Wu. | ||
// u is of size W.dim2() - 1 and the output v is of size W.dim1(). | ||
// u is imagined to have an extra element at the end with value 1, to | ||
// implement the bias, but it doesn't actually have it. | ||
// Computes the base C++ implementation, if there are no partial_funcs_. | ||
// NOTE: The size of the input vector (u) must be padded using | ||
// RoundInputs above. | ||
// The input will be over-read to the extent of the padding. There are no | ||
// alignment requirements. | ||
void MatrixDotVector(const GENERIC_2D_ARRAY<int8_t>& w, | ||
const GenericVector<double>& scales, const int8_t* u, | ||
double* v) const; | ||
|
||
protected: | ||
// Function to compute part of a matrix.vector multiplication. The weights | ||
// are in a very specific order (see above) in w, which is multiplied by | ||
// u of length num_in, to produce output v after scaling the integer results | ||
// by the corresponding member of scales. | ||
// The amount of w and scales consumed is fixed and not available to the | ||
// caller. The number of outputs written to v will be at most num_out. | ||
typedef void (*PartialFunc)(const int8_t* w, const double* scales, | ||
const int8_t* u, int num_in, int num_out, | ||
double* v); | ||
|
||
// Rounds the input up to a multiple of the given factor. | ||
static int Roundup(int input, int factor) { | ||
return (input + factor - 1) / factor * factor; | ||
} | ||
|
||
// Number of 32 bit outputs held in each register. | ||
int num_outputs_per_register_; | ||
// Maximum number of registers that we will use to hold outputs. | ||
int max_output_registers_; | ||
// Number of 8 bit inputs in the inputs register. | ||
int num_inputs_per_register_; | ||
// Number of inputs in each weight group. | ||
int num_inputs_per_group_; | ||
// Number of groups of inputs to be broadcast. | ||
int num_input_groups_; | ||
// The weights matrix reorganized in whatever way suits this instance. | ||
std::vector<int8_t> shaped_w_; | ||
// A series of functions to compute a partial result. | ||
std::vector<PartialFunc> partial_funcs_; | ||
}; | ||
|
||
} // namespace tesseract | ||
|
||
#endif // TESSERACT_ARCH_INTSIMDMATRIX_H_ |
Oops, something went wrong.
fc6a390
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove unittest/gunit.h and replace where it is used by unittest/include_gunit.h.
Update: Created PR #1116 for the above.
fc6a390
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Following three shell scripts are marked as temporary ...