Skip to content

Commit

Permalink
Optimize performance by using inline function DotProduct
Browse files Browse the repository at this point in the history
This improves performace for the "best" models because it
avoids function calls.

Signed-off-by: Stefan Weil <[email protected]>
  • Loading branch information
stweil committed Nov 29, 2018
1 parent e161501 commit 3c047f0
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 22 deletions.
44 changes: 24 additions & 20 deletions src/lstm/weightmatrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,29 @@ const int kAdamCorrectionIterations = 200000;
// Epsilon in Adam to prevent division by zero.
const double kAdamEpsilon = 1e-8;

// Computes and returns the dot product of the two n-vectors u and v.
static inline double DotProduct(const double* u, const double* v, int n) {
// Note: because the order of addition is different among the 3 DotProduct
// functions, the results can (and do) vary slightly (although they agree
// to within about 4e-15). This produces different results when running
// training, despite all random inputs being precisely equal.
// To get consistent results, use just one of these DotProduct functions.
// On a test multi-layer network, serial is 57% slower than sse, and avx
// is about 8% faster than sse. This suggests that the time is memory
// bandwidth constrained and could benefit from holding the reused vector
// in AVX registers.

if (SIMDDetect::IsAVXAvailable())
return DotProductAVX(u, v, n);

if (SIMDDetect::IsSSEAvailable())
return DotProductSSE(u, v, n);

double total = 0.0;
for (int k = 0; k < n; ++k) total += u[k] * v[k];
return total;
}

// Computes matrix.vector v = Wu.
// u is of size W.dim2() - add_bias_fwd and the output v is of size
// W.dim1() - skip_bias_back.
Expand All @@ -54,7 +77,7 @@ static inline void MatrixDotVectorInternal(const GENERIC_2D_ARRAY<double>& w,
int extent = w.dim2() - add_bias_fwd;
for (int i = 0; i < num_results; ++i) {
const double* wi = w[i];
double total = WeightMatrix::DotProduct(wi, u, extent);
double total = DotProduct(wi, u, extent);
if (add_bias_fwd) total += wi[extent]; // The bias value.
v[i] = total;
}
Expand Down Expand Up @@ -389,25 +412,6 @@ void WeightMatrix::Debug2D(const char* msg) {
histogram.print();
}

// Computes and returns the dot product of the two n-vectors u and v.
/* static */
double WeightMatrix::DotProduct(const double* u, const double* v, int n) {
// Note: because the order of addition is different among the 3 DotProduct
// functions, the results can (and do) vary slightly (although they agree
// to within about 4e-15). This produces different results when running
// training, despite all random inputs being precisely equal.
// To get consistent results, use just one of these DotProduct functions.
// On a test multi-layer network, serial is 57% slower than sse, and avx
// is about 8% faster than sse. This suggests that the time is memory
// bandwidth constrained and could benefit from holding the reused vector
// in AVX registers.
if (SIMDDetect::IsAVXAvailable()) return DotProductAVX(u, v, n);
if (SIMDDetect::IsSSEAvailable()) return DotProductSSE(u, v, n);
double total = 0.0;
for (int k = 0; k < n; ++k) total += u[k] * v[k];
return total;
}

// Utility function converts an array of float to the corresponding array
// of double.
/* static */
Expand Down
2 changes: 0 additions & 2 deletions src/lstm/weightmatrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,6 @@ class WeightMatrix {

void Debug2D(const char* msg);

// Computes and returns the dot product of the two n-vectors u and v.
static double DotProduct(const double* u, const double* v, int n);
// Utility function converts an array of float to the corresponding array
// of double.
static void FloatToDouble(const GENERIC_2D_ARRAY<float>& wf,
Expand Down

0 comments on commit 3c047f0

Please sign in to comment.