Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use EigenBlasGemm improve convolution computing performance in ARMv7 environment. #3549

Merged
merged 5 commits into from
Aug 21, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF)
option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

建议将USE_EIGEN_FOR_BLAS的默认值改成ON,跑一遍单测看看有没有问题。确认没有问题之后,再将默认值改回OFF

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

在本地ON和OFF都是测试过(x86, armv7, armv8),CI只是编译,修改过来实际也跑不了单侧。

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

若设置了USE_EIGEN_FOR_BLAS,teamcity上也会编译使用Eigen计算gemm的版本,是可以跑单测的。线下我只跑了mobilenet这一个模型,不确定测试的是否全面。


# CMAKE_BUILD_TYPE
if(NOT CMAKE_BUILD_TYPE)
Expand Down
4 changes: 4 additions & 0 deletions cmake/configure.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ if(NOT WITH_TIMER)
add_definitions(-DPADDLE_DISABLE_TIMER)
endif(NOT WITH_TIMER)

if(USE_EIGEN_FOR_BLAS)
add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
endif(USE_EIGEN_FOR_BLAS)

if(NOT WITH_PROFILER)
add_definitions(-DPADDLE_DISABLE_PROFILER)
endif(NOT WITH_PROFILER)
Expand Down
4 changes: 4 additions & 0 deletions paddle/function/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ file(GLOB cpp_files . *Op.cpp)
list(APPEND h_files Function.h)
list(APPEND cpp_files Function.cpp)
list(APPEND cpp_files BufferArg.cpp)
list(APPEND cpp_files GemmFunctor.cpp)
if(USE_EIGEN_FOR_BLAS)
list(APPEND cpp_files EigenGemm.cpp)
endif(USE_EIGEN_FOR_BLAS)

if(WITH_GPU)
file(GLOB cu_files . *OpGpu.cu)
Expand Down
1 change: 0 additions & 1 deletion paddle/function/DepthwiseConvOp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ limitations under the License. */

#include "DepthwiseConvOp.h"
#include "ConvOp.h"
#include "GemmFunctor.h"

namespace paddle {

Expand Down
1 change: 0 additions & 1 deletion paddle/function/DepthwiseConvOpGpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */

#include "DepthwiseConvOp.h"
#include "GemmFunctor.h"
#include "paddle/math/BaseMatrix.h"

namespace paddle {
Expand Down
91 changes: 91 additions & 0 deletions paddle/function/EigenGemm.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include <glog/logging.h>
#include "unsupported/Eigen/CXX11/Tensor"

namespace paddle {

template <class T>
struct EigenBlasGemm {
typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, int>,
Eigen::Aligned>
Matrix;

static void compute(const bool transA,
const bool transB,
const int M,
const int N,
const int K,
const T alpha,
const T* A,
const int lda,
const T* B,
const int ldb,
const T beta,
T* C,
const int ldc) {
Eigen::array<int, 2> sizeA;
if (transA) {
sizeA[0] = K;
sizeA[1] = M;
CHECK_EQ(M, lda);
} else {
sizeA[0] = M;
sizeA[1] = K;
CHECK_EQ(K, lda);
}
Eigen::array<int, 2> sizeB;
if (transB) {
sizeB[0] = N;
sizeB[1] = K;
CHECK_EQ(K, ldb);
} else {
sizeB[0] = K;
sizeB[1] = N;
CHECK_EQ(N, ldb);
}
Eigen::array<int, 2> sizeC;
sizeC[0] = M;
sizeC[1] = N;
CHECK_EQ(N, ldc);

const Matrix a(const_cast<T*>(A), sizeA);
const Matrix b(const_cast<T*>(B), sizeB);
Matrix c(C, sizeC);

typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
Eigen::array<DimPair, 1> dims;
dims[0] = DimPair(1, 0);
dims[0].first = transA ? 0 : 1;
dims[0].second = transB ? 1 : 0;

Eigen::DefaultDevice device;
if (alpha == T(1) && beta == T(0)) {
c.device(device) = a.contract(b, dims);
} else if (alpha == T(1) && beta == T(1)) {
c.device(device) += a.contract(b, dims);
} else {
c.device(device) = alpha * a.contract(b, dims) + beta * c;
}
}
};

#ifdef PADDLE_TYPE_DOUBLE
template class EigenBlasGemm<double>;
#else
template class EigenBlasGemm<float>;
#endif

} // namespace paddle
82 changes: 39 additions & 43 deletions paddle/function/GemmConvOp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@ class GemmConvFunction : public ConvFunctionBase {
}

Im2ColFunctor<kCFO, Device, real> im2col;
GemmFunctor<Device, real> gemm;
size_t inputOffset = imShape.getElements();
size_t outputOffset =
(outputChannels / groups_) * outputHeight * outputWidth;
Expand All @@ -108,19 +107,19 @@ class GemmConvFunction : public ConvFunctionBase {
int M = outputChannels / groups_;
int N = outputHeight * outputWidth;
int K = inputChannels / groups_ * filterHeight * filterWidth;
gemm(CblasNoTrans,
CblasNoTrans,
M,
N,
K,
1.0f,
filterData + g * filterOffset,
K,
colData,
N,
beta,
outputData + g * outputOffset,
N);
BlasGemm<Device, real>::compute(false,
false,
M,
N,
K,
1.0f,
filterData + g * filterOffset,
K,
colData,
N,
beta,
outputData + g * outputOffset,
N);
}
inputData += inputChannels * inputHeight * inputWidth;
outputData += outputChannels * outputHeight * outputWidth;
Expand Down Expand Up @@ -188,8 +187,6 @@ class GemmConvGradInputFunction : public ConvFunctionBase {
}

Col2ImFunctor<kCFO, Device, real> col2im;
GemmFunctor<Device, real> gemm;

size_t inputOffset = imShape.getElements();
size_t outputOffset =
(outputChannels / groups_) * outputHeight * outputWidth;
Expand All @@ -205,19 +202,19 @@ class GemmConvGradInputFunction : public ConvFunctionBase {
colData = inputGrad + g * inputOffset;
scale = 1.0f;
}
gemm(CblasTrans,
CblasNoTrans,
M,
N,
K,
1.0f,
filterData + g * filterOffset,
M,
outputGrad + g * outputOffset,
N,
scale,
colData,
N);
BlasGemm<Device, real>::compute(true,
false,
M,
N,
K,
1.0f,
filterData + g * filterOffset,
M,
outputGrad + g * outputOffset,
N,
scale,
colData,
N);
if (needIm2col) {
col2im(inputGrad + g * inputOffset,
imShape,
Expand Down Expand Up @@ -299,7 +296,6 @@ class GemmConvGradFilterFunction : public ConvFunctionBase {
}

Im2ColFunctor<kCFO, Device, real> im2col;
GemmFunctor<Device, real> gemm;
size_t inputOffset = imShape.getElements();
size_t outputOffset =
(outputChannels / groups_) * outputHeight * outputWidth;
Expand All @@ -321,19 +317,19 @@ class GemmConvGradFilterFunction : public ConvFunctionBase {
int M = outputChannels / groups_;
int K = outputHeight * outputWidth;
int N = inputChannels / groups_ * filterHeight * filterWidth;
gemm(CblasNoTrans,
CblasTrans,
M,
N,
K,
1.0f,
outputGrad + g * outputOffset,
K,
colData,
K,
i == 0 ? beta : 1.0f,
filterGrad + g * filterOffset,
N);
BlasGemm<Device, real>::compute(false,
true,
M,
N,
K,
1.0f,
outputGrad + g * outputOffset,
K,
colData,
K,
i == 0 ? beta : 1.0f,
filterGrad + g * filterOffset,
N);
}
inputData += inputChannels * inputHeight * inputWidth;
outputGrad += outputChannels * outputHeight * outputWidth;
Expand Down
90 changes: 90 additions & 0 deletions paddle/function/GemmFunctor.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "GemmFunctor.h"
#include "paddle/math/MathFunctions.h"

namespace paddle {

template <class T>
struct BlasGemm<DEVICE_TYPE_CPU, T> {
static void compute(const bool transA,
const bool transB,
const int M,
const int N,
const int K,
const T alpha,
const T* A,
const int lda,
const T* B,
const int ldb,
const T beta,
T* C,
const int ldc) {
#ifdef PADDLE_USE_EIGEN_FOR_BLAS
EigenBlasGemm<T>::compute(
transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
#else
gemm<T>(transA == false ? CblasNoTrans : CblasTrans,
transB == false ? CblasNoTrans : CblasTrans,
M,
N,
K,
alpha,
A,
lda,
B,
ldb,
beta,
C,
ldc);
#endif
}
};

template <class T>
struct BlasGemm<DEVICE_TYPE_GPU, T> {
static void compute(const bool transA,
const bool transB,
const int M,
const int N,
const int K,
const T alpha,
const T* A,
const int lda,
const T* B,
const int ldb,
const T beta,
T* C,
const int ldc) {
hl_matrix_mul((T*)A,
transA == false ? HPPL_OP_N : HPPL_OP_T,
(T*)B,
transB == false ? HPPL_OP_N : HPPL_OP_T,
C,
M,
N,
K,
alpha,
beta,
lda,
ldb,
ldc);
}
};

template class BlasGemm<DEVICE_TYPE_CPU, real>;
template class BlasGemm<DEVICE_TYPE_GPU, real>;

} // namespace paddle
Loading