Skip to content

Commit

Permalink
Merge pull request #3549 from hedaoyuan/convolution
Browse files Browse the repository at this point in the history
Use EigenBlasGemm improve convolution computing performance in ARMv7 environment.
  • Loading branch information
hedaoyuan authored Aug 21, 2017
2 parents 0d9846f + 430e0e4 commit a683a56
Show file tree
Hide file tree
Showing 9 changed files with 263 additions and 110 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF)
option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)

# CMAKE_BUILD_TYPE
if(NOT CMAKE_BUILD_TYPE)
Expand Down
4 changes: 4 additions & 0 deletions cmake/configure.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ if(NOT WITH_TIMER)
add_definitions(-DPADDLE_DISABLE_TIMER)
endif(NOT WITH_TIMER)

if(USE_EIGEN_FOR_BLAS)
add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
endif(USE_EIGEN_FOR_BLAS)

if(NOT WITH_PROFILER)
add_definitions(-DPADDLE_DISABLE_PROFILER)
endif(NOT WITH_PROFILER)
Expand Down
4 changes: 4 additions & 0 deletions paddle/function/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ file(GLOB cpp_files . *Op.cpp)
list(APPEND h_files Function.h)
list(APPEND cpp_files Function.cpp)
list(APPEND cpp_files BufferArg.cpp)
list(APPEND cpp_files GemmFunctor.cpp)
if(USE_EIGEN_FOR_BLAS)
list(APPEND cpp_files EigenGemm.cpp)
endif(USE_EIGEN_FOR_BLAS)

if(WITH_GPU)
file(GLOB cu_files . *OpGpu.cu)
Expand Down
1 change: 0 additions & 1 deletion paddle/function/DepthwiseConvOp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ limitations under the License. */

#include "DepthwiseConvOp.h"
#include "ConvOp.h"
#include "GemmFunctor.h"

namespace paddle {

Expand Down
1 change: 0 additions & 1 deletion paddle/function/DepthwiseConvOpGpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */

#include "DepthwiseConvOp.h"
#include "GemmFunctor.h"
#include "paddle/math/BaseMatrix.h"

namespace paddle {
Expand Down
91 changes: 91 additions & 0 deletions paddle/function/EigenGemm.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include <glog/logging.h>
#include "unsupported/Eigen/CXX11/Tensor"

namespace paddle {

template <class T>
struct EigenBlasGemm {
typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, int>,
Eigen::Aligned>
Matrix;

static void compute(const bool transA,
const bool transB,
const int M,
const int N,
const int K,
const T alpha,
const T* A,
const int lda,
const T* B,
const int ldb,
const T beta,
T* C,
const int ldc) {
Eigen::array<int, 2> sizeA;
if (transA) {
sizeA[0] = K;
sizeA[1] = M;
CHECK_EQ(M, lda);
} else {
sizeA[0] = M;
sizeA[1] = K;
CHECK_EQ(K, lda);
}
Eigen::array<int, 2> sizeB;
if (transB) {
sizeB[0] = N;
sizeB[1] = K;
CHECK_EQ(K, ldb);
} else {
sizeB[0] = K;
sizeB[1] = N;
CHECK_EQ(N, ldb);
}
Eigen::array<int, 2> sizeC;
sizeC[0] = M;
sizeC[1] = N;
CHECK_EQ(N, ldc);

const Matrix a(const_cast<T*>(A), sizeA);
const Matrix b(const_cast<T*>(B), sizeB);
Matrix c(C, sizeC);

typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
Eigen::array<DimPair, 1> dims;
dims[0] = DimPair(1, 0);
dims[0].first = transA ? 0 : 1;
dims[0].second = transB ? 1 : 0;

Eigen::DefaultDevice device;
if (alpha == T(1) && beta == T(0)) {
c.device(device) = a.contract(b, dims);
} else if (alpha == T(1) && beta == T(1)) {
c.device(device) += a.contract(b, dims);
} else {
c.device(device) = alpha * a.contract(b, dims) + beta * c;
}
}
};

#ifdef PADDLE_TYPE_DOUBLE
template class EigenBlasGemm<double>;
#else
template class EigenBlasGemm<float>;
#endif

} // namespace paddle
82 changes: 39 additions & 43 deletions paddle/function/GemmConvOp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@ class GemmConvFunction : public ConvFunctionBase {
}

Im2ColFunctor<kCFO, Device, real> im2col;
GemmFunctor<Device, real> gemm;
size_t inputOffset = imShape.getElements();
size_t outputOffset =
(outputChannels / groups_) * outputHeight * outputWidth;
Expand All @@ -108,19 +107,19 @@ class GemmConvFunction : public ConvFunctionBase {
int M = outputChannels / groups_;
int N = outputHeight * outputWidth;
int K = inputChannels / groups_ * filterHeight * filterWidth;
gemm(CblasNoTrans,
CblasNoTrans,
M,
N,
K,
1.0f,
filterData + g * filterOffset,
K,
colData,
N,
beta,
outputData + g * outputOffset,
N);
BlasGemm<Device, real>::compute(false,
false,
M,
N,
K,
1.0f,
filterData + g * filterOffset,
K,
colData,
N,
beta,
outputData + g * outputOffset,
N);
}
inputData += inputChannels * inputHeight * inputWidth;
outputData += outputChannels * outputHeight * outputWidth;
Expand Down Expand Up @@ -188,8 +187,6 @@ class GemmConvGradInputFunction : public ConvFunctionBase {
}

Col2ImFunctor<kCFO, Device, real> col2im;
GemmFunctor<Device, real> gemm;

size_t inputOffset = imShape.getElements();
size_t outputOffset =
(outputChannels / groups_) * outputHeight * outputWidth;
Expand All @@ -205,19 +202,19 @@ class GemmConvGradInputFunction : public ConvFunctionBase {
colData = inputGrad + g * inputOffset;
scale = 1.0f;
}
gemm(CblasTrans,
CblasNoTrans,
M,
N,
K,
1.0f,
filterData + g * filterOffset,
M,
outputGrad + g * outputOffset,
N,
scale,
colData,
N);
BlasGemm<Device, real>::compute(true,
false,
M,
N,
K,
1.0f,
filterData + g * filterOffset,
M,
outputGrad + g * outputOffset,
N,
scale,
colData,
N);
if (needIm2col) {
col2im(inputGrad + g * inputOffset,
imShape,
Expand Down Expand Up @@ -299,7 +296,6 @@ class GemmConvGradFilterFunction : public ConvFunctionBase {
}

Im2ColFunctor<kCFO, Device, real> im2col;
GemmFunctor<Device, real> gemm;
size_t inputOffset = imShape.getElements();
size_t outputOffset =
(outputChannels / groups_) * outputHeight * outputWidth;
Expand All @@ -321,19 +317,19 @@ class GemmConvGradFilterFunction : public ConvFunctionBase {
int M = outputChannels / groups_;
int K = outputHeight * outputWidth;
int N = inputChannels / groups_ * filterHeight * filterWidth;
gemm(CblasNoTrans,
CblasTrans,
M,
N,
K,
1.0f,
outputGrad + g * outputOffset,
K,
colData,
K,
i == 0 ? beta : 1.0f,
filterGrad + g * filterOffset,
N);
BlasGemm<Device, real>::compute(false,
true,
M,
N,
K,
1.0f,
outputGrad + g * outputOffset,
K,
colData,
K,
i == 0 ? beta : 1.0f,
filterGrad + g * filterOffset,
N);
}
inputData += inputChannels * inputHeight * inputWidth;
outputGrad += outputChannels * outputHeight * outputWidth;
Expand Down
90 changes: 90 additions & 0 deletions paddle/function/GemmFunctor.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "GemmFunctor.h"
#include "paddle/math/MathFunctions.h"

namespace paddle {

template <class T>
struct BlasGemm<DEVICE_TYPE_CPU, T> {
static void compute(const bool transA,
const bool transB,
const int M,
const int N,
const int K,
const T alpha,
const T* A,
const int lda,
const T* B,
const int ldb,
const T beta,
T* C,
const int ldc) {
#ifdef PADDLE_USE_EIGEN_FOR_BLAS
EigenBlasGemm<T>::compute(
transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
#else
gemm<T>(transA == false ? CblasNoTrans : CblasTrans,
transB == false ? CblasNoTrans : CblasTrans,
M,
N,
K,
alpha,
A,
lda,
B,
ldb,
beta,
C,
ldc);
#endif
}
};

template <class T>
struct BlasGemm<DEVICE_TYPE_GPU, T> {
static void compute(const bool transA,
const bool transB,
const int M,
const int N,
const int K,
const T alpha,
const T* A,
const int lda,
const T* B,
const int ldb,
const T beta,
T* C,
const int ldc) {
hl_matrix_mul((T*)A,
transA == false ? HPPL_OP_N : HPPL_OP_T,
(T*)B,
transB == false ? HPPL_OP_N : HPPL_OP_T,
C,
M,
N,
K,
alpha,
beta,
lda,
ldb,
ldc);
}
};

template class BlasGemm<DEVICE_TYPE_CPU, real>;
template class BlasGemm<DEVICE_TYPE_GPU, real>;

} // namespace paddle
Loading

0 comments on commit a683a56

Please sign in to comment.