Skip to content

Commit

Permalink
Batching improvements for GEMM/TRSM operators and full MKL usage docs. (
Browse files Browse the repository at this point in the history
apache#8846)

* Batching improvements for GEMM/TRSM operators and full MKL usage docs.

* Changed GEMM operator to use gemmStridedBatch CUDA implementation when CUDA is version 8 or higher, otherwise to just do batching manually.
* Changed TRSM operator to not use the CUDA batching functionality as it's slower for large matrices. Instead do batching manually.
* Added instructions for using a full MKL installation instead of just MKL2017

* Batching improvements for GEMM/TRSM operators and full MKL usage docs.

* Changed GEMM operator to use gemmStridedBatch CUDA implementation when CUDA is version 8 or higher, otherwise to just do batching manually.
* Changed TRSM operator to not use the CUDA batching functionality as it's slower for large matrices. Instead do batching manually.
* Added instructions for using a full MKL installation instead of just MKL2017
  • Loading branch information
meissnereric authored and Nan Zhu committed Jan 16, 2018
1 parent a4288fa commit a9c0b45
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 102 deletions.
19 changes: 19 additions & 0 deletions MKL_README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,22 @@
# Full MKL Installation

## Build/Install MXNet with a full MKL installation:
Installing and enabling the full MKL installation enables MKL support for all operators under the linalg namespace.

1. Download and install the latest full MKL version following instructions on the [intel website.](https://software.intel.com/en-us/articles/intel-mkl-111-install-guide)

2. Set USE_BLAS=mkl in make/config.mk

1.1 Set ADD_LDFLAGS=-L<path/to/mkl/lib/folder> (ex. ADD_LDFLAGS=-L/opt/intel/compilers_and_libraries_2018.0.128/linux/mkl/lib)

1.1 Set ADD_CFLAGS=-I<path/to/mkl/include/folder> (ex. ADD_CFLAGS=-L/opt/intel/compilers_and_libraries_2018.0.128/linux/mkl/include)

3. Run 'make -j ${nproc}'

4. Navigate into the python directory

5. Run 'sudo python setup.py install'

# MKL2017 PLUGIN

MKL2017 is an INTEL released library to accelerate Deep Neural Network (DNN) applications on Intel architecture.
Expand Down
8 changes: 0 additions & 8 deletions make/config.mk
Original file line number Diff line number Diff line change
Expand Up @@ -110,21 +110,13 @@ USE_LAPACK = 1
# path to lapack library in case of a non-standard installation
USE_LAPACK_PATH =

# by default, disable lapack when using MKL
# switch on when there is a full installation of MKL available (not just MKL2017/MKL_ML)
ifeq ($(USE_BLAS), mkl)
USE_LAPACK = 0
endif

# add path to intel library, you may need it for MKL, if you did not add the path
# to environment variable
USE_INTEL_PATH = NONE

# If use MKL only for BLAS, choose static link automatically to allow python wrapper
ifeq ($(USE_MKL2017), 0)
ifeq ($(USE_BLAS), mkl)
USE_STATIC_MKL = 1
endif
else
USE_STATIC_MKL = NONE
endif
Expand Down
145 changes: 51 additions & 94 deletions src/operator/linalg_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,14 @@ void linalg_gemm<cpu, DType>(const Tensor<cpu, 2, DType>& A, const Tensor<cpu, 2
A.dptr_, A.stride_, B.dptr_, B.stride_, beta, C.dptr_, C.stride_); \
}

#define LINALG_CPU_BATCH_GEMM(DType) \
#define LINALG_XPU_BATCH_GEMM(xpu, DType) \
template<> inline \
void linalg_batch_gemm<cpu, DType>(const Tensor<cpu, 3, DType>& A, const Tensor<cpu, 3, DType>& B, \
const Tensor<cpu, 3, DType>& C, DType alpha, DType beta, \
bool tA, bool tB, Stream<cpu> *s) { \
void linalg_batch_gemm<xpu, DType>(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& B, \
const Tensor<xpu, 3, DType>& C, DType alpha, DType beta, \
bool tA, bool tB, Stream<xpu> *s) { \
linalg_check_batch_size(A.size(0), B.size(0), C.size(0)); \
for (index_t i = 0; i < A.size(0); ++i) { \
linalg_gemm(A[i], B[i], C[i], alpha, beta, tA, tB); \
linalg_gemm(A[i], B[i], C[i], alpha, beta, tA, tB, s); \
} \
}

Expand All @@ -90,11 +90,11 @@ void linalg_gemm<cpu, DType>(const Tensor<cpu, 2, DType>& A, const Tensor<cpu, 2
LOG(FATAL) << "linalg_gemm (without req arg) not implemented by mxnet for cpu, needs cblas!"; \
}

#define LINALG_CPU_BATCH_GEMM(DType) \
#define LINALG_XPU_BATCH_GEMM(xpu, DType) \
template<> inline \
void linalg_batch_gemm<cpu, DType>(const Tensor<cpu, 3, DType>& A, const Tensor<cpu, 3, DType>& B, \
const Tensor<cpu, 3, DType>& C, DType alpha, DType beta, \
bool tA, bool tB, Stream<cpu> *s) { \
void linalg_batch_gemm<xpu, DType>(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& B, \
const Tensor<xpu, 3, DType>& C, DType alpha, DType beta, \
bool tA, bool tB, Stream<xpu> *s) { \
LOG(FATAL) << "linalg_batch_gemm not implemented by mxnet for cpu, needs cblas!"; \
}

Expand All @@ -103,8 +103,8 @@ void linalg_batch_gemm<cpu, DType>(const Tensor<cpu, 3, DType>& A, const Tensor<
LINALG_CPU_GEMM(sgemm, float)
LINALG_CPU_GEMM(dgemm, double)

LINALG_CPU_BATCH_GEMM(float)
LINALG_CPU_BATCH_GEMM(double)
LINALG_XPU_BATCH_GEMM(cpu, float)
LINALG_XPU_BATCH_GEMM(cpu, double)

// Specialization of linalg_gemm<cpu, DType> for DType=mshadow::half::half_t.
template<> inline
Expand All @@ -119,13 +119,6 @@ void linalg_gemm<cpu, mshadow::half::half_t>(const Tensor<cpu, 2, mshadow::half:

#ifdef __CUDACC__

template<typename DType>
__global__ void linalgCollectBatchOffsetsGPU(DType *a[], DType* b, int stride, int N) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {
a[i] = b + i * stride;
}
}

// cublas col-major processing accounted for by switching first two operands

#define LINALG_GPU_GEMM(fname, DType) \
Expand Down Expand Up @@ -195,43 +188,36 @@ void linalg_gemm<gpu, mshadow::half::half_t>(const Tensor<gpu, 2, mshadow::half:
#endif // CUDA_VERSION >= 7050
}


// As of cuda8, cublas has implemented a strided version of batch gemm.
#if CUDA_VERSION < 8000
LINALG_XPU_BATCH_GEMM(gpu, float)
LINALG_XPU_BATCH_GEMM(gpu, double)
#else
#define LINALG_GPU_BATCH_GEMM(fname, DType) \
template<> inline \
void linalg_batch_gemm<gpu, DType>(const Tensor<gpu, 3, DType>& A, const Tensor<gpu, 3, DType>& B, \
const Tensor<gpu, 3, DType>& C, DType alpha, DType beta, \
bool tA, bool tB, Stream<gpu> *s) { \
using namespace mxnet; \
using mshadow::gpu; \
CHECK_NOTNULL(s); \
linalg_check_batch_size(A.size(0), B.size(0), C.size(0)); \
check_gemm(A[0], B[0], C[0], alpha, beta, tA, tB); \
Storage::Handle offsetsA, offsetsB, offsetsC; \
offsetsA = Storage::Get()->Alloc(sizeof(DType*)*A.size(0), Context::GPU()); \
offsetsB = Storage::Get()->Alloc(sizeof(DType*)*B.size(0), Context::GPU()); \
offsetsC = Storage::Get()->Alloc(sizeof(DType*)*C.size(0), Context::GPU()); \
using namespace mshadow::cuda; \
int ngrid = std::min(kMaxGridNum, \
static_cast<int>((A.size(0) + kBaseThreadNum - 1) / kBaseThreadNum)); \
linalgCollectBatchOffsetsGPU<<<ngrid, kBaseThreadNum, 0, mshadow::Stream<gpu>::GetStream(s)>>> \
(static_cast<DType **>(offsetsA.dptr), A.dptr_, A.size(1)*A.stride_, A.size(0)); \
linalgCollectBatchOffsetsGPU<<<ngrid, kBaseThreadNum, 0, mshadow::Stream<gpu>::GetStream(s)>>> \
(static_cast<DType **>(offsetsB.dptr), B.dptr_, B.size(1)*B.stride_, B.size(0)); \
linalgCollectBatchOffsetsGPU<<<ngrid, kBaseThreadNum, 0, mshadow::Stream<gpu>::GetStream(s)>>> \
(static_cast<DType **>(offsetsC.dptr), C.dptr_, C.size(1)*C.stride_, C.size(0)); \
CUBLAS_CALL(cublas##fname(Stream<gpu>::GetBlasHandle(s), \
(tB ? CUBLAS_OP_T : CUBLAS_OP_N), \
(tA ? CUBLAS_OP_T : CUBLAS_OP_N), \
C.size(2), C.size(1), (tB ? B.size(2) : B.size(1)), \
&alpha, static_cast<const DType **>(offsetsB.dptr), B.stride_, \
static_cast<const DType **>(offsetsA.dptr), A.stride_, \
&beta, static_cast<DType **>(offsetsC.dptr), C.stride_, A.size(0))) \
Storage::Get()->Free(offsetsA); \
Storage::Get()->Free(offsetsB); \
Storage::Get()->Free(offsetsC); \
}
LINALG_GPU_BATCH_GEMM(SgemmBatched, float)
LINALG_GPU_BATCH_GEMM(DgemmBatched, double)
template<> inline \
void linalg_batch_gemm<gpu, DType>(const Tensor<gpu, 3, DType>& A, \
const Tensor<gpu, 3, DType>& B, \
const Tensor<gpu, 3, DType>& C, DType alpha, DType beta, \
bool tA, bool tB, Stream<gpu> *s) { \
using namespace mxnet; \
using mshadow::gpu; \
CHECK_NOTNULL(s); \
linalg_check_batch_size(A.size(0), B.size(0), C.size(0)); \
check_gemm(A[0], B[0], C[0], alpha, beta, tA, tB); \
using namespace mshadow::cuda; \
CUBLAS_CALL(cublas##fname(Stream<gpu>::GetBlasHandle(s), \
(tB ? CUBLAS_OP_T : CUBLAS_OP_N), \
(tA ? CUBLAS_OP_T : CUBLAS_OP_N), \
C.size(2), C.size(1), (tB ? B.size(2) : B.size(1)), \
&alpha, B.dptr_, B.stride_, B.size(1) * B.stride_, \
A.dptr_, A.stride_, A.size(1) * A.stride_, \
&beta, C.dptr_, C.stride_, C.size(1) * C.stride_, A.size(0))) \
}

LINALG_GPU_BATCH_GEMM(SgemmStridedBatched, float)
LINALG_GPU_BATCH_GEMM(DgemmStridedBatched, double)

#endif // CUDA < 8000

#endif // __CUDACC__

Expand Down Expand Up @@ -266,13 +252,13 @@ void linalg_trsm<cpu, DType>(const Tensor<cpu, 2, DType>& A, const Tensor<cpu, 2
A.stride_, B.dptr_, B.stride_); \
}

#define LINALG_CPU_BATCH_TRSM(DType) \
#define LINALG_XPU_BATCH_TRSM(xpu, DType) \
template<> inline \
void linalg_batch_trsm<cpu, DType>(const Tensor<cpu, 3, DType>& A, const Tensor<cpu, 3, DType>& B, \
DType alpha, bool rightside, bool lower, bool transpose, Stream<cpu> *s) { \
void linalg_batch_trsm<xpu, DType>(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& B, \
DType alpha, bool rightside, bool lower, bool transpose, Stream<xpu> *s) { \
linalg_check_batch_size(A.size(0), B.size(0), B.size(0)); \
for (index_t i = 0; i < A.size(0); ++i) { \
linalg_trsm(A[i], B[i], alpha, rightside, lower, transpose); \
linalg_trsm(A[i], B[i], alpha, rightside, lower, transpose, s); \
} \
}

Expand All @@ -285,10 +271,10 @@ void linalg_trsm<cpu, DType>(const Tensor<cpu, 2, DType>& A, const Tensor<cpu, 2
LOG(FATAL) << "linalg_trsm not implemented, needs cblas!"; \
}

#define LINALG_CPU_BATCH_TRSM(DType) \
#define LINALG_XPU_BATCH_TRSM(xpu, DType) \
template<> inline \
void linalg_batch_trsm<cpu, DType>(const Tensor<cpu, 3, DType>& A, const Tensor<cpu, 3, DType>& B, \
DType alpha, bool rightside, bool lower, bool transpose, Stream<cpu> *s) { \
void linalg_batch_trsm<xpu, DType>(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& B, \
DType alpha, bool rightside, bool lower, bool transpose, Stream<xpu> *s) { \
LOG(FATAL) << "linalg_batch_trsm not implemented, needs cblas!"; \
}

Expand All @@ -297,8 +283,8 @@ void linalg_batch_trsm<cpu, DType>(const Tensor<cpu, 3, DType>& A, const Tensor<
LINALG_CPU_TRSM(strsm, float)
LINALG_CPU_TRSM(dtrsm, double)

LINALG_CPU_BATCH_TRSM(float)
LINALG_CPU_BATCH_TRSM(double)
LINALG_XPU_BATCH_TRSM(cpu, float)
LINALG_XPU_BATCH_TRSM(cpu, double)

#ifdef __CUDACC__

Expand All @@ -322,37 +308,8 @@ void linalg_trsm<gpu, DType>(const Tensor<gpu, 2, DType>& A, const Tensor<gpu, 2
LINALG_GPU_TRSM(Strsm, float)
LINALG_GPU_TRSM(Dtrsm, double)

#define LINALG_GPU_BATCH_TRSM(fname, DType) \
template<> inline \
void linalg_batch_trsm<gpu, DType>(const Tensor<gpu, 3, DType>& A, const Tensor<gpu, 3, DType>& B, \
DType alpha, bool rightside, bool lower, bool transpose, Stream<gpu> *s) { \
using namespace mxnet; \
using mshadow::gpu; \
CHECK_NOTNULL(s); \
linalg_check_batch_size(A.size(0), B.size(0), B.size(0)); \
check_trsm(A[0], B[0], alpha, rightside, lower, transpose); \
Storage::Handle offsetsA, offsetsB; \
offsetsA = Storage::Get()->Alloc(sizeof(DType*)*A.size(0), Context::GPU()); \
offsetsB = Storage::Get()->Alloc(sizeof(DType*)*B.size(0), Context::GPU()); \
using namespace mshadow::cuda; \
int ngrid = std::min(kMaxGridNum, \
static_cast<int>((A.size(0) + kBaseThreadNum - 1) / kBaseThreadNum)); \
linalgCollectBatchOffsetsGPU<<<ngrid, kBaseThreadNum, 0, mshadow::Stream<gpu>::GetStream(s)>>> \
(static_cast<DType **>(offsetsA.dptr), A.dptr_, A.size(1)*A.stride_, A.size(0)); \
linalgCollectBatchOffsetsGPU<<<ngrid, kBaseThreadNum, 0, mshadow::Stream<gpu>::GetStream(s)>>> \
(static_cast<DType **>(offsetsB.dptr), B.dptr_, B.size(1)*B.stride_, A.size(0)); \
CUBLAS_CALL(cublas##fname(Stream<gpu>::GetBlasHandle(s), \
(rightside ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT), \
(lower ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER), \
(transpose ? CUBLAS_OP_T : CUBLAS_OP_N), \
CUBLAS_DIAG_NON_UNIT, B.size(2), B.size(1), &alpha, \
static_cast<const DType **>(offsetsA.dptr), A.stride_, \
static_cast<DType **>(offsetsB.dptr), B.stride_, A.size(0))); \
Storage::Get()->Free(offsetsA); \
Storage::Get()->Free(offsetsB); \
}
LINALG_GPU_BATCH_TRSM(StrsmBatched, float)
LINALG_GPU_BATCH_TRSM(DtrsmBatched, double)
LINALG_XPU_BATCH_TRSM(gpu, float)
LINALG_XPU_BATCH_TRSM(gpu, double)

#endif // __CUDACC__

Expand Down

0 comments on commit a9c0b45

Please sign in to comment.