diff --git a/MKL_README.md b/MKL_README.md index 80a31c9a406a..0f97416ac368 100644 --- a/MKL_README.md +++ b/MKL_README.md @@ -1,3 +1,22 @@ +# Full MKL Installation + +## Build/Install MXNet with a full MKL installation: +Installing and enabling the full MKL installation enables MKL support for all operators under the linalg namespace. + + 1. Download and install the latest full MKL version following instructions on the [intel website.](https://software.intel.com/en-us/articles/intel-mkl-111-install-guide) + + 2. Set USE_BLAS=mkl in make/config.mk + + 1.1 Set ADD_LDFLAGS=-L (ex. ADD_LDFLAGS=-L/opt/intel/compilers_and_libraries_2018.0.128/linux/mkl/lib) + + 1.1 Set ADD_CFLAGS=-I (ex. ADD_CFLAGS=-L/opt/intel/compilers_and_libraries_2018.0.128/linux/mkl/include) + + 3. Run 'make -j ${nproc}' + + 4. Navigate into the python directory + + 5. Run 'sudo python setup.py install' + # MKL2017 PLUGIN MKL2017 is an INTEL released library to accelerate Deep Neural Network (DNN) applications on Intel architecture. diff --git a/make/config.mk b/make/config.mk index 9f7564b88fc0..a322fee0629e 100644 --- a/make/config.mk +++ b/make/config.mk @@ -110,21 +110,13 @@ USE_LAPACK = 1 # path to lapack library in case of a non-standard installation USE_LAPACK_PATH = -# by default, disable lapack when using MKL -# switch on when there is a full installation of MKL available (not just MKL2017/MKL_ML) -ifeq ($(USE_BLAS), mkl) -USE_LAPACK = 0 -endif - # add path to intel library, you may need it for MKL, if you did not add the path # to environment variable USE_INTEL_PATH = NONE # If use MKL only for BLAS, choose static link automatically to allow python wrapper -ifeq ($(USE_MKL2017), 0) ifeq ($(USE_BLAS), mkl) USE_STATIC_MKL = 1 -endif else USE_STATIC_MKL = NONE endif diff --git a/src/operator/linalg_impl.h b/src/operator/linalg_impl.h index b3e6573f789e..b2a672ffd9e8 100644 --- a/src/operator/linalg_impl.h +++ b/src/operator/linalg_impl.h @@ -69,14 +69,14 @@ void linalg_gemm(const Tensor& A, const Tensor inline \ -void linalg_batch_gemm(const Tensor& A, const Tensor& B, \ - const Tensor& C, DType alpha, DType beta, \ - bool tA, bool tB, Stream *s) { \ +void linalg_batch_gemm(const Tensor& A, const Tensor& B, \ + const Tensor& C, DType alpha, DType beta, \ + bool tA, bool tB, Stream *s) { \ linalg_check_batch_size(A.size(0), B.size(0), C.size(0)); \ for (index_t i = 0; i < A.size(0); ++i) { \ - linalg_gemm(A[i], B[i], C[i], alpha, beta, tA, tB); \ + linalg_gemm(A[i], B[i], C[i], alpha, beta, tA, tB, s); \ } \ } @@ -90,11 +90,11 @@ void linalg_gemm(const Tensor& A, const Tensor inline \ -void linalg_batch_gemm(const Tensor& A, const Tensor& B, \ - const Tensor& C, DType alpha, DType beta, \ - bool tA, bool tB, Stream *s) { \ +void linalg_batch_gemm(const Tensor& A, const Tensor& B, \ + const Tensor& C, DType alpha, DType beta, \ + bool tA, bool tB, Stream *s) { \ LOG(FATAL) << "linalg_batch_gemm not implemented by mxnet for cpu, needs cblas!"; \ } @@ -103,8 +103,8 @@ void linalg_batch_gemm(const Tensor& A, const Tensor< LINALG_CPU_GEMM(sgemm, float) LINALG_CPU_GEMM(dgemm, double) -LINALG_CPU_BATCH_GEMM(float) -LINALG_CPU_BATCH_GEMM(double) +LINALG_XPU_BATCH_GEMM(cpu, float) +LINALG_XPU_BATCH_GEMM(cpu, double) // Specialization of linalg_gemm for DType=mshadow::half::half_t. template<> inline @@ -119,13 +119,6 @@ void linalg_gemm(const Tensor -__global__ void linalgCollectBatchOffsetsGPU(DType *a[], DType* b, int stride, int N) { - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) { - a[i] = b + i * stride; - } -} - // cublas col-major processing accounted for by switching first two operands #define LINALG_GPU_GEMM(fname, DType) \ @@ -195,43 +188,36 @@ void linalg_gemm(const Tensor= 7050 } - +// As of cuda8, cublas has implemented a strided version of batch gemm. +#if CUDA_VERSION < 8000 + LINALG_XPU_BATCH_GEMM(gpu, float) + LINALG_XPU_BATCH_GEMM(gpu, double) +#else #define LINALG_GPU_BATCH_GEMM(fname, DType) \ -template<> inline \ -void linalg_batch_gemm(const Tensor& A, const Tensor& B, \ - const Tensor& C, DType alpha, DType beta, \ - bool tA, bool tB, Stream *s) { \ - using namespace mxnet; \ - using mshadow::gpu; \ - CHECK_NOTNULL(s); \ - linalg_check_batch_size(A.size(0), B.size(0), C.size(0)); \ - check_gemm(A[0], B[0], C[0], alpha, beta, tA, tB); \ - Storage::Handle offsetsA, offsetsB, offsetsC; \ - offsetsA = Storage::Get()->Alloc(sizeof(DType*)*A.size(0), Context::GPU()); \ - offsetsB = Storage::Get()->Alloc(sizeof(DType*)*B.size(0), Context::GPU()); \ - offsetsC = Storage::Get()->Alloc(sizeof(DType*)*C.size(0), Context::GPU()); \ - using namespace mshadow::cuda; \ - int ngrid = std::min(kMaxGridNum, \ - static_cast((A.size(0) + kBaseThreadNum - 1) / kBaseThreadNum)); \ - linalgCollectBatchOffsetsGPU<<::GetStream(s)>>> \ - (static_cast(offsetsA.dptr), A.dptr_, A.size(1)*A.stride_, A.size(0)); \ - linalgCollectBatchOffsetsGPU<<::GetStream(s)>>> \ - (static_cast(offsetsB.dptr), B.dptr_, B.size(1)*B.stride_, B.size(0)); \ - linalgCollectBatchOffsetsGPU<<::GetStream(s)>>> \ - (static_cast(offsetsC.dptr), C.dptr_, C.size(1)*C.stride_, C.size(0)); \ - CUBLAS_CALL(cublas##fname(Stream::GetBlasHandle(s), \ - (tB ? CUBLAS_OP_T : CUBLAS_OP_N), \ - (tA ? CUBLAS_OP_T : CUBLAS_OP_N), \ - C.size(2), C.size(1), (tB ? B.size(2) : B.size(1)), \ - &alpha, static_cast(offsetsB.dptr), B.stride_, \ - static_cast(offsetsA.dptr), A.stride_, \ - &beta, static_cast(offsetsC.dptr), C.stride_, A.size(0))) \ - Storage::Get()->Free(offsetsA); \ - Storage::Get()->Free(offsetsB); \ - Storage::Get()->Free(offsetsC); \ -} -LINALG_GPU_BATCH_GEMM(SgemmBatched, float) -LINALG_GPU_BATCH_GEMM(DgemmBatched, double) + template<> inline \ + void linalg_batch_gemm(const Tensor& A, \ + const Tensor& B, \ + const Tensor& C, DType alpha, DType beta, \ + bool tA, bool tB, Stream *s) { \ + using namespace mxnet; \ + using mshadow::gpu; \ + CHECK_NOTNULL(s); \ + linalg_check_batch_size(A.size(0), B.size(0), C.size(0)); \ + check_gemm(A[0], B[0], C[0], alpha, beta, tA, tB); \ + using namespace mshadow::cuda; \ + CUBLAS_CALL(cublas##fname(Stream::GetBlasHandle(s), \ + (tB ? CUBLAS_OP_T : CUBLAS_OP_N), \ + (tA ? CUBLAS_OP_T : CUBLAS_OP_N), \ + C.size(2), C.size(1), (tB ? B.size(2) : B.size(1)), \ + &alpha, B.dptr_, B.stride_, B.size(1) * B.stride_, \ + A.dptr_, A.stride_, A.size(1) * A.stride_, \ + &beta, C.dptr_, C.stride_, C.size(1) * C.stride_, A.size(0))) \ + } + + LINALG_GPU_BATCH_GEMM(SgemmStridedBatched, float) + LINALG_GPU_BATCH_GEMM(DgemmStridedBatched, double) + +#endif // CUDA < 8000 #endif // __CUDACC__ @@ -266,13 +252,13 @@ void linalg_trsm(const Tensor& A, const Tensor inline \ -void linalg_batch_trsm(const Tensor& A, const Tensor& B, \ - DType alpha, bool rightside, bool lower, bool transpose, Stream *s) { \ +void linalg_batch_trsm(const Tensor& A, const Tensor& B, \ + DType alpha, bool rightside, bool lower, bool transpose, Stream *s) { \ linalg_check_batch_size(A.size(0), B.size(0), B.size(0)); \ for (index_t i = 0; i < A.size(0); ++i) { \ - linalg_trsm(A[i], B[i], alpha, rightside, lower, transpose); \ + linalg_trsm(A[i], B[i], alpha, rightside, lower, transpose, s); \ } \ } @@ -285,10 +271,10 @@ void linalg_trsm(const Tensor& A, const Tensor inline \ -void linalg_batch_trsm(const Tensor& A, const Tensor& B, \ - DType alpha, bool rightside, bool lower, bool transpose, Stream *s) { \ +void linalg_batch_trsm(const Tensor& A, const Tensor& B, \ + DType alpha, bool rightside, bool lower, bool transpose, Stream *s) { \ LOG(FATAL) << "linalg_batch_trsm not implemented, needs cblas!"; \ } @@ -297,8 +283,8 @@ void linalg_batch_trsm(const Tensor& A, const Tensor< LINALG_CPU_TRSM(strsm, float) LINALG_CPU_TRSM(dtrsm, double) -LINALG_CPU_BATCH_TRSM(float) -LINALG_CPU_BATCH_TRSM(double) +LINALG_XPU_BATCH_TRSM(cpu, float) +LINALG_XPU_BATCH_TRSM(cpu, double) #ifdef __CUDACC__ @@ -322,37 +308,8 @@ void linalg_trsm(const Tensor& A, const Tensor inline \ -void linalg_batch_trsm(const Tensor& A, const Tensor& B, \ - DType alpha, bool rightside, bool lower, bool transpose, Stream *s) { \ - using namespace mxnet; \ - using mshadow::gpu; \ - CHECK_NOTNULL(s); \ - linalg_check_batch_size(A.size(0), B.size(0), B.size(0)); \ - check_trsm(A[0], B[0], alpha, rightside, lower, transpose); \ - Storage::Handle offsetsA, offsetsB; \ - offsetsA = Storage::Get()->Alloc(sizeof(DType*)*A.size(0), Context::GPU()); \ - offsetsB = Storage::Get()->Alloc(sizeof(DType*)*B.size(0), Context::GPU()); \ - using namespace mshadow::cuda; \ - int ngrid = std::min(kMaxGridNum, \ - static_cast((A.size(0) + kBaseThreadNum - 1) / kBaseThreadNum)); \ - linalgCollectBatchOffsetsGPU<<::GetStream(s)>>> \ - (static_cast(offsetsA.dptr), A.dptr_, A.size(1)*A.stride_, A.size(0)); \ - linalgCollectBatchOffsetsGPU<<::GetStream(s)>>> \ - (static_cast(offsetsB.dptr), B.dptr_, B.size(1)*B.stride_, A.size(0)); \ - CUBLAS_CALL(cublas##fname(Stream::GetBlasHandle(s), \ - (rightside ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT), \ - (lower ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER), \ - (transpose ? CUBLAS_OP_T : CUBLAS_OP_N), \ - CUBLAS_DIAG_NON_UNIT, B.size(2), B.size(1), &alpha, \ - static_cast(offsetsA.dptr), A.stride_, \ - static_cast(offsetsB.dptr), B.stride_, A.size(0))); \ - Storage::Get()->Free(offsetsA); \ - Storage::Get()->Free(offsetsB); \ -} -LINALG_GPU_BATCH_TRSM(StrsmBatched, float) -LINALG_GPU_BATCH_TRSM(DtrsmBatched, double) +LINALG_XPU_BATCH_TRSM(gpu, float) +LINALG_XPU_BATCH_TRSM(gpu, double) #endif // __CUDACC__