diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc index a088de716b2ed..72948c74d7877 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc @@ -56,9 +56,9 @@ class MatMulNBits final : public OpKernel { const size_t nbits_; const int64_t accuracy_level_; const bool column_wise_quant_{true}; -#ifdef ORT_NEURAL_SPEED IAllocatorUniquePtr packed_b_; size_t packed_b_size_{0}; +#ifdef ORT_NEURAL_SPEED bool is_asym_{false}; bool all_constant_{false}; #endif @@ -113,7 +113,7 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ Allocat is_packed = true; } -#else // defined(MLAS_JBLAS) +#else // defined(ORT_NEURAL_SPEED) if (input_idx == 1) { packed_b_size_ = MlasSQNBitGemmPackQuantBDataSize(N_, K_, nbits_, block_size_); @@ -128,7 +128,7 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ Allocat is_packed = true; } -#endif // defined(MLAS_JBLAS) +#endif // defined(ORT_NEURAL_SPEED) return Status::OK(); } @@ -151,14 +151,14 @@ Status MatMulNBits::UseSharedPrePackedBuffers(std::vector& prep packed_b_ = std::move(prepacked_buffers[0]); } -#else // defined(MLAS_JBLAS) +#else // defined(ORT_NEURAL_SPEED) if (input_idx == 1) { used_shared_buffers = true; packed_b_ = std::move(prepacked_buffers[0]); } -#endif // defined(MLAS_JBLAS) +#endif // defined(ORT_NEURAL_SPEED) return Status::OK(); } @@ -204,7 +204,7 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const { return Status::OK(); } -#endif // defined(MLAS_JBLAS) +#endif // defined(ORT_NEURAL_SPEED) const Tensor* scales = ctx->Input(2); const Tensor* zero_points = ctx->Input(3); diff --git a/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.cc b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.cc index 0762014375030..d73c4476cadd3 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.cc @@ -323,7 +323,7 @@ static size_t NSQ4GemmPackBSize(size_t N, size_t K, size_t BlkSize, bool isAsym, } // from low precision to high precision switch (CompType) { - case CompInt8: + case NSCompInt8: if (!isAsym) { // asym int8 is not optimized, so fall through to others. if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) { return NSQ4BuSize>(BlkSize, N, K, isAsym); @@ -336,10 +336,10 @@ static size_t NSQ4GemmPackBSize(size_t N, size_t K, size_t BlkSize, bool isAsym, } } [[fallthrough]]; - case CompBf16: - case CompFp16: - case CompFp32: - case CompUndef: + case NSCompBf16: + case NSCompFp16: + case NSCompFp32: + case NSCompUndef: if (_cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) { return NSQ4BuSize>(BlkSize, N, K, isAsym); } @@ -358,7 +358,7 @@ static bool NSQ4GemmPackB(void* PackedBuf, const uint8_t* QData, const float* Sc GetCPUDevice(); // explicit statement fall through. switch (CompType) { - case CompInt8: + case NSCompInt8: if (!isAsym) { // asym int8 is not optimized, so fall through to others. if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) { NSQ4GemmPackBImpl>( @@ -377,10 +377,10 @@ static bool NSQ4GemmPackB(void* PackedBuf, const uint8_t* QData, const float* Sc } } [[fallthrough]]; - case CompBf16: - case CompFp16: - case CompFp32: - case CompUndef: + case NSCompBf16: + case NSCompFp16: + case NSCompFp32: + case NSCompUndef: if (_cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) { NSQ4GemmPackBImpl>(PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall, ldb, ThreadPool); diff --git a/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.h b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.h index 0ffece2be77f2..ebcb3027a209f 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.h +++ b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.h @@ -22,11 +22,11 @@ Module Name: * @brief Define compute types of block quantization */ enum NS_SQNBIT_COMPUTE_TYPE { - CompUndef = 0, /*!< undef */ - CompFp32 = 1, /*!< input fp32, accumulator fp32 */ - CompFp16 = 2, /*!< input fp16, accumulator fp16 */ - CompBf16 = 3, /*!< input bf16, accumulator fp32 */ - CompInt8 = 4 /*!< input int8, accumulator int32 */ + NSCompUndef = 0, /*!< undef */ + NSCompFp32 = 1, /*!< input fp32, accumulator fp32 */ + NSCompFp16 = 2, /*!< input fp16, accumulator fp16 */ + NSCompBf16 = 3, /*!< input bf16, accumulator fp32 */ + NSCompInt8 = 4 /*!< input int8, accumulator int32 */ }; /** diff --git a/onnxruntime/core/mlas/inc/mlas_qnbit.h b/onnxruntime/core/mlas/inc/mlas_qnbit.h index bc0bfc92c85a0..047011e70bd4d 100644 --- a/onnxruntime/core/mlas/inc/mlas_qnbit.h +++ b/onnxruntime/core/mlas/inc/mlas_qnbit.h @@ -183,133 +183,3 @@ MlasSQNBitGemmPackQuantBData( void* PackedQuantBData, MLAS_THREADPOOL* ThreadPool = nullptr ); - -/** - * @brief Data parameters for NBits GEMM routine - * C = A * B - * A, C must be a float32 matrix - * B must be a packed nbits blob - * All except C are [in] parameters - */ -struct MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS { - const float* A = nullptr; /**< address of A (float32 matrix)*/ - const void* B = nullptr; /**< address of B (packed nbits blob)*/ - float* C = nullptr; /**< address of result matrix */ - size_t lda = 0; /**< leading dimension of A */ - size_t ldc = 0; /**< leading dimension of C*/ -}; - -/** - * @brief Compute the byte size of the parameter combination - * - * @param N the number of columns of matrix B. - * @param K the number of rows of matrix B. - * @param block_size size of the block to quantize, elements from the same block share the same - * scale and zero point - * @param nbits number of bits used for weight quantization - * @param is_asym flag for asymmetric quantization - * @param comp_type specify input data type and accumulator data type - * @return size of the packing buffer, 0 if the operation is not yet supported. - */ -size_t MLASCALL -MlasNBitsGemmPackBSize( - size_t N, size_t K, size_t block_size, int nbits, bool is_asym, MLAS_SQNBIT_COMPUTE_TYPE comp_type -); - -/** - * @brief Prepack tensor data from n-bit quantized data, scale and zero point buffers. - * - * @param PackedBuf packed data buffer - * @param QData quantized data buffer - * @param Scale scale pointer - * @param Zp zero point pointer - * @param N the number of columns of matrix B. - * @param K the number of rows of matrix B. - * @param ldb leading dimension of B - * @param block_size size of the block to quantize, elements from the same block share the same - * scale and zero point - * @param nbits number of bits used for weight quantization (default 4) - * @param is_asym flag for asymmetric quantization - * @param comp_type specify input data type and accumulator data type - * @param last_call flag to activate the epilogue process of packB. OpKernel::PrePack will query input tensor - * one by one: QData, Scale, Zp (if is_asym is true). But kernel prefers to pack all tensors into one blob data where - * they can share the common attributes like: block_size. Meanwhile, kernel has some pre-computations to speed up - * inference which require that all blob data are ready. So, you need to set this flag to true when passing Scale - * (is_asym is false) and Zp(is_asym is true). - * @param thread_pool - */ -void MLASCALL -MlasNBitsGemmPackB( - void* PackedBuf, - const uint8_t* QData, - const float* Scale, - const uint8_t* Zp, - size_t N, - size_t K, - size_t ldb, - size_t block_size, - int nbits, - bool is_asym, - bool last_call, - MLAS_SQNBIT_COMPUTE_TYPE comp_type, - MLAS_THREADPOOL* thread_pool -); - -/** - * @brief Unpack and dequantize to fp32 - * - * @param FpData unpacked float32 data - * @param PackedBuf quantized and packed data - * @param N the number of columns of matrix B. - * @param K the number of rows of matrix B. - * @param ldb leading dimension of B - * @param thread_pool - */ -void MLASCALL -MlasNBitsGemmUnPackB( - float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, MLAS_THREADPOOL* thread_pool -); - -/** - * @brief Get the workspace size required by computation. - * - * @param[in] M row size of matrix A and C - * @param[in] N column size of matrix B and C - * @param[in] K column size of matrix A and row size of matrix B - * @param[in] BatchN number of batches - * @param[inout] DataParams An array (size BatchN) of parameter blocks - * @return Workspace size in bytes - */ -size_t MLASCALL -MlasSQNBitsGemmBatchPackedBWorkspaceSize( - const size_t M, - const size_t N, - const size_t K, - const size_t BatchN, - const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams -); - -/** - * @brief Batched GEMM: C = A * B - * A, C must be a float32 matrix - * B must be a packed nbits blob - * - * @param[in] M row size of matrix A and C - * @param[in] N column size of matrix B and C - * @param[in] K column size of matrix A and row size of matrix B - * @param[in] BatchN number of batches - * @param[inout] DataParams An array (size BatchN) of parameter blocks - * @param[in] WorkSpace temporary buffer - * @param[in] ThreadPool - * @return - */ -void MLASCALL -MlasSQNBitsGemmBatchPackedB( - const size_t M, - const size_t N, - const size_t K, - const size_t BatchN, - const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams, - void* WorkSpace, - MLAS_THREADPOOL* ThreadPool = nullptr -); diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm.cpp index 1b974f560e09a..0d8a5692359a6 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm.cpp +++ b/onnxruntime/core/mlas/lib/sqnbitgemm.cpp @@ -19,10 +19,6 @@ Module Name: #include -#ifdef MLAS_NEURAL_SPEED -#include "bestla_gemm.h" -#endif - namespace { @@ -694,127 +690,3 @@ MlasSQNBitGemmBatch( ComputeOperation(BlkLen, K, Data, PerGemmWorkspace, RangeStartM, RangeCountM, RangeStartN, RangeCountN); }); } - -size_t MLASCALL -MlasNBitsGemmPackBSize( - size_t N, size_t K, size_t BlkSize, int nbits, bool isAsym, MLAS_SQNBIT_COMPUTE_TYPE CompType -) -{ -#ifdef MLAS_NEURAL_SPEED - if (nbits == 4) { - auto jsize = BTLAQ4GemmPackBSize(N, K, BlkSize, isAsym, CompType); - if (jsize) { - return jsize; - } - } -#endif - (void)(N); - (void)(K); - (void)(BlkSize); - (void)(nbits); - (void)(isAsym); - (void)(CompType); - return 0; -} - -void MLASCALL -MlasNBitsGemmPackB( - void* PackedBuf, - const uint8_t* QData, - const float* Scale, - const uint8_t* Zp, - size_t N, - size_t K, - size_t ldb, - size_t BlkSize, - int nbits, - bool isAsym, - bool lastCall, - MLAS_SQNBIT_COMPUTE_TYPE CompType, - MLAS_THREADPOOL* ThreadPool -) -{ -#ifdef MLAS_NEURAL_SPEED - if (nbits == 4) { - if (BTLAQ4GemmPackB(PackedBuf, QData, Scale, Zp, N, K, ldb, BlkSize, isAsym, lastCall, CompType, ThreadPool)) { - return; - } - } -#endif - (void)(PackedBuf); - (void)(QData); - (void)(Scale); - (void)(Zp); - (void)(N); - (void)(K); - (void)(ldb); - (void)(BlkSize); - (void)(nbits); - (void)(isAsym); - (void)(lastCall); - (void)(CompType); - (void)(ThreadPool); -} - -void MLASCALL -MlasNBitsGemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, MLAS_THREADPOOL* ThreadPool) -{ -#ifdef MLAS_NEURAL_SPEED - if (BTLAQ4GemmUnPackB(FpData, PackedBuf, N, K, ldb, ThreadPool)) { - return; - } -#endif - (void)(FpData); - (void)(PackedBuf); - (void)(N); - (void)(K); - (void)(ldb); - (void)(ThreadPool); -} - -size_t MLASCALL -MlasSQNBitsGemmBatchPackedBWorkspaceSize( - const size_t M, - const size_t N, - const size_t K, - const size_t BatchN, - const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams -) -{ -#ifdef MLAS_NEURAL_SPEED - return BTLASQ4GemmBatchWorkspaceSize(M, N, K, BatchN, DataParams); -#endif - (void)(M); - (void)(N); - (void)(K); - (void)(BatchN); - (void)(DataParams); - return 0; -} - -void MLASCALL -MlasSQNBitsGemmBatchPackedB( - const size_t M, - const size_t N, - const size_t K, - const size_t BatchN, - const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams, - void* WorkSpace, - MLAS_THREADPOOL* ThreadPool -) -{ - GetMlasPlatform(); -#ifdef MLAS_NEURAL_SPEED - if (BTLASQ4GemmBatchDriver(M, N, K, BatchN, DataParams, reinterpret_cast(WorkSpace), ThreadPool)) { - // PackedWeight is created by bestla - return; - } -#endif - (void)(M); - (void)(N); - (void)(K); - (void)(BatchN); - (void)(DataParams); - (void)(WorkSpace); - (void)(ThreadPool); -} diff --git a/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp b/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp index 0b9d552a6da3f..668d7a0611367 100644 --- a/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp +++ b/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp @@ -112,64 +112,3 @@ static void SQNBitGemmArgs(benchmark::internal::Benchmark* b) { } BENCHMARK(SQNBITGEMM<4>)->Apply(SQNBitGemmArgs)->UseRealTime(); - -#if defined(MLAS_JBLAS) - -#ifdef MLAS_NEURAL_SPEED -void Q4GEMM_BTLA(benchmark::State& state, int block_size, bool is_asym, MLAS_SQNBIT_COMPUTE_TYPE cmp_type) { - if (state.range(0) <= 0) throw std::invalid_argument("M must greater than 0!"); - if (state.range(1) <= 0) throw std::invalid_argument("N must greater than 0!"); - if (state.range(2) <= 0) throw std::invalid_argument("K must greater than 0!"); - if (state.range(3) <= 0) throw std::invalid_argument("Threads must greater than 0!"); - - const size_t M = static_cast(state.range(0)); - const size_t N = static_cast(state.range(1)); - const size_t K = static_cast(state.range(2)); - const size_t threads = static_cast(state.range(3)); - block_size = block_size == -1 ? static_cast(K) : block_size; - const size_t pack_b_size = MlasNBitsGemmPackBSize(N, K, block_size, 4, is_asym, cmp_type); - - OrtThreadPoolParams tpo; - tpo.thread_pool_size = static_cast(threads); - tpo.auto_set_affinity = true; - std::unique_ptr tp(onnxruntime::concurrency::CreateThreadPool( - &onnxruntime::Env::Default(), tpo, onnxruntime::concurrency::ThreadPoolType::INTRA_OP)); - - auto A1 = RandomVectorUniform(static_cast(M * K), -1.0f, 1.0f); - auto B1 = RandomVectorUniform(static_cast(N * K / 2), 0, 255); - auto blk_num = static_cast((K + block_size - 1) / block_size); - auto B_scale = RandomVectorUniform(static_cast(N * blk_num), 0.003f, 0.005f); - std::vector C1(static_cast(M * N)); - auto B_zp = RandomVectorUniform(static_cast(N * blk_num / 2), 0, 255); - - std::vector B1_packed(pack_b_size); - MlasNBitsGemmPackB(B1_packed.data(), B1.data(), B_scale.data(), is_asym ? B_zp.data() : nullptr, N, K, K, block_size, - 4, is_asym, true, cmp_type, tp.get()); - - MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS params1; - params1.A = A1.data(); - params1.lda = K; - params1.C = C1.data(); - params1.ldc = N; - params1.B = B1_packed.data(); - std::vector workspace(static_cast(M <= 32 ? 32 : M) * K * 4); - MlasSQNBitsGemmBatchPackedB(M, N, K, 1, ¶ms1, workspace.data(), tp.get()); - - for (auto _ : state) { - MlasSQNBitsGemmBatchPackedB(M, N, K, 1, ¶ms1, workspace.data(), tp.get()); - } -} - -static void GemmSizeProducts(benchmark::internal::Benchmark* b) { - b->ArgNames({"M", "N", "K", "Threads"}); - b->ArgsProduct({{1, 1024, 2048}, {4096, 11008}, {4096, 11008}, {8}}); -} - -BENCHMARK_CAPTURE(Q4GEMM_BTLA, Q4B32SymInt8, 32, false, CompInt8)->Apply(GemmSizeProducts)->UseRealTime(); -BENCHMARK_CAPTURE(Q4GEMM_BTLA, Q4B128SymInt8, 128, false, CompInt8)->Apply(GemmSizeProducts)->UseRealTime(); -BENCHMARK_CAPTURE(Q4GEMM_BTLA, Q4PerNSymInt8, -1, false, CompInt8)->Apply(GemmSizeProducts)->UseRealTime(); -BENCHMARK_CAPTURE(Q4GEMM_BTLA, Q4B32SymFp32, 32, false, CompFp32)->Apply(GemmSizeProducts)->UseRealTime(); -BENCHMARK_CAPTURE(Q4GEMM_BTLA, Q4B128SymFp32, 128, false, CompFp32)->Apply(GemmSizeProducts)->UseRealTime(); -BENCHMARK_CAPTURE(Q4GEMM_BTLA, Q4PerNSymFp32, -1, false, CompFp32)->Apply(GemmSizeProducts)->UseRealTime(); -BENCHMARK_CAPTURE(Q4GEMM_BTLA, Q4B32AsymFp32, 32, true, CompFp32)->Apply(GemmSizeProducts)->UseRealTime(); -#endif