fix after rebase origin

luoyu-intel · Jan 15, 2024 · 65cbb81 · 65cbb81
1 parent eb65b7a
commit 65cbb81
Show file tree

Hide file tree

Showing 6 changed files with 21 additions and 340 deletions.
diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
@@ -56,9 +56,9 @@ class MatMulNBits final : public OpKernel {
   const size_t nbits_;
   const int64_t accuracy_level_;
   const bool column_wise_quant_{true};
-#ifdef ORT_NEURAL_SPEED
   IAllocatorUniquePtr<void> packed_b_;
   size_t packed_b_size_{0};
+#ifdef ORT_NEURAL_SPEED
   bool is_asym_{false};
   bool all_constant_{false};
 #endif
@@ -113,7 +113,7 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ Allocat
     is_packed = true;
   }
 
-#else  // defined(MLAS_JBLAS)
+#else  // defined(ORT_NEURAL_SPEED)
 
   if (input_idx == 1) {
     packed_b_size_ = MlasSQNBitGemmPackQuantBDataSize(N_, K_, nbits_, block_size_);
@@ -128,7 +128,7 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ Allocat
     is_packed = true;
   }
 
-#endif  // defined(MLAS_JBLAS)
+#endif  // defined(ORT_NEURAL_SPEED)
 
   return Status::OK();
 }
@@ -151,14 +151,14 @@ Status MatMulNBits::UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prep
     packed_b_ = std::move(prepacked_buffers[0]);
   }
 
-#else  // defined(MLAS_JBLAS)
+#else  // defined(ORT_NEURAL_SPEED)
 
   if (input_idx == 1) {
     used_shared_buffers = true;
     packed_b_ = std::move(prepacked_buffers[0]);
   }
 
-#endif  // defined(MLAS_JBLAS)
+#endif  // defined(ORT_NEURAL_SPEED)
   return Status::OK();
 }
 
@@ -204,7 +204,7 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
     return Status::OK();
   }
 
-#endif  // defined(MLAS_JBLAS)
+#endif  // defined(ORT_NEURAL_SPEED)
 
   const Tensor* scales = ctx->Input<Tensor>(2);
   const Tensor* zero_points = ctx->Input<Tensor>(3);

diff --git a/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.cc b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.cc
@@ -323,7 +323,7 @@ static size_t NSQ4GemmPackBSize(size_t N, size_t K, size_t BlkSize, bool isAsym,
   }
   // from low precision to high precision
   switch (CompType) {
-    case CompInt8:
+    case NSCompInt8:
       if (!isAsym) {  // asym int8 is not optimized, so fall through to others.
         if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
           return NSQ4BuSize<tWeiNInt<tAMX_INT8_SS_KBlock, tAMX_INT8_SS_KBlock::ISA>>(BlkSize, N, K, isAsym);
@@ -336,10 +336,10 @@ static size_t NSQ4GemmPackBSize(size_t N, size_t K, size_t BlkSize, bool isAsym,
         }
       }
       [[fallthrough]];
-    case CompBf16:
-    case CompFp16:
-    case CompFp32:
-    case CompUndef:
+    case NSCompBf16:
+    case NSCompFp16:
+    case NSCompFp32:
+    case NSCompUndef:
       if (_cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
         return NSQ4BuSize<tWeiNInt<tAVX512F, tAVX512F::ISA>>(BlkSize, N, K, isAsym);
       }
@@ -358,7 +358,7 @@ static bool NSQ4GemmPackB(void* PackedBuf, const uint8_t* QData, const float* Sc
   GetCPUDevice();
   // explicit statement fall through.
   switch (CompType) {
-    case CompInt8:
+    case NSCompInt8:
       if (!isAsym) {  // asym int8 is not optimized, so fall through to others.
         if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
           NSQ4GemmPackBImpl<tWeiNInt<tAMX_INT8_SS_KBlock, tAMX_INT8_SS_KBlock::ISA>>(
@@ -377,10 +377,10 @@ static bool NSQ4GemmPackB(void* PackedBuf, const uint8_t* QData, const float* Sc
         }
       }
       [[fallthrough]];
-    case CompBf16:
-    case CompFp16:
-    case CompFp32:
-    case CompUndef:
+    case NSCompBf16:
+    case NSCompFp16:
+    case NSCompFp32:
+    case NSCompUndef:
       if (_cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
         NSQ4GemmPackBImpl<tWeiNInt<tAVX512F, tAVX512F::ISA>>(PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym,
                                                              lastCall, ldb, ThreadPool);

diff --git a/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.h b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.h
@@ -22,11 +22,11 @@ Module Name:
  * @brief Define compute types of block quantization
  */
 enum NS_SQNBIT_COMPUTE_TYPE {
-  CompUndef = 0, /*!< undef */
-  CompFp32 = 1,  /*!< input fp32, accumulator fp32 */
-  CompFp16 = 2,  /*!< input fp16, accumulator fp16 */
-  CompBf16 = 3,  /*!< input bf16, accumulator fp32 */
-  CompInt8 = 4   /*!< input int8, accumulator int32 */
+  NSCompUndef = 0, /*!< undef */
+  NSCompFp32 = 1,  /*!< input fp32, accumulator fp32 */
+  NSCompFp16 = 2,  /*!< input fp16, accumulator fp16 */
+  NSCompBf16 = 3,  /*!< input bf16, accumulator fp32 */
+  NSCompInt8 = 4   /*!< input int8, accumulator int32 */
 };
 
 /**

diff --git a/onnxruntime/core/mlas/inc/mlas_qnbit.h b/onnxruntime/core/mlas/inc/mlas_qnbit.h
@@ -183,133 +183,3 @@ MlasSQNBitGemmPackQuantBData(
     void* PackedQuantBData,
     MLAS_THREADPOOL* ThreadPool = nullptr
 );
-
-/**
- * @brief Data parameters for NBits GEMM routine
- *        C = A * B
- *        A, C must be a float32 matrix
- *        B must be a packed nbits blob
- *        All except C are [in] parameters
- */
-struct MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS {
-    const float* A = nullptr; /**< address of A (float32 matrix)*/
-    const void* B = nullptr;  /**< address of B (packed nbits blob)*/
-    float* C = nullptr;       /**< address of result matrix */
-    size_t lda = 0;           /**< leading dimension of A */
-    size_t ldc = 0;           /**< leading dimension of C*/
-};
-
-/**
- * @brief Compute the byte size of the parameter combination
- *
- * @param N      the number of columns of matrix B.
- * @param K      the number of rows of matrix B.
- * @param block_size    size of the block to quantize, elements from the same block share the same
- * scale and zero point
- * @param nbits  number of bits used for weight quantization
- * @param is_asym  flag for asymmetric quantization
- * @param comp_type  specify input data type and accumulator data type
- * @return size of the packing buffer, 0 if the operation is not yet supported.
- */
-size_t MLASCALL
-MlasNBitsGemmPackBSize(
-    size_t N, size_t K, size_t block_size, int nbits, bool is_asym, MLAS_SQNBIT_COMPUTE_TYPE comp_type
-);
-
-/**
- * @brief Prepack tensor data from n-bit quantized data, scale and zero point buffers.
- *
- * @param PackedBuf     packed data buffer
- * @param QData         quantized data buffer
- * @param Scale         scale pointer
- * @param Zp            zero point pointer
- * @param N             the number of columns of matrix B.
- * @param K             the number of rows of matrix B.
- * @param ldb           leading dimension of B
- * @param block_size    size of the block to quantize, elements from the same block share the same
- * scale and zero point
- * @param nbits         number of bits used for weight quantization (default 4)
- * @param is_asym       flag for asymmetric quantization
- * @param comp_type     specify input data type and accumulator data type
- * @param last_call     flag to activate the epilogue process of packB. OpKernel::PrePack will query input tensor
- * one by one: QData, Scale, Zp (if is_asym is true). But kernel prefers to pack all tensors into one blob data where
- * they can share the common attributes like: block_size. Meanwhile, kernel has some pre-computations to speed up
- * inference which require that all blob data are ready. So, you need to set this flag to true when passing Scale
- * (is_asym is false) and Zp(is_asym is true).
- * @param thread_pool
- */
-void MLASCALL
-MlasNBitsGemmPackB(
-    void* PackedBuf,
-    const uint8_t* QData,
-    const float* Scale,
-    const uint8_t* Zp,
-    size_t N,
-    size_t K,
-    size_t ldb,
-    size_t block_size,
-    int nbits,
-    bool is_asym,
-    bool last_call,
-    MLAS_SQNBIT_COMPUTE_TYPE comp_type,
-    MLAS_THREADPOOL* thread_pool
-);
-
-/**
- * @brief Unpack and dequantize to fp32
- *
- * @param FpData     unpacked float32 data
- * @param PackedBuf  quantized and packed data
- * @param N          the number of columns of matrix B.
- * @param K          the number of rows of matrix B.
- * @param ldb        leading dimension of B
- * @param thread_pool
- */
-void MLASCALL
-MlasNBitsGemmUnPackB(
-    float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, MLAS_THREADPOOL* thread_pool
-);
-
-/**
- * @brief Get the workspace size required by computation.
- *
- * @param[in]  M       row size of matrix A and C
- * @param[in]  N       column size of matrix B and C
- * @param[in]  K       column size of matrix A and row size of matrix B
- * @param[in]  BatchN  number of batches
- * @param[inout]  DataParams  An array (size BatchN) of parameter blocks
- * @return     Workspace size in bytes
- */
-size_t MLASCALL
-MlasSQNBitsGemmBatchPackedBWorkspaceSize(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams
-);
-
-/**
- * @brief Batched GEMM:  C = A * B
- *        A, C must be a float32 matrix
- *        B must be a packed nbits blob
- *
- * @param[in]  M       row size of matrix A and C
- * @param[in]  N       column size of matrix B and C
- * @param[in]  K       column size of matrix A and row size of matrix B
- * @param[in]  BatchN  number of batches
- * @param[inout]  DataParams  An array (size BatchN) of parameter blocks
- * @param[in]  WorkSpace  temporary buffer
- * @param[in]  ThreadPool
- * @return
- */
-void MLASCALL
-MlasSQNBitsGemmBatchPackedB(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams,
-    void* WorkSpace,
-    MLAS_THREADPOOL* ThreadPool = nullptr
-);
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm.cpp
@@ -19,10 +19,6 @@ Module Name:
 
 #include <cassert>
 
-#ifdef MLAS_NEURAL_SPEED
-#include "bestla_gemm.h"
-#endif
-
 namespace
 {
 
@@ -694,127 +690,3 @@ MlasSQNBitGemmBatch(
         ComputeOperation(BlkLen, K, Data, PerGemmWorkspace, RangeStartM, RangeCountM, RangeStartN, RangeCountN);
     });
 }
-
-size_t MLASCALL
-MlasNBitsGemmPackBSize(
-    size_t N, size_t K, size_t BlkSize, int nbits, bool isAsym, MLAS_SQNBIT_COMPUTE_TYPE CompType
-)
-{
-#ifdef MLAS_NEURAL_SPEED
-    if (nbits == 4) {
-        auto jsize = BTLAQ4GemmPackBSize(N, K, BlkSize, isAsym, CompType);
-        if (jsize) {
-            return jsize;
-        }
-    }
-#endif
-    (void)(N);
-    (void)(K);
-    (void)(BlkSize);
-    (void)(nbits);
-    (void)(isAsym);
-    (void)(CompType);
-    return 0;
-}
-
-void MLASCALL
-MlasNBitsGemmPackB(
-    void* PackedBuf,
-    const uint8_t* QData,
-    const float* Scale,
-    const uint8_t* Zp,
-    size_t N,
-    size_t K,
-    size_t ldb,
-    size_t BlkSize,
-    int nbits,
-    bool isAsym,
-    bool lastCall,
-    MLAS_SQNBIT_COMPUTE_TYPE CompType,
-    MLAS_THREADPOOL* ThreadPool
-)
-{
-#ifdef MLAS_NEURAL_SPEED
-    if (nbits == 4) {
-        if (BTLAQ4GemmPackB(PackedBuf, QData, Scale, Zp, N, K, ldb, BlkSize, isAsym, lastCall, CompType, ThreadPool)) {
-            return;
-        }
-    }
-#endif
-    (void)(PackedBuf);
-    (void)(QData);
-    (void)(Scale);
-    (void)(Zp);
-    (void)(N);
-    (void)(K);
-    (void)(ldb);
-    (void)(BlkSize);
-    (void)(nbits);
-    (void)(isAsym);
-    (void)(lastCall);
-    (void)(CompType);
-    (void)(ThreadPool);
-}
-
-void MLASCALL
-MlasNBitsGemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, MLAS_THREADPOOL* ThreadPool)
-{
-#ifdef MLAS_NEURAL_SPEED
-    if (BTLAQ4GemmUnPackB(FpData, PackedBuf, N, K, ldb, ThreadPool)) {
-        return;
-    }
-#endif
-    (void)(FpData);
-    (void)(PackedBuf);
-    (void)(N);
-    (void)(K);
-    (void)(ldb);
-    (void)(ThreadPool);
-}
-
-size_t MLASCALL
-MlasSQNBitsGemmBatchPackedBWorkspaceSize(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams
-)
-{
-#ifdef MLAS_NEURAL_SPEED
-    return BTLASQ4GemmBatchWorkspaceSize(M, N, K, BatchN, DataParams);
-#endif
-    (void)(M);
-    (void)(N);
-    (void)(K);
-    (void)(BatchN);
-    (void)(DataParams);
-    return 0;
-}
-
-void MLASCALL
-MlasSQNBitsGemmBatchPackedB(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams,
-    void* WorkSpace,
-    MLAS_THREADPOOL* ThreadPool
-)
-{
-    GetMlasPlatform();
-#ifdef MLAS_NEURAL_SPEED
-    if (BTLASQ4GemmBatchDriver(M, N, K, BatchN, DataParams, reinterpret_cast<int8_t*>(WorkSpace), ThreadPool)) {
-        // PackedWeight is created by bestla
-        return;
-    }
-#endif
-    (void)(M);
-    (void)(N);
-    (void)(K);
-    (void)(BatchN);
-    (void)(DataParams);
-    (void)(WorkSpace);
-    (void)(ThreadPool);
-}