Skip to content

Commit

Permalink
fix after rebase origin
Browse files Browse the repository at this point in the history
  • Loading branch information
luoyu-intel committed Jan 15, 2024
1 parent eb65b7a commit 65cbb81
Show file tree
Hide file tree
Showing 6 changed files with 21 additions and 340 deletions.
12 changes: 6 additions & 6 deletions onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@ class MatMulNBits final : public OpKernel {
const size_t nbits_;
const int64_t accuracy_level_;
const bool column_wise_quant_{true};
#ifdef ORT_NEURAL_SPEED
IAllocatorUniquePtr<void> packed_b_;
size_t packed_b_size_{0};
#ifdef ORT_NEURAL_SPEED
bool is_asym_{false};
bool all_constant_{false};
#endif
Expand Down Expand Up @@ -113,7 +113,7 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ Allocat
is_packed = true;
}

#else // defined(MLAS_JBLAS)
#else // defined(ORT_NEURAL_SPEED)

if (input_idx == 1) {
packed_b_size_ = MlasSQNBitGemmPackQuantBDataSize(N_, K_, nbits_, block_size_);
Expand All @@ -128,7 +128,7 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ Allocat
is_packed = true;
}

#endif // defined(MLAS_JBLAS)
#endif // defined(ORT_NEURAL_SPEED)

return Status::OK();
}
Expand All @@ -151,14 +151,14 @@ Status MatMulNBits::UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prep
packed_b_ = std::move(prepacked_buffers[0]);
}

#else // defined(MLAS_JBLAS)
#else // defined(ORT_NEURAL_SPEED)

if (input_idx == 1) {
used_shared_buffers = true;
packed_b_ = std::move(prepacked_buffers[0]);
}

#endif // defined(MLAS_JBLAS)
#endif // defined(ORT_NEURAL_SPEED)
return Status::OK();
}

Expand Down Expand Up @@ -204,7 +204,7 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
return Status::OK();
}

#endif // defined(MLAS_JBLAS)
#endif // defined(ORT_NEURAL_SPEED)

const Tensor* scales = ctx->Input<Tensor>(2);
const Tensor* zero_points = ctx->Input<Tensor>(3);
Expand Down
20 changes: 10 additions & 10 deletions onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ static size_t NSQ4GemmPackBSize(size_t N, size_t K, size_t BlkSize, bool isAsym,
}
// from low precision to high precision
switch (CompType) {
case CompInt8:
case NSCompInt8:
if (!isAsym) { // asym int8 is not optimized, so fall through to others.
if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
return NSQ4BuSize<tWeiNInt<tAMX_INT8_SS_KBlock, tAMX_INT8_SS_KBlock::ISA>>(BlkSize, N, K, isAsym);
Expand All @@ -336,10 +336,10 @@ static size_t NSQ4GemmPackBSize(size_t N, size_t K, size_t BlkSize, bool isAsym,
}
}
[[fallthrough]];
case CompBf16:
case CompFp16:
case CompFp32:
case CompUndef:
case NSCompBf16:
case NSCompFp16:
case NSCompFp32:
case NSCompUndef:
if (_cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
return NSQ4BuSize<tWeiNInt<tAVX512F, tAVX512F::ISA>>(BlkSize, N, K, isAsym);
}
Expand All @@ -358,7 +358,7 @@ static bool NSQ4GemmPackB(void* PackedBuf, const uint8_t* QData, const float* Sc
GetCPUDevice();
// explicit statement fall through.
switch (CompType) {
case CompInt8:
case NSCompInt8:
if (!isAsym) { // asym int8 is not optimized, so fall through to others.
if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
NSQ4GemmPackBImpl<tWeiNInt<tAMX_INT8_SS_KBlock, tAMX_INT8_SS_KBlock::ISA>>(
Expand All @@ -377,10 +377,10 @@ static bool NSQ4GemmPackB(void* PackedBuf, const uint8_t* QData, const float* Sc
}
}
[[fallthrough]];
case CompBf16:
case CompFp16:
case CompFp32:
case CompUndef:
case NSCompBf16:
case NSCompFp16:
case NSCompFp32:
case NSCompUndef:
if (_cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
NSQ4GemmPackBImpl<tWeiNInt<tAVX512F, tAVX512F::ISA>>(PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym,
lastCall, ldb, ThreadPool);
Expand Down
10 changes: 5 additions & 5 deletions onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ Module Name:
* @brief Define compute types of block quantization
*/
enum NS_SQNBIT_COMPUTE_TYPE {
CompUndef = 0, /*!< undef */
CompFp32 = 1, /*!< input fp32, accumulator fp32 */
CompFp16 = 2, /*!< input fp16, accumulator fp16 */
CompBf16 = 3, /*!< input bf16, accumulator fp32 */
CompInt8 = 4 /*!< input int8, accumulator int32 */
NSCompUndef = 0, /*!< undef */
NSCompFp32 = 1, /*!< input fp32, accumulator fp32 */
NSCompFp16 = 2, /*!< input fp16, accumulator fp16 */
NSCompBf16 = 3, /*!< input bf16, accumulator fp32 */
NSCompInt8 = 4 /*!< input int8, accumulator int32 */
};

/**
Expand Down
130 changes: 0 additions & 130 deletions onnxruntime/core/mlas/inc/mlas_qnbit.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,133 +183,3 @@ MlasSQNBitGemmPackQuantBData(
void* PackedQuantBData,
MLAS_THREADPOOL* ThreadPool = nullptr
);

/**
* @brief Data parameters for NBits GEMM routine
* C = A * B
* A, C must be a float32 matrix
* B must be a packed nbits blob
* All except C are [in] parameters
*/
struct MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS {
const float* A = nullptr; /**< address of A (float32 matrix)*/
const void* B = nullptr; /**< address of B (packed nbits blob)*/
float* C = nullptr; /**< address of result matrix */
size_t lda = 0; /**< leading dimension of A */
size_t ldc = 0; /**< leading dimension of C*/
};

/**
* @brief Compute the byte size of the parameter combination
*
* @param N the number of columns of matrix B.
* @param K the number of rows of matrix B.
* @param block_size size of the block to quantize, elements from the same block share the same
* scale and zero point
* @param nbits number of bits used for weight quantization
* @param is_asym flag for asymmetric quantization
* @param comp_type specify input data type and accumulator data type
* @return size of the packing buffer, 0 if the operation is not yet supported.
*/
size_t MLASCALL
MlasNBitsGemmPackBSize(
size_t N, size_t K, size_t block_size, int nbits, bool is_asym, MLAS_SQNBIT_COMPUTE_TYPE comp_type
);

/**
* @brief Prepack tensor data from n-bit quantized data, scale and zero point buffers.
*
* @param PackedBuf packed data buffer
* @param QData quantized data buffer
* @param Scale scale pointer
* @param Zp zero point pointer
* @param N the number of columns of matrix B.
* @param K the number of rows of matrix B.
* @param ldb leading dimension of B
* @param block_size size of the block to quantize, elements from the same block share the same
* scale and zero point
* @param nbits number of bits used for weight quantization (default 4)
* @param is_asym flag for asymmetric quantization
* @param comp_type specify input data type and accumulator data type
* @param last_call flag to activate the epilogue process of packB. OpKernel::PrePack will query input tensor
* one by one: QData, Scale, Zp (if is_asym is true). But kernel prefers to pack all tensors into one blob data where
* they can share the common attributes like: block_size. Meanwhile, kernel has some pre-computations to speed up
* inference which require that all blob data are ready. So, you need to set this flag to true when passing Scale
* (is_asym is false) and Zp(is_asym is true).
* @param thread_pool
*/
void MLASCALL
MlasNBitsGemmPackB(
void* PackedBuf,
const uint8_t* QData,
const float* Scale,
const uint8_t* Zp,
size_t N,
size_t K,
size_t ldb,
size_t block_size,
int nbits,
bool is_asym,
bool last_call,
MLAS_SQNBIT_COMPUTE_TYPE comp_type,
MLAS_THREADPOOL* thread_pool
);

/**
* @brief Unpack and dequantize to fp32
*
* @param FpData unpacked float32 data
* @param PackedBuf quantized and packed data
* @param N the number of columns of matrix B.
* @param K the number of rows of matrix B.
* @param ldb leading dimension of B
* @param thread_pool
*/
void MLASCALL
MlasNBitsGemmUnPackB(
float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, MLAS_THREADPOOL* thread_pool
);

/**
* @brief Get the workspace size required by computation.
*
* @param[in] M row size of matrix A and C
* @param[in] N column size of matrix B and C
* @param[in] K column size of matrix A and row size of matrix B
* @param[in] BatchN number of batches
* @param[inout] DataParams An array (size BatchN) of parameter blocks
* @return Workspace size in bytes
*/
size_t MLASCALL
MlasSQNBitsGemmBatchPackedBWorkspaceSize(
const size_t M,
const size_t N,
const size_t K,
const size_t BatchN,
const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams
);

/**
* @brief Batched GEMM: C = A * B
* A, C must be a float32 matrix
* B must be a packed nbits blob
*
* @param[in] M row size of matrix A and C
* @param[in] N column size of matrix B and C
* @param[in] K column size of matrix A and row size of matrix B
* @param[in] BatchN number of batches
* @param[inout] DataParams An array (size BatchN) of parameter blocks
* @param[in] WorkSpace temporary buffer
* @param[in] ThreadPool
* @return
*/
void MLASCALL
MlasSQNBitsGemmBatchPackedB(
const size_t M,
const size_t N,
const size_t K,
const size_t BatchN,
const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams,
void* WorkSpace,
MLAS_THREADPOOL* ThreadPool = nullptr
);
128 changes: 0 additions & 128 deletions onnxruntime/core/mlas/lib/sqnbitgemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,6 @@ Module Name:

#include <cassert>

#ifdef MLAS_NEURAL_SPEED
#include "bestla_gemm.h"
#endif

namespace
{

Expand Down Expand Up @@ -694,127 +690,3 @@ MlasSQNBitGemmBatch(
ComputeOperation(BlkLen, K, Data, PerGemmWorkspace, RangeStartM, RangeCountM, RangeStartN, RangeCountN);
});
}

size_t MLASCALL
MlasNBitsGemmPackBSize(
size_t N, size_t K, size_t BlkSize, int nbits, bool isAsym, MLAS_SQNBIT_COMPUTE_TYPE CompType
)
{
#ifdef MLAS_NEURAL_SPEED
if (nbits == 4) {
auto jsize = BTLAQ4GemmPackBSize(N, K, BlkSize, isAsym, CompType);
if (jsize) {
return jsize;
}
}
#endif
(void)(N);
(void)(K);
(void)(BlkSize);
(void)(nbits);
(void)(isAsym);
(void)(CompType);
return 0;
}

void MLASCALL
MlasNBitsGemmPackB(
void* PackedBuf,
const uint8_t* QData,
const float* Scale,
const uint8_t* Zp,
size_t N,
size_t K,
size_t ldb,
size_t BlkSize,
int nbits,
bool isAsym,
bool lastCall,
MLAS_SQNBIT_COMPUTE_TYPE CompType,
MLAS_THREADPOOL* ThreadPool
)
{
#ifdef MLAS_NEURAL_SPEED
if (nbits == 4) {
if (BTLAQ4GemmPackB(PackedBuf, QData, Scale, Zp, N, K, ldb, BlkSize, isAsym, lastCall, CompType, ThreadPool)) {
return;
}
}
#endif
(void)(PackedBuf);
(void)(QData);
(void)(Scale);
(void)(Zp);
(void)(N);
(void)(K);
(void)(ldb);
(void)(BlkSize);
(void)(nbits);
(void)(isAsym);
(void)(lastCall);
(void)(CompType);
(void)(ThreadPool);
}

void MLASCALL
MlasNBitsGemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, MLAS_THREADPOOL* ThreadPool)
{
#ifdef MLAS_NEURAL_SPEED
if (BTLAQ4GemmUnPackB(FpData, PackedBuf, N, K, ldb, ThreadPool)) {
return;
}
#endif
(void)(FpData);
(void)(PackedBuf);
(void)(N);
(void)(K);
(void)(ldb);
(void)(ThreadPool);
}

size_t MLASCALL
MlasSQNBitsGemmBatchPackedBWorkspaceSize(
const size_t M,
const size_t N,
const size_t K,
const size_t BatchN,
const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams
)
{
#ifdef MLAS_NEURAL_SPEED
return BTLASQ4GemmBatchWorkspaceSize(M, N, K, BatchN, DataParams);
#endif
(void)(M);
(void)(N);
(void)(K);
(void)(BatchN);
(void)(DataParams);
return 0;
}

void MLASCALL
MlasSQNBitsGemmBatchPackedB(
const size_t M,
const size_t N,
const size_t K,
const size_t BatchN,
const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams,
void* WorkSpace,
MLAS_THREADPOOL* ThreadPool
)
{
GetMlasPlatform();
#ifdef MLAS_NEURAL_SPEED
if (BTLASQ4GemmBatchDriver(M, N, K, BatchN, DataParams, reinterpret_cast<int8_t*>(WorkSpace), ThreadPool)) {
// PackedWeight is created by bestla
return;
}
#endif
(void)(M);
(void)(N);
(void)(K);
(void)(BatchN);
(void)(DataParams);
(void)(WorkSpace);
(void)(ThreadPool);
}
Loading

0 comments on commit 65cbb81

Please sign in to comment.