From 5805f33b705ca92963a60afce704e0436836694e Mon Sep 17 00:00:00 2001 From: luoyu-intel Date: Wed, 22 Nov 2023 11:44:59 +0800 Subject: [PATCH] set USE_JBLAS on as default. fix some lint --- cmake/CMakeLists.txt | 2 +- docs/ContribOperators.md | 2 ++ .../contrib_ops/cpu/quantization/matmul_nbits.cc | 3 ++- onnxruntime/core/graph/contrib_ops/contrib_defs.cc | 12 ++++++------ 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 461128afedf12..ffdb2090106d3 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -87,7 +87,7 @@ option(onnxruntime_USE_QNN "Build with QNN support" OFF) option(onnxruntime_USE_SNPE "Build with SNPE support" OFF) option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF) option(onnxruntime_USE_DNNL "Build with DNNL support" OFF) -option(onnxruntime_USE_JBLAS "Build MLAS with JBLAS support" OFF) +option(onnxruntime_USE_JBLAS "Build MLAS with JBLAS support" ON) option(onnxruntime_USE_JSEP "Build with JavaScript implemented kernels support" OFF) option(onnxruntime_BUILD_UNIT_TESTS "Build ONNXRuntime unit tests" ON) option(onnxruntime_BUILD_CSHARP "Build C# library" OFF) diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md index c73f978bdf404..9e247092a6596 100644 --- a/docs/ContribOperators.md +++ b/docs/ContribOperators.md @@ -2828,6 +2828,8 @@ This version of the operator has been available since version 1 of the 'com.micr
number of bits used for weight quantization (default 4)
block_size : int (required)
number of groupsize used for weight quantization,(default 128). It needs to be a power of 2 and not smaller than 16.
+
accuracy_level : int
+
The minimum accuracy level of input A, can be: 0(unset), 1(fp32), 2(fp16), 3(bf16), or 4(int8) (default unset). It is used to control how input A is quantized or downcast internally while doing computation, for example: 0 means input A will not be quantized or downcast while doing computation. 4 means input A can be quantized with the same block_size to int8 internally from type T1.
#### Inputs (3 - 4) diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc index bca54a72fe8a7..c2306f36de242 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc @@ -162,7 +162,8 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const { gemm_params[i].C = y_data + helper.OutputOffsets()[i]; gemm_params[i].ldc = N; } - MlasNBitsGemmBatchPackedB(M, N, K, max_len, gemm_params.data(), reinterpret_cast(ws_ptr.get()), thread_pool); + MlasNBitsGemmBatchPackedB(M, N, K, max_len, gemm_params.data(), reinterpret_cast(ws_ptr.get()), + thread_pool); return Status::OK(); } diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc index e38270d2f27a9..e2e4cdf4950ad 100644 --- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc +++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc @@ -3364,12 +3364,12 @@ Input zero_points is stored as uint8_t. If bits <= 4, two zero points are stored .Attr("bits", "number of bits used for weight quantization (default 4)", AttributeProto::INT) .Attr("block_size", "number of groupsize used for weight quantization,(default 128). It needs to be a power of 2 and not smaller than 16.", AttributeProto::INT) .Attr("accuracy_level", - "minimum accuracy level of input A, can be: 0(unset), 1(fp32), 2(fp16), 3(bf16), or 4(int8) (default unset)." - "It is used to control how input A is quantized or downcast internally while doing computation, for example:" - "0 means input A will not be quantized or downcast while doing computation." - "4 means input A can be quantized with the same block_size to int8 internally from type T1.", - AttributeProto::INT, - static_cast(0)) + "The minimum accuracy level of input A, can be: 0(unset), 1(fp32), 2(fp16), 3(bf16), or 4(int8) " + "(default unset). It is used to control how input A is quantized or downcast internally while " + "doing computation, for example: 0 means input A will not be quantized or downcast while doing " + "computation. 4 means input A can be quantized with the same block_size to int8 internally from " + "type T1.", + AttributeProto::INT, static_cast(0)) .Input(0, "A", "The input tensor, not quantized", "T1") .Input(1, "B", "1-dimensional data blob", "T2") .Input(2, "scales", "quantization scale", "T1")