diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 461128afedf12..ffdb2090106d3 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -87,7 +87,7 @@ option(onnxruntime_USE_QNN "Build with QNN support" OFF)
option(onnxruntime_USE_SNPE "Build with SNPE support" OFF)
option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF)
option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
-option(onnxruntime_USE_JBLAS "Build MLAS with JBLAS support" OFF)
+option(onnxruntime_USE_JBLAS "Build MLAS with JBLAS support" ON)
option(onnxruntime_USE_JSEP "Build with JavaScript implemented kernels support" OFF)
option(onnxruntime_BUILD_UNIT_TESTS "Build ONNXRuntime unit tests" ON)
option(onnxruntime_BUILD_CSHARP "Build C# library" OFF)
diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index c73f978bdf404..9e247092a6596 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -2828,6 +2828,8 @@ This version of the operator has been available since version 1 of the 'com.micr
number of bits used for weight quantization (default 4)
block_size : int (required)
number of groupsize used for weight quantization,(default 128). It needs to be a power of 2 and not smaller than 16.
+accuracy_level : int
+The minimum accuracy level of input A, can be: 0(unset), 1(fp32), 2(fp16), 3(bf16), or 4(int8) (default unset). It is used to control how input A is quantized or downcast internally while doing computation, for example: 0 means input A will not be quantized or downcast while doing computation. 4 means input A can be quantized with the same block_size to int8 internally from type T1.
#### Inputs (3 - 4)
diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
index bca54a72fe8a7..c2306f36de242 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
@@ -162,7 +162,8 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
gemm_params[i].C = y_data + helper.OutputOffsets()[i];
gemm_params[i].ldc = N;
}
- MlasNBitsGemmBatchPackedB(M, N, K, max_len, gemm_params.data(), reinterpret_cast(ws_ptr.get()), thread_pool);
+ MlasNBitsGemmBatchPackedB(M, N, K, max_len, gemm_params.data(), reinterpret_cast(ws_ptr.get()),
+ thread_pool);
return Status::OK();
}
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index e38270d2f27a9..e2e4cdf4950ad 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -3364,12 +3364,12 @@ Input zero_points is stored as uint8_t. If bits <= 4, two zero points are stored
.Attr("bits", "number of bits used for weight quantization (default 4)", AttributeProto::INT)
.Attr("block_size", "number of groupsize used for weight quantization,(default 128). It needs to be a power of 2 and not smaller than 16.", AttributeProto::INT)
.Attr("accuracy_level",
- "minimum accuracy level of input A, can be: 0(unset), 1(fp32), 2(fp16), 3(bf16), or 4(int8) (default unset)."
- "It is used to control how input A is quantized or downcast internally while doing computation, for example:"
- "0 means input A will not be quantized or downcast while doing computation."
- "4 means input A can be quantized with the same block_size to int8 internally from type T1.",
- AttributeProto::INT,
- static_cast(0))
+ "The minimum accuracy level of input A, can be: 0(unset), 1(fp32), 2(fp16), 3(bf16), or 4(int8) "
+ "(default unset). It is used to control how input A is quantized or downcast internally while "
+ "doing computation, for example: 0 means input A will not be quantized or downcast while doing "
+ "computation. 4 means input A can be quantized with the same block_size to int8 internally from "
+ "type T1.",
+ AttributeProto::INT, static_cast(0))
.Input(0, "A", "The input tensor, not quantized", "T1")
.Input(1, "B", "1-dimensional data blob", "T2")
.Input(2, "scales", "quantization scale", "T1")