diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 2e2e9c25210d6..23b55745ddaeb 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -354,13 +354,7 @@ if (onnxruntime_USE_ROCM) endif() endif() -if (APPLE) - if (NOT CMAKE_OSX_ARCHITECTURES) - message("Building ONNX Runtime for ${CMAKE_HOST_SYSTEM_PROCESSOR}") - endif() -elseif (NOT WIN32 AND NOT APPLE) - message("Building ONNX Runtime for ${CMAKE_SYSTEM_PROCESSOR}") -endif() + # Single output director for all binaries set(RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin CACHE PATH "Single output directory for all binaries.") @@ -493,6 +487,14 @@ endif() include(adjust_global_compile_flags.cmake) +if (APPLE) + if (NOT CMAKE_OSX_ARCHITECTURES) + message("Building ONNX Runtime for ${CMAKE_HOST_SYSTEM_PROCESSOR} CPU ARCH") + endif() +elseif (NOT WIN32 AND NOT APPLE) + message("Building ONNX Runtime for ${onnxruntime_target_platform} CPU ARCH") +endif() + # We need to link with libatomic on systems that do not have built-in atomics, or # don't have built-in support for 8 byte atomics # Derived from https://github.com/protocolbuffers/protobuf/blob/master/cmake/CMakeLists.txt @@ -639,7 +641,16 @@ else() check_cxx_compiler_flag(-Wunused-variable HAS_UNUSED_VARIABLE) check_cxx_compiler_flag(-Wuseless-cast HAS_USELESS_CAST) check_function_exists(reallocarray HAS_REALLOCARRAY) - + if (NOT APPLE AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND onnxruntime_target_platform STREQUAL "aarch64") + check_cxx_compiler_flag(-march=armv8.2-a+bf16 HAS_ARM64_BFLOAT16) + if(NOT HAS_ARM64_BFLOAT16) + message(FATAL_ERROR "The compiler doesn't support BFLOAT16!!!") + endif() + check_cxx_compiler_flag(-march=armv8.2-a+fp16 HAS_ARM64_FLOAT16) + if(NOT HAS_ARM64_FLOAT16) + message(FATAL_ERROR "The compiler doesn't support FLOAT16!!!") + endif() + endif() if (HAS_TAUTOLOGICAL_POINTER_COMPARE) #we may have extra null pointer checkings in debug build, it's not an issue list(APPEND ORT_WARNING_FLAGS -Wno-tautological-pointer-compare) diff --git a/cmake/adjust_global_compile_flags.cmake b/cmake/adjust_global_compile_flags.cmake index e825bfeaea952..9f00c873715f4 100644 --- a/cmake/adjust_global_compile_flags.cmake +++ b/cmake/adjust_global_compile_flags.cmake @@ -300,6 +300,31 @@ if (MSVC) endif() else() if (NOT APPLE) + #XXX: Sometimes the value of CMAKE_SYSTEM_PROCESSOR is set but it's wrong. For example, if you run an armv7 docker + #image on an aarch64 machine with an aarch64 Ubuntu host OS, in the docker instance cmake may still report + # CMAKE_SYSTEM_PROCESSOR as aarch64 by default. Given compiling this code may need more than 2GB memory, we do not + # support compiling for ARM32 natively(only support cross-compiling), we will ignore this issue for now. + if(NOT CMAKE_SYSTEM_PROCESSOR) + message(WARNING "CMAKE_SYSTEM_PROCESSOR is not set. Please set it in your toolchain cmake file.") + # Try to detect it + if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" OR "${CMAKE_C_COMPILER_ID}" STREQUAL "Clang") + execute_process( + COMMAND "${CMAKE_C_COMPILER}" -dumpmachine + OUTPUT_VARIABLE GCC_DUMP_MACHINE_OUT OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_VARIABLE _err + RESULT_VARIABLE _res + ) + if(NOT _res EQUAL 0) + message(SEND_ERROR "Failed to run 'gcc -dumpmachine':\n ${_res}") + endif() + string(REPLACE "-" ";" GCC_DUMP_MACHINE_OUT_LIST "${GCC_DUMP_MACHINE_OUT}") + list(LENGTH GCC_DUMP_MACHINE_OUT_LIST GCC_TRIPLET_LEN) + if(GCC_TRIPLET_LEN EQUAL 4) + list(GET GCC_DUMP_MACHINE_OUT_LIST 0 CMAKE_SYSTEM_PROCESSOR) + message("Setting CMAKE_SYSTEM_PROCESSOR to ${CMAKE_SYSTEM_PROCESSOR}") + endif() + endif() + endif() set(onnxruntime_target_platform ${CMAKE_SYSTEM_PROCESSOR}) endif() if (onnxruntime_BUILD_FOR_NATIVE_MACHINE) diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts index 8e1ec782079be..06c3c6c196501 100644 --- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts +++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts @@ -17,7 +17,7 @@ import {gather, parseGatherAttributes} from './ops/gather'; import {gatherElements, parseGatherElementsAttributes} from './ops/gather-elements'; import {gemm, parseGemmAttributes} from './ops/gemm'; import {instanceNorm, parseInstanceNormAttributes} from './ops/instance-norm'; -import {layerNorm, parseLayerNormAttributes} from './ops/layer-norm'; +import {layerNorm} from './ops/layer-norm'; import {matMul} from './ops/matmul'; import {multiHeadAttention, parseMultiHeadAttentionAttributes} from './ops/multi-head-attentiion'; import {pad, parsePadAttributes} from './ops/pad'; @@ -83,7 +83,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map = new ['Greater', [binaryOps.greater]], ['GreaterOrEqual', [binaryOps.greaterOrEqual]], ['InstanceNormalization', [instanceNorm, parseInstanceNormAttributes]], - ['LayerNormalization', [layerNorm, parseLayerNormAttributes]], + ['LayerNormalization', [layerNorm]], ['LeakyRelu', [unaryOps.leakyRelu, unaryOps.parseAlphaAttributes]], ['Less', [binaryOps.less]], ['LessOrEqual', [binaryOps.lessOrEqual]], diff --git a/js/web/lib/wasm/jsep/webgpu/ops/expand.ts b/js/web/lib/wasm/jsep/webgpu/ops/expand.ts index 3dc4e957e0fee..035d89755c7d7 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/expand.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/expand.ts @@ -47,7 +47,7 @@ const createExpandProgramInfo = (inputs: readonly TensorView[]): ProgramInfo => const outputShape: number[] = calculateOutputShape(inputShape, shape); const dataType = inputs[0].dataType; const components = dataType === DataType.bool ? 4 : 1; - const outputSize = ShapeUtil.size(outputShape) / components; + const outputSize = Math.ceil(ShapeUtil.size(outputShape) / components); const enableInputShapeUniform = enableShapesUniforms(inputShape.length); const enableOutputShapeUniform = enableShapesUniforms(outputShape.length); diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gather.ts b/js/web/lib/wasm/jsep/webgpu/ops/gather.ts index 53ca094abfd62..469249f92ff28 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/gather.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/gather.ts @@ -31,7 +31,7 @@ const createGatherProgramInfo = (inputs: readonly TensorView[], attributes: Gath const axisDimLimit = inputShape[axis]; const components = inputs[0].dataType === DataType.bool ? 4 : 1; - const outputSize = ShapeUtil.size(outputShape) / components; + const outputSize = Math.ceil(ShapeUtil.size(outputShape) / components); const enableInputShapesUniforms = enableShapesUniforms(inputs[0].dims.length); const inputShapeOrRank = enableInputShapesUniforms ? inputs[0].dims.length : inputs[0].dims; diff --git a/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts index 8a9eeecf2c68d..bc446079faf8f 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts @@ -4,12 +4,11 @@ import {DataType} from '../../../wasm-common'; import {TensorView} from '../../tensor-view'; import {ShapeUtil} from '../../util'; -import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key'; -import {ComputeContext, ProgramInfo} from '../types'; +import {ComputeContext, ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../types'; -import {castToF32, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType,} from './common'; +import {castToF32, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType, UniformsArrayType,} from './common'; -export interface LayerNormAttributes extends AttributeWithCacheKey { +interface LayerNormAttributes { axis: number; epsilon: number; } @@ -39,7 +38,7 @@ const createLayerNormProgramInfo = Got scale size of ${scaleSize} and bias size of ${biasSize}`); } - const meanInvStdDevDim = []; + const meanInvStdDevDim: number[] = []; for (let i = 0; i < xShape.length; ++i) { if (i < axis) { meanInvStdDevDim.push(xShape[i]); @@ -47,50 +46,57 @@ const createLayerNormProgramInfo = meanInvStdDevDim.push(1); } } - const components = getMaxComponents(normSize); - const dataType = tensorTypeToWsglStorageType(inputs[0].dataType); - const variables = [ - inputVariable('x', inputs[0].dataType, inputs[0].dims, components), - inputVariable('scale', scale.dataType, scale.dims, components), + const inputDependencies: ProgramInputTensorInfoDependency[] = ['type', 'type']; + const programUniforms: ProgramUniform[] = [ + {type: 'uint32', data: normCount}, {type: 'float32', data: normSize}, + {type: 'uint32', data: Math.floor(normSize / components)}, {type: 'float32', data: attributes.epsilon} ]; if (bias) { - variables.push(inputVariable('bias', bias.dataType, bias.dims, components)); + inputDependencies.push('type'); } - variables.push(outputVariable('output', inputs[0].dataType, outputShape, components)); - const hasMeanDataOutput = outputCount > 1; const hasInvStdOutput = outputCount > 2; - if (hasMeanDataOutput) { - variables.push(outputVariable('meanDataOutput', DataType.float, meanInvStdDevDim)); - } - if (hasInvStdOutput) { - variables.push(outputVariable('invStdOutput', DataType.float, meanInvStdDevDim)); - } - - const getShaderSource = (shaderHelper: ShaderHelper) => ` - const normSize: f32 = ${normSize}; - const normSizeVectorized: u32 = ${normSize / components}; - const epsilon: f32 = ${attributes.epsilon}; + const getShaderSource = (shaderHelper: ShaderHelper) => { + const dataType = tensorTypeToWsglStorageType(inputs[0].dataType); + const variables = [ + inputVariable('x', inputs[0].dataType, inputs[0].dims, components), + inputVariable('scale', scale.dataType, scale.dims, components), + ]; + if (bias) { + variables.push(inputVariable('bias', bias.dataType, bias.dims, components)); + } + variables.push(outputVariable('output', inputs[0].dataType, outputShape, components)); + if (hasMeanDataOutput) { + variables.push(outputVariable('mean_data_output', DataType.float, meanInvStdDevDim)); + } + if (hasInvStdOutput) { + variables.push(outputVariable('inv_std_output', DataType.float, meanInvStdDevDim)); + } - ${shaderHelper.declareVariables(...variables)} + const uniforms: UniformsArrayType = [ + {name: 'norm_count', type: 'u32'}, {name: 'norm_size', type: 'f32'}, + {name: 'norm_size_vectorized', type: 'u32'}, {name: 'epsilon', type: 'f32'} + ]; + return ` + ${shaderHelper.registerUniforms(uniforms).declareVariables(...variables)} ${shaderHelper.mainStart()} - ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(normCount)} - let offset = global_idx * normSizeVectorized; + ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.norm_count')} + let offset = global_idx * uniforms.norm_size_vectorized; var meanVector = ${fillVector('f32', components)}; var meanSquareVector = ${fillVector('f32', components)}; - for (var h: u32 = 0u; h < normSizeVectorized; h++) { + for (var h: u32 = 0u; h < uniforms.norm_size_vectorized; h++) { let value = ${castToF32(dataType, components, 'x[h + offset]')}; meanVector += value; meanSquareVector += value * value; } - let mean = ${sumVector('meanVector', components)} / normSize; - let meanSquare = sqrt(${sumVector('meanSquareVector', components)} - / normSize - mean * mean + epsilon); + let mean = ${sumVector('meanVector', components)} / uniforms.norm_size; + let meanSquare = sqrt(${sumVector('meanSquareVector', components)} + / uniforms.norm_size - mean * mean + uniforms.epsilon); - for (var j: u32 = 0; j < normSizeVectorized; j++) { + for (var j: u32 = 0; j < uniforms.norm_size_vectorized; j++) { let f32input = ${castToF32(dataType, components, 'x[j + offset]')}; let f32scale = ${castToF32(dataType, components, 'scale[j]')}; output[j + offset] = ${variables[0].type.value}((f32input - mean) / meanSquare * f32scale @@ -98,9 +104,10 @@ const createLayerNormProgramInfo = ); } - ${hasMeanDataOutput ? 'meanDataOutput[global_idx] = mean' : ''}; - ${hasInvStdOutput ? 'invStdOutput[global_idx] = 1 / meanSquare' : ''}; + ${hasMeanDataOutput ? 'mean_data_output[global_idx] = mean' : ''}; + ${hasInvStdOutput ? 'inv_std_output[global_idx] = 1 / meanSquare' : ''}; }`; + }; const outputs = [{dims: outputShape, dataType: inputs[0].dataType}]; if (hasMeanDataOutput) { outputs.push({dims: meanInvStdDevDim, dataType: DataType.float}); @@ -111,15 +118,13 @@ const createLayerNormProgramInfo = return { name: 'LayerNormalization', - shaderCache: {hint: `${attributes.cacheKey}|${outputCount}|${inputs.length}`}, - getRunData: () => ({outputs, dispatchGroup: {x: Math.ceil(normCount / 64 /* workgroup size */)}}), + shaderCache: {hint: `${components};${outputCount}`, inputDependencies}, + getRunData: () => + ({outputs, dispatchGroup: {x: Math.ceil(normCount / 64 /* workgroup size */)}, programUniforms}), getShaderSource, }; }; -export const parseLayerNormAttributes = (attributes: LayerNormAttributes): LayerNormAttributes => - createAttributeWithCacheKey({axis: attributes.axis, epsilon: attributes.epsilon}); - export const layerNorm = (context: ComputeContext, attributes: LayerNormAttributes): void => { validateInputs(context.inputs); context.compute(createLayerNormProgramInfo(context.inputs, attributes, context.outputCount)); diff --git a/js/web/test/data/ops/expand.jsonc b/js/web/test/data/ops/expand.jsonc index 22bc04d558d98..613b4507b2b15 100644 --- a/js/web/test/data/ops/expand.jsonc +++ b/js/web/test/data/ops/expand.jsonc @@ -168,20 +168,39 @@ "name": "Expand - last dim is not divisible by 4", "inputs": [ { - "data": [true, false, false, true, true, true, false, false, false, true, true, true], - "dims": [2, 6], + "data": [true, false, false, true, true, true], + "dims": [1, 6], "type": "bool" }, { - "data": [2, 1], + "data": [3, 1], "dims": [2], "type": "int64" } ], "outputs": [ { - "data": [true, false, false, true, true, true, false, false, false, true, true, true], - "dims": [2, 6], + "data": [ + true, + false, + false, + true, + true, + true, + true, + false, + false, + true, + true, + true, + true, + false, + false, + true, + true, + true + ], + "dims": [3, 6], "type": "bool" } ] diff --git a/js/web/test/data/ops/gather.jsonc b/js/web/test/data/ops/gather.jsonc index 0be077d237b88..d218d120d356d 100644 --- a/js/web/test/data/ops/gather.jsonc +++ b/js/web/test/data/ops/gather.jsonc @@ -99,6 +99,28 @@ "operator": "Gather", "attributes": [], "cases": [ + { + "name": "data[4] indices[]", + "inputs": [ + { + "data": [false, true, false, false], + "dims": [4], + "type": "bool" + }, + { + "data": [1], + "dims": [], + "type": "int32" + } + ], + "outputs": [ + { + "data": [true], + "dims": [], + "type": "bool" + } + ] + }, { "name": "data[2,4] indices[1]", "inputs": [ diff --git a/onnxruntime/contrib_ops/rocm/fused_conv.cc b/onnxruntime/contrib_ops/rocm/fused_conv.cc index d597e0d57fbcb..63804f79a32fb 100644 --- a/onnxruntime/contrib_ops/rocm/fused_conv.cc +++ b/onnxruntime/contrib_ops/rocm/fused_conv.cc @@ -76,7 +76,12 @@ struct FNVHash { void HashConvolutionDescriptor(miopenConvolutionDescriptor_t cdesc) { int spatial_dim = 1; #if ROCM_VERSION >= 50500 - miopenGetConvolutionSpatialDim(cdesc, &spatial_dim); + MIOPEN_CALL(miopenGetConvolutionSpatialDim(cdesc, &spatial_dim)); + std::vector pads{spatial_dim}; + std::vector strides{spatial_dim}; + std::vector dilations{spatial_dim}; + miopenConvolutionMode_t mode; + MIOPEN_CALL(miopenGetConvolutionNdDescriptor(cdesc, spatial_dim, &spatial_dim, pads.data(), strides.data(), dilations.data(), &mode)); #else // Previous versions of MIOpen doesn't provide API to probe the dimension of a // miopenConvolutionDescriptor_t, so we have to guess. @@ -100,11 +105,12 @@ struct FNVHash { pads.resize(spatial_dim); strides.resize(spatial_dim); dilations.resize(spatial_dim); +#endif (*this) << spatial_dim; (*this) << pads; (*this) << strides; (*this) << dilations; -#endif + (*this) << mode; } private: @@ -313,6 +319,8 @@ class FusedConv : public onnxruntime::rocm::Conv { auto ret = miopenCompileFusionPlan(handle, fusion->plan); if (miopenStatusSuccess == ret) { fusion->compiled_on.insert(handle); + } else { + return ret; } return miopenStatusSuccess; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp index adb4fd131119f..c6a15e76f4736 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp @@ -360,7 +360,7 @@ namespace Dml::GraphDescBuilder // The tensor description's size should be no larger than the constant input unless it was rounded to // the required alignment. assert(((constantInput->GetTensorByteSize() + 3) & ~3) >= tensorDesc->totalTensorSizeInBytes); - size_t minimumConstantSize = std::min(constantInput->GetTensorByteSize(), tensorDesc->totalTensorSizeInBytes); + size_t minimumConstantSize = std::min(constantInput->GetTensorByteSize(), gsl::narrow_cast(tensorDesc->totalTensorSizeInBytes)); auto data = static_cast(constantInput->GetData()); std::vector tensorData(data, data + minimumConstantSize); diff --git a/onnxruntime/core/providers/js/js_kernel.h b/onnxruntime/core/providers/js/js_kernel.h index 5c2d1f0b881ba..b850bea4bc275 100644 --- a/onnxruntime/core/providers/js/js_kernel.h +++ b/onnxruntime/core/providers/js/js_kernel.h @@ -67,6 +67,7 @@ namespace js { float value; \ ORT_ENFORCE(info.GetAttr(#attr_name, &value));, \ , ({#attr_name : $1}), static_cast(value)) +#define JSEP_HEAP_PTR(ptr) reinterpret_cast(ptr) // TODO: // class JsMultiProgramKernel : public OpKernel { /* TBD */ }; diff --git a/onnxruntime/core/providers/js/operators/conv.h b/onnxruntime/core/providers/js/operators/conv.h index 5c0fbf93a4004..98a530c6b77f6 100644 --- a/onnxruntime/core/providers/js/operators/conv.h +++ b/onnxruntime/core/providers/js/operators/conv.h @@ -54,13 +54,13 @@ class ConvBase : public JsKernel { static_cast(conv_attrs_.group), static_cast(kernel_shape_0), static_cast(local_pads.size()), - reinterpret_cast(local_pads.size() > 0 ? local_pads.data() : nullptr) >> 2, + JSEP_HEAP_PTR(local_pads.size() > 0 ? local_pads.data() : nullptr) >> 2, static_cast(conv_attrs_.strides.size() > 0 ? conv_attrs_.strides[0] : 0), static_cast(channels_last), - reinterpret_cast(&w_is_const_), + JSEP_HEAP_PTR(&w_is_const_), conv_attrs_.activation.c_str(), activation_params.size(), - reinterpret_cast(activation_params_ptr) >> 2); + JSEP_HEAP_PTR(activation_params_ptr) >> 2); } else { JSEP_INIT_KERNEL_ATTRIBUTE(Conv, ({ "format" : $11 ? "NHWC" : "NCHW", @@ -81,14 +81,14 @@ class ConvBase : public JsKernel { static_cast(kernel_shape_0), static_cast(kernel_shape_1), static_cast(local_pads.size()), - reinterpret_cast(local_pads.size() > 0 ? local_pads.data() : nullptr) >> 2, + JSEP_HEAP_PTR(local_pads.size() > 0 ? local_pads.data() : nullptr) >> 2, static_cast(conv_attrs_.strides.size() > 0 ? conv_attrs_.strides[0] : 0), static_cast(conv_attrs_.strides.size() > 1 ? conv_attrs_.strides[1] : 0), static_cast(channels_last), - reinterpret_cast(&w_is_const_), + JSEP_HEAP_PTR(&w_is_const_), conv_attrs_.activation.c_str(), activation_params.size(), - reinterpret_cast(activation_params_ptr) >> 2); + JSEP_HEAP_PTR(activation_params_ptr) >> 2); } } diff --git a/onnxruntime/core/providers/js/operators/conv_transpose.h b/onnxruntime/core/providers/js/operators/conv_transpose.h index 5d30dc851e00f..353a946e95c21 100644 --- a/onnxruntime/core/providers/js/operators/conv_transpose.h +++ b/onnxruntime/core/providers/js/operators/conv_transpose.h @@ -64,11 +64,11 @@ class ConvTranspose : public JsKernel { static_cast(pads_1), static_cast(strides), static_cast(channels_last), - reinterpret_cast(&w_is_const_), + JSEP_HEAP_PTR(&w_is_const_), gsl::narrow_cast(local_output_padding.size()), - reinterpret_cast(local_output_padding_ptr) >> 2, + JSEP_HEAP_PTR(local_output_padding_ptr) >> 2, gsl::narrow_cast(local_output_shape.size()), - reinterpret_cast(local_output_shape_ptr) >> 2, + JSEP_HEAP_PTR(local_output_shape_ptr) >> 2, conv_transpose_attrs_.activation.c_str()); } else { constexpr size_t pads_vec_size = 4; @@ -114,17 +114,17 @@ class ConvTranspose : public JsKernel { "activation" : UTF8ToString($13) }), static_cast(conv_transpose_attrs_.auto_pad), - reinterpret_cast(local_dilations.data()) >> 2, + JSEP_HEAP_PTR(local_dilations.data()) >> 2, static_cast(conv_transpose_attrs_.group), - reinterpret_cast(local_kernel_shape.data()) >> 2, - reinterpret_cast(local_pads.data()) >> 2, - reinterpret_cast(local_strides.data()) >> 2, + JSEP_HEAP_PTR(local_kernel_shape.data()) >> 2, + JSEP_HEAP_PTR(local_pads.data()) >> 2, + JSEP_HEAP_PTR(local_strides.data()) >> 2, static_cast(channels_last), - reinterpret_cast(&w_is_const_), + JSEP_HEAP_PTR(&w_is_const_), gsl::narrow_cast(local_output_padding.size()), - reinterpret_cast(local_output_padding_ptr) >> 2, + JSEP_HEAP_PTR(local_output_padding_ptr) >> 2, gsl::narrow_cast(local_output_shape.size()), - reinterpret_cast(local_output_shape_ptr) >> 2, + JSEP_HEAP_PTR(local_output_shape_ptr) >> 2, conv_transpose_attrs_.activation.c_str()); } } diff --git a/onnxruntime/core/providers/js/operators/pad.h b/onnxruntime/core/providers/js/operators/pad.h index 19168f40b4722..bf808be949cf8 100644 --- a/onnxruntime/core/providers/js/operators/pad.h +++ b/onnxruntime/core/providers/js/operators/pad.h @@ -26,7 +26,7 @@ class Pad : public JsKernel, public PadBase { static_cast(mode_), static_cast(value_), gsl::narrow_cast(pads.size()), - reinterpret_cast((pads.size() > 0) ? pads.data() : nullptr) >> 2); + JSEP_HEAP_PTR((pads.size() > 0) ? pads.data() : nullptr) >> 2); } }; diff --git a/onnxruntime/core/providers/js/operators/reduce.h b/onnxruntime/core/providers/js/operators/reduce.h index a5a4aa834c2ca..95c4f2bec230d 100644 --- a/onnxruntime/core/providers/js/operators/reduce.h +++ b/onnxruntime/core/providers/js/operators/reduce.h @@ -8,29 +8,29 @@ namespace onnxruntime { namespace js { -#define JSEP_DEFINE_REDUCE_KERNEL(ReduceKernel) \ - template \ - class ReduceKernel : public JsKernel, public ReduceKernelBase { \ - public: \ - using ReduceKernelBase::axes_; \ - using ReduceKernelBase::noop_with_empty_axes_; \ - using ReduceKernelBase::keepdims_; \ - ReduceKernel(const OpKernelInfo& info) : JsKernel(info), ReduceKernelBase(info) { \ - std::vector axes(axes_.size()); \ - if (axes_.size() > 0) { \ - std::transform(axes_.begin(), axes_.end(), axes.begin(), \ - [](int64_t axis) { return gsl::narrow_cast(axis); }); \ - } \ - JSEP_INIT_KERNEL_ATTRIBUTE(ReduceKernel, ({ \ - "keepDims" : !!$1, \ - "noopWithEmptyAxes" : !!$2, \ - "axes" : $3 ? (Array.from(HEAP32.subarray($4, $4 + $3))) : [], \ - }), \ - static_cast(keepdims_), \ - static_cast(noop_with_empty_axes_), \ - gsl::narrow_cast(axes.size()), \ - reinterpret_cast((axes.size() > 0) ? axes.data() : nullptr) >> 2); \ - } \ +#define JSEP_DEFINE_REDUCE_KERNEL(ReduceKernel) \ + template \ + class ReduceKernel : public JsKernel, public ReduceKernelBase { \ + public: \ + using ReduceKernelBase::axes_; \ + using ReduceKernelBase::noop_with_empty_axes_; \ + using ReduceKernelBase::keepdims_; \ + ReduceKernel(const OpKernelInfo& info) : JsKernel(info), ReduceKernelBase(info) { \ + std::vector axes(axes_.size()); \ + if (axes_.size() > 0) { \ + std::transform(axes_.begin(), axes_.end(), axes.begin(), \ + [](int64_t axis) { return gsl::narrow_cast(axis); }); \ + } \ + JSEP_INIT_KERNEL_ATTRIBUTE(ReduceKernel, ({ \ + "keepDims" : !!$1, \ + "noopWithEmptyAxes" : !!$2, \ + "axes" : $3 ? (Array.from(HEAP32.subarray($4, $4 + $3))) : [], \ + }), \ + static_cast(keepdims_), \ + static_cast(noop_with_empty_axes_), \ + gsl::narrow_cast(axes.size()), \ + JSEP_HEAP_PTR((axes.size() > 0) ? axes.data() : nullptr) >> 2); \ + } \ }; JSEP_DEFINE_REDUCE_KERNEL(ReduceMax); diff --git a/onnxruntime/core/providers/js/operators/resize.h b/onnxruntime/core/providers/js/operators/resize.h index 65854222ba988..4b1c288ae3015 100644 --- a/onnxruntime/core/providers/js/operators/resize.h +++ b/onnxruntime/core/providers/js/operators/resize.h @@ -34,7 +34,7 @@ class Resize : public JsKernel, public UpsampleBase { }), static_cast(antialias_), gsl::narrow_cast(axes.size()), - reinterpret_cast((axes.size() > 0) ? axes.data() : nullptr) >> 2, + JSEP_HEAP_PTR((axes.size() > 0) ? axes.data() : nullptr) >> 2, resize_coordinate_transformation_mode.c_str(), static_cast(cubic_coeff_a_), static_cast(exclude_outside_), diff --git a/onnxruntime/core/providers/js/operators/slice.h b/onnxruntime/core/providers/js/operators/slice.h index 6792997025d65..989adabf029a5 100644 --- a/onnxruntime/core/providers/js/operators/slice.h +++ b/onnxruntime/core/providers/js/operators/slice.h @@ -24,11 +24,11 @@ class Slice : public JsKernel, public SliceBase { "ends" : $3 ? Array.from(HEAP32.subarray($4, $4 + $3)) : [], "axes" : $5 ? Array.from(HEAP32.subarray($6, $6 + $5)) : []}), gsl::narrow_cast(starts.size()), - reinterpret_cast((starts.size() > 0) ? starts.data() : nullptr) >> 2, + JSEP_HEAP_PTR((starts.size() > 0) ? starts.data() : nullptr) >> 2, gsl::narrow_cast(ends.size()), - reinterpret_cast((ends.size() > 0) ? ends.data() : nullptr) >> 2, + JSEP_HEAP_PTR((ends.size() > 0) ? ends.data() : nullptr) >> 2, gsl::narrow_cast(axes.size()), - reinterpret_cast((axes.size() > 0) ? axes.data() : nullptr) >> 2); + JSEP_HEAP_PTR((axes.size() > 0) ? axes.data() : nullptr) >> 2); } }; diff --git a/onnxruntime/core/providers/js/operators/split.h b/onnxruntime/core/providers/js/operators/split.h index cfacc1aa6a363..1c1874e5aa98e 100644 --- a/onnxruntime/core/providers/js/operators/split.h +++ b/onnxruntime/core/providers/js/operators/split.h @@ -53,7 +53,7 @@ class Split : public JsKernel, public SplitBase { static_cast(axis_), static_cast(num_outputs_), gsl::narrow_cast(split_sizes.size()), - reinterpret_cast((split_sizes.size() > 0) ? split_sizes.data() : nullptr) >> 2); + JSEP_HEAP_PTR((split_sizes.size() > 0) ? split_sizes.data() : nullptr) >> 2); } }; diff --git a/onnxruntime/core/providers/js/operators/transpose.h b/onnxruntime/core/providers/js/operators/transpose.h index 311badbde0d11..dae442b9f5a13 100644 --- a/onnxruntime/core/providers/js/operators/transpose.h +++ b/onnxruntime/core/providers/js/operators/transpose.h @@ -27,7 +27,7 @@ class Transpose final : public JsKernel, public TransposeBase { gsl::narrow_cast(perm_specified_ ? perm_.size() : 0), // $2: index to HEAP32 of the first int32 element. calculated from right shift memory // address by 2 - reinterpret_cast(perm_specified_ && !perm.empty() ? perm.data() : nullptr) >> 2); + JSEP_HEAP_PTR(perm_specified_ && !perm.empty() ? perm.data() : nullptr) >> 2); } }; diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h index 28857d3002ede..5aec81af15761 100644 --- a/onnxruntime/core/providers/webnn/builders/helper.h +++ b/onnxruntime/core/providers/webnn/builders/helper.h @@ -139,7 +139,7 @@ static const InlinedHashMap op_map = { {"ArgMax", {"argMax", false}}, {"ArgMin", {"argMin", false}}, {"AveragePool", {"averagePool2d", true}}, - {"BatchNormalization", {"meanVarianceNormalization", false}}, + {"BatchNormalization", {"batchNormalization", false}}, {"Cast", {"cast", false}}, {"Ceil", {"ceil", true}}, {"Clip", {"clamp", true}}, @@ -162,12 +162,11 @@ static const InlinedHashMap op_map = { {"GlobalLpPool", {"l2Pool2d", false}}, {"Greater", {"greater", false}}, {"GreaterOrEqual", {"greaterOrEqual", false}}, - {"GroupNormalization", {"meanVarianceNormalization", false}}, {"HardSigmoid", {"hardSigmoid", false}}, {"HardSwish", {"hardSwish", true}}, {"Identity", {"identity", false}}, - {"InstanceNormalization", {"meanVarianceNormalization", false}}, - {"LayerNormalization", {"meanVarianceNormalization", false}}, + {"InstanceNormalization", {"instanceNormalization", false}}, + {"LayerNormalization", {"layerNormalization", false}}, {"LeakyRelu", {"leakyRelu", true}}, {"Less", {"lesser", false}}, {"LessOrEqual", {"lesserOrEqual", false}}, @@ -179,7 +178,7 @@ static const InlinedHashMap op_map = { {"Min", {"min", true}}, {"Mul", {"mul", true}}, {"Neg", {"neg", true}}, - {"Not", {"not", false}}, + {"Not", {"logicalNot", false}}, {"Pad", {"pad", true}}, {"Pow", {"pow", false}}, {"PRelu", {"prelu", true}}, diff --git a/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc index 756a838cc0c3e..4d2470dfe7deb 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc @@ -27,8 +27,6 @@ class NormalizationOpBuilder : public BaseOpBuilder { const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override; }; -// All normalization are based on layout NCHW. -// TODO: add support for NHWC. Status NormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node, const logging::Logger& logger) const { @@ -61,49 +59,13 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder ORT_RETURN_IF_NOT(bias_shape == scale_shape, "The bias' shape should be equal to scale's shape."); } - std::vector new_scale_shape; - if (scale_size < rank) { - if (op_type == "BatchNormalization") { - scale_shape.insert(scale_shape.begin(), 1); - scale_shape.insert(scale_shape.end(), rank - 2, 1); - } else if (op_type == "LayerNormalization") { - // Align right with leading ones. - scale_shape.insert(scale_shape.begin(), rank - scale_size, 1); - } else if (op_type == "InstanceNormalization") { - // Insert ones before and after the channel dimension. - scale_shape.insert(scale_shape.begin(), 1); - ORT_RETURN_IF(scale_size != 1 || rank < 2, - "The scale size should be 1 and rank should be at least 2 for InstanceNorm."); - scale_shape.insert(scale_shape.end(), rank - scale_size - 1, 1); - } else if (op_type == "GroupNormalization") { - // The input will be reshaped to 3D later. So just insert ones before the channel and after. - scale_shape.insert(scale_shape.begin(), 1); - scale_shape.insert(scale_shape.end(), 1); - } else { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported normalization op: ", op_type); - } + emscripten::val scale = model_builder.GetOperand(input_defs[1]->Name()); + options.set("scale", scale); - std::transform(scale_shape.cbegin(), scale_shape.cend(), - std::back_inserter(new_scale_shape), - [](int64_t dim) -> uint32_t { return SafeInt(dim); }); - emscripten::val reshape_scale = model_builder.GetOperand(input_defs[1]->Name()); - emscripten::val reshape_output_scale = - model_builder.GetBuilder().call("reshape", reshape_scale, emscripten::val::array(new_scale_shape)); - options.set("scale", reshape_output_scale); - - if (input_defs.size() >= 3 && !input_defs[2]->Name().empty()) { - // Bias input exists, and bias's shape is the same as scale's shape. - emscripten::val reshape_bias = model_builder.GetOperand(input_defs[2]->Name()); - emscripten::val reshape_output_bias = - model_builder.GetBuilder().call("reshape", reshape_bias, emscripten::val::array(new_scale_shape)); - options.set("bias", reshape_output_bias); - } - } else { - options.set("scale", model_builder.GetOperand(input_defs[1]->Name())); - if (input_defs.size() >= 3 && !input_defs[2]->Name().empty()) { - // Bias input exists, and bias's shape is the same as scale's shape. - options.set("bias", model_builder.GetOperand(input_defs[2]->Name())); - } + if (input_defs.size() >= 3 && !input_defs[2]->Name().empty()) { + // Bias input exists, and bias's shape is the same as scale's shape. + emscripten::val bias = model_builder.GetOperand(input_defs[2]->Name()); + options.set("bias", bias); } NodeAttrHelper helper(node); @@ -114,56 +76,62 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder ORT_RETURN_IF_NOT(input_defs.size() == 5, "BatchNormalization requires five inputs."); emscripten::val mean = model_builder.GetOperand(input_defs[3]->Name()); emscripten::val variance = model_builder.GetOperand(input_defs[4]->Name()); - // Enlarge 1-D mean and variance to new scale shape. - emscripten::val reshape_mean = - model_builder.GetBuilder().call("reshape", mean, emscripten::val::array(new_scale_shape)); - emscripten::val reshape_variance = - model_builder.GetBuilder().call("reshape", variance, emscripten::val::array(new_scale_shape)); - - std::vector axes = {0}; - for (uint32_t i = 2; i < rank; i++) { - axes.push_back(i); + if (model_builder.GetPreferredLayout() == DataLayout::NHWC) { + options.set("axis", rank - 1); } - - options.set("axes", emscripten::val::array(axes)); - options.set("mean", reshape_mean); - options.set("variance", reshape_variance); - output = model_builder.GetBuilder().call("meanVarianceNormalization", input, options); + output = model_builder.GetBuilder().call("batchNormalization", input, mean, variance, options); } else if (op_type == "LayerNormalization") { int64_t axis = helper.Get("axis", -1); axis = HandleNegativeAxis(axis, rank); std::vector axes(rank - SafeInt(axis)); - std::iota(axes.begin(), axes.end(), axis); + if (model_builder.GetPreferredLayout() == DataLayout::NHWC && axis > 1) { + std::iota(axes.begin(), axes.end(), axis - 1); + } else { + std::iota(axes.begin(), axes.end(), axis); + } options.set("axes", emscripten::val::array(axes)); - output = model_builder.GetBuilder().call("meanVarianceNormalization", input, options); + output = model_builder.GetBuilder().call("layerNormalization", input, options); } else if (op_type == "InstanceNormalization") { - std::vector axes; - for (uint32_t i = 2; i < rank; i++) { - axes.emplace_back(i); + // WebNN spec only supports 4D input for instanceNormalization. + // Supports 3D input by prepending 1 size dimension. + // For models with dimensions greater than 4, they will be reshaped into 4D. + constexpr size_t webnn_shape_rank = 4; + if (input_shape.size() != webnn_shape_rank) { + std::vector new_shape; + new_shape.reserve(std::max(input_shape.size(), webnn_shape_rank)); + std::transform(input_shape.begin(), input_shape.end(), + std::back_inserter(new_shape), + [](int64_t dim) -> uint32_t { return SafeInt(dim); }); + + size_t insertion_offset = (model_builder.GetPreferredLayout() == DataLayout::NHWC) ? 2 : 3; + ptrdiff_t excess_rank = new_shape.size() - webnn_shape_rank; + auto insertion_point = new_shape.begin() + insertion_offset; + if (input_shape.size() < webnn_shape_rank) { + // Pad the shape with extra 1's to satisfy WebNN v1's rank requirements. + new_shape.insert(insertion_point, -excess_rank, 1); + } else { + // Fold the extra range to fit within WebNN v1's rank requirements. + uint32_t sum = std::accumulate( + insertion_point, insertion_point + excess_rank + 1, 1, std::multiplies()); + new_shape.erase(insertion_point, insertion_point + excess_rank); + *insertion_point = sum; + } + input = model_builder.GetBuilder().call("reshape", input, emscripten::val::array(new_shape)); + } + + if (model_builder.GetPreferredLayout() == DataLayout::NHWC) { + options.set("layout", emscripten::val("nhwc")); + } + output = model_builder.GetBuilder().call("instanceNormalization", input, options); + // Reshape back to the original output shape for 3D input. + if (input_shape.size() != 4) { + std::vector output_shape; + std::transform(input_shape.begin(), input_shape.end(), + std::back_inserter(output_shape), + [](int64_t dim) -> uint32_t { return SafeInt(dim); }); + output = model_builder.GetBuilder().call( + "reshape", output, emscripten::val::array(output_shape)); } - options.set("axes", emscripten::val::array(axes)); - output = model_builder.GetBuilder().call("meanVarianceNormalization", input, options); - } else if (op_type == "GroupNormalization") { - ORT_RETURN_IF_NOT(helper.HasAttr("num_groups"), "GroupNormalization num_group must be provided."); - int32_t group_count = helper.Get("num_groups", -1); - std::vector orig_shape, new_shape; - std::transform(input_shape.cbegin(), input_shape.cend(), - std::back_inserter(orig_shape), - [](int64_t dim) -> uint32_t { return SafeInt(dim); }); - // Add N and Group. - ORT_RETURN_IF_NOT(rank >= 2, "Input for GroupNormalization cannot be a scalar or 1D"); - new_shape.emplace_back(SafeInt(input_shape[0])); - new_shape.emplace_back(SafeInt(group_count)); - - ORT_RETURN_IF_NOT(group_count > 0 && input_shape[1] % group_count == 0, - "GroupNormalization num_group must be divisible by group."); - new_shape.emplace_back(SafeInt(std::reduce(input_shape.begin() + 2, input_shape.end(), - input_shape[1] / group_count, std::multiplies()))); - // Input will be reshaped to (N, group count, channels per group x D1 x D2 ... Dn) and recovered after normalization. - options.set("axes", emscripten::val::array(std::vector{2})); - output = model_builder.GetBuilder().call("reshape", input, emscripten::val::array(new_shape)); - output = model_builder.GetBuilder().call("meanVarianceNormalization", output, options); - output = model_builder.GetBuilder().call("reshape", output, emscripten::val::array(orig_shape)); } else { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported normalization op: ", op_type); } @@ -214,7 +182,6 @@ void CreateNormalizationOpBuilder(const std::string& op_type, OpBuilderRegistrat constexpr static std::string_view op_types[] = { "BatchNormalization", - "GroupNormalization", "InstanceNormalization", "LayerNormalization", }; diff --git a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc index ea9fc379ee23f..186d1e7c1035a 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc @@ -30,7 +30,7 @@ class ResizeOpBuilder : public BaseOpBuilder { // Operator support related. private: bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node, - const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override; + const WebnnDeviceType device_type, const logging::Logger& logger) const override; // Resize opset 10- is very different than Resize opset 11+, with many key attributes missing. // We only support Resize opset 11+ here. @@ -161,7 +161,7 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, bool ResizeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node, - const WebnnDeviceType /* device_type */, + const WebnnDeviceType device_type, const logging::Logger& logger) const { const auto& input_defs = node.InputDefs(); @@ -181,9 +181,18 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers const auto mode = helper.Get("mode", "nearest"); bool is_linear_resize = mode == "linear"; bool is_nearest_resize = mode == "nearest"; - if (!is_linear_resize && !is_nearest_resize) { - LOGS(logger, VERBOSE) << "Resize unsupported input mode, " << mode; - return false; + // WebNN CPU backend only supports "linear" mode. + // WebNN GPU backend only supports "linear" and "nearest" modes. + if (device_type == WebnnDeviceType::CPU) { + if (!is_linear_resize) { + LOGS(logger, VERBOSE) << "Resize unsupported input mode, " << mode << " for CPU backend."; + return false; + } + } else { + if (!is_linear_resize && !is_nearest_resize) { + LOGS(logger, VERBOSE) << "Resize unsupported input mode, " << mode << " for GPU backend."; + return false; + } } const auto exclude_outside = helper.Get("exclude_outside", 0); diff --git a/onnxruntime/core/providers/webnn/builders/impl/unary_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/unary_op_builder.cc index 129532e91f5a0..e6c5cf24080cd 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/unary_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/unary_op_builder.cc @@ -48,7 +48,7 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const } else if (op_type == "Neg") { output = model_builder.GetBuilder().call("neg", input); } else if (op_type == "Not") { - output = model_builder.GetBuilder().call("not", input); + output = model_builder.GetBuilder().call("logicalNot", input); } else if (op_type == "Reciprocal") { output = model_builder.GetBuilder().call("reciprocal", input); } else if (op_type == "Sin") { diff --git a/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc b/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc index 463317a4dafda..613771eda71fe 100644 --- a/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc +++ b/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc @@ -111,7 +111,6 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() { { // Normalization CreateNormalizationOpBuilder("BatchNormalization", op_registrations); - CreateNormalizationOpBuilder("GroupNormalization", op_registrations); CreateNormalizationOpBuilder("InstanceNormalization", op_registrations); CreateNormalizationOpBuilder("LayerNormalization", op_registrations); } diff --git a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py index 9f90196e301e5..6293bcbbf95bd 100644 --- a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py +++ b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py @@ -4,10 +4,11 @@ # license information. # -------------------------------------------------------------------------- +from __future__ import annotations + import argparse import logging import os -from typing import List, Tuple import numpy as np import numpy.typing as npt @@ -26,16 +27,24 @@ class MatMul4BitsQuantizer: """Perform 4b quantization of constant MatMul weights""" - def __init__(self, model: ModelProto, block_size: int, is_symmetric: bool, nodes_to_exclude=None): + def __init__( + self, + model: ModelProto, + block_size: int, + is_symmetric: bool, + accuracy_level: int | None = None, + nodes_to_exclude: list[str] | None = None, + ): if nodes_to_exclude is None: nodes_to_exclude = [] self.model = ONNXModel(model) self.block_size = block_size self.is_symmetric = is_symmetric + self.accuracy_level = accuracy_level self.nodes_to_exclude = set(nodes_to_exclude) @staticmethod - def __get_initializer(name, graph_path: List[GraphProto]) -> Tuple[TensorProto, GraphProto]: + def __get_initializer(name, graph_path: list[GraphProto]) -> tuple[TensorProto, GraphProto]: for gid in range(len(graph_path) - 1, -1, -1): graph = graph_path[gid] for tensor in graph.initializer: @@ -66,7 +75,7 @@ def int4_block_quant(self, fp32weight: npt.ArrayLike) -> np.ndarray: return (packed, scales, zero_point) - def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: List[GraphProto]) -> NodeProto: + def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: list[GraphProto]) -> NodeProto: """If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node""" if node.op_type != "MatMul": @@ -113,6 +122,8 @@ def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: List[GraphProto]) kwargs["N"] = cols kwargs["bits"] = 4 kwargs["block_size"] = self.block_size + if self.accuracy_level is not None: + kwargs["accuracy_level"] = self.accuracy_level matmul_q4_node = onnx.helper.make_node( "MatMulNBits", @@ -127,7 +138,7 @@ def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: List[GraphProto]) return matmul_q4_node - def _process_subgraph(self, graph_stack: List[GraphProto]): + def _process_subgraph(self, graph_stack: list[GraphProto]): new_nodes = [] graph = graph_stack[-1] @@ -201,6 +212,14 @@ def parse_args(): type=bool, help="Indicate whether to quantize the model symmetrically", ) + parser.add_argument( + "--accuracy_level", + required=False, + type=int, + help="Accuracy level of the 4-bit quantized MatMul computation. " + "Refer to the MatMulNBits contrib op's 'accuracy_level' attribute for details " + "(https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulnbits).", + ) parser.add_argument("-v", "--verbose", required=False, action="store_true") parser.set_defaults(verbose=False) parser.add_argument( @@ -228,6 +247,12 @@ def parse_args(): raise Exception(f"file {output_model_path} already exists") model = onnx.load(input_model_path) - quant = MatMul4BitsQuantizer(model, args.block_size, args.symmetric, nodes_to_exclude=args.nodes_to_exclude) + quant = MatMul4BitsQuantizer( + model=model, + block_size=args.block_size, + is_symmetric=args.symmetric, + accuracy_level=args.accuracy_level, + nodes_to_exclude=args.nodes_to_exclude, + ) quant.process() quant.model.save_model_to_file(output_model_path, True) diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py index e694b5050cc8c..bc09b52574a27 100644 --- a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py +++ b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py @@ -1,9 +1,10 @@ +from __future__ import annotations + import argparse import logging import os import shutil from itertools import chain -from typing import List import onnx import torch @@ -21,11 +22,12 @@ from onnxruntime import quantization as ort_quantization from onnxruntime.quantization.matmul_4bits_quantizer import MatMul4BitsQuantizer +torch_export_onnx_opset_version = 14 logger = logging.getLogger("") init_dist() -def get_model_dynamic_axes(input_names: List[str], output_names: List[str]): +def get_model_dynamic_axes(input_names: list[str], output_names: list[str]): dynamic_axes = {} for name in input_names + output_names: if name in input_names: @@ -42,7 +44,7 @@ def get_model_dynamic_axes(input_names: List[str], output_names: List[str]): return dynamic_axes -def get_model_with_past_kv_dynamic_axes(input_names: List[str], output_names: List[str]): +def get_model_with_past_kv_dynamic_axes(input_names: list[str], output_names: list[str]): dynamic_axes = {} for name in input_names + output_names: if name in {"input_ids", "position_ids"}: @@ -65,7 +67,7 @@ def get_model_with_past_kv_dynamic_axes(input_names: List[str], output_names: Li return dynamic_axes -def get_merged_model_dynamic_axes(input_names: List[str], output_names: List[str]): +def get_merged_model_dynamic_axes(input_names: list[str], output_names: list[str]): dynamic_axes = {} for name in input_names + output_names: if name in {"input_ids", "position_ids"}: @@ -229,7 +231,7 @@ def run_torchscript_separate_export( input_names=input_names, output_names=output_names, dynamic_axes=dynamic_axes, - opset_version=13, + opset_version=torch_export_onnx_opset_version, do_constant_folding=True, verbose=args.verbose, ) @@ -288,7 +290,7 @@ def run_torchscript_separate_export( input_names=input_names, output_names=output_names, dynamic_axes=dynamic_axes, - opset_version=13, + opset_version=torch_export_onnx_opset_version, do_constant_folding=True, verbose=args.verbose, ) @@ -368,7 +370,7 @@ def run_torchscript_merged_export( input_names=input_names, output_names=output_names, dynamic_axes=dynamic_axes, - opset_version=13, + opset_version=torch_export_onnx_opset_version, do_constant_folding=True, verbose=args.verbose, ) @@ -412,7 +414,7 @@ def optimize_export(config: AutoConfig, input_path: str, output_path: str, remov def convert_to_float16( - args: argparse.Namespace, config: AutoConfig, old_paths: List[str], rank: int = 0, world_size: int = 1 + args: argparse.Namespace, config: AutoConfig, old_paths: list[str], rank: int = 0, world_size: int = 1 ): decoder_model_fp16_path = os.path.join(args.output, f"rank_{rank}_{args.model_name}_decoder_model_fp16.onnx") decoder_with_past_model_fp16_path = os.path.join( @@ -635,7 +637,7 @@ def get_args(): help="Run a specific quantization algorithm (blockwise for int4, smooth_quant for int8, quantize_dynamic for int8). Blockwise is recommended. Need to install extra packages in `requirements-quant.txt` for SmoothQuant.", ) - blockwise_group = parser.add_argument_group("4-bit quantization") + blockwise_group = parser.add_argument_group("blockwise (4-bit quantization)") blockwise_group.add_argument( "--block_size", @@ -645,6 +647,15 @@ def get_args(): help="Block size to quantize with. See https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py for details.", ) + blockwise_group.add_argument( + "--int4_accuracy_level", + required=False, + type=int, + help="Accuracy level of the 4-bit quantized MatMul computation. " + "Refer to the MatMulNBits contrib op's 'accuracy_level' attribute for details " + "(https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulnbits).", + ) + smooth_quant_group = parser.add_argument_group("smooth_quant (8-bit quantization)") smooth_quant_group.add_argument( @@ -937,7 +948,13 @@ def main(): for fp_path, int4_path in zip(old_paths, new_paths): if os.path.exists(fp_path): model = onnx.load_model(fp_path, load_external_data=True) - quant = MatMul4BitsQuantizer(model, args.block_size, is_symmetric=True, nodes_to_exclude=[]) + quant = MatMul4BitsQuantizer( + model=model, + block_size=args.block_size, + is_symmetric=True, + accuracy_level=args.int4_accuracy_level, + nodes_to_exclude=[], + ) quant.process() quant.model.save_model_to_file(int4_path, use_external_data_format=True) del model diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py b/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py index bae1ae82e8f7e..a329b73259dda 100644 --- a/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py +++ b/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py @@ -1,4 +1,4 @@ -from typing import List, Tuple +from __future__ import annotations import numpy as np import torch @@ -235,7 +235,7 @@ def get_past_kv_inputs(config: AutoConfig, batch_size: int, past_seq_len: int, u # Convert list of past_key_values to dict of past_key and past_value -def flatten_past_kv_inputs(past_key_values: List[Tuple[torch.Tensor, torch.Tensor]]): +def flatten_past_kv_inputs(past_key_values: list[tuple[torch.Tensor, torch.Tensor]]): past_kv = {} for i, (past_k, past_v) in enumerate(past_key_values): past_kv[f"past_key_values.{i}.key"] = past_k.detach().cpu().numpy() diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_parity.py b/onnxruntime/python/tools/transformers/models/llama/llama_parity.py index 418a65325c8f0..25d7519769604 100644 --- a/onnxruntime/python/tools/transformers/models/llama/llama_parity.py +++ b/onnxruntime/python/tools/transformers/models/llama/llama_parity.py @@ -1,8 +1,9 @@ +from __future__ import annotations + import argparse import logging import os import time -from typing import List import numpy as np import torch @@ -139,7 +140,7 @@ def verify_parity( return kv_cache_ortvalues -def get_args(argv: List[str]): +def get_args(argv: list[str]): parser = argparse.ArgumentParser() parser.add_argument( @@ -232,7 +233,7 @@ def get_args(argv: List[str]): return args -def main(argv: List[str] = []): # noqa: B006 +def main(argv: list[str] = []): # noqa: B006 args = get_args(argv) setup_logger(args.verbose) logger.info(f"Arguments: {args}") diff --git a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml index d9aff36c4ad34..f6fcbd08ff03a 100644 --- a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml @@ -9,11 +9,6 @@ parameters: type: string default: qnn-v2.17.0.231124_win -- name: ort_package_version - displayName: OnnxRuntime Nuget package version - type: string - default: 1.15.0 - - name: build_config displayName: Build Configuration type: string @@ -47,7 +42,7 @@ jobs: buildArch: x64 setVcvars: true ALLOW_RELEASED_ONNX_OPSET_ONLY: '1' - commonBuildArgs: '--compile_no_warning_as_error --disable_ml_ops --build_dir $(Build.BinariesDirectory)\Windows --skip_submodule_sync --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}}' + commonBuildArgs: '--compile_no_warning_as_error --build_dir $(Build.BinariesDirectory)\Windows --skip_submodule_sync --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}}' steps: - template: templates/set-version-number-variables-step.yml @@ -90,7 +85,7 @@ jobs: displayName: 'Generating nuspec for the native Nuget package x64' inputs: script: | - python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version ${{ parameters.ort_package_version }} --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture x64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Windows\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Windows\packages --ort_build_path $(Build.BinariesDirectory)\Windows --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.qnn_sdk_info }} + python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version $(OnnxRuntimeVersion) --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture x64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Windows\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Windows\packages --ort_build_path $(Build.BinariesDirectory)\Windows --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.qnn_sdk_info }} cd $(Build.BinariesDirectory)\Windows\${{ parameters.build_config }}\${{ parameters.build_config }} nuget pack NativeNuget.nuspec mkdir $(Build.ArtifactStagingDirectory)\x64 @@ -130,7 +125,7 @@ jobs: displayName: 'Generate CMake Configuration for arm64' inputs: scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py' - arguments: '--update --arm64 --disable_ml_ops --build_dir $(Build.BinariesDirectory)\Win_arm64 --skip_submodule_sync --skip_tests --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}}' + arguments: '--update --arm64 --build_dir $(Build.BinariesDirectory)\Win_arm64 --skip_submodule_sync --skip_tests --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}}' - task: VSBuild@1 displayName: 'Build onnxruntime arm64' @@ -178,7 +173,7 @@ jobs: displayName: 'Generating nuspec for the native Nuget package arm64' inputs: script: | - python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version ${{ parameters.ort_package_version }} --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture arm64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Win_arm64\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Win_arm64\packages --ort_build_path $(Build.BinariesDirectory)\Win_arm64 --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.qnn_sdk_info }} + python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version $(OnnxRuntimeVersion) --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture arm64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Win_arm64\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Win_arm64\packages --ort_build_path $(Build.BinariesDirectory)\Win_arm64 --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.qnn_sdk_info }} cd $(Build.BinariesDirectory)\Win_arm64\${{ parameters.build_config }}\${{ parameters.build_config }} nuget pack NativeNuget.nuspec mkdir $(Build.ArtifactStagingDirectory)\arm64 diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml index fdb9238071c9e..eee38ac04b355 100644 --- a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml @@ -59,15 +59,14 @@ stages: BuildConfig: 'RelWithDebInfo' EnvSetupScript: setup_env_cuda.bat buildArch: x64 - additionalBuildFlags: --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --skip_onnx_tests --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75 + additionalBuildFlags: --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --skip_onnx_tests --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 msbuildPlatform: x64 isX86: false job_name_suffix: x64_RelWithDebInfo RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }} ORT_EP_NAME: CUDA WITH_CACHE: true - # Some unit tests crash on A10 GPUs. So this job still needs to use T4. - MachinePool: onnxruntime-Win2022-GPU-T4 + MachinePool: onnxruntime-Win2022-GPU-A10 isTraining: true - stage: dml diff --git a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile index 85d738d2167e1..6c71631368822 100644 --- a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile +++ b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile @@ -65,9 +65,6 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86 conda update --all && \ rm ~/miniconda.sh && conda clean -ya -# Conda base patch -RUN pip install cryptography==41.0.4 - # Create migraphx-ci environment ENV CONDA_ENVIRONMENT_PATH /opt/miniconda/envs/migraphx-ci ENV CONDA_DEFAULT_ENV migraphx-ci diff --git a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile index 29048b79d4b81..4db9df80ed187 100644 --- a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile +++ b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile @@ -67,9 +67,6 @@ ENV CONDA_DEFAULT_ENV rocm-ci RUN conda create -y -n ${CONDA_DEFAULT_ENV} python=3.9 ENV PATH ${CONDA_ENVIRONMENT_PATH}/bin:${PATH} -# Conda base patch -RUN pip install cryptography==41.0.4 - # Enable rocm-ci environment SHELL ["conda", "run", "-n", "rocm-ci", "/bin/bash", "-c"]