diff --git a/VERSION_NUMBER b/VERSION_NUMBER index 3989355915568..0044d6cb9691c 100644 --- a/VERSION_NUMBER +++ b/VERSION_NUMBER @@ -1 +1 @@ -1.20.0 +1.20.1 diff --git a/csharp/OnnxRuntime.CSharp.proj b/csharp/OnnxRuntime.CSharp.proj index 95207d158affe..6779fd60bcd0a 100644 --- a/csharp/OnnxRuntime.CSharp.proj +++ b/csharp/OnnxRuntime.CSharp.proj @@ -64,13 +64,6 @@ CMake creates a target to this project - - - - - - - @@ -153,7 +146,7 @@ CMake creates a target to this project $(BaseTargets);$(MobileTargets) + + + true + true + true + + + true + true + true + true + + $(ProjectDir)..\..\.. + + + true + + + Microsoft.ML.OnnxRuntime Microsoft.ML.OnnxRuntime @@ -66,54 +93,31 @@ Commit: $(BUILD_SOURCEVERSION) Build: https://aiinfra.visualstudio.com/Lotus/_build/results?buildId=$(BUILD_BUILDID) + README.md + LICENSE.txt + + + true + + true + ..\..\OnnxRuntime.snk + + $(AllowedOutputExtensionsInPackageBuildOutputFolder);.pdb + AnyCPU;x86 default true - true - ..\..\OnnxRuntime.snk - - $(ProjectDir)..\..\.. - $(OnnxRuntimeRoot)\csharp x64 false false portable - - true - - - true - - - - - false - $(AllowedOutputExtensionsInPackageBuildOutputFolder);.pdb Debug;Release;RelWithDebInfo - - true - true - true - - - true - true - true - - - $(OnnxRuntimeCsharpRoot)\..\build\Linux - $(OnnxRuntimeBuildDirectory)\$(Configuration) - - - - $(OnnxRuntimeCsharpRoot)\..\build\Windows $(OnnxRuntimeBuildDirectory)\$(Configuration)\$(Configuration) - - - - $(OnnxRuntimeCsharpRoot)\..\build\MacOS + $(OnnxRuntimeBuildDirectory)\$(Configuration) - + $(OrtConstants);__MOBILE__ @@ -155,12 +148,12 @@ $(OrtConstants);__ANDROID__ - + $(OrtConstants);__IOS__ - - + + $(OrtConstants);__ENABLE_COREML__ @@ -178,128 +171,6 @@ $(DefineConstants);$(OrtConstants) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + - - - + + + + + + + + + + + + + + + + + + - + diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/Microsoft.ML.OnnxRuntime.Tests.Common.csproj b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/Microsoft.ML.OnnxRuntime.Tests.Common.csproj index 60d18ad31e811..07ca7fe7c64bf 100644 --- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/Microsoft.ML.OnnxRuntime.Tests.Common.csproj +++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/Microsoft.ML.OnnxRuntime.Tests.Common.csproj @@ -1,16 +1,19 @@  + + true + true + true + + $(ProjectDir)..\..\.. + netstandard2.0;net8.0 false - $(ProjectDir)..\.. AnyCPU bin\$(Configuration)\ - true - true - true - $(OnnxRuntimeCsharpRoot)\..\cmake\external\onnx + $(OnnxRuntimeRoot)\cmake\external\onnx 8981 @@ -22,30 +25,22 @@ ..\..\OnnxRuntime.snk Debug;Release;RelWithDebInfo + Microsoft.ML.OnnxRuntime.Tests Microsoft.ML.OnnxRuntime.Tests.Common - - - $(OnnxRuntimeCsharpRoot)\..\build\Linux - $(OnnxRuntimeBuildDirectory)\$(Configuration) - $(OnnxRuntimeBuildDirectory)\$(Configuration)\external\protobuf\cmake - $(ProtocDirectory)\protoc - - - - $(OnnxRuntimeCsharpRoot)\..\build\Windows - $(OnnxRuntimeBuildDirectory)\$(Configuration)\$(Configuration) $(OnnxRuntimeBuildDirectory)\$(Configuration)\external\protobuf\cmake\$(Configuration) $(ProtocDirectory)\protoc.exe + + $(OnnxRuntimeBuildDirectory)\$(Configuration)\external\protobuf\cmake + $(ProtocDirectory)\protoc + + - - $(OnnxRuntimeCsharpRoot)\..\build\MacOS - $(OnnxRuntimeBuildDirectory)\$(Configuration) $(OnnxRuntimeBuildDirectory)\$(Configuration)\external\protobuf\cmake $(ProtocDirectory)\protoc @@ -102,28 +97,6 @@ - - - - PreserveNewest - false - - - - PreserveNewest - false - - - - PreserveNewest - false - - - @@ -132,16 +105,20 @@ - + - + + - + + @@ -152,20 +129,20 @@ + - TestData\%(Filename)%(Extension) + TestData\%(Filename)%(Extension) - - TestData\overridable_initializer.onnx + + TestData\overridable_initializer.onnx - - TestData\capi_symbolic_dims.onnx + + TestData\capi_symbolic_dims.onnx - diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/NativeLibraryInclude.props b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/NativeLibraryInclude.props new file mode 100644 index 0000000000000..3daab21dbcbac --- /dev/null +++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/NativeLibraryInclude.props @@ -0,0 +1,171 @@ + + + + + true + true + true + + + true + true + true + true + + + false + 1.20.0-dev-20241007 + + + + + + + + + + + + + + + + + + + + + + $(OnnxRuntimeRoot)\build\Windows + $(OnnxRuntimeBuildDirectory)\$(Configuration)\$(Configuration) + + + + $(OnnxRuntimeRoot)\build\Linux + $(OnnxRuntimeBuildDirectory)\$(Configuration) + + + + $(OnnxRuntimeRoot)\build\MacOS + $(OnnxRuntimeBuildDirectory)\$(Configuration) + + + + $(OnnxRuntimeRoot)\build\Android + $(OnnxRuntimeBuildDirectory)\$(Configuration) + + + + $(OnnxRuntimeRoot)\build\iOS + iPhoneSimulator + $(Platform.ToLower()) + $(OnnxRuntimeBuildDirectory)\$(Configuration)\$(Configuration)-$(PlatformLower) + + + + $(OnnxRuntimeRoot)\build\macOS + $(OnnxRuntimeBuildDirectory)\$(Configuration) + + + + + PreserveNewest + true + + + + + + PreserveNewest + false + + + + + + PreserveNewest + false + + + + + + libs\libonnxruntime.so + + + + + + libs\libonnxruntime.dylib + Dynamic + True + True + + + + + + libs\libonnxruntime.dylib + Dynamic + True + True + + + + + + + + + false + true + false + true + false + true + + + + + + + + + + + diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/Tensors/TensorTests.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/Tensors/TensorTests.cs index 27cde1dbe9ed8..46dd292e8514e 100644 --- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/Tensors/TensorTests.cs +++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/Tensors/TensorTests.cs @@ -2180,10 +2180,13 @@ public void GetArrayString(TensorConstructor constructor) {22,23} } }"; + // remove \r so the newlines are just \n on all platforms + expected = expected.Replace("\r", ""); + var actual= tensor.GetArrayString().Replace("\r", ""); - Assert.Equal(expected, tensor.GetArrayString()); + Assert.Equal(expected, actual); - var expectedNoSpace = expected.Replace(Environment.NewLine, "").Replace(" ", ""); + var expectedNoSpace = expected.Replace("\n", "").Replace(" ", ""); Assert.Equal(expectedNoSpace, tensor.GetArrayString(false)); } diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.MAUI/Microsoft.ML.OnnxRuntime.Tests.MAUI.csproj b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.MAUI/Microsoft.ML.OnnxRuntime.Tests.MAUI.csproj index 210a04d78f107..e07448daeea7f 100644 --- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.MAUI/Microsoft.ML.OnnxRuntime.Tests.MAUI.csproj +++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.MAUI/Microsoft.ML.OnnxRuntime.Tests.MAUI.csproj @@ -1,306 +1,125 @@  - - - true - true - true - true - $(ProjectDir)..\..\.. - - - - - net8.0-android;net8.0-ios;net8.0-maccatalyst - $(TargetFrameworks);net8.0-windows10.0.19041.0 - - - - - Exe - Microsoft.ML.OnnxRuntime.Tests.MAUI - true - true - enable - enable - true - - 8002 - - - $(DefineConstants);INCLUDE_FAILING_TESTS - $(DefineConstants);MODE_NON_INTERACTIVE_VISUAL - $(DefineConstants);MODE_XHARNESS - - - Microsoft.ML.OnnxRuntime.Tests.MAUI - - - ORT.CSharp.Tests.MAUI - - - 1.0 - 1 - - 15.0 - 13.1 - 30.0 - 10.0.17763.0 - 10.0.17763.0 - - true - ..\..\OnnxRuntime.snk - - - false - - - - - $(OnnxRuntimeRoot)\build\microsoft.ml.onnxruntime.1.18.1\runtimes - - true - - - - $(OnnxRuntimeRoot)\build\Windows - $(OnnxRuntimeBuildDirectory)\$(Configuration)\$(Configuration) - - $(PrebuiltRuntimesDir)\win-x64\native - - - $(OnnxRuntimeRoot)\build\Android - $(OnnxRuntimeBuildDirectory)\$(Configuration) - $(PrebuiltRuntimesDir)\android\native\onnxruntime.aar - - - $(OnnxRuntimeRoot)\build\iOS - iPhoneSimulator - $(Platform.ToLower()) - $(OnnxRuntimeBuildDirectory)\$(Configuration)\$(Configuration)-$(PlatformLower) - $(PrebuiltRuntimesDir)\ios\native\onnxruntime.xcframework - - - $(OnnxRuntimeRoot)\build\macOS - $(OnnxRuntimeBuildDirectory)\$(Configuration) - $(PrebuiltRuntimesDir)\ios\native\onnxruntime.xcframework - - - - - - PreserveNewest - true - - - - - PreserveNewest - true - - - - - PreserveNewest - false - - - PreserveNewest - false - - - PreserveNewest - false - - - PreserveNewest - false - - - PreserveNewest - false - - - PreserveNewest - false - - - - - - - libs\libonnxruntime.so - - - - - - - - - - libs\libonnxruntime.dylib - Dynamic - True - True - - - - - Framework - True - True - - - - - - - libs\libonnxruntime.dylib - Dynamic - True - True - - - - - Framework - True - True - - - - - - - false - true - false - true - false - true - - false - true - false - true - false - true - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - InferenceTest.cs - - - OrtIoBindingAllocationTest.cs - - - TensorTests.cs - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - <_VisualStudioTestRunnerFiles Include="@(PackagingOutputs)" Condition="$([System.String]::Copy('%(PackagingOutputs.FullPath)').Contains('xunit.runner.visualstudio'))" /> - - - + + $(ProjectDir)..\..\.. + + + + + + + net8.0-android;net8.0-ios;net8.0-maccatalyst + $(TargetFrameworks);net8.0-windows10.0.19041.0 + + + + + Exe + Microsoft.ML.OnnxRuntime.Tests.MAUI + true + true + enable + enable + true + + 8002 + + + $(DefineConstants);INCLUDE_FAILING_TESTS + $(DefineConstants);MODE_NON_INTERACTIVE_VISUAL + $(DefineConstants);MODE_XHARNESS + + + Microsoft.ML.OnnxRuntime.Tests.MAUI + + + ORT.CSharp.Tests.MAUI + + + 1.0 + 1 + + 15.0 + 13.1 + 30.0 + 10.0.17763.0 + 10.0.17763.0 + + true + ..\..\OnnxRuntime.snk + + + + + + + + + + + + + + + + + + + + + + + + InferenceTest.cs + + + OrtIoBindingAllocationTest.cs + + + TensorTests.cs + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <_VisualStudioTestRunnerFiles + Include="@(PackagingOutputs)" + Condition="$([System.String]::Copy('%(PackagingOutputs.FullPath)').Contains('xunit.runner.visualstudio'))" /> + + + diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.MAUI/ReadMe.md b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.MAUI/ReadMe.md new file mode 100644 index 0000000000000..07cb5fe7c9b3d --- /dev/null +++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.MAUI/ReadMe.md @@ -0,0 +1,9 @@ +The MAUI test project can be optionally used with a pre-built ONNX Runtime native nuget package (Microsoft.ML.OnnxRuntime). + +To do so, specify the `UsePrebuiltNativePackage` and `CurrentOnnxRuntimeVersion` properties when building the project. These can be set via the command-line or as environment variables. + +For example: + +```cmd +dotnet build csharp\test\Microsoft.ML.OnnxRuntime.Tests.MAUI\Microsoft.ML.OnnxRuntime.Tests.MAUI.csproj --property:UsePrebuiltNativePackage=true --property:CurrentOnnxRuntimeVersion=1.19.2 --source directory_containing_native_nuget_package --source https://api.nuget.org/v3/index.json +``` diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp.csproj b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp.csproj index b822c999e4d39..a8abcd2b4aa1c 100644 --- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp.csproj +++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp.csproj @@ -1,4 +1,9 @@  + + $(ProjectDir)..\..\.. + + + net8.0 @@ -6,9 +11,7 @@ $(ProjectDir)..\.. AnyCPU;x86 bin\$(Configuration)\ - true - true - true + $(OnnxSourceDirectory)\onnx default @@ -35,19 +38,19 @@ - $(OnnxRuntimeCsharpRoot)\..\build\Linux + $(OnnxRuntimeRoot)\build\Linux $(OnnxRuntimeBuildDirectory)\$(Configuration) - $(OnnxRuntimeCsharpRoot)\..\build\Windows + $(OnnxRuntimeRoot)\build\Windows $(OnnxRuntimeBuildDirectory)\$(Configuration)\$(Configuration) - $(OnnxRuntimeCsharpRoot)\..\build\MacOS + $(OnnxRuntimeRoot)\build\MacOS $(OnnxRuntimeBuildDirectory)\$(Configuration) @@ -58,15 +61,14 @@ PreserveNewest @@ -74,45 +76,39 @@ PreserveNewest false PreserveNewest false - - PreserveNewest - false - - + PreserveNewest false - - PreserveNewest - false - - + + PreserveNewest false - + + PreserveNewest false - + + PreserveNewest false + @@ -131,7 +127,7 @@ - + PreserveNewest false diff --git a/docs/python/README.rst b/docs/python/README.rst index 5a45bf6cef8ed..82c2fbde1d1d8 100644 --- a/docs/python/README.rst +++ b/docs/python/README.rst @@ -8,6 +8,11 @@ For more information on ONNX Runtime, please see `aka.ms/onnxruntime 12000400, '1.20.0-rc.1 ' -> 12000301 -// '1.20.0-beta.1' -> 12000201, '1.20.0-alpha.1' -> 12000101 +// for example '1.20.1' -> 12000400, '1.20.1-rc.1 ' -> 12000301 +// '1.20.1-beta.1' -> 12000201, '1.20.1-alpha.1' -> 12000101 def getVersionCode(String version) { String[] versionAndRelSufx = version.split('-') String[] codes = versionAndRelSufx[0].split('\\.') diff --git a/js/common/lib/version.ts b/js/common/lib/version.ts index 450ae2d06e638..784b80f603acf 100644 --- a/js/common/lib/version.ts +++ b/js/common/lib/version.ts @@ -4,4 +4,4 @@ // This file is generated by /js/scripts/update-version.ts // Do not modify file content manually. -export const version = '1.20.0'; +export const version = '1.20.1'; diff --git a/js/common/package-lock.json b/js/common/package-lock.json index 865fa860e98ad..03b8b4f0cc9a7 100644 --- a/js/common/package-lock.json +++ b/js/common/package-lock.json @@ -1,12 +1,12 @@ { "name": "onnxruntime-common", - "version": "1.20.0", + "version": "1.20.1", "lockfileVersion": 2, "requires": true, "packages": { "": { "name": "onnxruntime-common", - "version": "1.20.0", + "version": "1.20.1", "license": "MIT", "devDependencies": { "typedoc": "^0.25.7" diff --git a/js/common/package.json b/js/common/package.json index 9c941f6486ea9..c483b41dfdce9 100644 --- a/js/common/package.json +++ b/js/common/package.json @@ -2,7 +2,7 @@ "license": "MIT", "type": "module", "name": "onnxruntime-common", - "version": "1.20.0", + "version": "1.20.1", "repository": { "url": "https://github.com/Microsoft/onnxruntime.git", "type": "git" diff --git a/js/node/lib/version.ts b/js/node/lib/version.ts index 450ae2d06e638..784b80f603acf 100644 --- a/js/node/lib/version.ts +++ b/js/node/lib/version.ts @@ -4,4 +4,4 @@ // This file is generated by /js/scripts/update-version.ts // Do not modify file content manually. -export const version = '1.20.0'; +export const version = '1.20.1'; diff --git a/js/node/package-lock.json b/js/node/package-lock.json index a0fc445c16dda..633c7cd62f9f6 100644 --- a/js/node/package-lock.json +++ b/js/node/package-lock.json @@ -1,12 +1,12 @@ { "name": "onnxruntime-node", - "version": "1.20.0", + "version": "1.20.1", "lockfileVersion": 2, "requires": true, "packages": { "": { "name": "onnxruntime-node", - "version": "1.20.0", + "version": "1.20.1", "hasInstallScript": true, "license": "MIT", "os": [ @@ -29,7 +29,7 @@ }, "../common": { "name": "onnxruntime-common", - "version": "1.20.0", + "version": "1.20.1", "license": "MIT", "devDependencies": { "typedoc": "^0.25.7" diff --git a/js/node/package.json b/js/node/package.json index 4964d0fc3fd4d..3842df7edf522 100644 --- a/js/node/package.json +++ b/js/node/package.json @@ -13,7 +13,7 @@ 3 ] }, - "version": "1.20.0", + "version": "1.20.1", "dependencies": { "onnxruntime-common": "file:../common", "tar": "^7.0.1" diff --git a/js/react_native/lib/version.ts b/js/react_native/lib/version.ts index 450ae2d06e638..784b80f603acf 100644 --- a/js/react_native/lib/version.ts +++ b/js/react_native/lib/version.ts @@ -4,4 +4,4 @@ // This file is generated by /js/scripts/update-version.ts // Do not modify file content manually. -export const version = '1.20.0'; +export const version = '1.20.1'; diff --git a/js/react_native/package.json b/js/react_native/package.json index 20b5d02ff233e..1acfd69ec84f2 100644 --- a/js/react_native/package.json +++ b/js/react_native/package.json @@ -36,7 +36,7 @@ "registry": "https://registry.npmjs.org/" }, "source": "lib/index", - "version": "1.20.0", + "version": "1.20.1", "main": "dist/commonjs/index", "homepage": "https://github.com/microsoft/onnxruntime/blob/main/js/react_native/README.md", "files": [ diff --git a/js/react_native/yarn.lock b/js/react_native/yarn.lock index 99c03d2e7bf02..c9eba883944d7 100644 --- a/js/react_native/yarn.lock +++ b/js/react_native/yarn.lock @@ -5254,7 +5254,7 @@ onetime@^5.1.0, onetime@^5.1.2: mimic-fn "^2.1.0" "onnxruntime-common@file:../common": - version "1.20.0" + version "1.20.1" open@^6.2.0: version "6.4.0" diff --git a/js/web/lib/version.ts b/js/web/lib/version.ts index 450ae2d06e638..784b80f603acf 100644 --- a/js/web/lib/version.ts +++ b/js/web/lib/version.ts @@ -4,4 +4,4 @@ // This file is generated by /js/scripts/update-version.ts // Do not modify file content manually. -export const version = '1.20.0'; +export const version = '1.20.1'; diff --git a/js/web/package-lock.json b/js/web/package-lock.json index 2eb79a2850bea..7f289cc914d42 100644 --- a/js/web/package-lock.json +++ b/js/web/package-lock.json @@ -1,12 +1,12 @@ { "name": "onnxruntime-web", - "version": "1.20.0", + "version": "1.20.1", "lockfileVersion": 2, "requires": true, "packages": { "": { "name": "onnxruntime-web", - "version": "1.20.0", + "version": "1.20.1", "license": "MIT", "dependencies": { "flatbuffers": "^1.12.0", @@ -51,7 +51,7 @@ }, "../common": { "name": "onnxruntime-common", - "version": "1.20.0", + "version": "1.20.1", "license": "MIT", "devDependencies": { "typedoc": "^0.25.7" diff --git a/js/web/package.json b/js/web/package.json index d770499adada4..d5dba18c14a59 100644 --- a/js/web/package.json +++ b/js/web/package.json @@ -7,7 +7,7 @@ "type": "git" }, "author": "fs-eire", - "version": "1.20.0", + "version": "1.20.1", "jsdelivr": "dist/ort.min.js", "dependencies": { "flatbuffers": "^1.12.0", diff --git a/onnxruntime/__init__.py b/onnxruntime/__init__.py index 0e9a924bde4bb..cded663706ff6 100644 --- a/onnxruntime/__init__.py +++ b/onnxruntime/__init__.py @@ -7,7 +7,7 @@ For more information on ONNX Runtime, please see `aka.ms/onnxruntime `_ or the `Github project `_. """ -__version__ = "1.20.0" +__version__ = "1.20.1" __author__ = "Microsoft" # we need to do device version validation (for example to check Cuda version for an onnxruntime-training package). diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 67b4950af73bf..3e70f848675cb 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -46,24 +46,13 @@ void ComputeJob( const T* gamma_data, const T* beta_data, const T* bias_data, - IAllocatorUniquePtr& skip_float_uptr, - IAllocatorUniquePtr& gamma_float_uptr, - IAllocatorUniquePtr& beta_float_uptr, - IAllocatorUniquePtr& bias_float_uptr, ptrdiff_t task_idx, int hidden_size, int64_t skip_size, float epsilon, bool simplified, T* output_data, - T* skip_input_bias_add_output_data, - AllocatorPtr alloc) { - ORT_UNUSED_PARAMETER(skip_float_uptr); // only used in MLFloat16 overload - ORT_UNUSED_PARAMETER(gamma_float_uptr); // only used in MLFloat16 overload - ORT_UNUSED_PARAMETER(beta_float_uptr); // only used in MLFloat16 overload - ORT_UNUSED_PARAMETER(bias_float_uptr); // only used in MLFloat16 overload - ORT_UNUSED_PARAMETER(alloc); - + T* skip_input_bias_add_output_data) { auto offset = task_idx * hidden_size; const T* p_input = input_data + offset; const T* p_skip = skip_data + (offset % skip_size); @@ -110,13 +99,11 @@ void ComputeJob( void ComputeJob( const MLFloat16* input_data, const MLFloat16* skip_data, - const MLFloat16* gamma_data, - const MLFloat16* beta_data, - const MLFloat16* bias_data, - IAllocatorUniquePtr& skip_float_uptr, - IAllocatorUniquePtr& gamma_float_uptr, - IAllocatorUniquePtr& beta_float_uptr, - IAllocatorUniquePtr& bias_float_uptr, + const float* prepacked_skip_fp32_data, + const float* gamma_float_ptr, + const float* beta_float_ptr, + const float* bias_float_ptr, + float* output_float_ptr, ptrdiff_t task_idx, int hidden_size, int64_t skip_size, @@ -127,7 +114,6 @@ void ComputeJob( AllocatorPtr alloc) { auto offset = task_idx * hidden_size; const MLFloat16* p_input = input_data + offset; - const MLFloat16* p_skip = skip_data + (offset % skip_size); MLFloat16* p_output = output_data + offset; MLFloat16* p_skip_input_bias_add_output = skip_input_bias_add_output_data == nullptr ? nullptr : skip_input_bias_add_output_data + offset; @@ -138,26 +124,19 @@ void ComputeJob( IAllocatorUniquePtr input_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems); MlasConvertHalfToFloatBuffer(p_input, input_float_uptr.get(), num_elems); - if (!skip_float_uptr) { + IAllocatorUniquePtr skip_float_uptr = nullptr; + if (prepacked_skip_fp32_data == nullptr && skip_data) { + const MLFloat16* p_skip = skip_data + (offset % skip_size); skip_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems); MlasConvertHalfToFloatBuffer(p_skip, skip_float_uptr.get(), num_elems); } - if (bias_data && !bias_float_uptr) { - bias_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems); - MlasConvertHalfToFloatBuffer(bias_data, bias_float_uptr.get(), num_elems); - } - - IAllocatorUniquePtr output_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems); - float* output_float_ptr = output_float_uptr.get(); - const float* input_float_ptr = input_float_uptr.get(); - const float* skip_float_ptr = skip_float_uptr.get(); - const float* bias_float_ptr = bias_float_uptr.get(); + const float* skip_float_ptr = prepacked_skip_fp32_data ? prepacked_skip_fp32_data : skip_float_uptr.get(); for (size_t h = 0; h < num_elems; h++) { float val = input_float_ptr[h] + skip_float_ptr[h]; - if (bias_float_uptr) { + if (bias_float_ptr) { val += bias_float_ptr[h]; } @@ -177,22 +156,10 @@ void ComputeJob( mean_square = sqrt(mean_square / hidden_size - mean * mean + epsilon); } - if (!gamma_float_uptr) { - gamma_float_uptr = std::move(input_float_uptr); // overwrite input with gamma values, since they have the same size - MlasConvertHalfToFloatBuffer(gamma_data, gamma_float_uptr.get(), num_elems); - } - - if (beta_data && !beta_float_uptr) { - beta_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems); - MlasConvertHalfToFloatBuffer(beta_data, beta_float_uptr.get(), num_elems); - } - - const float* gamma_float_ptr = gamma_float_uptr.get(); - const float* beta_float_ptr = beta_float_uptr.get(); for (size_t h = 0; h < num_elems; h++) { if (simplified) { output_float_ptr[h] = output_float_ptr[h] / mean_square * gamma_float_ptr[h]; - } else if (nullptr == beta_float_uptr) { + } else if (nullptr == beta_float_ptr) { output_float_ptr[h] = (output_float_ptr[h] - mean) / mean_square * gamma_float_ptr[h]; } else { output_float_ptr[h] = (output_float_ptr[h] - mean) / mean_square * gamma_float_ptr[h] + beta_float_ptr[h]; @@ -218,7 +185,12 @@ void ConvertMLFloat16ToFloatIfNeeded(const Tensor& tensor, AllocatorPtr alloc, I template SkipLayerNorm::SkipLayerNorm(const OpKernelInfo& op_kernel_info) - : OpKernel(op_kernel_info), skip_fp32_(nullptr), gamma_fp32_(nullptr), beta_fp32_(nullptr), bias_fp32_(nullptr) { + : OpKernel(op_kernel_info), + prepacked_skip_fp32_size_(0), + prepacked_skip_fp32_data_(nullptr), + prepacked_gamma_fp32_data_(nullptr), + prepacked_beta_fp32_data_(nullptr), + prepacked_bias_fp32_data_(nullptr) { ORT_ENFORCE(op_kernel_info.GetAttr("epsilon", &epsilon_).IsOK()); ORT_ENFORCE(epsilon_ >= 0); } @@ -226,10 +198,10 @@ SkipLayerNorm::SkipLayerNorm(const OpKernelInfo& op_kernel_info) template Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { const Tensor* input = p_ctx->Input(0); - const Tensor* skip = p_ctx->Input(1); - const Tensor* gamma = p_ctx->Input(2); - const Tensor* beta = p_ctx->Input(3); - const Tensor* bias = p_ctx->Input(4); + const Tensor* skip = prepacked_skip_fp32_data_ ? nullptr : p_ctx->Input(1); + const Tensor* gamma = prepacked_gamma_fp32_data_ ? nullptr : p_ctx->Input(2); + const Tensor* beta = prepacked_beta_fp32_data_ ? nullptr : p_ctx->Input(3); + const Tensor* bias = prepacked_bias_fp32_data_ ? nullptr : p_ctx->Input(4); Tensor* output = p_ctx->Output(0, input->Shape()); // For inferencing, we support one more optional output which is the sum of the input and skip tensors Tensor* skip_input_bias_add_output = p_ctx->Output(3, input->Shape()); @@ -238,19 +210,21 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { size_t input_dims_size = input_dims.size(); int hidden_size = static_cast(input_dims[input_dims_size - 1]); - ORT_RETURN_IF_ERROR(onnxruntime::contrib::skip_layer_norm_helper::CheckInputs(input, - skip, - gamma, - beta, - bias, - hidden_size, - input_dims_size)); + ORT_RETURN_IF_ERROR(skip_layer_norm_helper::CheckPotentiallyPrepackedInputs(input, + skip, + gamma, + beta, + bias, + hidden_size, + input_dims_size, + prepacked_skip_fp32_data_ != nullptr, + prepacked_gamma_fp32_data_ != nullptr)); int64_t task_count = input->Shape().SizeToDimension(input_dims_size - 1); const T* input_data = input->Data(); - const T* skip_data = skip->Data(); - const T* gamma_data = gamma->Data(); + const T* skip_data = skip == nullptr ? nullptr : skip->Data(); + const T* gamma_data = gamma == nullptr ? nullptr : gamma->Data(); const T* beta_data = beta == nullptr ? nullptr : beta->Data(); const T* bias_data = bias == nullptr ? nullptr : bias->Data(); @@ -259,17 +233,53 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { // For inferencing, we support one more optional output which is the sum of the input and skip tensors T* skip_input_bias_add_output_data = skip_input_bias_add_output == nullptr ? nullptr : skip_input_bias_add_output->MutableData(); - const int64_t& skip_size = skip->Shape().Size(); + const int64_t skip_size = skip ? skip->Shape().Size() : prepacked_skip_fp32_size_; AllocatorPtr alloc; ORT_RETURN_IF_ERROR(p_ctx->GetTempSpaceAllocator(&alloc)); + IAllocatorUniquePtr output_fp32; + IAllocatorUniquePtr gamma_fp32; + IAllocatorUniquePtr beta_fp32; + IAllocatorUniquePtr bias_fp32; + + if constexpr (std::is_same_v) { + const size_t num_elems = static_cast(hidden_size); + + output_fp32 = IAllocator::MakeUniquePtr(alloc, num_elems); + + if (prepacked_gamma_fp32_data_ == nullptr && gamma_data) { + gamma_fp32 = IAllocator::MakeUniquePtr(alloc, num_elems); + MlasConvertHalfToFloatBuffer(gamma_data, gamma_fp32.get(), num_elems); + } + + if (prepacked_beta_fp32_data_ == nullptr && beta_data) { + beta_fp32 = IAllocator::MakeUniquePtr(alloc, num_elems); + MlasConvertHalfToFloatBuffer(beta_data, beta_fp32.get(), num_elems); + } + + if (prepacked_bias_fp32_data_ == nullptr && bias_data) { + bias_fp32 = IAllocator::MakeUniquePtr(alloc, num_elems); + MlasConvertHalfToFloatBuffer(bias_data, bias_fp32.get(), num_elems); + } + } + concurrency::ThreadPool::TryBatchParallelFor( p_ctx->GetOperatorThreadPool(), static_cast(task_count), [&](ptrdiff_t task_idx) { - ComputeJob(input_data, skip_data, gamma_data, beta_data, bias_data, skip_fp32_, gamma_fp32_, beta_fp32_, - bias_fp32_, task_idx, hidden_size, skip_size, epsilon_, simplified, output_data, - skip_input_bias_add_output_data, alloc); + if constexpr (std::is_same_v) { + ComputeJob(input_data, skip_data, + prepacked_skip_fp32_data_.get(), + prepacked_gamma_fp32_data_ ? prepacked_gamma_fp32_data_.get() : gamma_fp32.get(), + prepacked_beta_fp32_data_ ? prepacked_beta_fp32_data_.get() : beta_fp32.get(), + prepacked_bias_fp32_data_ ? prepacked_bias_fp32_data_.get() : bias_fp32.get(), + output_fp32.get(), + task_idx, hidden_size, skip_size, epsilon_, simplified, output_data, + skip_input_bias_add_output_data, alloc); + } else { + ComputeJob(input_data, skip_data, gamma_data, beta_data, bias_data, task_idx, hidden_size, skip_size, + epsilon_, simplified, output_data, skip_input_bias_add_output_data); + } }, 0); @@ -283,13 +293,14 @@ Status SkipLayerNorm::PrePack(const Tensor& tensor, int input_idx is_packed = false; if (input_idx == 1) { // skip - ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, skip_fp32_, is_packed); + prepacked_skip_fp32_size_ = tensor.Shape().Size(); + ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_skip_fp32_data_, is_packed); } else if (input_idx == 2) { // gamma - ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, gamma_fp32_, is_packed); + ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_gamma_fp32_data_, is_packed); } else if (input_idx == 3) { // beta - ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, beta_fp32_, is_packed); + ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_beta_fp32_data_, is_packed); } else if (input_idx == 4) { // bias - ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, bias_fp32_, is_packed); + ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_bias_fp32_data_, is_packed); } return Status::OK(); diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.h b/onnxruntime/contrib_ops/cpu/skip_layer_norm.h index 08e2276c3d9d5..4a350fdcc2220 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.h +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.h @@ -21,10 +21,11 @@ class SkipLayerNorm final : public OpKernel { private: float epsilon_; - mutable IAllocatorUniquePtr skip_fp32_; - mutable IAllocatorUniquePtr gamma_fp32_; - mutable IAllocatorUniquePtr beta_fp32_; - mutable IAllocatorUniquePtr bias_fp32_; + int64_t prepacked_skip_fp32_size_; + IAllocatorUniquePtr prepacked_skip_fp32_data_; + IAllocatorUniquePtr prepacked_gamma_fp32_data_; + IAllocatorUniquePtr prepacked_beta_fp32_data_; + IAllocatorUniquePtr prepacked_bias_fp32_data_; }; } // namespace contrib diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm_helper.h b/onnxruntime/contrib_ops/cpu/skip_layer_norm_helper.h index 6271f822287e6..4c901f5650dbd 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm_helper.h +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm_helper.h @@ -11,14 +11,10 @@ namespace onnxruntime { namespace contrib { namespace skip_layer_norm_helper { +namespace { + template -Status CheckInputs(const T* input, - const T* skip, - const T* gamma, - const T* beta, - const T* bias, - int hidden_size_check, - size_t input_dims_size_check) { +Status CheckSkip(const T* input, const T* skip, size_t input_dims_size_check) { const auto& input_dims_check = input->Shape().GetDims(); const auto& skip_dims_check = skip->Shape().GetDims(); size_t skip_dims_size_check = skip_dims_check.size(); @@ -33,49 +29,150 @@ Status CheckInputs(const T* input, "skip is expected to have same shape as input or, a batch size of 1 or no batch size when input has 3 dimensions"); } - if (input_dims_size_check != 3 && input_dims_size_check != 2) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "input is expected to have 3 or 2 dimensions, got ", input_dims_size_check); - } - if (skip_dims_check[skip_dims_size_check - 1] != input_dims_check[input_dims_size_check - 1] || skip_dims_check[skip_dims_size_check - 2] != input_dims_check[input_dims_size_check - 2]) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "last two dimensions of skip needs to be same as input"); } + return Status::OK(); +} + +template +Status CheckGamma(const T* gamma, int hidden_size_check) { const auto& gamma_dims = gamma->Shape().GetDims(); + if (gamma_dims.size() != 1) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "gamma is expected to have 1 dimension, got ", gamma_dims.size()); } + if (gamma_dims[0] != hidden_size_check) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Last dimension of gamma and input does not match"); } + return Status::OK(); +} + +template +Status CheckBeta(const T* beta, int hidden_size_check) { if (nullptr != beta) { const auto& beta_dims = beta->Shape().GetDims(); + if (beta_dims.size() != 1) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "beta is expected to have 1 dimension, got ", beta_dims.size()); } + if (beta_dims[0] != hidden_size_check) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Last dimension of beta and input does not match"); } } + return Status::OK(); +} + +template +Status CheckBias(const T* bias, int hidden_size_check) { if (nullptr != bias) { const auto& bias_dims = bias->Shape().GetDims(); + if (bias_dims.size() != 1) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "bias is expected to have 1 dimension, got ", bias_dims.size()); } + if (bias_dims[0] != hidden_size_check) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Last dimension of bias and input does not match"); } } + + return Status::OK(); +} + +} // anonymous namespace + +template +Status CheckInputs(const T* input, + const T* skip, + const T* gamma, + const T* beta, + const T* bias, + int hidden_size_check, + size_t input_dims_size_check) { + if (input_dims_size_check != 3 && input_dims_size_check != 2) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "input is expected to have 3 or 2 dimensions, got ", input_dims_size_check); + } + + auto status = CheckSkip(input, skip, input_dims_size_check); + if (status != Status::OK()) { + return status; + } + + status = CheckGamma(gamma, hidden_size_check); + if (status != Status::OK()) { + return status; + } + + status = CheckBeta(beta, hidden_size_check); + if (status != Status::OK()) { + return status; + } + + status = CheckBias(bias, hidden_size_check); + if (status != Status::OK()) { + return status; + } + + return Status::OK(); +} + +template +Status CheckPotentiallyPrepackedInputs(const T* input, + const T* skip, + const T* gamma, + const T* beta, + const T* bias, + int hidden_size_check, + size_t input_dims_size_check, + bool prepacked_skip, + bool prepacked_gamma) { + if (input_dims_size_check != 3 && input_dims_size_check != 2) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "input is expected to have 3 or 2 dimensions, got ", input_dims_size_check); + } + + if (nullptr != skip) { + auto status = CheckSkip(input, skip, input_dims_size_check); + if (status != Status::OK()) { + return status; + } + } else if (!prepacked_skip) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "skip is expected but not provided"); + } + + if (nullptr != gamma) { + auto status = CheckGamma(gamma, hidden_size_check); + if (status != Status::OK()) { + return status; + } + } else if (!prepacked_gamma) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "gamma is expected but not provided"); + } + + auto status = CheckBeta(beta, hidden_size_check); + if (status != Status::OK()) { + return status; + } + + status = CheckBias(bias, hidden_size_check); + if (status != Status::OK()) { + return status; + } + return Status::OK(); } diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc index 34dcbd1d77fca..bfc2102bdaac2 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc @@ -641,12 +641,17 @@ Status QnnBackendManager::LoadCachedQnnContextFromBuffer(char* buffer, uint64_t ORT_RETURN_IF(nullptr == binary_info, "Qnn cached binary info is nullptr."); uint32_t graph_count = 0; QnnSystemContext_GraphInfo_t* graphs_info = nullptr; - if (binary_info->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_1) { - graph_count = binary_info->contextBinaryInfoV1.numGraphs; - graphs_info = binary_info->contextBinaryInfoV1.graphs; + if (binary_info->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_3) { + graph_count = binary_info->contextBinaryInfoV3.numGraphs; + graphs_info = binary_info->contextBinaryInfoV3.graphs; } else if (binary_info->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_2) { graph_count = binary_info->contextBinaryInfoV2.numGraphs; graphs_info = binary_info->contextBinaryInfoV2.graphs; + } else if (binary_info->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_1) { + graph_count = binary_info->contextBinaryInfoV1.numGraphs; + graphs_info = binary_info->contextBinaryInfoV1.graphs; + } else { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported context binary info version."); } ORT_RETURN_IF(graph_count < 1 || graphs_info == nullptr, "Failed to get graph info from Qnn cached context."); diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.cc b/onnxruntime/core/providers/qnn/builder/qnn_model.cc index b09ff51b666c7..2950c246902fa 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_model.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_model.cc @@ -321,29 +321,50 @@ Status QnnModel::DeserializeGraphInfoFromBinaryInfo(const QnnSystemContext_Graph std::vector output_tensor_wrappers; std::string graph_name; - if (qnn_sys_ctx_graph_info.version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_1) { + Qnn_Tensor_t* input_tensors = nullptr; + Qnn_Tensor_t* output_tensors = nullptr; + uint32_t graph_input_num = 0; + uint32_t graph_output_num = 0; + if (qnn_sys_ctx_graph_info.version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_3) { + graph_name.assign(qnn_sys_ctx_graph_info.graphInfoV3.graphName); + graph_input_num = qnn_sys_ctx_graph_info.graphInfoV3.numGraphInputs; + graph_output_num = qnn_sys_ctx_graph_info.graphInfoV3.numGraphOutputs; + + input_tensors = qnn_sys_ctx_graph_info.graphInfoV3.graphInputs; + output_tensors = qnn_sys_ctx_graph_info.graphInfoV3.graphOutputs; + } else if (qnn_sys_ctx_graph_info.version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_2) { + graph_name.assign(qnn_sys_ctx_graph_info.graphInfoV2.graphName); + graph_input_num = qnn_sys_ctx_graph_info.graphInfoV2.numGraphInputs; + graph_output_num = qnn_sys_ctx_graph_info.graphInfoV2.numGraphOutputs; + + input_tensors = qnn_sys_ctx_graph_info.graphInfoV2.graphInputs; + output_tensors = qnn_sys_ctx_graph_info.graphInfoV2.graphOutputs; + } else if (qnn_sys_ctx_graph_info.version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_1) { graph_name.assign(qnn_sys_ctx_graph_info.graphInfoV1.graphName); - auto graph_input_num = qnn_sys_ctx_graph_info.graphInfoV1.numGraphInputs; - auto graph_output_num = qnn_sys_ctx_graph_info.graphInfoV1.numGraphOutputs; - ORT_RETURN_IF(nullptr == qnn_sys_ctx_graph_info.graphInfoV1.graphInputs, "Graph from cached context doesn't have any inputs."); - ORT_RETURN_IF(nullptr == qnn_sys_ctx_graph_info.graphInfoV1.graphOutputs, "Graph from cached context doesn't have any outputs."); - - // Copy graph input - Qnn_Tensor_t* input_tensors = qnn_sys_ctx_graph_info.graphInfoV1.graphInputs; - for (size_t i = 0; i < graph_input_num; ++i) { - QnnTensorWrapper tensorwrapper; - ORT_RETURN_IF_ERROR(tensorwrapper.Init(input_tensors[i])); - input_tensor_wrappers.push_back(std::move(tensorwrapper)); - } + graph_input_num = qnn_sys_ctx_graph_info.graphInfoV1.numGraphInputs; + graph_output_num = qnn_sys_ctx_graph_info.graphInfoV1.numGraphOutputs; - // Copy graph output - Qnn_Tensor_t* output_tensors = qnn_sys_ctx_graph_info.graphInfoV1.graphOutputs; - for (size_t i = 0; i < graph_output_num; ++i) { - QnnTensorWrapper tensorwrapper; - ORT_RETURN_IF_ERROR(tensorwrapper.Init(output_tensors[i])); - output_tensor_wrappers.push_back(std::move(tensorwrapper)); - } + input_tensors = qnn_sys_ctx_graph_info.graphInfoV1.graphInputs; + output_tensors = qnn_sys_ctx_graph_info.graphInfoV1.graphOutputs; + } else { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported context graph info version."); + } + ORT_RETURN_IF(nullptr == input_tensors, "Graph from cached context doesn't have any inputs."); + ORT_RETURN_IF(nullptr == output_tensors, "Graph from cached context doesn't have any outputs."); + + // Copy graph input + for (size_t i = 0; i < graph_input_num; ++i) { + QnnTensorWrapper tensorwrapper; + ORT_RETURN_IF_ERROR(tensorwrapper.Init(input_tensors[i])); + input_tensor_wrappers.push_back(std::move(tensorwrapper)); } + // Copy graph output + for (size_t i = 0; i < graph_output_num; ++i) { + QnnTensorWrapper tensorwrapper; + ORT_RETURN_IF_ERROR(tensorwrapper.Init(output_tensors[i])); + output_tensor_wrappers.push_back(std::move(tensorwrapper)); + } + Qnn_GraphHandle_t graph; auto qnn_interface = qnn_backend_manager_->GetQnnInterface(); auto rt = qnn_interface.graphRetrieve(context, graph_name.c_str(), &graph); diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc index 2600104bde7a2..e4c58ba51c3df 100644 --- a/onnxruntime/core/session/onnxruntime_c_api.cc +++ b/onnxruntime/core/session/onnxruntime_c_api.cc @@ -2842,7 +2842,7 @@ static_assert(offsetof(OrtApi, SessionOptionsAppendExecutionProvider_OpenVINO_V2 static_assert(offsetof(OrtApi, AddExternalInitializersFromFilesInMemory) / sizeof(void*) == 279, "Size of version 18 API cannot change"); // So that nobody forgets to finish an API version, this check will serve as a reminder: -static_assert(std::string_view(ORT_VERSION) == "1.20.0", +static_assert(std::string_view(ORT_VERSION) == "1.20.1", "ORT_Version change detected, please follow below steps to ensure OrtApi is updated properly"); // 1. Update the hardcoded version string in above static_assert to silence it // 2. If there were any APIs added to ort_api_1_to_20 above: diff --git a/onnxruntime/python/tools/quantization/__init__.py b/onnxruntime/python/tools/quantization/__init__.py index 9d397499d45a4..712e15a6a1ca9 100644 --- a/onnxruntime/python/tools/quantization/__init__.py +++ b/onnxruntime/python/tools/quantization/__init__.py @@ -10,6 +10,7 @@ from .quantize import DynamicQuantConfig # noqa: F401 from .quantize import QuantizationMode # noqa: F401 from .quantize import StaticQuantConfig # noqa: F401 +from .quantize import get_qdq_config # noqa: F401 from .quantize import quantize # noqa: F401 from .quantize import quantize_dynamic # noqa: F401 from .quantize import quantize_static # noqa: F401 diff --git a/onnxruntime/python/tools/quantization/base_quantizer.py b/onnxruntime/python/tools/quantization/base_quantizer.py index b20af5137d206..f07fb30f10f82 100644 --- a/onnxruntime/python/tools/quantization/base_quantizer.py +++ b/onnxruntime/python/tools/quantization/base_quantizer.py @@ -21,7 +21,6 @@ from .quant_utils import ( ONNX_TYPE_TO_NP_TYPE, TENSOR_NAME_QUANT_SUFFIX, - QuantType, find_by_name, model_has_infer_metadata, normalize_axis, @@ -40,18 +39,26 @@ def __init__(self, **data: Dict[str, Any]): for k, v in data.items(): if not isinstance(k, str): raise TypeError(f"Keys must be strings not {type(k)} for k={k!r}.") - if not isinstance(v, (int, str, np.ndarray)): + if k != "axis" and not isinstance(v, (int, str, np.ndarray)): raise TypeError(f"Values must be numpy arrays, int, float, str not {type(v)} for k={k!r}.") + if k == "axis" and not isinstance(v, int) and v is not None: + raise TypeError(f"Axis value must be an int or None, not {type(v)}.") if k == "scale" and v.dtype not in (np.float32, np.float16): raise ValueError(f"scale must a float32 or float16 numpy element but is {v.dtype} for k={k!r}") self.data[k] = v + def get(self, key, default_value=None): + return self.data.get(key, default_value) + def __iter__(self): yield from self.data def __getitem__(self, key): return self.data[key] + def __setitem__(self, key, value): + self.data[key] = value + def __len__(self): return len(self.data) @@ -88,9 +95,10 @@ def __init__( self.force_quantize_no_input_check = ( "ForceQuantizeNoInputCheck" in self.extra_options and self.extra_options["ForceQuantizeNoInputCheck"] ) - self.is_weight_symmetric = self.extra_options.get( - "WeightSymmetric", weight_qType in (QuantType.QInt8, QuantType.QInt16, QuantType.QFLOAT8E4M3FN) - ) + + # If user does not explicitly set "WeightSymmetric", then the weight's quantization type determines + # the symmetry (i.e., signed integer types will use symmetric quantization). See `def is_weight_symmetric()` + self._is_weight_symmetric: bool | None = self.extra_options.get("WeightSymmetric", None) self.is_activation_symmetric = self.extra_options.get("ActivationSymmetric", False) self.min_real_range = self.extra_options.get("MinimumRealRange") @@ -131,6 +139,16 @@ def __init__( self.tensor_quant_override_qtypes = self.tensor_quant_overrides.get_quant_types() + def is_weight_symmetric(self, weight_quant_type: onnx.TensorProto.DataType) -> bool: + if self._is_weight_symmetric is not None: + return self._is_weight_symmetric # Return value explicitly set by user. + return weight_quant_type in ( + onnx.TensorProto.INT4, + onnx.TensorProto.INT8, + onnx.TensorProto.INT16, + onnx.TensorProto.FLOAT8E4M3FN, + ) + def quantize_model(self): raise NotImplementedError @@ -230,9 +248,19 @@ def quantize_bias_static_impl(self, bias_name, input_scale, weight_scale, beta=1 # TODO: This formula should be explained including why the scale is not estimated for the bias as well. bias_scale = input_scale * weight_scale * beta - quantized_data = (np.asarray(bias_data) / bias_scale).round() - quantized_data = np.clip(quantized_data, np.iinfo(np.int32).min, np.iinfo(np.int32).max) - quantized_data = quantized_data.astype(np.int32) + # Quantize by dividing by bias_scale + quantized_data = np.asarray(bias_data, dtype=np.float64) / np.asarray(bias_scale, dtype=np.float64) + quantized_data = quantized_data.round() + + # Clip quantized data to the range of a int32 + int32_min = np.float64(np.iinfo(np.int32).min) + int32_max = np.float64(np.iinfo(np.int32).max) + if np.any(quantized_data < int32_min) or np.any(quantized_data > int32_max): + logging.warning( + f"Quantized bias `{bias_name}` exceeds the range of a int32. The bias scale is too small." + ) + + quantized_data = np.clip(quantized_data, int32_min, int32_max).astype(np.int32) # update bias initializer bias_np_data = np.asarray(quantized_data, dtype=np.int32).reshape(bias_initializer.dims) @@ -282,6 +310,7 @@ def quantize_initializer_impl(self, weight, qType, reduce_range=False, keep_floa If keep_float_weight is False, quantize the weight, or don't quantize the weight. :return: quantized weight name, zero point name, scale name """ + # TODO(adrianlizarraga): This function is now only used by onnx_quantizer.py, so move it there. q_weight_name = weight.name + TENSOR_NAME_QUANT_SUFFIX zp_name = weight.name + "_zero_point" scale_name = weight.name + "_scale" @@ -303,10 +332,11 @@ def quantize_initializer_impl(self, weight, qType, reduce_range=False, keep_floa assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}" else: - _, _, zero_point, scale, q_weight_data = quantize_data( + symmetric = self.is_weight_symmetric(qType) if qType == self.weight_qType else self.is_activation_symmetric + zero_point, scale, q_weight_data = quantize_data( weight_data.flatten(), qType, - quant_overrides.get("symmetric", self.is_weight_symmetric), + quant_overrides.get("symmetric", symmetric), reduce_range=quant_overrides.get("reduce_range", self.reduce_range and reduce_range), min_real_range=self.min_real_range, rmin_override=quant_overrides.get("rmin"), @@ -371,6 +401,7 @@ def quantize_weight_per_channel_impl( reduce_range=True, keep_float_weight=False, ): + # TODO(adrianlizarraga): This function is now only used by onnx_quantizer.py, so move it there. initializer = find_by_name(weight_name, self.model.initializer()) if initializer is None: raise ValueError("{} is not an initializer", weight_name) @@ -409,13 +440,7 @@ def quantize_weight_per_channel_impl( if "quant_type" in quant_overrides_for_channels[0]: weight_qType = quant_overrides_for_channels[0]["quant_type"].tensor_type # noqa: N806 - symmetric = quant_overrides_for_channels[0].get( - "symmetric", - ( - self.is_weight_symmetric - or weight_qType in (onnx.TensorProto.INT8, onnx.TensorProto.FLOAT8E4M3FN, onnx.TensorProto.INT4) - ), - ) + symmetric = quant_overrides_for_channels[0].get("symmetric", self.is_weight_symmetric(weight_qType)) reduce_range = quant_overrides_for_channels[0].get("reduce_range", self.reduce_range and reduce_range) zero_point_list = [] scale_list = [] @@ -444,7 +469,7 @@ def quantize_weight_per_channel_impl( ), f"Unexpected type {type(quantized_per_channel_data)}" else: - _, _, zero_point, scale, quantized_per_channel_data = quantize_data( + zero_point, scale, quantized_per_channel_data = quantize_data( per_channel_data.flatten(), weight_qType, symmetric, @@ -529,4 +554,6 @@ def adjust_tensor_ranges(self): self.tensors_range[node.input[0]] = td # Adjust Softmax to range from 0.0 to 1.0 elif node.op_type == "Softmax": + if not self.should_quantize_node(node): + continue self.tensors_range[node.output[0]] = TensorData(lowest=np.float32(0.0), highest=np.float32(1.0)) diff --git a/onnxruntime/python/tools/quantization/onnx_model.py b/onnxruntime/python/tools/quantization/onnx_model.py index 174bf5fd1509c..43105550139de 100644 --- a/onnxruntime/python/tools/quantization/onnx_model.py +++ b/onnxruntime/python/tools/quantization/onnx_model.py @@ -296,6 +296,26 @@ def get_largest_node_name_suffix(self, node_name_prefix): return suffix + def get_largest_initializer_name_suffix(self, initializer_name_prefix): + """ + Gets the largest initializer name integer suffix for all initializer names that begin + with `initializer_name_prefix`. This can be used to create unique initializer names. + + Example: for initializer names 'my_weight_0' and 'my_weight_3', this method returns 3 if + `initializer_name_prefix` is 'my_weight_'. + """ + suffix = -1 + + for initializer in self.model.graph.initializer: + if initializer.name.startswith(initializer_name_prefix): + try: + index = int(initializer.name[len(initializer_name_prefix) :]) + suffix = max(index, suffix) + except ValueError: + continue + + return suffix + def find_nodes_by_initializer(self, graph, initializer): """ Find all nodes with given initializer as an input. diff --git a/onnxruntime/python/tools/quantization/operators/pad.py b/onnxruntime/python/tools/quantization/operators/pad.py index 5f3c1231e62d6..b3e9ddb5e6278 100644 --- a/onnxruntime/python/tools/quantization/operators/pad.py +++ b/onnxruntime/python/tools/quantization/operators/pad.py @@ -1,3 +1,12 @@ +# -------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +from __future__ import annotations + +from typing import Any + +import numpy as np import onnx from ..quant_utils import ( @@ -8,6 +17,7 @@ quantize_nparray, ) from .base_operator import QuantOperatorBase +from .qdq_base_operator import QDQOperatorBase class QPad(QuantOperatorBase): @@ -98,3 +108,65 @@ def quantize(self): node.input[0] = quantized_input_value.q_name node.output[0] = quantized_output_value.q_name self.quantizer.new_nodes += [node] + + +class QDQPad(QDQOperatorBase): + def __init__(self, onnx_quantizer, onnx_node): + super().__init__(onnx_quantizer, onnx_node) + + def _get_pad_const_val(self, attrs_dict: dict[str, Any]) -> np.ndarray | None: + """ + Returns the Pad's constant padding value. Returns `None` if the padding value is + not constant (i.e., comes from a dynamic input). + """ + const_val = None + onnx_tensor_type = self.quantizer.model.get_tensor_type(self.node.input[0]) + if onnx_tensor_type is None: + return None + + np_dtype = onnx.helper.tensor_dtype_to_np_dtype(onnx_tensor_type.elem_type) + if self.quantizer.opset_version < 11: + const_val = np.array(attrs_dict.get("value", 0), dtype=np_dtype) + elif len(self.node.input) >= 3 and self.node.input[2]: + const_val = self.quantizer.model.get_constant_value(self.node.input[2]) + else: + const_val = np.array(0, dtype=np_dtype) + + return const_val + + def _should_quantize_output_same_as_input(self) -> bool: + """ + Returns true if Pad's output should use the same quantization parameters as input[0] + """ + attrs_dict = {} + for attribute in self.node.attribute: + kv = attribute_to_kwarg(attribute) + attrs_dict.update(kv) + + pad_mode = attrs_dict.get("mode", b"constant") + if pad_mode in (b"reflect", b"edge", b"wrap"): + # These modes pad the output with a value that already exists in the input. + # So, we can quantize the output the same as the input. + return True + + # For 'constant' mode, if padding with 0, we can also quantize the output the same as the input + # because our quantization floating-point range always includes 0. + if pad_mode == b"constant": + pad_val = self._get_pad_const_val(attrs_dict) + if pad_val is not None and pad_val.dtype in (np.float32, np.float16): + return float(pad_val.item()) == 0 + + return False + + def quantize(self): + assert self.node.op_type == "Pad" + + for input_name in self.node.input: + if input_name: + self.quantizer.quantize_activation_tensor(input_name) + + if not self.disable_qdq_for_node_output: + if self._should_quantize_output_same_as_input(): + self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name) + else: + self.quantizer.quantize_activation_tensor(self.node.output[0]) diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py index b71f332252850..048c7f3296503 100644 --- a/onnxruntime/python/tools/quantization/qdq_quantizer.py +++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py @@ -20,6 +20,7 @@ from .calibrate import TensorData from .quant_utils import ( DEQUANT_OP_NAME, + ONNX_TYPE_TO_NP_TYPE, QUANT_OP_NAME, QuantizedValue, QuantizedValueType, @@ -30,12 +31,14 @@ add_quant_input_suffix, add_quant_output_suffix, add_quant_suffix, + compute_data_quant_params, compute_scale_zp, compute_scale_zp_float8, find_by_name, get_qmin_qmax_for_qType, ms_domain, normalize_axis, + quantize_onnx_initializer, tensor_proto_to_array, ) from .registry import CreateQDQQuantizer @@ -86,6 +89,18 @@ class QDQTensorQuantParams: converted: QuantizationParams | None # Converted type consumed by some (or all/none) consumer nodes. converted_recv_nodes: set[str] | None # The name of nodes that consume the converted type. + def get_for_consumer(self, consumer_node_name) -> QuantizationParams: + if self.converted is None: # Quantized value is not converted, return original + return self.original + + if self.converted_recv_nodes is None: # All consumers receive the converted value + return self.converted + + # Check if consumer node name is in the list of nodes that + # receive the converted quantization value. If not, return the original value generated + # by the tensor's producer. + return self.converted if (consumer_node_name in self.converted_recv_nodes) else self.original + # Holds scale and zero_point initializer TensorProtos. @dataclass @@ -153,8 +168,8 @@ def __init__( op_types_to_quantize, extra_options, ) - self.tensors_to_quantize = {} - self.bias_to_quantize = {} + self.tensors_to_quantize: dict[str, QDQTensorQuantInfo] = {} + self.bias_to_quantize: dict[str, QDQBiasQuantInfo] = {} self.nodes_to_remove = [] @@ -191,6 +206,9 @@ def __init__( # Used in the QDQRemovableActivation class. self.qdq_keep_removable_activations = extra_options.get("QDQKeepRemovableActivations", False) + # Let user disable adjustment of weight scales for bias inputs that are quantized to int32. + self.qdq_disable_weight_adjust_for_int32_bias = extra_options.get("QDQDisableWeightAdjustForInt32Bias", False) + # The ONNX spec did not support 16-bit Q/DQ ops before opset 21. # So, may have to override the Q/DQ op domain to 'com.microsoft' if the activation or weight types # are 16-bit or 4-bit integers. @@ -213,6 +231,7 @@ def __init__( self.qdq_op_domain = ms_domain self.quantization_params = self.calc_graph_quant_params() + self.initializer_quant_params: dict[str, QuantizationParams] = {} # Map of all original value names to quantized value names self.quantized_value_map = {} @@ -328,6 +347,18 @@ def quantize_weight_tensor_per_channel(self, tensor_name, axis): else: logging.warning(f"only support per-channel quantization on weight. Tensor: {tensor_name} is not quantized.") + def _dup_initializer(self, initializer: onnx.TensorProto) -> onnx.TensorProto: + """ + Duplicates an existing initializer and adds it to the model. Returns the new initializer. + """ + name_suffix: int = self.model.get_largest_initializer_name_suffix(initializer.name) + 1 + new_initializer_name = f"{initializer.name}{name_suffix}" + new_initializer = onnx.TensorProto() + new_initializer.CopyFrom(initializer) + new_initializer.name = new_initializer_name + self.model.add_initializer(new_initializer) + return new_initializer + def quantize_bias_tensor(self, node_name, bias_name, input_name, weight_name, beta=1.0): """ Adds a bias tensor to the list of bias tensors to quantize. Called by op quantizers that @@ -353,15 +384,160 @@ def quantize_bias_tensor(self, node_name, bias_name, input_name, weight_name, be self.quantize_weight_tensor(bias_name) return - weight = find_by_name(bias_name, self.model.initializer()) - if weight is not None: - if weight.data_type in (onnx_proto.TensorProto.FLOAT, onnx_proto.TensorProto.FLOAT16): - if bias_name not in self.bias_to_quantize: - self.bias_to_quantize[bias_name] = QDQBiasQuantInfo(node_name, input_name, weight_name, beta) - else: - logging.warning(f"Bias {bias_name} has already been marked for quantization") - else: - logging.warning(f"Expected {bias_name} to be a weight") + bias_initializer = find_by_name(bias_name, self.model.initializer()) + if bias_initializer is None: + logging.warning(f"Expected bias '{bias_name}' to be an initializer") + return + + if bias_initializer.data_type not in (onnx_proto.TensorProto.FLOAT, onnx_proto.TensorProto.FLOAT16): + logging.info(f"Expected bias '{bias_name}' to be an floating-point initializer") + return + + actual_bias_name = bias_name + if bias_name in self.bias_to_quantize: + # This bias input is consumed by two different nodes. We need to duplicate the bias so that + # each node has its own bias input. This is necessary because the bias's scale is computed + # from the node's other input scales. + new_bias_initializer = self._dup_initializer(bias_initializer) + actual_bias_name = new_bias_initializer.name + + # Replace this node's bias input + self.model.replace_input_of_nodes(bias_name, actual_bias_name, {node_name}) + logging.info(f"Created a copy of bias input '{bias_name}' called '{actual_bias_name}'") + + # Add this to our list of biases to quantize. + self.bias_to_quantize[actual_bias_name] = QDQBiasQuantInfo(node_name, input_name, weight_name, beta) + + def _adjust_weight_scale_for_int32_bias( + self, + input_scale: np.ndarray, + weight_scale: np.ndarray, + weight_name: str, + bias_tp: onnx.TensorProto, + is_per_channel: bool, + ) -> tuple[bool, np.ndarray | None]: + """ + Checks if the bias scale (input_scale * weight_scale) that we intend to use is too small. + A bias scale that is too small leads to quantized bias values that fall outside the range of a int32 and have to + be clipped, which decreases accuracy. If this function detects such a scenario, the weight_scale value will be + increased to prevent this from happening. + + Although the adjustment method and amount differs, the idea to adjust the weight's scale came from the following + reference: + https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/optimize/quantization_utils.cc#L252 + + :param input_scale: The input's scale. + :param weight_scale: The weight scale to potentially adjust. + :param weight_name: The weight initializer's name. Used for logging. + :param bias_tp: The bias ONNX initializer. + :param is_per_channel: True if the bias and weight are quantized per-channel. + :return: A tuple with a bool indicating if the weight's scale was adjusted and the new weight scale. + """ + if not weight_scale.size: + return False, None + + bias_float_data = tensor_proto_to_array(bias_tp) + + int32_info = np.iinfo(np.int32) + multiplicative_epsilon = 1.0001 + qrange = np.array(int32_info.max, dtype=np.float64) - np.array(int32_info.min + 1, dtype=np.float64) + weight_scale_dtype = weight_scale.dtype + updated_an_elem = False + + if not is_per_channel: + rmin = np.minimum(bias_float_data.min(), np.array(0, dtype=np.float64)) + rmax = np.maximum(bias_float_data.max(), np.array(0, dtype=np.float64)) + absmax = np.maximum(np.abs(rmin), np.abs(rmax)) + bias_smallest_valid_scale = multiplicative_epsilon * (2.0 * absmax) / qrange + + input_scale_fp64 = np.array(input_scale.item(), dtype=np.float64) + weight_scale_fp64 = np.array(weight_scale.item(), dtype=np.float64) + bias_candidate_scale = input_scale_fp64 * weight_scale_fp64 + + if (bias_candidate_scale < bias_smallest_valid_scale) and (bias_candidate_scale > 0.0): + # The candidate bias scale would be too small, so increase the weight_scale by the necessary ratio. + ratio = bias_smallest_valid_scale / bias_candidate_scale + logging.info( + f"Increasing scale for weight `{weight_name}` by the ratio {ratio} to " + f"ensure bias input `{bias_tp.name}` has a valid scale." + ) + new_scale = weight_scale_fp64 * ratio + weight_scale = new_scale.astype(weight_scale_dtype) + updated_an_elem = True + elif weight_scale.shape and len(weight_scale.shape) == 1: + # per-channel case + num_elems = weight_scale.shape[0] + + for i in range(num_elems): + bias_rmax = np.abs(bias_float_data[i]) + bias_smallest_valid_scale = multiplicative_epsilon * (2.0 * bias_rmax) / qrange + + input_scale_fp64 = np.array(input_scale.item(), dtype=np.float64) + weight_scale_fp64 = np.array(weight_scale[i].item(), dtype=np.float64) + bias_candidate_scale = input_scale_fp64 * weight_scale_fp64 + if (bias_candidate_scale < bias_smallest_valid_scale) and (bias_candidate_scale > 0.0): + # The candidate bias scale would be too small, so increase the weight_scale by the necessary ratio. + ratio = bias_smallest_valid_scale / bias_candidate_scale + logging.info( + f"Increased scale[{i}] for weight `{weight_name}` by ratio {ratio} " + f"to ensure bias input `{bias_tp.name}` has a valid scale." + ) + new_scale = weight_scale_fp64 * ratio + weight_scale[i] = new_scale.astype(weight_scale_dtype) + updated_an_elem = True + + return updated_an_elem, weight_scale + + def _adjust_weight_quant_params_for_bias_tensors(self): + """ + Iterates through all bias inputs that should be quantized to int32. If the intended + bias scale (equal to input_scale * weight_scale) is too small, this function will increase + the associated weight's scale to ensure the bias does not overflow the int32 range when quantized. + """ + + if self.qdq_disable_weight_adjust_for_int32_bias: + # User passed an extra_option to disable this adjustment. + return + + for bias_name, bias_info in self.bias_to_quantize.items(): + if ( + bias_info.input_name not in self.quantization_params + or bias_info.input_name not in self.tensors_to_quantize + or bias_info.weight_name not in self.initializer_quant_params + ): + continue + + # Get the associated input's scale. + input_qparams = self.quantization_params[bias_info.input_name].get_for_consumer(bias_info.node_name) + input_info = self.tensors_to_quantize[bias_info.input_name] + input_scale = np.asarray( + input_qparams["scale"], dtype=onnx.helper.tensor_dtype_to_np_dtype(input_info.data_type) + ) + + weight_quant_params = self.initializer_quant_params[bias_info.weight_name] + weight_quant_type = weight_quant_params["quant_type"] + if weight_quant_type not in (onnx.TensorProto.INT8, onnx.TensorProto.INT16): + continue + + weight_zero_point: np.ndarray = weight_quant_params["zero_point"] + if weight_zero_point.any(): + # Skip if zero_point(s) are not all zero (i.e., symmetric quant) + continue + + weight_scale: np.ndarray = weight_quant_params["scale"] + is_per_channel = weight_quant_params.get("axis", None) is not None + + # Get adjusted weight scales. + did_update_weight_scale, new_weight_scale = self._adjust_weight_scale_for_int32_bias( + input_scale, + weight_scale, + bias_info.weight_name, + find_by_name(bias_name, self.model.initializer()), + is_per_channel, + ) + + if did_update_weight_scale: + weight_quant_params["scale"] = new_weight_scale def remove_node(self, node): self.nodes_to_remove.append(node) @@ -380,6 +556,8 @@ def quantize_model(self): self.tensor_to_its_receiving_nodes[tensor_name] = [] self.tensor_to_its_receiving_nodes[tensor_name].append(node) + self.initializer_quant_params = self._calc_initializer_quant_params() + self._adjust_weight_quant_params_for_bias_tensors() self._quantize_normal_tensors() self._quantize_sharing_param_tensors() if self.quantize_bias: @@ -475,38 +653,26 @@ def _create_qdq_nodes( ) self.model.add_nodes([qlinear_node, dequant_node]) - def _add_qdq_pair_for_initializer(self, weight_proto, tensor_type, axis=None): + def _add_qdq_nodes_for_initializer(self, weight_proto: onnx.TensorProto): + """ + Adds Q/DQ nodes for an initializer. If `self.add_qdq_pair_to_weight` is true, creates + the sequence (weight_f32 -> Q -> DQ -> ). Otherwise, this function quantizes the initializer + and adds the sequence (weight_quant -> DQ ->). + """ weight_name = weight_proto.name - if axis is not None: - if self.opset_version < 13: - raise ValueError("Per-Channel support with QDQ format requires onnx opset version 13 or above.") - - qtype = self.weight_qType if tensor_type is QDQQuantTensorType.WEIGHT else self.activation_qType - if qtype == onnx.onnx_pb.TensorProto.UINT8: - qtype = onnx_proto.TensorProto.INT8 - - q_weight_name, zp_name, scale_name = self.quantize_weight_per_channel( - weight_name, - # Quantization type is forced to be TensorProto.INT8. - # when the expected value would be (see below) - # self.weight_qType if tensor_type is QDQQuantTensorType.WEIGHT else self.activation_qType. - # QLinearConv expects to have a unique value for all channels. - # This code does not enforce that but it is necessarily the case when the - # quantization is symmetric (as for INT8). - qtype, - axis, - keep_float_weight=self.add_qdq_pair_to_weight, - ) - else: - q_weight_name, zp_name, scale_name = self.quantize_initializer( - weight_proto, - self.weight_qType if tensor_type is QDQQuantTensorType.WEIGHT else self.activation_qType, - keep_float_weight=self.add_qdq_pair_to_weight, - ) + if weight_name in self.quantized_value_map: + return + quant_params: QuantizationParams = self.initializer_quant_params[weight_name] + axis: int = quant_params.get("axis") + scale_zp_initializers = self._make_scale_zp_initializers(weight_name, quant_params) + q_weight_name: str | None = None weight_dequant_output = add_dequant_output_suffix(weight_name) self.model.replace_input_of_all_nodes(weight_name, weight_dequant_output) + if self.add_qdq_pair_to_weight: + # Don't actually quantize the weight. Instead, keep floating-point weight and create the node + # sequence (weight_f32 -> Q -> DQ -> weight_dequant) weight_quant_output = add_quant_output_suffix(weight_name) self._create_qdq_nodes( @@ -516,14 +682,26 @@ def _add_qdq_pair_for_initializer(self, weight_proto, tensor_type, axis=None): weight_quant_output, weight_dequant_output, add_dequant_suffix(weight_name), - scale_name, - zp_name, + scale_zp_initializers.scale.name, + scale_zp_initializers.zero_point.name, axis, ) else: + # Quantize the weight and create the node sequence: + # (weight_quantized -> DQ -> weight_dequant) + quant_weight = quantize_onnx_initializer( + weight_proto, + quant_params["quant_type"], + quant_params["zero_point"], + quant_params["scale"], + axis, + ) + self.model.add_initializer(quant_weight) + + q_weight_name = quant_weight.name dequant_node = onnx.helper.make_node( DEQUANT_OP_NAME, - [q_weight_name, scale_name, zp_name], + [quant_weight.name, scale_zp_initializers.scale.name, scale_zp_initializers.zero_point.name], [weight_dequant_output], add_dequant_suffix(weight_name), axis=axis, @@ -531,6 +709,17 @@ def _add_qdq_pair_for_initializer(self, weight_proto, tensor_type, axis=None): ) self.model.add_node(dequant_node) + # Log entry for this quantized weight + quantized_value = QuantizedValue( + weight_name, + q_weight_name, + scale_zp_initializers.scale.name, + scale_zp_initializers.zero_point.name, + QuantizedValueType.Initializer, + axis=axis, + ) + self.quantized_value_map[weight_name] = QDQTensorQuantizedValue(quantized_value, None, None) + def _add_qdq_pair_for_activation(self, tensor_name, scale_name, zp_name, data_type=None): if ( self.dedicated_qdq_pair @@ -767,7 +956,7 @@ def _quantize_normal_tensors(self): # Quantize the input initializer = find_by_name(tensor_name, self.model.initializer()) if initializer: - self._add_qdq_pair_for_initializer(initializer, tensor_info.tensor_type, tensor_info.axis) + self._add_qdq_nodes_for_initializer(initializer) else: tensor_qparam_initializers = self._make_tensor_scale_zp_initializers(tensor_name) if not tensor_qparam_initializers: @@ -909,45 +1098,6 @@ def _quantize_bias_tensors(self): def is_tensor_quantized(self, tensor_name: str): return tensor_name in self.tensors_to_quantize or tensor_name in self.bias_to_quantize - def quantize_initializer( - self, - weight: onnx.TensorProto, - qType: onnx.TensorProto.DataType, - reduce_range: bool = False, - keep_float_weight: bool = False, - ) -> tuple[str, str, str]: - """ - :param weight: TensorProto initializer - :param qType: type to quantize to - :param keep_float_weight: Whether to quantize the weight. In some cases, we only want to qunatize scale and zero point. - If keep_float_weight is False, quantize the weight, or don't quantize the weight. - :return: quantized weight name, zero point name, scale name - """ - # Find if this input is already quantized - if weight.name in self.quantized_value_map: - quantized_value = self.quantized_value_map[weight.name].original - return ( - quantized_value.q_name, - quantized_value.zp_name, - quantized_value.scale_name, - ) - - q_weight_name, zp_name, scale_name = self.quantize_initializer_impl( - weight, qType, reduce_range, keep_float_weight - ) - - # Log entry for this quantized weight - quantized_value = QuantizedValue( - weight.name, - q_weight_name, - scale_name, - zp_name, - QuantizedValueType.Initializer, - None, - ) - self.quantized_value_map[weight.name] = QDQTensorQuantizedValue(quantized_value, None, None) - return q_weight_name, zp_name, scale_name - def is_tensor_per_channel( self, tensor_name: str, @@ -997,38 +1147,6 @@ def is_tensor_per_channel( return True, axis - def quantize_weight_per_channel( - self, - weight_name: str, - weight_qType: onnx.TensorProto.DataType, - channel_axis: int, - reduce_range: bool = True, - keep_float_weight: bool = False, - ) -> tuple[str, str, str]: - # Find if this input is already quantized - if weight_name in self.quantized_value_map: - quantized_value = self.quantized_value_map[weight_name].original - return ( - quantized_value.q_name, - quantized_value.zp_name, - quantized_value.scale_name, - ) - - q_weight_name, zp_name, scale_name = self.quantize_weight_per_channel_impl( - weight_name, weight_qType, channel_axis, reduce_range, keep_float_weight - ) - quantized_value = QuantizedValue( - weight_name, - q_weight_name, - scale_name, - zp_name, - QuantizedValueType.Initializer, - None, - ) - self.quantized_value_map[weight_name] = QDQTensorQuantizedValue(quantized_value, None, None) - - return q_weight_name, zp_name, scale_name - def quantize_bias_static(self, bias_name: str, bias_info: QDQBiasQuantInfo) -> str: """ Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale @@ -1040,15 +1158,15 @@ def quantize_bias_static(self, bias_name: str, bias_info: QDQBiasQuantInfo) -> s # get scale for weight weight_scale_name = self.quantized_value_map[bias_info.weight_name].original.scale_name - weight_initializer = find_by_name(weight_scale_name, self.model.initializer()) - weight_scale = tensor_proto_to_array(weight_initializer) + weight_scale_initializer = find_by_name(weight_scale_name, self.model.initializer()) + weight_scale = tensor_proto_to_array(weight_scale_initializer) # get scale for input input_scale_name = ( self.quantized_value_map[bias_info.input_name].get_for_consumer(bias_info.node_name).scale_name ) - inputscale_initializer = find_by_name(input_scale_name, self.model.initializer()) - input_scale = tensor_proto_to_array(inputscale_initializer) + input_scale_initializer = find_by_name(input_scale_name, self.model.initializer()) + input_scale = tensor_proto_to_array(input_scale_initializer) ( quantized_bias_name, @@ -1074,7 +1192,7 @@ def quantize_bias_static(self, bias_name: str, bias_info: QDQBiasQuantInfo) -> s return quantized_bias_name def _make_scale_zp_initializers( - self, param_name: str, params: QuantizationParams, init_name_suffix: str = "" + self, param_name: str, quant_params: QuantizationParams, init_name_suffix: str = "" ) -> QDQScaleZpInitializers: """ Creates and returns scale and zero-point initializers for the given quantization params. The initializers are @@ -1082,31 +1200,31 @@ def _make_scale_zp_initializers( - {param_name}_zero_point{init_name_suffix} - {param_name}_scale{init_name_suffix} """ - zero_point_values = np.array([params["zero_point"]]) - if not hasattr(params["scale"], "dtype") or params["scale"].dtype not in (np.float32, np.float16): - raise ValueError(f"Unexpected type {type(params['scale'])} and param_name={param_name!r}") - scale_values = np.array([params["scale"]]) - assert scale_values.dtype != np.float64 - zero_point_type = params.data.get("quant_type", self.activation_qType) - - zero_point_shape = [] + zero_point = quant_params["zero_point"] + scale = quant_params["scale"] + zero_point_type = quant_params["quant_type"] + axis: int | None = quant_params.get("axis") + assert (axis is not None and len(scale.shape) == 1) or ( + axis is None and len(scale.shape) == 0 + ), "Wrong scale/zp shapes" + assert len(scale.shape) == len(zero_point.shape), "Scale and zero-point must have the same rank" + zero_point_name = param_name + "_zero_point" + init_name_suffix - scale_shape = [] scale_name = param_name + "_scale" + init_name_suffix # Add initializers to model init_zp = onnx.helper.make_tensor( - zero_point_name, zero_point_type, zero_point_shape, zero_point_values.ravel().tolist() + zero_point_name, zero_point_type, zero_point.shape, zero_point.ravel().tolist() ) self.model.add_initializer(init_zp) - if scale_values.dtype == np.float32: + if scale.dtype == np.float32: scale_type = onnx_proto.TensorProto.FLOAT - elif scale_values.dtype == np.float16: + elif scale.dtype == np.float16: scale_type = onnx_proto.TensorProto.FLOAT16 else: - raise ValueError(f"Unexpected dtype={scale_values.dtype} for param_name={param_name!r}") - init_scale = onnx.helper.make_tensor(scale_name, scale_type, scale_shape, scale_values.reshape((-1,)).tolist()) + raise ValueError(f"Unexpected dtype={scale.dtype} for param_name={param_name!r}") + init_scale = onnx.helper.make_tensor(scale_name, scale_type, scale.shape, scale.ravel().tolist()) self.model.add_initializer(init_scale) return QDQScaleZpInitializers(init_scale, init_zp) @@ -1155,7 +1273,7 @@ def calc_quant_params(self, tensor_data: TensorData, quant_overrides: dict[str, qmin, qmax = get_qmin_qmax_for_qType(quant_type, reduce_range=reduce_range, symmetric=symmetric) zero, scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric, self.min_real_range) - return QuantizationParams(zero_point=zero, scale=scale, quant_type=quant_type) + return QuantizationParams(zero_point=zero.squeeze(), scale=scale.squeeze(), quant_type=quant_type) def calc_graph_quant_params(self) -> dict[str, QDQTensorQuantParams]: """ @@ -1185,3 +1303,127 @@ def calc_graph_quant_params(self) -> dict[str, QDQTensorQuantParams]: quantization_params[tensor_name] = QDQTensorQuantParams(original, converted, converted_recv_nodes) return quantization_params + + def _calc_initializer_quant_params(self) -> dict[str, QuantizationParams]: + """ + Returns quantization parameters (scale/zero_point/quant_type) for all initializers. + """ + + quantization_params: dict[str, QuantizationParams] = {} + for tensor_name, tensor_info in self.tensors_to_quantize.items(): + initializer = find_by_name(tensor_name, self.model.initializer()) + if not initializer: + continue + + initializer_data = tensor_proto_to_array(initializer) + initializer_rank = len(initializer_data.shape) + + # initializers for elementwise ops use the quant_type for activations. + is_weight = tensor_info.tensor_type is QDQQuantTensorType.WEIGHT + quant_type = self.weight_qType if is_weight else self.activation_qType + + # Try to get scale/zp directly from user's overrides and avoid computation. + if self.tensor_quant_overrides.overrides_scale_zp(tensor_name): + overrides = self.tensor_quant_overrides[tensor_name] + if "quant_type" in overrides[0]: + quant_type = overrides[0]["quant_type"].tensor_type + + zp_dtype = ONNX_TYPE_TO_NP_TYPE[quant_type] + is_per_channel = "axis" in overrides[0] + if not is_per_channel: + quantization_params[tensor_name] = QuantizationParams( + zero_point=np.array(overrides[0]["zero_point"], dtype=zp_dtype), + scale=np.array(overrides[0]["scale"], initializer_data.dtype), + quant_type=quant_type, + ) + else: + zero_points_list = [] + scales_list = [] + for chan_overrides in overrides: + zero_points_list.append(np.array(chan_overrides["zero_point"], zp_dtype)) + scales_list.append(np.array(chan_overrides["scale"], dtype=initializer_data.dtype)) + + channel_axis = overrides[0]["axis"] + is_axis_valid, norm_channel_axis = normalize_axis(channel_axis, initializer_rank) + if not is_axis_valid: + raise ValueError( + f"Weight {initializer.name} has a per-channel axis with value {channel_axis} that is " + f"out-of-bounds for rank {initializer_rank}" + ) + + quantization_params[tensor_name] = QuantizationParams( + zero_point=np.array(zero_points_list), + scale=np.array(scales_list), + quant_type=quant_type, + axis=norm_channel_axis, + ) + + continue + + # Compute scale/zp normally. User's overrides may still override parameters + # used to compute the scale/zp (e.g., rmin, rmax, symmetric, etc.) + overrides = self.tensor_quant_overrides.get(tensor_name, [{}]) + if "quant_type" in overrides[0]: + quant_type = overrides[0]["quant_type"].tensor_type + + channel_axis = overrides[0].get("axis", tensor_info.axis) + is_per_channel = channel_axis is not None + + # Note: always quantize per-channel initializers as symmetric because QLinear* ops require the + # same zero-point in every channel, which is necessarily the case for symmetric quantization. + is_symmetric_default = is_per_channel or ( + self.is_weight_symmetric(quant_type) if is_weight else self.is_activation_symmetric + ) + is_symmetric = overrides[0].get("symmetric", is_symmetric_default) + reduce_range = overrides[0].get("reduce_range", self.reduce_range) + zero_point: np.ndarray | None = None + scale: np.ndarray | None = None + + if not is_per_channel: + zero_point, scale = compute_data_quant_params( + initializer_data.flatten(), + quant_type, + is_symmetric, + reduce_range=reduce_range, + min_real_range=self.min_real_range, + rmin_override=overrides[0].get("rmin"), + rmax_override=overrides[0].get("rmax"), + ) + else: + is_axis_valid, norm_channel_axis = normalize_axis(channel_axis, initializer_rank) + if not is_axis_valid: + raise ValueError( + f"Weight {initializer.name} has a per-channel axis with value {channel_axis} that is " + f"out-of-bounds for rank {initializer_rank}" + ) + + channel_axis = norm_channel_axis + channel_count = initializer_data.shape[channel_axis] + zero_points_list = [] + scales_list = [] + for i in range(channel_count): + per_channel_data = initializer_data.take(i, channel_axis) + channel_overrides = overrides[i] if overrides and i < len(overrides) else {} + channel_zero_point, channel_scale = compute_data_quant_params( + per_channel_data.ravel(), + quant_type, + is_symmetric, + reduce_range=reduce_range, + min_real_range=self.min_real_range, + rmin_override=channel_overrides.get("rmin"), + rmax_override=channel_overrides.get("rmax"), + ) + zero_points_list.append(channel_zero_point) + scales_list.append(channel_scale) + + zero_point = np.asarray(zero_points_list) + scale = np.asarray(scales_list) + + quantization_params[tensor_name] = QuantizationParams( + zero_point=zero_point, + scale=scale, + quant_type=quant_type, + axis=channel_axis, + ) + + return quantization_params diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py index 9228ad33130f2..2bf675745d093 100644 --- a/onnxruntime/python/tools/quantization/quant_utils.py +++ b/onnxruntime/python/tools/quantization/quant_utils.py @@ -33,6 +33,12 @@ int4 = None uint4 = None +try: + from onnx.reference.op_run import to_array_extended +except ImportError: + # old version of onnx. + to_array_extended = None + __producer__ = "onnx.quantize" __version__ = "0.1.0" @@ -43,6 +49,7 @@ DEQUANT_OP_NAME = "DequantizeLinear" DEQUANT_OUTPUT_SUFFIX = "_DequantizeLinear_Output" TENSOR_NAME_QUANT_SUFFIX = "_quantized" +MODEL_SIZE_THRESHOLD = 2147483648 # Quant model should use external data if >= 2GB FLOAT8_DISTRIBUTIONS = {} @@ -156,7 +163,9 @@ def from_string(format): } ONNX_INT_TYPE_SYMMETRIC_RANGE = { + onnx_proto.TensorProto.UINT8: (numpy.array(0, dtype=numpy.uint8), numpy.array(254, dtype=numpy.uint8)), onnx_proto.TensorProto.INT8: (numpy.array(-127, dtype=numpy.int8), numpy.array(127, dtype=numpy.int8)), + onnx_proto.TensorProto.UINT16: (numpy.array(0, dtype=numpy.uint16), numpy.array(65534, dtype=numpy.uint16)), onnx_proto.TensorProto.INT16: (numpy.array(-32767, dtype=numpy.int16), numpy.array(32767, dtype=numpy.int16)), } @@ -229,7 +238,7 @@ def quantize_nparray(qType, arr, scale, zero_point, low=None, high=None): # which matches the python reference ONNX implementation of QuantizeLinear. # This data can be packed into 4-bit elements by using pack_bytes_to_4bit(). dtype = ONNX_TYPE_TO_NP_TYPE[qType] - (qmin, qmax) = get_qmin_qmax_for_qType(qType, reduce_range=False, symmetric=True) + qmin, qmax = get_qmin_qmax_for_qType(qType, reduce_range=False, symmetric=False) cliplow = max(qmin, low) if low is not None else qmin cliphigh = min(qmax, high) if high is not None else qmax @@ -269,7 +278,7 @@ def compute_scale_zp(rmin, rmax, qmin, qmax, symmetric=False, min_real_range=Non # Ensure a minimum float-point range if specified. if min_real_range is not None: - rmax = max(rmax, rmin + min_real_range) + rmax = max(rmax, rmin + numpy.asarray(min_real_range, dtype=rmin.dtype)) if symmetric: absmax = numpy.maximum(numpy.abs(rmin), numpy.abs(rmax)) @@ -338,13 +347,75 @@ def compute_scale_zp_float8(element_type, std): return [zero, scale] +def compute_data_quant_params( + data: numpy.ndarray, + quant_type: onnx.TensorProto.DataType, + symmetric: bool, + reduce_range: bool = False, + min_real_range: float | None = None, + rmin_override: float | None = None, + rmax_override: float | None = None, +) -> tuple[numpy.ndarray, numpy.ndarray]: + """ + Returns the zero_point and scale for the given data. + + :param data: The data for which to compute quantization parameters. + :param quant_type: The quantization data type. + :param symmetric: whether symmetric quantization is used or not. + :parameter reduce_range: True if the quantization range should be reduced. Defaults to False. + :parameter min_real_range: Minimum floating-point range (i.e., rmax - rmin) to enforce. Defaults to None. + :parameter rmin_override: The value of rmin to use if not None. Otherwise, uses min(data). + :parameter rmax_override: The value of rmax to use if not None. Otherwise, uses max(data). + :return: zero point and scale + """ + if not isinstance(data, numpy.ndarray): + raise TypeError(f"Weight must be given as an array not {type(data)}.") + if rmin_override is not None: + rmin = rmin_override + else: + rmin = data.min() if len(data) else 0.0 + + if rmax_override is not None: + rmax = rmax_override + else: + rmax = data.max() if len(data) else 0.0 + + rmin = numpy.array(rmin, dtype=data.dtype) + rmax = numpy.array(rmax, dtype=data.dtype) + scale = numpy.array(1.0, dtype=data.dtype) + + if quant_type == TensorProto.FLOAT8E4M3FN: + if reduce_range: + raise RuntimeError("Unsupported option reduce_range=True for float 8.") + std = numpy.std(data) + zero_point, scale = compute_scale_zp_float8(quant_type, std) + return _check_type(zero_point, scale, zero_point_index=0) + + if quant_type in ( + TensorProto.INT8, + TensorProto.UINT8, + TensorProto.INT16, + TensorProto.UINT16, + TensorProto.INT4, + TensorProto.UINT4, + ): + qmin, qmax = get_qmin_qmax_for_qType(quant_type, reduce_range, symmetric=symmetric) + if len(data): + zero_point, scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric, min_real_range) + else: + zero_point = numpy.array(0, dtype=qmin.dtype) + return _check_type(zero_point, scale, zero_point_index=0) + + raise ValueError(f"Unexpected value for quant_type={quant_type}.") + + def quantize_data( data, qType, symmetric, reduce_range=False, min_real_range=None, rmin_override=None, rmax_override=None -): +) -> tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]: """ :param data: data to quantize - :param qType: data type to quantize to. Supported types UINT8 and INT8 - :param symmetric: whether symmetric quantization is used or not. This is applied to INT8. + :param qType: data type to quantize to. + :param symmetric: whether symmetric quantization is used or not. :parameter reduce_range: True if the quantization range should be reduced. Defaults to False. :parameter min_real_range: Minimum floating-point range (i.e., rmax - rmin) to enforce. Defaults to None. :parameter rmin_override: The value of rmin to use if not None. Otherwise, uses min(data). @@ -366,28 +437,16 @@ def quantize_data( - *S*: scale - *z*: zero point """ - if not isinstance(data, numpy.ndarray): - raise TypeError(f"Weight must be given as an array not {type(data)}.") - if rmin_override is not None: - rmin = rmin_override - else: - rmin = data.min() if len(data) else 0.0 - - if rmax_override is not None: - rmax = rmax_override - else: - rmax = data.max() if len(data) else 0.0 - - rmin = numpy.array(rmin, dtype=data.dtype) - rmax = numpy.array(rmax, dtype=data.dtype) - zero_point = 0 - scale = numpy.array(1.0, dtype=data.dtype) - + zero_point, scale = compute_data_quant_params( + data, + qType, + symmetric, + reduce_range, + min_real_range, + rmin_override, + rmax_override, + ) if qType == TensorProto.FLOAT8E4M3FN: - if reduce_range: - raise RuntimeError("Unsupported option reduce_range=True for float 8.") - std = numpy.std(data) - zero_point, scale = compute_scale_zp_float8(qType, std) quantized_data = quantize_nparray(qType, data, scale, zero_point) if any((quantized_data.astype(numpy.uint8).ravel() & 127) == 127): np_data = numpy.asarray(data) @@ -395,7 +454,7 @@ def quantize_data( f"One of the quantized value is NaN data in [{np_data.min()}, {np_data.max()}], " f"quantized_data in [{quantized_data.min()}, {quantized_data.max()}]." ) - return _check_type(rmin, rmax, zero_point, scale, quantized_data, zero_point_index=2) + return zero_point, scale, quantized_data if qType in ( TensorProto.INT8, @@ -405,15 +464,91 @@ def quantize_data( TensorProto.INT4, TensorProto.UINT4, ): - if len(data): - qmin, qmax = get_qmin_qmax_for_qType(qType, reduce_range, symmetric=symmetric) - zero_point, scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric, min_real_range) quantized_data = quantize_nparray(qType, data, scale, zero_point) - return _check_type(rmin, rmax, zero_point, scale, quantized_data, zero_point_index=2) + return zero_point, scale, quantized_data raise ValueError(f"Unexpected value for qType={qType}.") +def quantize_onnx_initializer( + weight: onnx.TensorProto, + quant_type: onnx.TensorProto.DataType, + zero_point: numpy.ndarray, + scale: numpy.ndarray, + axis: int | None = None, + quant_weight_name: str | None = None, +) -> onnx.TensorProto: + """ + Returns a quantized version of the given ONNX initializer. + + :param weight: The ONNX initializer to quantize. + :param quant_type: The final quantized data type. + :param zero_point: The zero-point value to use for quantization. + :param scale: The scale value to use for quantization. + :param axis: The quantization axis if quantizing per-channel. Defaults to None. + :param quant_weight_name: The name of the quantized initializer. + If not specified, the quantized name is generated. + :return: The quantized ONNX initializer. + """ + weight_data = tensor_proto_to_array(weight) + q_weight_data: numpy.ndarray | None = None + + if axis is None: # Per-tensor quantization + q_weight_data = quantize_nparray(quant_type, weight_data.ravel(), scale, zero_point) + else: # Per-channel quantization + channel_count = weight_data.shape[axis] + channel_dims = list(weight_data.shape) # deep copy + channel_dims[axis] = 1 # only one per channel for reshape + quantized_channel_data_list = [] + + for i in range(channel_count): + channel_data = weight_data.take(i, axis) + channel_scale = scale[i] + channel_zero_point = zero_point[i] + quantized_channel_data = quantize_nparray( + quant_type, channel_data.ravel(), channel_scale, channel_zero_point + ) + quantized_channel_data_list.append(numpy.asarray(quantized_channel_data).reshape(channel_dims)) + + q_weight_data = numpy.concatenate(quantized_channel_data_list, axis) + + q_weight_name = quant_weight_name if quant_weight_name else f"{weight.name}{TENSOR_NAME_QUANT_SUFFIX}" + + if quant_type == onnx.TensorProto.FLOAT8E4M3FN: + q_weight_initializer = onnx.TensorProto() + q_weight_initializer.data_type = quant_type + q_weight_initializer.dims.extend(weight.dims) + q_weight_initializer.name = q_weight_name + # Do not remove .flatten().copy() numpy is not clear about data persistence. + q_weight_initializer.raw_data = q_weight_data.flatten().copy().tobytes() + if to_array_extended is not None: + # This test should not be needed but it helped catch some issues + # with data persistence and tobytes. + check = to_array_extended(q_weight_initializer) + if check.shape != weight_data.shape or check.tobytes() != q_weight_data.tobytes(): + raise RuntimeError( + f"The initializer of shape {weight_data.shape} could not be created, expecting " + f"{q_weight_data.tobytes()[:10]}, got {check.tobytes()[:10]} and shape={weight.shape}" + f"\nraw={str(q_weight_initializer)[:200]}." + ) + elif quant_type in (onnx.TensorProto.INT4, onnx.TensorProto.UINT4): + if q_weight_data.dtype not in (numpy.int8, numpy.uint8): + raise RuntimeError(f"Quantized weights for {q_weight_name} must be 8-bit before packing as 4-bit values.") + + # We do not use onnx.helper.pack_float32_to_4bit() due to performance. + # This can be the difference between a large model taking 30 minutes to quantize vs 5 minutes. + packed_data = bytes(pack_bytes_to_4bit(q_weight_data.tobytes())) + + # We only use onnx.helper.make_tensor with raw data due to bug: https://github.com/onnx/onnx/pull/6161 + q_weight_initializer = onnx.helper.make_tensor(q_weight_name, quant_type, weight.dims, packed_data, raw=True) + else: + quant_np_dtype = onnx.helper.tensor_dtype_to_np_dtype(quant_type) + q_weight_data = numpy.asarray(q_weight_data, dtype=quant_np_dtype).reshape(weight.dims) + q_weight_initializer = onnx.numpy_helper.from_array(q_weight_data, q_weight_name) + + return q_weight_initializer + + def get_qmin_qmax_for_qType(qType, reduce_range=False, symmetric=False): # noqa: N802 """ Return qmin and qmax, the minimum and maximum value representable by the given qType diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py index 745344dc01fcb..4ffd8b9872982 100644 --- a/onnxruntime/python/tools/quantization/quantize.py +++ b/onnxruntime/python/tools/quantization/quantize.py @@ -3,10 +3,13 @@ # Licensed under the MIT License. See License.txt in the project root for # license information. # -------------------------------------------------------------------------- +from __future__ import annotations + +import copy import logging import tempfile from pathlib import Path -from typing import Union +from typing import Any, Callable import onnx @@ -14,6 +17,7 @@ from .onnx_quantizer import ONNXQuantizer from .qdq_quantizer import QDQQuantizer from .quant_utils import ( + MODEL_SIZE_THRESHOLD, QuantFormat, QuantizationMode, QuantType, @@ -22,6 +26,7 @@ save_and_reload_model_with_shape_infer, ) from .registry import IntegerOpsRegistry, QDQRegistry, QLinearOpsRegistry +from .tensor_quant_overrides import TensorQuantOverridesHelper class QuantConfig: @@ -192,6 +197,9 @@ def __init__( removed if activations are asymmetrically quantized. Keeping these activations is necessary if optimizations or EP transformations will later remove QuantizeLinear/DequantizeLinear operators from the model. + QDQDisableWeightAdjustForInt32Bias = True/False: + Default is False. If true, QDQ quantizer will not adjust the weight's scale when the bias + has a scale (input_scale * weight_scale) that is too small. execution_provider : A enum indicates the Execution Provider such as: CPU, TRT, NNAPI, SNE, etc. Raises: ValueError: Raise ValueError if execution provider is unknown @@ -213,6 +221,167 @@ def __init__( self.extra_options = extra_options or {} +def get_qdq_config( + model_input: str | Path | onnx.ModelProto, + calibration_data_reader: CalibrationDataReader, + calibrate_method=CalibrationMethod.MinMax, + calibrate_args: dict[str, Any] | None = None, + activation_type=QuantType.QUInt8, + weight_type=QuantType.QInt8, + activation_symmetric: bool = False, + weight_symmetric: bool | None = None, + per_channel: bool = False, + reduce_range: bool = False, + keep_removable_activations: bool = False, + min_real_range: float | None = None, + tensor_quant_overrides: dict[str, list[dict[str, Any]]] | None = None, + nodes_to_exclude: list[str] | Callable[[onnx.ModelProto, onnx.NodeProto], bool] | None = None, + extra_options: dict | None = None, +) -> StaticQuantConfig: + """ + Returns a configuration suitable that quantizes the entire model to integer precision. + + Params: + model_input: Path to the input model file or ModelProto. + calibration_data_reader: Calibration data reader. + calibrate_methode: The calibration method. Defaults to MinMax. + activation_type: The default activation quantization type. Defaults to QUInt8. + weight_type: The default weight quantization type. Defaults to QInt8. + activation_symmetric: True if activations should be quantized symmetrically (i.e, rmax == -rmin) by default. + Defaults to false. For int8 and int16, this results in zero-point values of 0. For uint8 and uint16, + the zero-point values are 127 and 32,767, respectively. + weight_symmetric: True if weights should be quantized symmetrically (i.e., rmax == -rmin) by default. + Defaults to None. If set to None, weight_symmetric is assumed true if a weight's quant type is a signed int. + per_channel: Global option that determines if a fixed set of operator types should be quantized per-channel. + Defaults to false. Alternatively, use the tensor-level `tensor_quant_overrides` to select individual operators + and their quantization axes. + reduce_range: quantize weights with 1 less bit of precision (e.g., 7 bits for QInt8). Defaults to false. + May improve the accuracy for some models running on non-VNNI machine, especially for per-channel mode. + keep_removable_activations: Defaults to false. If true, "removable" activations (e.g., Clip or Relu) will not + be removed, and will be explicitly represented in the QDQ model. If false, these activations + are automatically removed if activations are asymmetrically quantized. Keeping these activations + is necessary if optimizations or EP transformations will later remove + QuantizeLinear/DequantizeLinear operators from the model. + min_real_range: Default is None. If set to a floating-point value, the calculation of the quantization parameters + (i.e., scale and zero point) will enforce a minimum range between rmin and rmax. If (rmax - rmin) + is less than the specified minimum range, rmax will be set to rmin + min_real_range. + tensor_quant_overrides: tensor-level quantization overrides. Defaults to None. + The key is a tensor name and the value is a list of dictionaries. For per-tensor quantization, the list + contains a single dictionary. For per-channel quantization, the list contains either a dictionary for + each channel in the tensor or a single dictionary that is assumed to apply to all channels. An 'axis' + key must be present in the first dictionary for per-channel quantization. + + Each dictionary contains optional overrides with the following keys and values. + 'quant_type' = QuantType : The tensor's quantization data type. + 'axis' = Int : The per-channel axis. Must be present for per-channel weights. + 'scale' = Float : The scale value to use. Must also specify `zero_point` if set. + 'zero_point' = Int : The zero-point value to use. Must also specify `scale` is set. + 'symmetric' = Bool : If the tensor should use symmetric quantization. Invalid if also + set `scale` or `zero_point`. + 'reduce_range' = Bool : If the quantization range should be reduced. Invalid if also + set `scale` or `zero_point`. Only valid for initializers. + 'rmax' = Float : Override the maximum real tensor value in calibration data. + Invalid if also set `scale` or `zero_point`. + 'rmin' = Float : Override the minimum real tensor value in calibration data. + Invalid if also set `scale` or `zero_point`. + 'convert' = Dict : A nested dictionary with the same keys for an activation + tensor that should be converted to another quantization type. + 'convert["recv_nodes"] = Set : Set of node names that consume the converted activation, + other nodes get the original type. If not specified, + assume all consumer nodes get the converted type. + nodes_to_exclude: List of nodes names to exclude from quantization. Alternatively, can provide a function that + accepts an onnx.ModelProto and onnx.NodeProto as arguments and returns true if the give onnx.NodeProto + should be excluded from quantization. + extra_options: Additional options specified as string key/value pairs. Refer to the documentation for + `quantize_static` for valid keys and values. + + Returns: + A StaticQuantConfig object + """ + q16_types = {QuantType.QInt16, QuantType.QUInt16} + q4_types = {QuantType.QInt4, QuantType.QUInt4} + op_types_to_exclude = {"Cast", "DequantizeLinear", "QuantizeLinear"} + + model = ( + model_input + if isinstance(model_input, onnx.ModelProto) + else onnx.load_model(model_input, load_external_data=False) + ) + + op_types = set() + model_has_external_data = False + overrides_helper = TensorQuantOverridesHelper( + copy.deepcopy(tensor_quant_overrides) if tensor_quant_overrides else {} + ) + + # check if the model has external data. + for initializer in model.graph.initializer: + if onnx.external_data_helper.uses_external_data(initializer): + model_has_external_data = True + + final_nodes_to_exclude = [] + if nodes_to_exclude is not None and isinstance(nodes_to_exclude, list): + final_nodes_to_exclude.extend(nodes_to_exclude) + + # Iterate through nodes to get all operator types in the model and + # call user's function to filter out nodes from quantization. + for node in model.graph.node: + op_types.add(node.op_type) + if nodes_to_exclude is not None and callable(nodes_to_exclude): + if nodes_to_exclude(model, node): + final_nodes_to_exclude.append(node.name) + + final_extra_options = { + "MinimumRealRange": min_real_range, + "QDQKeepRemovableActivations": keep_removable_activations, + "ActivationSymmetric": activation_symmetric, + "WeightSymmetric": weight_symmetric, + "ForceQuantizeNoInputCheck": True, + "TensorQuantOverrides": overrides_helper.get_dict(), + } + + # Pass along known calibration options + if calibrate_args: + calib_extra_options_keys = [ + ("symmetric", "CalibTensorRangeSymmetric"), + ("moving_average", "CalibMovingAverage"), + ("averaging_constant", "CalibMovingAverageConstant"), + ("max_intermediate_outputs", "CalibMaxIntermediateOutputs"), + ("percentile", "CalibPercentile"), + ] + calib_extra_options = { + key: calibrate_args.get(name) for (name, key) in calib_extra_options_keys if name in calibrate_args + } + final_extra_options.update(calib_extra_options) + + # ONNX opset < 21 does not support 16-bit quantization, so must use 'com.microsoft' domain + # on Q/DQ operators if using 16-bit or 4-bit quantization. + onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx") + if onnx_opset.version < 21: + opset21_types = q16_types.union(q4_types) + overrides_have_opset21_types = any(t in opset21_types for t in overrides_helper.get_quant_types()) + if activation_type in opset21_types or weight_type in opset21_types or overrides_have_opset21_types: + final_extra_options["UseQDQContribOps"] = True + + # Allow user's extra_options to override our final_extra_options. + if extra_options: + final_extra_options.update(extra_options) + + return StaticQuantConfig( + calibration_data_reader, + calibrate_method=calibrate_method, + quant_format=QuantFormat.QDQ, + activation_type=activation_type, + weight_type=weight_type, + op_types_to_quantize=list(op_types.difference(op_types_to_exclude)), + nodes_to_exclude=final_nodes_to_exclude, + per_channel=per_channel, + reduce_range=reduce_range, + use_external_data_format=(model_has_external_data or model.ByteSize() >= MODEL_SIZE_THRESHOLD), + extra_options=final_extra_options, + ) + + class DynamicQuantConfig(QuantConfig): def __init__( self, @@ -290,8 +459,8 @@ def check_static_quant_arguments(quant_format: QuantFormat, activation_type: Qua def quantize_static( - model_input: Union[str, Path, onnx.ModelProto], - model_output: Union[str, Path], + model_input: str | Path | onnx.ModelProto, + model_output: str | Path, calibration_data_reader: CalibrationDataReader, quant_format=QuantFormat.QDQ, op_types_to_quantize=None, @@ -438,6 +607,9 @@ def quantize_static( removed if activations are asymmetrically quantized. Keeping these activations is necessary if optimizations or EP transformations will later remove QuantizeLinear/DequantizeLinear operators from the model. + QDQDisableWeightAdjustForInt32Bias = True/False: + Default is False. If true, QDQ quantizer will not adjust the weight's scale when the bias + has a scale (input_scale * weight_scale) that is too small. """ if activation_type == QuantType.QFLOAT8E4M3FN or weight_type == QuantType.QFLOAT8E4M3FN: if calibrate_method != CalibrationMethod.Distribution: @@ -473,6 +645,7 @@ def quantize_static( ("CalibMovingAverage", "moving_average"), ("CalibMovingAverageConstant", "averaging_constant"), ("CalibMaxIntermediateOutputs", "max_intermediate_outputs"), + ("CalibPercentile", "percentile"), ] calib_extra_options = { key: extra_options.get(name) for (name, key) in calib_extra_options_keys if name in extra_options @@ -590,8 +763,8 @@ def inc_dataloader(): def quantize_dynamic( - model_input: Union[str, Path, onnx.ModelProto], - model_output: Union[str, Path], + model_input: str | Path | onnx.ModelProto, + model_output: str | Path, op_types_to_quantize=None, per_channel=False, reduce_range=False, @@ -690,8 +863,8 @@ def quantize_dynamic( def quantize( - model_input: Union[str, Path, onnx.ModelProto], - model_output: Union[str, Path], + model_input: str | Path | onnx.ModelProto, + model_output: str | Path, quant_config: QuantConfig, ): """Quantize a model with QuantConfig. diff --git a/onnxruntime/python/tools/quantization/registry.py b/onnxruntime/python/tools/quantization/registry.py index 160b056e1de17..fbeae39c39d21 100644 --- a/onnxruntime/python/tools/quantization/registry.py +++ b/onnxruntime/python/tools/quantization/registry.py @@ -14,7 +14,7 @@ from .operators.matmul import MatMulInteger, QDQMatMul, QLinearMatMul from .operators.maxpool import QDQMaxPool, QMaxPool from .operators.norm import QDQNormalization -from .operators.pad import QPad +from .operators.pad import QDQPad, QPad from .operators.pooling import QLinearPool from .operators.qdq_base_operator import QDQOperatorBase from .operators.resize import QDQResize, QResize @@ -76,6 +76,8 @@ "Resize": QDQResize, "MaxPool": QDQMaxPool, "AveragePool": QDQDirect8BitOp, + "Slice": QDQDirect8BitOp, + "Pad": QDQPad, "MatMul": QDQMatMul, "Split": QDQSplit, "Gather": QDQGather, diff --git a/onnxruntime/python/tools/quantization/tensor_quant_overrides.py b/onnxruntime/python/tools/quantization/tensor_quant_overrides.py index 219d929d22fce..fbd0cc17f5d81 100644 --- a/onnxruntime/python/tools/quantization/tensor_quant_overrides.py +++ b/onnxruntime/python/tools/quantization/tensor_quant_overrides.py @@ -78,6 +78,10 @@ def has_per_channel_overrides(self, tensor_name: str) -> bool: overrides_list = self.overrides.get(tensor_name) return overrides_list and "axis" in overrides_list[0] + def overrides_scale_zp(self, tensor_name: str) -> bool: + overrides_list = self.overrides.get(tensor_name) + return overrides_list and ("scale" in overrides_list[0]) and ("zero_point" in overrides_list[0]) + def get_per_tensor_overrides( self, tensor_name: str, diff --git a/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc b/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc index edf9064bb43c9..8e892807c6e05 100644 --- a/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc +++ b/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc @@ -186,6 +186,32 @@ static void RunTest( } } +TEST(SkipLayerNormTest, SkipLayerNormPrePack) { + OpTester test("SkipLayerNormalization", 1, onnxruntime::kMSDomain); + test.AddAttribute("epsilon", 1e-05f); + + int batch_size = 1; + int sequence_length = 2; + int hidden_size = 2; + std::vector input_skip_output_dims = {batch_size, sequence_length, hidden_size}; + std::vector gamma_beta_bias_dims = {hidden_size}; + test.AddInput("x", input_skip_output_dims, ToFloat16({1.f, 1.f, 1.f, 1.f})); + test.AddInput("skip", input_skip_output_dims, ToFloat16({1.f, 1.f, 1.f, 1.f})); + test.AddInput("gamma", gamma_beta_bias_dims, ToFloat16({1.f, 1.f}), true); + test.AddInput("beta", gamma_beta_bias_dims, ToFloat16({1.f, 1.f}), true); + test.AddOutput("output", input_skip_output_dims, ToFloat16({ + 1.f, + 1.f, + 1.f, + 1.f, + })); + + // TRT, DNNL, OpenVINO and NNAPI, CoreML don't support this combination of datatypes + test.Run(OpTester::ExpectResult::kExpectSuccess, "", + {kTensorrtExecutionProvider, kDnnlExecutionProvider, kOpenVINOExecutionProvider, + kNnapiExecutionProvider, kQnnExecutionProvider}); +} + TEST(SkipLayerNormTest, SkipLayerNormNullInput) { int batch_size = 1; int sequence_length = 0; diff --git a/onnxruntime/test/providers/qnn/gather_op_htp_test.cc b/onnxruntime/test/providers/qnn/gather_op_htp_test.cc index e3f09e92593df..019d619f9be49 100644 --- a/onnxruntime/test/providers/qnn/gather_op_htp_test.cc +++ b/onnxruntime/test/providers/qnn/gather_op_htp_test.cc @@ -131,11 +131,15 @@ TEST_F(QnnHTPBackendTests, GatherOp_IndicesDynamicInt32_Axis0) { ExpectedEPNodeAssignment::All); } +// disabled for QNN 2.28.0.241029 failed for accuracy validation +// qdq@QNN_EP val: 3.6094117164611816 (err: 1.3094117641448975, err/output_range: 22.19342041015625%) +// qdq@CPU_EP val: 2.2905881404876709 (err: 0.0094118118286132812, err/output_range: 0.15952222049236298%) +// abs(qdq@QNN_EP - qdq@CPU_EP) / output_range = 22.033897399902344% // Test creates a DQ -> Gather -> Q -> DQ graph, and checks that all // nodes are supported by the QNN EP, and that the inference results are as accurate as CPU EP. // // Static int32 indices with axis = 1 -TEST_F(QnnHTPBackendTests, GatherOp_IndicesStaticInt32_Axis1) { +TEST_F(QnnHTPBackendTests, DISABLED_GatherOp_IndicesStaticInt32_Axis1) { RunQDQGatherOpTest(TestInputDef({3, 3}, false, {1.0f, 1.2f, 1.9f, 2.3f, 3.4f, 3.9f, 4.5f, 5.7f, 5.9f}), TestInputDef({1, 2}, true, {0, 2}), {utils::MakeAttribute("axis", static_cast(1))}, diff --git a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc index 018720fd8b71f..05731976c453f 100644 --- a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc +++ b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc @@ -229,8 +229,15 @@ TEST_F(QnnHTPBackendTests, UnaryOp_Tanh) { ExpectedEPNodeAssignment::All); } +// disabled for QNN 2.28.0.241029 backendValidateOpConfig failed +// QnnDsp [4294967295] has incorrect Value -32768, expected equal to 0. +// QnnDsp validateNativeOps node_token_6:qti.aisw:Tanh htp op validator failed 3110 +// QnnDsp registered validator failed => 3110 +// QnnDsp QnnBackend_validateOpConfig failed 3110 +// QnnDsp Wake up free backend (id: 1)'s thread(s) +// QnnDsp Failed to validate op node_token_6 with error 0xc26 // Tests accuracy of 16-bit QDQ Tanh. -TEST_F(QnnHTPBackendTests, UnaryOp_Tanh_U16) { +TEST_F(QnnHTPBackendTests, DISABLED_UnaryOp_Tanh_U16) { RunQDQOpTest("Tanh", {TestInputDef({1, 2, 3}, false, GetFloatDataInRange(-10.0f, 10.0f, 6))}, {}, diff --git a/onnxruntime/test/python/quantization/op_test_utils.py b/onnxruntime/test/python/quantization/op_test_utils.py index cf7fc292ea86b..82193d08684c6 100644 --- a/onnxruntime/test/python/quantization/op_test_utils.py +++ b/onnxruntime/test/python/quantization/op_test_utils.py @@ -1,3 +1,10 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for +# license information. +# -------------------------------------------------------------------------- +from __future__ import annotations + import uuid from pathlib import Path @@ -661,3 +668,29 @@ def generate_random_initializer(initializer_name, tensor_shape, tensor_dtype, me tensor = np.random.normal(mean, dev, tensor_shape).astype(tensor_dtype) init = onnx.numpy_helper.from_array(tensor, initializer_name) return init + + +def get_tensor_consumers_and_producers( + model: onnx.ModelProto, +) -> tuple[dict[str, list[onnx.NodeProto]], dict[str, onnx.NodeProto]]: + """ + Returns a tuple containing the following python dictionaries: + - consumers: maps a tensor name to the list of nodes that have that tensor as an input. + - producers: maps a tensor name to the node that generates this tensor as an output. + """ + consumers: dict[str, list[onnx.NodeProto]] = {} + producers: dict[str, onnx.NodeProto] = {} + for node in model.graph.node: + # Iterate through node's inputs to build the consumers dictionary. + for input_name in node.input: + if input_name: + if input_name not in consumers: + consumers[input_name] = [] + + consumers[input_name].append(node) + + # Iterate through node's outputs to build the producers dictionary. + for output_name in node.output: + producers[output_name] = node + + return (consumers, producers) diff --git a/onnxruntime/test/python/quantization/test_get_qdq_config.py b/onnxruntime/test/python/quantization/test_get_qdq_config.py new file mode 100644 index 0000000000000..58d00272475cd --- /dev/null +++ b/onnxruntime/test/python/quantization/test_get_qdq_config.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for +# license information. +# -------------------------------------------------------------------------- +from __future__ import annotations + +import os +import tempfile +import unittest + +import numpy as np +import onnx +from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count + +from onnxruntime.quantization import CalibrationMethod, QuantFormat, QuantType, get_qdq_config, quantize + + +class TestGetQDQConfig(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls._tmp_model_dir = tempfile.TemporaryDirectory(prefix="ort.int_qdq_config_") + + # Note: swap with the commented line if you want to see the models in local test dir. + cls._tmp_dir_path = cls._tmp_model_dir.name + # cls._tmp_dir_path = "." + + @classmethod + def tearDownClass(cls): + cls._tmp_model_dir.cleanup() + + def build_add_model( + self, + shape: list[int], + tensor_type: onnx.TensorProto.DataType, + weight: onnx.TensorProto | None = None, + opset: int = 21, + ) -> onnx.ModelProto: + """ + Returns an onnx.ModelProto with a single Add operator. The second input can be optionally made + a static weight. + """ + graph_inputs = [onnx.helper.make_tensor_value_info("input_0", tensor_type, shape)] + graph_outputs = [onnx.helper.make_tensor_value_info("output_0", tensor_type, shape)] + initializers = [] + add_input_names = ["input_0"] + + if weight is not None: + initializers.append(weight) + add_input_names.append(weight.name) + else: + graph_inputs.append(onnx.helper.make_tensor_value_info("input_1", tensor_type, shape)) + add_input_names.append("input_1") + + add_node = onnx.helper.make_node("Add", add_input_names, ["output_0"], name="Add0") + + graph = onnx.helper.make_graph( + [add_node], + "AddGraph", + graph_inputs, + graph_outputs, + initializer=initializers, + ) + opset_imports = [onnx.helper.make_opsetid("", opset)] + model = onnx.helper.make_model(graph, opset_imports=opset_imports) + model = onnx.shape_inference.infer_shapes(model) + onnx.checker.check_model(model, True) + return model + + def test_basic_args(self): + """ + Test that get_qdq_config() returns a config that sets the basic args. + """ + + shape = [1, 8, 8] + tensor_type = onnx.TensorProto.FLOAT + np_dtype = onnx.helper.tensor_dtype_to_np_dtype(tensor_type) + weight = onnx.numpy_helper.from_array(np.ones(shape, dtype=np_dtype), "weight") + float_model = self.build_add_model(shape, tensor_type, weight, opset=21) + + input_data_list = [ + {"input_0": np.ones(shape, dtype=np_dtype) * np.array(-2, dtype=np_dtype)}, + {"input_0": np.ones(shape, dtype=np_dtype) * np.array(2, dtype=np_dtype)}, + ] + data_reader = TestDataFeeds(input_data_list) + + qdq_config = get_qdq_config( + float_model, + data_reader, + calibrate_method=CalibrationMethod.Percentile, + calibrate_args={"percentile": 99.98}, # Converted to extra_options + activation_type=QuantType.QUInt16, + weight_type=QuantType.QInt16, + per_channel=True, + reduce_range=True, + nodes_to_exclude=["Mul"], + # Other options converted to extra_options: + min_real_range=0.0001, + keep_removable_activations=True, + activation_symmetric=True, + weight_symmetric=True, + ) + self.assertEqual(qdq_config.calibrate_method, CalibrationMethod.Percentile) + self.assertEqual(qdq_config.activation_type, QuantType.QUInt16) + self.assertEqual(qdq_config.weight_type, QuantType.QInt16) + self.assertTrue(qdq_config.per_channel) + self.assertTrue(qdq_config.reduce_range) + self.assertEqual(set(qdq_config.nodes_to_exclude), {"Mul"}) + self.assertEqual(set(qdq_config.op_types_to_quantize), {"Add"}) + + # Check that calibration args are translated to extra_options. + self.assertEqual(qdq_config.extra_options["CalibPercentile"], 99.98) + + # Check that other args are also translated to extra_options. + self.assertEqual(qdq_config.extra_options["MinimumRealRange"], 0.0001) + self.assertTrue(qdq_config.extra_options["QDQKeepRemovableActivations"]) + self.assertTrue(qdq_config.extra_options["ActivationSymmetric"]) + self.assertTrue(qdq_config.extra_options["WeightSymmetric"]) + + # The following options should always be set to specific values. + self.assertTrue(qdq_config.extra_options["ForceQuantizeNoInputCheck"]) + self.assertEqual(qdq_config.quant_format, QuantFormat.QDQ) + + # Should use onnx domain Q/DQ ops because onnx opset >= 21. + self.assertFalse(qdq_config.extra_options.get("UseQDQContribOps", False)) + + def test_exclude_nodes_callable(self): + """ + Test passing a function/callable to exclude nodes from quantization. + """ + + shape = [1, 8, 8] + tensor_type = onnx.TensorProto.FLOAT + np_dtype = onnx.helper.tensor_dtype_to_np_dtype(tensor_type) + weight = onnx.numpy_helper.from_array(np.ones(shape, dtype=np_dtype), "weight") + float_model = self.build_add_model(shape, tensor_type, weight, opset=21) + + input_data_list = [ + {"input_0": np.ones(shape, dtype=np_dtype) * np.array(-2, dtype=np_dtype)}, + {"input_0": np.ones(shape, dtype=np_dtype) * np.array(2, dtype=np_dtype)}, + ] + data_reader = TestDataFeeds(input_data_list) + + # Local function that excludes all "Add" nodes. + def should_exclude_node_(model: onnx.ModelProto, node: onnx.NodeProto) -> bool: + return node.op_type == "Add" + + qdq_config = get_qdq_config( + float_model, + data_reader, + nodes_to_exclude=should_exclude_node_, + ) + + expected_excluded_nodes = set([node.name for node in float_model.graph.node if node.op_type == "Add"]) + self.assertTrue(bool(expected_excluded_nodes)) + self.assertEqual(set(qdq_config.nodes_to_exclude), expected_excluded_nodes) + + def test_external_data(self): + """ + Test that get_qdq_config() returns a config that enables external data + if the input model has external data. + """ + + # Create model with a weight large enough (> 1024 bytes) to be stored externally. + shape = [1, 32, 32] + tensor_type = onnx.TensorProto.FLOAT + np_dtype = onnx.helper.tensor_dtype_to_np_dtype(tensor_type) + large_weight = onnx.numpy_helper.from_array(np.ones(shape, dtype=np_dtype), "weight") + float_model = self.build_add_model(shape, tensor_type, large_weight) + float_model_path = os.path.join(self._tmp_dir_path, "add_ext_data_int_qdq_config.onnx") + + onnx.save_model( + float_model, + float_model_path, + save_as_external_data=True, + all_tensors_to_one_file=True, + location="add_ext_data_int_qdq_config.bin", + ) + + input_data_list = [ + {"input_0": np.ones(shape, dtype=np_dtype) * np.array(-2, dtype=np_dtype)}, + {"input_0": np.ones(shape, dtype=np_dtype) * np.array(0, dtype=np_dtype)}, + {"input_0": np.ones(shape, dtype=np_dtype) * np.array(2, dtype=np_dtype)}, + ] + data_reader = TestDataFeeds(input_data_list) + + # Create a quantization config and check that it sets boolean to use external data + qdq_config = get_qdq_config( + float_model_path, data_reader, activation_type=QuantType.QUInt8, weight_type=QuantType.QInt8 + ) + self.assertEqual(set(qdq_config.op_types_to_quantize), {"Add"}) + self.assertTrue(qdq_config.use_external_data_format) + + # Quantize the model and check computational correctness against float model. + qdq_model_path = os.path.join(self._tmp_dir_path, "add_ext_data_int_qdq_config.qdq.onnx") + quantize(float_model_path, qdq_model_path, qdq_config) + + expected_op_counts = {"DequantizeLinear": 3, "QuantizeLinear": 2, "Add": 1} + check_op_type_count(self, qdq_model_path, **expected_op_counts) + + data_reader.rewind() + check_model_correctness(self, float_model_path, qdq_model_path, data_reader.get_next()) + + # The quantized weight should still be stored in an external file. + qdq_model = onnx.load_model(qdq_model_path, load_external_data=False) + weight_quantized = next( + ( + initializer + for initializer in qdq_model.graph.initializer + if initializer.name == f"{large_weight.name}_quantized" + ), + None, + ) + self.assertIsNotNone(weight_quantized) + self.assertEqual(weight_quantized.data_location, onnx.TensorProto.EXTERNAL) + + def test_use_qdq_contrib_ops_for_int16_opset19(self): + """ + Test that get_qdq_config() returns a config that forces 'com.microsoft' Q/DQ ops for + use of int16 in opset < 21. + """ + + shape = [1, 8, 8] + tensor_type = onnx.TensorProto.FLOAT + np_dtype = onnx.helper.tensor_dtype_to_np_dtype(tensor_type) + weight = onnx.numpy_helper.from_array(np.ones(shape, dtype=np_dtype), "weight") + float_model = self.build_add_model(shape, tensor_type, weight, opset=19) + + input_data_list = [ + {"input_0": np.ones(shape, dtype=np_dtype) * np.array(-2, dtype=np_dtype)}, + {"input_0": np.ones(shape, dtype=np_dtype) * np.array(2, dtype=np_dtype)}, + ] + data_reader = TestDataFeeds(input_data_list) + + qdq_config = get_qdq_config( + float_model, + data_reader, + activation_type=QuantType.QUInt16, + weight_type=QuantType.QInt8, + ) + + self.assertEqual(qdq_config.activation_type, QuantType.QUInt16) + self.assertTrue(qdq_config.extra_options["UseQDQContribOps"]) + + def test_use_qdq_contrib_ops_for_int4_opset19(self): + """ + Test that get_qdq_config() returns a config that forces 'com.microsoft' Q/DQ ops for + use of int4 in opset < 21. + """ + + shape = [1, 8, 8] + tensor_type = onnx.TensorProto.FLOAT + np_dtype = onnx.helper.tensor_dtype_to_np_dtype(tensor_type) + weight = onnx.numpy_helper.from_array(np.ones(shape, dtype=np_dtype), "weight") + float_model = self.build_add_model(shape, tensor_type, weight, opset=19) + + input_data_list = [ + {"input_0": np.ones(shape, dtype=np_dtype) * np.array(-2, dtype=np_dtype)}, + {"input_0": np.ones(shape, dtype=np_dtype) * np.array(2, dtype=np_dtype)}, + ] + data_reader = TestDataFeeds(input_data_list) + + # Use int4 in tensor quantization overrides. This should still force use of 'com.microsoft' Q/DQ ops. + qdq_config = get_qdq_config( + float_model, + data_reader, + activation_type=QuantType.QUInt8, + weight_type=QuantType.QInt8, + tensor_quant_overrides={"weight": [{"quant_type": QuantType.QInt4}]}, + ) + + self.assertEqual(qdq_config.extra_options["TensorQuantOverrides"]["weight"][0]["quant_type"], QuantType.QInt4) + self.assertTrue(qdq_config.extra_options["UseQDQContribOps"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/onnxruntime/test/python/quantization/test_op_pad.py b/onnxruntime/test/python/quantization/test_op_pad.py index 291bf42405d58..755c7fae5e3e8 100644 --- a/onnxruntime/test/python/quantization/test_op_pad.py +++ b/onnxruntime/test/python/quantization/test_op_pad.py @@ -4,14 +4,23 @@ # Licensed under the MIT License. See License.txt in the project root for # license information. # -------------------------------------------------------------------------- +from __future__ import annotations import itertools +import os +import tempfile import unittest import numpy as np import onnx from onnx import TensorProto, helper -from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_qtype_by_node_type +from op_test_utils import ( + TestDataFeeds, + check_model_correctness, + check_op_type_count, + check_qtype_by_node_type, + get_tensor_consumers_and_producers, +) from onnxruntime.quantization import QuantFormat, QuantType, quantize_dynamic, quantize_static @@ -519,5 +528,160 @@ def test_pad_with_empty_string_input_name(self): self.assertNotEqual(name, "_quantized") +class TestQDQPad(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls._tmp_model_dir = tempfile.TemporaryDirectory(prefix="ort.qdq.pad_") + + # Note: swap with the commented line if you want to see the models in local test dir. + cls._tmp_dir_path = cls._tmp_model_dir.name + # cls._tmp_dir_path = "." + + @classmethod + def tearDownClass(cls): + cls._tmp_model_dir.cleanup() + + def build_pad_model( + self, + mode: str, + constant_value: float | None = None, + opset: int = 21, + float_type: onnx.TensorProto.DataType = onnx.TensorProto.FLOAT, + ) -> onnx.ModelProto: + num_pads_start = 1 + input_0 = onnx.helper.make_tensor_value_info("input_0", float_type, (3, 2)) + output_0 = onnx.helper.make_tensor_value_info("output_0", float_type, (3, 2 + num_pads_start)) + + initializers = [] + pad_input_names = ["input_0"] + attrs = {"mode": mode} + + pads_data = np.array([0, num_pads_start, 0, 0], dtype=np.int64) # Pad one val at beginning of axis 1. + if opset >= 11: + initializers.append(onnx.numpy_helper.from_array(pads_data, "pads")) + pad_input_names.append("pads") + else: + attrs["pads"] = pads_data.tolist() + + if mode == "constant" and constant_value is not None: + if opset >= 11: + initializers.append(onnx.helper.make_tensor("constant_value", float_type, [], [constant_value])) + pad_input_names.append("constant_value") + else: + attrs["value"] = float(constant_value) + + pad_node = onnx.helper.make_node("Pad", pad_input_names, ["output_0"], name="Pad0", **attrs) + + graph = onnx.helper.make_graph( + [pad_node], + "PadFloat", + [input_0], + [output_0], + initializer=initializers, + ) + opset_imports = [onnx.helper.make_opsetid("", opset)] + model = onnx.helper.make_model(graph, opset_imports=opset_imports) + model = onnx.shape_inference.infer_shapes(model) + onnx.checker.check_model(model, True) + return model + + def test_qdq_pad_qparams(self): + """ + Test that QDQ Pad has equal scale/zero-point for its input and output for certain configurations. + """ + test_configs = [ + # Opset 21 + ("constant", None, 21, onnx.TensorProto.FLOAT), + ("constant", None, 21, onnx.TensorProto.FLOAT16), + ("constant", 0, 21, onnx.TensorProto.FLOAT), + ("constant", 0, 21, onnx.TensorProto.FLOAT16), + ("constant", 10.0, 21, onnx.TensorProto.FLOAT), + ("constant", 10.0, 21, onnx.TensorProto.FLOAT16), + ("reflect", None, 21, onnx.TensorProto.FLOAT), + ("reflect", None, 21, onnx.TensorProto.FLOAT16), + ("edge", None, 21, onnx.TensorProto.FLOAT), + ("edge", None, 21, onnx.TensorProto.FLOAT16), + ("wrap", None, 21, onnx.TensorProto.FLOAT), + ("wrap", None, 21, onnx.TensorProto.FLOAT16), + # Model with opset 10 will use pad of opset 2, which uses attributes instead of inputs. + # Opset 10 Q/DQ ops don't support float16. + ("constant", None, 10, onnx.TensorProto.FLOAT), + ("constant", 0, 10, onnx.TensorProto.FLOAT), + ("constant", 10.0, 10, onnx.TensorProto.FLOAT), + ("reflect", None, 10, onnx.TensorProto.FLOAT), + ("edge", None, 10, onnx.TensorProto.FLOAT), + ] + + for pad_mode, constant_value, opset, float_type in test_configs: + with self.subTest(pad_mode=pad_mode, constant_value=constant_value, opset=opset, float_type=float_type): + label = f"_{pad_mode}_{constant_value}_opset{opset}_{onnx.TensorProto.DataType.Name(float_type)}" + float_model_path = os.path.join(self._tmp_dir_path, f"pad{label}.float.onnx") + qdq_model_path = os.path.join(self._tmp_dir_path, f"pad{label}.qdq.onnx") + + float_model = self.build_pad_model(pad_mode, constant_value, opset=opset, float_type=float_type) + onnx.save_model(float_model, float_model_path) + + # Create a data reader + np_dtype = onnx.helper.tensor_dtype_to_np_dtype(float_type) + input_data_list = [ + {"input_0": np.array([[1.0, 1.2], [2.3, 3.4], [4.5, 5.7]], dtype=np_dtype)}, + {"input_0": np.array([[2.3, 3.4], [4.5, 5.7], [1.0, 1.2]], dtype=np_dtype)}, + ] + data_reader = TestDataFeeds(input_data_list) + + # quantize model to QDQ + quantize_static( + float_model_path, + qdq_model_path, + data_reader, + quant_format=QuantFormat.QDQ, + activation_type=QuantType.QUInt8, + weight_type=QuantType.QInt8, + ) + + expected_op_counts = {"DequantizeLinear": 2, "QuantizeLinear": 2, "Pad": 1} + if constant_value is not None and opset >= 11: + expected_op_counts["DequantizeLinear"] += 1 # The constant padding value is quantized. + check_op_type_count(self, qdq_model_path, **expected_op_counts) + + if pad_mode != "reflect": + # Do not check model correctness for 'reflect' mode because ONNX Runtime implementation does + # not match the ONNX reference implementation. See the following issue: + # https://github.com/microsoft/onnxruntime/issues/20801 + data_reader.rewind() + check_model_correctness(self, float_model_path, qdq_model_path, data_reader.get_next()) + + qdq_model = onnx.load_model(qdq_model_path) + quant_output_same_as_input = False + + if pad_mode in ("reflect", "edge", "wrap"): + quant_output_same_as_input = True + + if pad_mode == "constant" and constant_value in (None, 0): + quant_output_same_as_input = True + + pad_node = next((node for node in qdq_model.graph.node if node.op_type == "Pad"), None) + self.assertNotEqual(pad_node, None) + self.assertEqual(pad_node.op_type, "Pad") + + # Get the parent and child nodes of the Pad and check that they are DQ/Q. + consumers, producers = get_tensor_consumers_and_producers(qdq_model) + input_dq_node = producers.get(pad_node.input[0], None) + self.assertNotEqual(input_dq_node, None) + self.assertEqual(input_dq_node.op_type, "DequantizeLinear") + + output_q_node = consumers.get(pad_node.output[0], [None])[0] + self.assertNotEqual(output_q_node, None) + self.assertEqual(output_q_node.op_type, "QuantizeLinear") + + # Check that the Pad's input DQ uses the same scale/zp as the Pad's output Q. + if quant_output_same_as_input: + self.assertEqual(input_dq_node.input[1], output_q_node.input[1]) # Same scale + self.assertEqual(input_dq_node.input[2], output_q_node.input[2]) # Same zero-point + else: + self.assertNotEqual(input_dq_node.input[1], output_q_node.input[1]) + self.assertNotEqual(input_dq_node.input[2], output_q_node.input[2]) + + if __name__ == "__main__": unittest.main() diff --git a/onnxruntime/test/python/quantization/test_op_slice.py b/onnxruntime/test/python/quantization/test_op_slice.py new file mode 100644 index 0000000000000..bfb9fc6b46bbd --- /dev/null +++ b/onnxruntime/test/python/quantization/test_op_slice.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for +# license information. +# -------------------------------------------------------------------------- +from __future__ import annotations + +import os +import tempfile +import unittest + +import numpy as np +import onnx +from op_test_utils import ( + TestDataFeeds, + check_model_correctness, + check_op_type_count, + get_tensor_consumers_and_producers, +) + +from onnxruntime.quantization import QuantFormat, QuantType, quantize_static + + +class TestQDQSlice(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls._tmp_model_dir = tempfile.TemporaryDirectory(prefix="ort.qdq.slice_") + + # Note: swap with the commented line if you want to see the models in local test dir. + cls._tmp_dir_path = cls._tmp_model_dir.name + # cls._tmp_dir_path = "." + + @classmethod + def tearDownClass(cls): + cls._tmp_model_dir.cleanup() + + def build_slice_model( + self, + input_shape: list[int], + input_tensor_type: onnx.TensorProto.DataType, + starts: list[int], + ends: list[int], + axes: list[int] | None = None, + steps: list[int] | None = None, + ) -> onnx.ModelProto: + """ + Returns an onnx.ModelProto with a single Slice operator. + """ + input_0 = onnx.helper.make_tensor_value_info("input_0", input_tensor_type, input_shape) + output_0 = onnx.helper.make_tensor_value_info("output_0", input_tensor_type, None) + + initializers = [ + onnx.numpy_helper.from_array(np.array(starts, dtype=np.int64), "starts"), + onnx.numpy_helper.from_array(np.array(ends, dtype=np.int64), "ends"), + ] + slice_input_names = ["input_0", "starts", "ends"] + + if axes: + initializers.append(onnx.numpy_helper.from_array(np.array(axes, dtype=np.int64), "axes")) + slice_input_names.append("axes") + + if steps: + if not axes: + slice_input_names.append("") # Empty axes input. + initializers.append(onnx.numpy_helper.from_array(np.array(steps, dtype=np.int64), "steps")) + slice_input_names.append("steps") + + slice_node = onnx.helper.make_node("Slice", slice_input_names, ["output_0"], name="Slice0") + + graph = onnx.helper.make_graph( + [slice_node], + "SliceGraph", + [input_0], + [output_0], + initializer=initializers, + ) + opset_imports = [onnx.helper.make_opsetid("", 21)] + model = onnx.helper.make_model(graph, opset_imports=opset_imports) + model = onnx.shape_inference.infer_shapes(model) + onnx.checker.check_model(model, True) + return model + + def test_qdq_slice_qparams(self): + """ + Test that QDQ Slice has equal scale/zero-point for its input and output. + """ + test_configs = [onnx.TensorProto.FLOAT, onnx.TensorProto.FLOAT16] + + for onnx_tensor_type in test_configs: + with self.subTest(onnx_tensor_type=onnx_tensor_type): + label = f"{onnx.TensorProto.DataType.Name(onnx_tensor_type)}" + float_model_path = os.path.join(self._tmp_dir_path, f"slice.{label}.onnx") + qdq_model_path = os.path.join(self._tmp_dir_path, f"slice.{label}.qdq.onnx") + + input_shape = [2, 4] + float_model = self.build_slice_model( + input_shape=input_shape, + input_tensor_type=onnx_tensor_type, + starts=[1, 0], + ends=[2, 3], + axes=None, + steps=[1, 2], + ) + onnx.save_model(float_model, float_model_path) + + # Create a data reader + np_dtype = onnx.helper.tensor_dtype_to_np_dtype(onnx_tensor_type) + input_data_list = [ + {"input_0": np.array([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]], dtype=np_dtype)}, + {"input_0": np.array([[-1.0, -2.0, -3.0, -4.0], [-5.0, -6.0, -7.0, -8.0]], dtype=np_dtype)}, + ] + data_reader = TestDataFeeds(input_data_list) + + # quantize model to QDQ + quantize_static( + float_model_path, + qdq_model_path, + data_reader, + quant_format=QuantFormat.QDQ, + activation_type=QuantType.QUInt8, + weight_type=QuantType.QInt8, + extra_options={"ForceQuantizeNoInputCheck": True}, + ) + expected_op_counts = {"DequantizeLinear": 2, "QuantizeLinear": 2, "Slice": 1} + check_op_type_count(self, qdq_model_path, **expected_op_counts) + + data_reader.rewind() + check_model_correctness(self, float_model_path, qdq_model_path, data_reader.get_next()) + + qdq_model = onnx.load_model(qdq_model_path) + + slice_node = next((node for node in qdq_model.graph.node if node.op_type == "Slice"), None) + self.assertNotEqual(slice_node, None) + self.assertEqual(slice_node.op_type, "Slice") + + # Get the parent and child nodes of the Slice and check that they are DQ/Q. + consumers, producers = get_tensor_consumers_and_producers(qdq_model) + input_dq_node = producers.get(slice_node.input[0], None) + self.assertNotEqual(input_dq_node, None) + self.assertEqual(input_dq_node.op_type, "DequantizeLinear") + + output_q_node = consumers.get(slice_node.output[0], [None])[0] + self.assertNotEqual(output_q_node, None) + self.assertEqual(output_q_node.op_type, "QuantizeLinear") + + # Check that the Slice's input DQ uses the same scale/zp as the Slice's output Q. + self.assertEqual(input_dq_node.input[1], output_q_node.input[1]) + self.assertEqual(input_dq_node.input[2], output_q_node.input[2]) + + +if __name__ == "__main__": + unittest.main() diff --git a/onnxruntime/test/python/quantization/test_op_softmax.py b/onnxruntime/test/python/quantization/test_op_softmax.py index 3416198450137..e5bc6288c91e2 100644 --- a/onnxruntime/test/python/quantization/test_op_softmax.py +++ b/onnxruntime/test/python/quantization/test_op_softmax.py @@ -213,6 +213,40 @@ def test_quantize_softmax(self): self.quantize_softmax_test_qop(QuantType.QUInt8, QuantType.QUInt8) self.quantize_softmax_test_qdq(QuantType.QUInt8, QuantType.QUInt8) + def test_bug_fix_exclude_softmax(self): + """ + Test fix to bug that happens when softmax is excluded from quantization, but + the quantization tool still tries to assign it a tensor range of [0.0, 1.0]. + """ + np.random.seed(1) + model_fp32_path = "softmax_fp32.onnx" + model_qdq_path = "softmax_bug_exclude_softmax.qdq.onnx" + self.construct_model_conv_softmax( + model_fp32_path, + [1, 2, 26, 42], + [3, 2, 3, 3], + [1, 3, 24, 40], + {"axis": -2}, + [1, 3, 24, 40], + add_ms_domain_opset=False, + ) + data_reader = self.input_feeds(1, {"input": [1, 2, 26, 42]}) + data_reader.rewind() + + # Bug would cause an exception during quantization. + quantize_static( + model_fp32_path, + model_qdq_path, + data_reader, + quant_format=QuantFormat.QDQ, + activation_type=QuantType.QUInt8, + weight_type=QuantType.QInt8, + nodes_to_exclude=["Softmax"], + ) + + qdq_model = onnx.load(Path(model_qdq_path)) + self.assertIn("Softmax", {node.op_type for node in qdq_model.graph.node}) + def test_quantize_softmax_s8s8(self): self.quantize_softmax_test_qop( QuantType.QInt8, diff --git a/onnxruntime/test/python/quantization/test_qdq.py b/onnxruntime/test/python/quantization/test_qdq.py index b99c11abf6d2c..24039fe7398a8 100644 --- a/onnxruntime/test/python/quantization/test_qdq.py +++ b/onnxruntime/test/python/quantization/test_qdq.py @@ -1726,5 +1726,204 @@ def test_json_serialization(self): write_calibration_table(new_calibrate_tensors_range) +class TestAdjustWeightScaleForInt32Bias(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls._tmp_model_dir = tempfile.TemporaryDirectory(prefix="ort.qdq.adj_int32_bias_") + + # Note: swap with the commented line if you want to see the models in local test dir. + cls._tmp_dir_path = cls._tmp_model_dir.name + # cls._tmp_dir_path = "." + + @classmethod + def tearDownClass(cls): + cls._tmp_model_dir.cleanup() + + def build_conv_test_model( + self, + input0_shape: list[int], + weight_shape: list[int], + onnx_float_type: onnx.TensorProto.DataType, + ): + np_float_type = onnx.helper.tensor_dtype_to_np_dtype(onnx_float_type) + input_0 = onnx.helper.make_tensor_value_info("input_0", onnx_float_type, input0_shape) + output_0 = onnx.helper.make_tensor_value_info("output_0", onnx_float_type, None) + + tiny_value = 1e-7 if np_float_type == np.float32 else 0.007782 + # weight_scale = 2*tiny_value / 255.0 = 7.84313725490196e-10 + + weight_data = np.full(weight_shape, tiny_value, dtype=np_float_type) + with np.nditer(weight_data, op_flags=["readwrite"]) as it: + for i, x in enumerate(it): + if i % 2 == 0: + x[...] = -x + + weight = onnx.numpy_helper.from_array(weight_data, "weight") + + # if we set input_scale to 0.05, then normally bias_scale would be + # (input_scale * weight_scale) => (0.05 * 7.84314e-10) => 3.9215686274509805e-11 + # + # If we quantize the f32 bias with this bias_scale, we get + # [5.0/bias_scale, 4.0/bias_scale] = [127500000000, 102000000000]. These quantized bias values exceed the + # range of int32. + # + # The ORT quantization tool will clamp these out-of-bounds values to int32::max(), + # which can be very inaccurate. + bias_shape = [weight_shape[0]] + bias_data = np.ones(bias_shape, dtype=np_float_type) + with np.nditer(bias_data, op_flags=["readwrite"]) as it: + for i, x in enumerate(it): + if i % 2 == 0: + x[...] = 5.0 if np_float_type == np.float32 else 1400 + else: + x[...] = -4.5 if np_float_type == np.float32 else -1200 + + bias = onnx.numpy_helper.from_array(bias_data, "bias") + + conv_node = onnx.helper.make_node("Conv", ["input_0", "weight", "bias"], ["output_0"], name="Conv0") + graph = onnx.helper.make_graph( + [conv_node], + "Convfloat", + [input_0], + [output_0], + initializer=[weight, bias], + ) + opset_imports = [onnx.helper.make_opsetid("", 21)] + model = onnx.helper.make_model(graph, opset_imports=opset_imports) + model = onnx.shape_inference.infer_shapes(model) + onnx.checker.check_model(model, True) + return model + + def test_adjust_weight_scale_for_int32_bias(self): + """ + Test adjustment of weight input's scale to ensure int32 bias's scale is not too small. + """ + test_configs = [ + (onnx.TensorProto.FLOAT, True), + (onnx.TensorProto.FLOAT, False), + (onnx.TensorProto.FLOAT16, True), + (onnx.TensorProto.FLOAT16, False), + ] + + for float_type, per_channel in test_configs: + with self.subTest(float_type=float_type, per_channel=per_channel): + label = f"_f{float_type}_perchannel{per_channel}" + float_model_path = os.path.join(self._tmp_dir_path, f"conv{label}.float.onnx") + qdq_model_path = os.path.join(self._tmp_dir_path, f"conv{label}.qdq.onnx") + + # Create float model with a Conv that has tiny weight values. + # This tiny weight scale would normally create a very small bias scale that will saturate + # bias's int32 range. But, the qdq_quantizer adjusts the weight's scale to ensure this doesn't happen. + input0_shape = [1, 2, 4, 4] + weight_shape = [2, 2, 2, 2] + float_model = self.build_conv_test_model(input0_shape, weight_shape, float_type) + onnx.save_model(float_model, float_model_path) + + # Create a data reader + np_float_type = onnx.helper.tensor_dtype_to_np_dtype(float_type) + input0_rmin = 0.0 + input0_scale = 0.05 if float_type == onnx.TensorProto.FLOAT else 0.01 + input0_rmax = (input0_scale * 255.0) + input0_rmin + input_data_list = [ + {"input_0": np.full(input0_shape, input0_rmin, dtype=np_float_type)}, + {"input_0": np.full(input0_shape, (input0_rmax - input0_rmin) / 2.0, dtype=np_float_type)}, + {"input_0": np.full(input0_shape, input0_rmax, dtype=np_float_type)}, + ] + data_reader = TestDataFeeds(input_data_list) + + # quantize model to QDQ + quantize_static( + float_model_path, + qdq_model_path, + data_reader, + activation_type=QuantType.QUInt8, + weight_type=QuantType.QInt8, + per_channel=per_channel, + ) + + # Check correctness + data_reader.rewind() + check_model_correctness(self, float_model_path, qdq_model_path, data_reader.get_next()) + + def build_model_convs_share_bias( + self, + input0_shape: list[int], + weight_shape: list[int], + onnx_float_type: onnx.TensorProto.DataType, + ): + np_float_type = onnx.helper.tensor_dtype_to_np_dtype(onnx_float_type) + input_0 = onnx.helper.make_tensor_value_info("input_0", onnx_float_type, input0_shape) + output_0 = onnx.helper.make_tensor_value_info("output_0", onnx_float_type, None) + output_1 = onnx.helper.make_tensor_value_info("output_1", onnx_float_type, None) + + weight_0_data = np.ones(weight_shape, dtype=np_float_type) + weight_0 = onnx.numpy_helper.from_array(weight_0_data, "weight_0") + + weight_1_data = np.full(weight_shape, 0.5, dtype=np_float_type) + weight_1 = onnx.numpy_helper.from_array(weight_1_data, "weight_1") + + bias_shape = [weight_shape[0]] + bias_data = np.ones(bias_shape, dtype=np_float_type) + bias_shared = onnx.numpy_helper.from_array(bias_data, "bias_shared") + + conv_0_node = onnx.helper.make_node("Conv", ["input_0", "weight_0", "bias_shared"], ["output_0"], name="Conv0") + conv_1_node = onnx.helper.make_node("Conv", ["input_0", "weight_1", "bias_shared"], ["output_1"], name="Conv1") + graph = onnx.helper.make_graph( + [conv_0_node, conv_1_node], + "ConvWithSharedBiasToDup", + [input_0], + [output_0, output_1], + initializer=[weight_0, weight_1, bias_shared], + ) + opset_imports = [onnx.helper.make_opsetid("", 21)] + model = onnx.helper.make_model(graph, opset_imports=opset_imports) + model = onnx.shape_inference.infer_shapes(model) + onnx.checker.check_model(model, True) + return model + + def test_dup_shared_bias(self): + """ + Test duplicating a bias that is shared by two nodes that want to quantize their bias to int32. + """ + float_model_path = os.path.join(self._tmp_dir_path, "convs_share_bias.float.onnx") + qdq_model_path = os.path.join(self._tmp_dir_path, "convs_share_bias.qdq.onnx") + + # Create float model with a Convs that share a bias input. The QDQ quantizer should add a + # duplicate bias so that each node has its own. + input0_shape = [1, 2, 4, 4] + weight_shape = [2, 2, 2, 2] + float_model = self.build_model_convs_share_bias(input0_shape, weight_shape, onnx.TensorProto.FLOAT) + onnx.save_model(float_model, float_model_path) + + # Create a data reader + input0_rmin = 0.0 + input0_scale = 0.05 + input0_rmax = (input0_scale * 255.0) + input0_rmin + input_data_list = [ + {"input_0": np.full(input0_shape, input0_rmin, dtype=np.float32)}, + {"input_0": np.full(input0_shape, (input0_rmax - input0_rmin) / 2.0, dtype=np.float32)}, + {"input_0": np.full(input0_shape, input0_rmax, dtype=np.float32)}, + ] + data_reader = TestDataFeeds(input_data_list) + + # quantize model to QDQ + quantize_static( + float_model_path, + qdq_model_path, + data_reader, + activation_type=QuantType.QUInt8, + weight_type=QuantType.QInt8, + ) + + qdq_model = onnx.load_model(qdq_model_path) + bias_names = set() + + for node in qdq_model.graph.node: + if node.op_type == "DequantizeLinear" and node.input[0].startswith("bias_shared"): + bias_names.add(node.input[0]) + + self.assertEqual(len(bias_names), 2) + + if __name__ == "__main__": unittest.main() diff --git a/onnxruntime/test/python/quantization/test_quant_util.py b/onnxruntime/test/python/quantization/test_quant_util.py index 96d841654adbd..b23d53f2a04e8 100644 --- a/onnxruntime/test/python/quantization/test_quant_util.py +++ b/onnxruntime/test/python/quantization/test_quant_util.py @@ -145,7 +145,7 @@ def test_quantize_data_4bit(self): for onnx_type, symmetric in subtest_configs: with self.subTest(onnx_type=onnx_type, symmetric=symmetric): - _, _, zero_point, scale, data_quant = quantize_data(data_float, onnx_type, symmetric) + zero_point, scale, data_quant = quantize_data(data_float, onnx_type, symmetric) is_signed = onnx_type == onnx.TensorProto.INT4 np_int_type = numpy.int8 if is_signed else numpy.uint8 qmin = numpy.array(-8 if is_signed else 0, dtype=np_int_type) diff --git a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py index 21a772c5f56c7..41dae04f1c6ff 100644 --- a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py +++ b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py @@ -36,7 +36,7 @@ def setUp(self): self.bias = np.array([0.0, 1.0], dtype=np.float32) self.default_act_qtype = onnx.TensorProto.UINT8 self.default_wgt_qtype = onnx.TensorProto.UINT8 - self.default_wgt_qtype_per_channel = onnx.TensorProto.INT8 + self.default_wgt_qtype_per_channel = onnx.TensorProto.UINT8 self.default_bias_qtype = onnx.TensorProto.INT32 self.default_zp_scales = { @@ -49,7 +49,8 @@ def setUp(self): self.default_zp_scales_per_channel = { "INP": (0, np.float32(0.0235294122248888)), "SIG_OUT": (0, np.float32(0.003911871928721666)), - "WGT": ([0, 0], [np.float32(0.015748031437397003), np.float32(0.011811023578047752)]), + # per-channel weights are always symmetric (ie. zp = (qmin + qmax) / 2) + "WGT": ([127, 127], [np.float32(0.015748031437397003), np.float32(0.011811023578047752)]), "BIAS": ([0, 0], [np.float32(0.00006160428165458143), np.float32(0.00004620321124093607)]), "OUT": (0, np.float32(0.005075461231172085)), } @@ -420,12 +421,17 @@ def test_qdq_overrides_per_channel2(self): self.assertEqual(wgt_zp.data_type, quant_type.tensor_type) for index, (zp, scale) in enumerate(zip(wgt_zp.int32_data, wgt_sc.float_data)): - wgt_qmin, wgt_qmax = get_qmin_qmax_for_qType(wgt_zp.data_type, reduce_range=reduce_range) + wgt_qmin, wgt_qmax = get_qmin_qmax_for_qType( + wgt_zp.data_type, + symmetric=True, # per-channel is always symmetric + reduce_range=reduce_range, + ) expected_zp, expected_scale = compute_scale_zp( np.array(rmin_vals[index], dtype=np.float32), np.array(rmax_vals[index], dtype=np.float32), wgt_qmin, wgt_qmax, + symmetric=True, # per-channel is always symmetric ) self.assertEqual(zp, expected_zp) self.assertEqual(scale, np.float32(expected_scale)) diff --git a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml index 9362a8b0ee18c..20252220da8f9 100644 --- a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml @@ -32,7 +32,7 @@ parameters: - name: QnnSdk displayName: QNN SDK version type: string - default: 2.27.0.240926 + default: 2.28.0.241029 jobs: - job: Build_QNN_EP diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml index b12360d2710d0..c1e469509b9bd 100644 --- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml +++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml @@ -62,7 +62,7 @@ parameters: - name: QnnSdk displayName: QNN SDK Version type: string - default: 2.27.0.240926 + default: 2.28.0.241029 resources: repositories: diff --git a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml index 41f6b6a8d6d80..03859b1548fd2 100644 --- a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml @@ -33,7 +33,7 @@ parameters: - name: QnnSdk displayName: QNN SDK version type: string - default: 2.27.0.240926 + default: 2.28.0.241029 jobs: - job: Build_QNN_EP diff --git a/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml index de17db216da9c..0a18343eee33d 100644 --- a/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml @@ -69,7 +69,7 @@ parameters: - name: qnn_sdk_version type: string displayName: 'QNN SDK version. Only for QNN packages.' - default: 2.27.0.240926 + default: 2.28.0.241029 trigger: none diff --git a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml index fd3f31da4ab7e..f2c0561368a9e 100644 --- a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml @@ -2,7 +2,7 @@ parameters: - name: QnnSdk displayName: QNN SDK Version type: string - default: 2.27.0.240926 + default: 2.28.0.241029 - name: build_config displayName: Build Configuration diff --git a/tools/ci_build/github/azure-pipelines/templates/jar-maven-signing-linux.yml b/tools/ci_build/github/azure-pipelines/templates/jar-maven-signing-linux.yml index ca7e3f6148e26..d14952e544e5e 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jar-maven-signing-linux.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jar-maven-signing-linux.yml @@ -45,7 +45,8 @@ steps: for file in $(find $jar_file_directory -type f); do echo "Adding checksum of sha256 to file: $file" - sha256sum $file | awk '{print $1}' >$file.sha256 + sha256_value=$(sha256sum $file | awk '{print $1}') + echo $sha256_value" *"$(basename "$file") >$file.sha256 echo "Added checksum of sha256 to file: $file" done diff --git a/tools/ci_build/github/azure-pipelines/templates/jar-maven-signing-win.yml b/tools/ci_build/github/azure-pipelines/templates/jar-maven-signing-win.yml index 182a2ebe3b4c9..5681b3568bae1 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jar-maven-signing-win.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jar-maven-signing-win.yml @@ -15,6 +15,7 @@ steps: displayName: 'Sign jar files: GnuPG and sha256' inputs: targetType: 'inline' + pwsh: true workingDirectory: '$(Build.SourcesDirectory)' script: | $jar_file_directory = '${{ parameters.JarFileDirectory }}' @@ -53,15 +54,22 @@ steps: Write-Host "GnuPG signed to file: "$file_path } + $PSDefaultParameterValues['Out-File:Encoding'] = 'utf8NoBOM' + $sha256sum_exe_path = "C:\Program Files\Git\usr\bin\sha256sum.exe" $targeting_asc_files = Get-ChildItem $jar_file_directory -Recurse -Force -File -Name + $original_location = Get-Location + Set-Location $jar_file_directory foreach ($file in $targeting_asc_files) { - $file_path = Join-Path $jar_file_directory -ChildPath $file - Write-Host "Adding checksum of sha256 to file: "$file_path - $file_path_sha256 = $file_path + ".sha256" - CertUtil -hashfile $file_path SHA256 - CertUtil -hashfile $file_path SHA256 | find /v `"hash`" | Out-File -FilePath $file_path_sha256 - Write-Host "Added checksum of sha256 to file: "$file_path + Write-Host "Adding checksum of sha256 to file: "$file + $file_path_sha256 = $file + ".sha256" + & $sha256sum_exe_path $file 1>$file_path_sha256 + if ($lastExitCode -ne 0) { + Write-Host -Object "sha256sum command failed. Exitcode: $exitCode" + exit $lastExitCode + } + Write-Host "Added checksum of sha256 to file: "$file } + Set-Location $original_location Write-Host "GnuPG and sha256 signing to files completed." Write-Host "Deleting GnuPG key files." diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml index f749f32456b25..97ca94e7ab516 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml @@ -1,7 +1,7 @@ parameters: - name: QnnSDKVersion type: string - default: '2.27.0.240926' + default: '2.28.0.241029' steps: - script: | diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml index c56d81aefbec1..6b318664d1b12 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml @@ -1,7 +1,7 @@ parameters: - name: QnnSDKVersion type: string - default: '2.27.0.240926' + default: '2.28.0.241029' steps: - powershell: | diff --git a/tools/ci_build/github/azure-pipelines/templates/py-linux-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-linux-qnn.yml index e663afb49dd99..d2ce7c84aa40d 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-linux-qnn.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-linux-qnn.yml @@ -26,7 +26,7 @@ parameters: - name: QnnSdk displayName: QNN SDK version type: string - default: 2.27.0.240926 + default: 2.28.0.241029 jobs: - job: Linux_py_qnn_Wheels_x64 diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml index 10d7ce04747d9..2a59e9de9908f 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml @@ -73,7 +73,7 @@ parameters: - name: qnn_sdk_version type: string displayName: 'QNN SDK version. Only for QNN packages.' - default: 2.27.0.240926 + default: 2.28.0.241029 stages: - ${{ if eq(parameters.enable_windows_cpu, true) }}: diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml index f47108a2a48cd..6adc35568b034 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml @@ -7,7 +7,7 @@ parameters: - name: QNN_SDK displayName: QNN SDK Version type: string - default: 2.27.0.240926 + default: 2.28.0.241029 - name: ENV_SETUP_SCRIPT type: string diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml index 5839ee273c1fe..0a58874d1d478 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml @@ -7,7 +7,7 @@ parameters: - name: QNN_SDK displayName: QNN SDK Version type: string - default: 2.27.0.240926 + default: 2.28.0.241029 - name: ENV_SETUP_SCRIPT type: string diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml index 9e01f4116b602..1114477c84454 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml @@ -7,7 +7,7 @@ parameters: - name: QNN_SDK displayName: QNN SDK Version type: string - default: 2.27.0.240926 + default: 2.28.0.241029 - name: ENV_SETUP_SCRIPT type: string diff --git a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml index 30280c6e22c7e..24abf7f6d0872 100644 --- a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml +++ b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml @@ -1,6 +1,6 @@ parameters: - QnnSdk: '2.27.0.240926' - build_config: 'RelWithDebInfo' + QnnSdk: '2.28.0.241029' + build_config: 'RelWithDebInfo' IsReleaseBuild: false DoEsrp: false qnn_ep_build_pool_name: 'Onnxruntime-QNNEP-Windows-2022-CPU' @@ -44,7 +44,7 @@ stages: inputs: scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py' arguments: '--use_qnn --qnn_home $(QnnSDKRootDir) $(commonBuildArgs)' - + - task: VSBuild@1 displayName: 'Build onnxruntime' inputs: diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml index 8f971612dbc6d..59a8dac9b1988 100644 --- a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml @@ -33,7 +33,7 @@ parameters: - name: QnnSdk displayName: QNN SDK version type: string - default: 2.27.0.240926 + default: 2.28.0.241029 jobs: - job: 'build' diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml index fdb6998f53d15..6645c9b1f78f3 100644 --- a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml @@ -33,7 +33,7 @@ parameters: - name: QnnSdk displayName: QNN SDK version type: string - default: 2.27.0.240926 + default: 2.28.0.241029 jobs: - job: 'build'