From ff3740146a829e845d79266acf233b202843d3fd Mon Sep 17 00:00:00 2001 From: "chen, suyue" Date: Wed, 17 Jul 2024 23:11:15 +0800 Subject: [PATCH] 3.X API installation update (#1935) Signed-off-by: chensuyue --- .azure-pipelines/scripts/install_nc.sh | 4 - .../scripts/ut/3x/coverage.3x_ort | 15 - .azure-pipelines/scripts/ut/3x/run_3x_ort.sh | 35 - .azure-pipelines/ut-3x-ort.yml | 109 -- .github/checkgroup.yml | 13 - README.md | 10 +- docs/source/installation_guide.md | 57 +- neural_compressor/onnxrt/__init__.py | 56 - .../onnxrt/algorithms/__init__.py | 22 - .../onnxrt/algorithms/layer_wise/__init__.py | 17 - .../onnxrt/algorithms/layer_wise/core.py | 289 ----- .../onnxrt/algorithms/smoother/__init__.py | 17 - .../onnxrt/algorithms/smoother/calibrator.py | 237 ---- .../onnxrt/algorithms/smoother/core.py | 668 ---------- .../onnxrt/algorithms/weight_only/__init__.py | 13 - .../onnxrt/algorithms/weight_only/awq.py | 437 ------- .../onnxrt/algorithms/weight_only/gptq.py | 451 ------- .../onnxrt/algorithms/weight_only/rtn.py | 222 ---- .../onnxrt/algorithms/weight_only/utility.py | 335 ----- .../onnxrt/quantization/__init__.py | 50 - .../onnxrt/quantization/algorithm_entry.py | 152 --- .../onnxrt/quantization/autotune.py | 116 -- .../onnxrt/quantization/calibrate.py | 35 - .../onnxrt/quantization/config.py | 614 ---------- .../onnxrt/quantization/quantize.py | 67 - neural_compressor/onnxrt/utils/__init__.py | 24 - neural_compressor/onnxrt/utils/onnx_model.py | 1082 ----------------- neural_compressor/onnxrt/utils/utility.py | 288 ----- requirements_ort.txt | 9 - setup.py | 52 +- .../layer_wise/test_layer_wise.py | 155 --- .../quantization/weight_only/test_awq.py | 219 ---- .../quantization/weight_only/test_gptq.py | 222 ---- .../quantization/weight_only/test_rtn.py | 193 --- test/3x/onnxrt/requirements.txt | 2 - test/3x/onnxrt/test_autotune.py | 304 ----- test/3x/onnxrt/test_config.py | 251 ---- test/3x/onnxrt/test_smooth_quant.py | 127 -- 38 files changed, 43 insertions(+), 6926 deletions(-) delete mode 100644 .azure-pipelines/scripts/ut/3x/coverage.3x_ort delete mode 100644 .azure-pipelines/scripts/ut/3x/run_3x_ort.sh delete mode 100644 .azure-pipelines/ut-3x-ort.yml delete mode 100644 neural_compressor/onnxrt/__init__.py delete mode 100644 neural_compressor/onnxrt/algorithms/__init__.py delete mode 100644 neural_compressor/onnxrt/algorithms/layer_wise/__init__.py delete mode 100644 neural_compressor/onnxrt/algorithms/layer_wise/core.py delete mode 100644 neural_compressor/onnxrt/algorithms/smoother/__init__.py delete mode 100644 neural_compressor/onnxrt/algorithms/smoother/calibrator.py delete mode 100644 neural_compressor/onnxrt/algorithms/smoother/core.py delete mode 100644 neural_compressor/onnxrt/algorithms/weight_only/__init__.py delete mode 100644 neural_compressor/onnxrt/algorithms/weight_only/awq.py delete mode 100644 neural_compressor/onnxrt/algorithms/weight_only/gptq.py delete mode 100644 neural_compressor/onnxrt/algorithms/weight_only/rtn.py delete mode 100644 neural_compressor/onnxrt/algorithms/weight_only/utility.py delete mode 100644 neural_compressor/onnxrt/quantization/__init__.py delete mode 100644 neural_compressor/onnxrt/quantization/algorithm_entry.py delete mode 100644 neural_compressor/onnxrt/quantization/autotune.py delete mode 100644 neural_compressor/onnxrt/quantization/calibrate.py delete mode 100644 neural_compressor/onnxrt/quantization/config.py delete mode 100644 neural_compressor/onnxrt/quantization/quantize.py delete mode 100644 neural_compressor/onnxrt/utils/__init__.py delete mode 100644 neural_compressor/onnxrt/utils/onnx_model.py delete mode 100644 neural_compressor/onnxrt/utils/utility.py delete mode 100644 requirements_ort.txt delete mode 100644 test/3x/onnxrt/quantization/layer_wise/test_layer_wise.py delete mode 100644 test/3x/onnxrt/quantization/weight_only/test_awq.py delete mode 100644 test/3x/onnxrt/quantization/weight_only/test_gptq.py delete mode 100644 test/3x/onnxrt/quantization/weight_only/test_rtn.py delete mode 100644 test/3x/onnxrt/requirements.txt delete mode 100644 test/3x/onnxrt/test_autotune.py delete mode 100644 test/3x/onnxrt/test_config.py delete mode 100644 test/3x/onnxrt/test_smooth_quant.py diff --git a/.azure-pipelines/scripts/install_nc.sh b/.azure-pipelines/scripts/install_nc.sh index 2cb175138b4..55b323c56c2 100644 --- a/.azure-pipelines/scripts/install_nc.sh +++ b/.azure-pipelines/scripts/install_nc.sh @@ -10,10 +10,6 @@ elif [[ $1 = *"3x_tf"* ]]; then python -m pip install --no-cache-dir -r requirements_tf.txt python setup.py tf bdist_wheel pip install dist/neural_compressor*.whl --force-reinstall -elif [[ $1 = *"3x_ort" ]]; then - python -m pip install --no-cache-dir -r requirements_ort.txt - python setup.py ort bdist_wheel - pip install dist/neural_compressor*.whl --force-reinstall else python -m pip install --no-cache-dir -r requirements.txt python setup.py bdist_wheel diff --git a/.azure-pipelines/scripts/ut/3x/coverage.3x_ort b/.azure-pipelines/scripts/ut/3x/coverage.3x_ort deleted file mode 100644 index 1404dccbaee..00000000000 --- a/.azure-pipelines/scripts/ut/3x/coverage.3x_ort +++ /dev/null @@ -1,15 +0,0 @@ -[run] -branch = True - -[report] -include = - */neural_compressor/common/* - */neural_compressor/onnxrt/* -exclude_lines = - pragma: no cover - raise NotImplementedError - raise TypeError - if self.device == "gpu": - if device == "gpu": - except ImportError: - except Exception as e: \ No newline at end of file diff --git a/.azure-pipelines/scripts/ut/3x/run_3x_ort.sh b/.azure-pipelines/scripts/ut/3x/run_3x_ort.sh deleted file mode 100644 index 5f8550ea742..00000000000 --- a/.azure-pipelines/scripts/ut/3x/run_3x_ort.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash -python -c "import neural_compressor as nc" -test_case="run 3x ONNXRT" -echo "${test_case}" - -# install requirements -echo "set up UT env..." -pip install -r /neural-compressor/test/3x/onnxrt/requirements.txt -pip install pytest-cov -pip install pytest-html -pip list - -export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/3x/coverage.3x_ort -inc_path=$(python -c 'import neural_compressor; print(neural_compressor.__path__[0])') -cd /neural-compressor/test/3x || exit 1 -rm -rf torch -rm -rf tensorflow - -LOG_DIR=/neural-compressor/log_dir -mkdir -p ${LOG_DIR} -ut_log_name=${LOG_DIR}/ut_3x_ort.log -pytest --cov="${inc_path}" -vs --disable-warnings --html=report.html --self-contained-html . 2>&1 | tee -a ${ut_log_name} - -cp report.html ${LOG_DIR}/ - -if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then - echo "Find errors in pytest case, please check the output..." - echo "Please search for '== FAILURES ==' or '== ERRORS =='" - exit 1 -fi - -# if ut pass, collect the coverage file into artifacts -cp .coverage ${LOG_DIR}/.coverage - -echo "UT finished successfully! " \ No newline at end of file diff --git a/.azure-pipelines/ut-3x-ort.yml b/.azure-pipelines/ut-3x-ort.yml deleted file mode 100644 index 42636df2314..00000000000 --- a/.azure-pipelines/ut-3x-ort.yml +++ /dev/null @@ -1,109 +0,0 @@ -trigger: none - -pr: - autoCancel: true - drafts: false - branches: - include: - - master - paths: - include: - - neural_compressor/common - - neural_compressor/onnxrt - - test/3x/onnxrt - - test/3x/common - - setup.py - - requirements_ort.txt - - .azure-pipelines/scripts/ut/3x/run_3x_ort.sh - -pool: ICX-16C - -variables: - IMAGE_NAME: "neural-compressor" - IMAGE_TAG: "py310" - UPLOAD_PATH: $(Build.SourcesDirectory)/log_dir - DOWNLOAD_PATH: $(Build.SourcesDirectory)/log_dir - ARTIFACT_NAME: "UT_coverage_report_3x_ort" - REPO: $(Build.Repository.Uri) - -stages: - - stage: ONNXRT - displayName: Unit Test 3x ONNXRT - dependsOn: [] - jobs: - - job: - displayName: Unit Test 3x ONNXRT - steps: - - template: template/ut-template.yml - parameters: - dockerConfigName: "commonDockerConfig" - utScriptFileName: "3x/run_3x_ort" - uploadPath: $(UPLOAD_PATH) - utArtifact: "ut_3x" - - - - stage: ONNXRT_baseline - displayName: Unit Test 3x ONNXRT baseline - dependsOn: [] - jobs: - - job: - displayName: Unit Test 3x ONNXRT baseline - steps: - - template: template/ut-template.yml - parameters: - dockerConfigName: "gitCloneDockerConfig" - utScriptFileName: "3x/run_3x_ort" - uploadPath: $(UPLOAD_PATH) - utArtifact: "ut_3x_baseline" - repo: $(REPO) - - - stage: Coverage - displayName: "Coverage Compare" - pool: - vmImage: "ubuntu-latest" - dependsOn: [ONNXRT, ONNXRT_baseline] - jobs: - - job: CollectDatafiles - steps: - - script: | - if [[ ! $(docker images | grep -i ${IMAGE_NAME}:${IMAGE_TAG}) ]]; then - docker build -f ${BUILD_SOURCESDIRECTORY}/.azure-pipelines/docker/Dockerfile.devel -t ${IMAGE_NAME}:${IMAGE_TAG} . - fi - docker images | grep -i ${IMAGE_NAME} - if [[ $? -ne 0 ]]; then - echo "NO Such Repo" - exit 1 - fi - displayName: "Build develop docker image" - - - task: DownloadPipelineArtifact@2 - inputs: - artifact: - patterns: '*_coverage/.coverage' - path: $(DOWNLOAD_PATH) - - - script: | - echo "--- create container ---" - docker run -d -it --name="collectLogs" -v ${BUILD_SOURCESDIRECTORY}:/neural-compressor ${IMAGE_NAME}:${IMAGE_TAG} /bin/bash - echo "--- docker ps ---" - docker ps - echo "--- collect logs ---" - docker exec collectLogs /bin/bash +x -c "cd /neural-compressor/.azure-pipelines/scripts \ - && bash install_nc.sh 3x_ort \ - && bash ut/3x/collect_log_3x.sh 3x_ort" - displayName: "Collect UT Coverage" - - - task: PublishPipelineArtifact@1 - condition: succeededOrFailed() - inputs: - targetPath: $(UPLOAD_PATH) - artifact: $(ARTIFACT_NAME) - publishLocation: "pipeline" - - - task: Bash@3 - condition: always() - inputs: - targetType: "inline" - script: | - docker exec collectLogs bash -c "rm -fr /neural-compressor/* && rm -fr /neural-compressor/.* || true" - displayName: "Docker clean up" diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 4c6691da86a..c1e6e147fab 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -140,16 +140,3 @@ subprojects: - "UT-3x-Torch (Coverage Compare CollectDatafiles)" - "UT-3x-Torch (Unit Test 3x Torch Unit Test 3x Torch)" - "UT-3x-Torch (Unit Test 3x Torch baseline Unit Test 3x Torch baseline)" - - - id: "Unit Tests 3x-ONNXRT workflow" - paths: - - "neural_compressor/common/**" - - "neural_compressor/onnxrt/**" - - "test/3x/onnxrt/**" - - "setup.py" - - "requirements_ort.txt" - checks: - - "UT-3x-ONNXRT" - - "UT-3x-ONNXRT (Coverage Compare CollectDatafiles)" - - "UT-3x-ONNXRT (Unit Test 3x ONNXRT Unit Test 3x ONNXRT)" - - "UT-3x-ONNXRT (Unit Test 3x ONNXRT baseline Unit Test 3x ONNXRT baseline)" diff --git a/README.md b/README.md index 31772f4d025..7e5b65bf351 100644 --- a/README.md +++ b/README.md @@ -19,21 +19,25 @@ Intel® Neural Compressor aims to provide popular model compression techniques s as well as Intel extensions such as [Intel Extension for TensorFlow](https://github.com/intel/intel-extension-for-tensorflow) and [Intel Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch). In particular, the tool provides the key features, typical examples, and open collaborations as below: -* Support a wide range of Intel hardware such as [Intel Xeon Scalable Processors](https://www.intel.com/content/www/us/en/products/details/processors/xeon/scalable.html), [Intel Xeon CPU Max Series](https://www.intel.com/content/www/us/en/products/details/processors/xeon/max-series.html), [Intel Data Center GPU Flex Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/flex-series.html), and [Intel Data Center GPU Max Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/max-series.html) with extensive testing; support AMD CPU, ARM CPU, and NVidia GPU through ONNX Runtime with limited testing +* Support a wide range of Intel hardware such as [Intel Gaudi Al Accelerators](https://www.intel.com/content/www/us/en/products/details/processors/ai-accelerators/gaudi-overview.html), [Intel Core Ultra Processors](https://www.intel.com/content/www/us/en/products/details/processors/core-ultra.html), [Intel Xeon Scalable Processors](https://www.intel.com/content/www/us/en/products/details/processors/xeon/scalable.html), [Intel Xeon CPU Max Series](https://www.intel.com/content/www/us/en/products/details/processors/xeon/max-series.html), [Intel Data Center GPU Flex Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/flex-series.html), and [Intel Data Center GPU Max Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/max-series.html) with extensive testing; +support AMD CPU, ARM CPU, and NVidia GPU through ONNX Runtime with limited testing; support NVidia GPU for some WOQ algorithms like AutoRound and HQQ. * Validate popular LLMs such as [LLama2](/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), [Falcon](/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), [GPT-J](/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), [Bloom](/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), [OPT](/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), and more than 10,000 broad models such as [Stable Diffusion](/examples/pytorch/nlp/huggingface_models/text-to-image/quantization), [BERT-Large](/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx), and [ResNet50](/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx) from popular model hubs such as [Hugging Face](https://huggingface.co/), [Torch Vision](https://pytorch.org/vision/stable/index.html), and [ONNX Model Zoo](https://github.com/onnx/models#models), with automatic [accuracy-driven](/docs/source/design.md#workflow) quantization strategies * Collaborate with cloud marketplaces such as [Google Cloud Platform](https://console.cloud.google.com/marketplace/product/bitnami-launchpad/inc-tensorflow-intel?project=verdant-sensor-286207), [Amazon Web Services](https://aws.amazon.com/marketplace/pp/prodview-yjyh2xmggbmga#pdp-support), and [Azure](https://azuremarketplace.microsoft.com/en-us/marketplace/apps/bitnami.inc-tensorflow-intel), software platforms such as [Alibaba Cloud](https://www.intel.com/content/www/us/en/developer/articles/technical/quantize-ai-by-oneapi-analytics-on-alibaba-cloud.html), [Tencent TACO](https://new.qq.com/rain/a/20221202A00B9S00) and [Microsoft Olive](https://github.com/microsoft/Olive), and open AI ecosystem such as [Hugging Face](https://huggingface.co/blog/intel), [PyTorch](https://pytorch.org/tutorials/recipes/intel_neural_compressor_for_pytorch.html), [ONNX](https://github.com/onnx/models#models), [ONNX Runtime](https://github.com/microsoft/onnxruntime), and [Lightning AI](https://github.com/Lightning-AI/lightning/blob/master/docs/source-pytorch/advanced/post_training_quantization.rst) ## What's New +* [2024/07] From 3.0 release, framework extension API is recommended to be used for quantization. * [2024/07] Performance optimizations and usability improvements on [client-side](https://github.com/intel/neural-compressor/blob/master/docs/3x/client_quant.md). -* [2024/03] A new SOTA approach [AutoRound](https://github.com/intel/auto-round) Weight-Only Quantization on [Intel Gaudi2 AI accelerator](https://habana.ai/products/gaudi2/) is available for LLMs. ## Installation ### Install from pypi ```Shell -pip install neural-compressor +# Install 2.X API + Framework extension API + PyTorch dependency +pip install neural-compressor[pt] +# Install 2.X API + Framework extension API + TensorFlow dependency +pip install neural-compressor[tf] ``` > **Note**: > Further installation methods can be found under [Installation Guide](https://github.com/intel/neural-compressor/blob/master/docs/source/installation_guide.md). check out our [FAQ](https://github.com/intel/neural-compressor/blob/master/docs/source/faq.md) for more details. diff --git a/docs/source/installation_guide.md b/docs/source/installation_guide.md index a0e7ad5e47c..f4497806c58 100644 --- a/docs/source/installation_guide.md +++ b/docs/source/installation_guide.md @@ -29,28 +29,28 @@ The following prerequisites and requirements must be satisfied for a successful ### Install from Binary - Install from Pypi - ```Shell - # install stable basic version from pypi - pip install neural-compressor - ``` - ```Shell - # [Experimental] install stable basic + PyTorch framework extension API from pypi - pip install neural-compressor[pt] - ``` - ```Shell - # [Experimental] install stable basic + TensorFlow framework extension API from pypi - pip install neural-compressor[tf] - ``` - -- Install from test Pypi - ```Shell - # install nightly version - git clone https://github.com/intel/neural-compressor.git - cd neural-compressor - pip install -r requirements.txt - # install nightly basic version from pypi - pip install -i https://test.pypi.org/simple/ neural-compressor - ``` +```Shell +# Install 2.X API + Framework extension API + PyTorch dependency +pip install neural-compressor[pt] +``` +```Shell +# Install 2.X API + Framework extension API + TensorFlow dependency +pip install neural-compressor[tf] +``` +```Shell +# Install 2.X API + Framework extension API +# With this install CMD, some dependencies for framework extension API not installed, +# you can install them separately by `pip install -r requirements_pt.txt` or `pip install -r requirements_tf.txt`. +pip install neural-compressor +``` +```Shell +# Framework extension API + TensorFlow dependency +pip install neural-compressor-pt +``` +```Shell +# Framework extension API + TensorFlow dependency +pip install neural-compressor-tf +``` ### Install from Source @@ -76,15 +76,20 @@ The AI Kit is distributed through many common channels, including from Intel's w ## System Requirements ### Validated Hardware Environment + +#### Intel® Neural Compressor supports HPUs based on heterogeneous architecture with two compute engines (MME and TPC): +* Intel Gaudi Al Accelerators (Gaudi2) + #### Intel® Neural Compressor supports CPUs based on [Intel 64 architecture or compatible processors](https://en.wikipedia.org/wiki/X86-64): -* Intel Xeon Scalable processor (formerly Skylake, Cascade Lake, Cooper Lake, Ice Lake, and Sapphire Rapids) -* Intel Xeon CPU Max Series (formerly Sapphire Rapids HBM) +* Intel Xeon Scalable processor (Skylake, Cascade Lake, Cooper Lake, Ice Lake, and Sapphire Rapids) +* Intel Xeon CPU Max Series (Sapphire Rapids HBM) +* Intel Core Ultra Processors (Meteor Lake) #### Intel® Neural Compressor supports GPUs built on Intel's Xe architecture: -* Intel Data Center GPU Flex Series (formerly Arctic Sound-M) -* Intel Data Center GPU Max Series (formerly Ponte Vecchio) +* Intel Data Center GPU Flex Series (Arctic Sound-M) +* Intel Data Center GPU Max Series (Ponte Vecchio) #### Intel® Neural Compressor quantized ONNX models support multiple hardware vendors through ONNX Runtime: diff --git a/neural_compressor/onnxrt/__init__.py b/neural_compressor/onnxrt/__init__.py deleted file mode 100644 index e26d2897dd5..00000000000 --- a/neural_compressor/onnxrt/__init__.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from onnxruntime.quantization.calibrate import CalibrationMethod -from onnxruntime.quantization.quant_utils import QuantType, QuantFormat -from neural_compressor.onnxrt.utils.utility import register_algo -from neural_compressor.onnxrt.quantization import ( - rtn_quantize_entry, - RTNConfig, - get_default_rtn_config, - gptq_quantize_entry, - GPTQConfig, - get_default_gptq_config, - awq_quantize_entry, - AWQConfig, - get_default_awq_config, - smooth_quant_entry, - SmoohQuantConfig, - get_default_sq_config, - CalibrationDataReader, - autotune, - get_all_config_set, -) - -__all__ = [ - "register_algo", - "rtn_quantize_entry", - "RTNConfig", - "get_default_rtn_config", - "gptq_quantize_entry", - "GPTQConfig", - "get_default_gptq_config", - "awq_quantize_entry", - "AWQConfig", - "get_default_awq_config", - "smooth_quant_entry", - "SmoohQuantConfig", - "get_default_sq_config", - "CalibrationDataReader", - "QuantType", - "QuantFormat", - "CalibrationMethod", - "autotune", - "get_all_config_set", -] diff --git a/neural_compressor/onnxrt/algorithms/__init__.py b/neural_compressor/onnxrt/algorithms/__init__.py deleted file mode 100644 index c1d38b1844c..00000000000 --- a/neural_compressor/onnxrt/algorithms/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from neural_compressor.onnxrt.algorithms.smoother import Smoother -from neural_compressor.onnxrt.algorithms.weight_only.rtn import apply_rtn_on_model -from neural_compressor.onnxrt.algorithms.weight_only.gptq import apply_gptq_on_model -from neural_compressor.onnxrt.algorithms.weight_only.awq import apply_awq_on_model -from neural_compressor.onnxrt.algorithms.layer_wise import layer_wise_quant - -__all__ = ["Smoother", "apply_rtn_on_model", "apply_gptq_on_model", "apply_awq_on_model", "layer_wise_quant"] diff --git a/neural_compressor/onnxrt/algorithms/layer_wise/__init__.py b/neural_compressor/onnxrt/algorithms/layer_wise/__init__.py deleted file mode 100644 index 86c5371fbb3..00000000000 --- a/neural_compressor/onnxrt/algorithms/layer_wise/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from neural_compressor.onnxrt.algorithms.layer_wise.core import layer_wise_quant - -__all__ = ["layer_wise_quant"] diff --git a/neural_compressor/onnxrt/algorithms/layer_wise/core.py b/neural_compressor/onnxrt/algorithms/layer_wise/core.py deleted file mode 100644 index a3eacb6ebc9..00000000000 --- a/neural_compressor/onnxrt/algorithms/layer_wise/core.py +++ /dev/null @@ -1,289 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (c) 2023 MIT HAN Lab -# This source code is licensed under the MIT license -# -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from copy import deepcopy -from pathlib import Path -from typing import Callable, List, Union - -import onnx -import onnxruntime as ort -import transformers - -from neural_compressor.common import Logger -from neural_compressor.onnxrt.quantization.calibrate import CalibrationDataReader -from neural_compressor.onnxrt.utils.onnx_model import ONNXModel -from neural_compressor.onnxrt.utils.utility import check_model_with_infer_shapes - -logger = Logger().get_logger() - -__all__ = [ - "layer_wise_quant", -] - - -def layer_wise_quant( - model: Union[onnx.ModelProto, ONNXModel, Path, str], - quant_func: Callable, - weight_config: dict, - data_reader: CalibrationDataReader = None, - *args, - **kwargs -) -> ONNXModel: - """Quantize model layer by layer to save memory. - - Args: - model (Union[onnx.ModelProto, ONNXModel, Path, str]): onnx model. - quant_func (Callable): quantization algo function. - weight_config (dict): quantization config. - data_reader (CalibrationDataReader, optional): data_reader for calibration. Defaults to None. - - Returns: - _type_: _description_ - """ - # check whether model shape is inferred - if not check_model_with_infer_shapes(model): - logger.error( - "Before applying layer-wise quantization, please make sure to " - "run symbolic shape inference on your model like follows:\n" - "import onnxruntime.tools.symbolic_shape_infer as symbolic_shape_infer\n" - "model = onnx.load(your_model_path)\n" - "out = symbolic_shape_infer.SymbolicShapeInference.infer_shapes(model, auto_merge=True)\n" - "onnx.save(out, infer_shape_model_path)\n" - ) - raise ValueError("Fail to run layer-wise quantization.") - - if not isinstance(model, ONNXModel): - model = ONNXModel(model, ignore_warning=True, load_external_data=False) - - origin_model = deepcopy(model) - - providers = kwargs.get("providers", ["CPUExecutionProvider"]) - - # get and check split nodes - split_nodes = origin_model.find_split_nodes() - if len(split_nodes) == 0: - logger.error( - "Can't find split nodes for layer-wise quantization. " - "We recommend applying graph optimization for your model like follows: \n" - "import onnxruntime as ort \n" - "sess_options = ort.SessionOptions() \n" - "sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED " - "# or ORT_ENABLE_BASIC \n" - "sess_options.optimized_model_filepath = 'optimized_model_path' \n" - "ort.InferenceSession(infer_shape_model_path, sess_options)" - ) - raise ValueError("Fail to run layer-wise quantization.") - logger.info( - "Will split model into {} parts to do layer-wise quantization".format( - len([node.name for node in split_nodes]) + 1 - ) - ) - logger.debug( - "Will split model with these nodes for layer-wise quantization: {}".format([node.name for node in split_nodes]) - ) - - split_idx = 1 - model_to_split = [origin_model] - quantized_model_merged = None - - require_data_reader = data_reader is not None - if require_data_reader: - lwq_data_reader = [data_reader] - - while len(model_to_split) != 0: - # prepare model, node and data_reader for current split - split_model = model_to_split.pop(0) - split_node = split_nodes.pop(0) - if require_data_reader: - current_data_reader = lwq_data_reader.pop(0) - - # if no remaining split nodes, it means this is the last split, and the two split models will be saved. - save_both_split_models = True if len(split_nodes) == 0 else False - - # split model with given split node - split_model_part_1, split_model_part_2 = split_model.split_model_with_node( - split_node.name, model.model_path, save_both_split_models - ) - if not save_both_split_models: - # append split_model_part_2 to do next split - model_to_split.append(split_model_part_2) - - logger.info("Quantize split model {}".format(split_idx)) - if require_data_reader: - # process data_reader for current split and next split - current_data_reader = _filter_data_reader_for_current_split_model( - split_model_part_1.model, current_data_reader - ) - next_data_reader = _prepare_data_reader_for_next_split_model( - split_model_part_1.model_path, current_data_reader, providers - ) - lwq_data_reader.append(next_data_reader) - - # perform quantization - split_model_part_1_quantized = quant_func( - split_model_part_1, - weight_config=weight_config, - data_reader=current_data_reader, - return_modelproto=False, - **kwargs - ) - else: - # perform quantization - split_model_part_1_quantized = quant_func( - split_model_part_1, weight_config=weight_config, return_modelproto=False, **kwargs - ) - - # check split model is valid - try: - ort.InferenceSession(split_model_part_1_quantized.model.SerializeToString(), providers=providers) - except Exception as e: - logger.error( - "Layer-wise quantized model {} can't be inferred correctly. " - "Please check the raise exception".format(split_idx) - ) - raise e - - # merge split quantized model - if quantized_model_merged is None: - quantized_model_merged = split_model_part_1_quantized - quantized_model_merged.write_external_data_to_new_location(overwrite=True) - else: - quantized_model_merged.merge_split_models(split_model_part_1_quantized) - - split_idx += 1 - # if this is the last split, quantize the last split model - if save_both_split_models: - logger.info("Quantize split model {}".format(split_idx)) - - # quantize split model - if require_data_reader: - # process data_reader for current split - current_data_reader = lwq_data_reader.pop(0) - current_data_reader = _filter_data_reader_for_current_split_model( - split_model_part_2.model, current_data_reader - ) - - # perform quantization - split_model_part_2_quantized = quant_func( - split_model_part_2, - weight_config=weight_config, - data_reader=current_data_reader, - return_modelproto=False, - **kwargs - ) - else: - # perform quantization - split_model_part_2_quantized = quant_func( - split_model_part_2, weight_config=weight_config, return_modelproto=False, **kwargs - ) - - # check split model is valid - try: - ort.InferenceSession(split_model_part_2_quantized.model.SerializeToString(), providers=providers) - except Exception as e: - logger.error( - "Layer-wise quantized model {} can't be inferred correctly. " - "Please check the raise exception".format(split_idx) - ) - raise e - - # merge split quantized model - if quantized_model_merged is None: - quantized_model_merged = split_model_part_2_quantized - quantized_model_merged.write_external_data_to_new_location(overwrite=True) - else: - quantized_model_merged.merge_split_models(split_model_part_2_quantized) - - # reload external data to prevent external data file path errors - from onnx.external_data_helper import load_external_data_for_model - - load_external_data_for_model(quantized_model_merged.model, os.path.dirname(quantized_model_merged.model_path)) - - return quantized_model_merged - - -class DataReader(CalibrationDataReader): - """Data reader for layer-wise quantization.""" - - def __init__(self, data_list): - self.data_list = data_list - self.iter_next = iter(self.data_list) - - def get_next(self): - return next(self.iter_next, None) - - def rewind(self): - self.iter_next = iter(self.data_list) - - -def _filter_data_reader_for_current_split_model(model: onnx.ModelProto, data_reader: CalibrationDataReader): - """Filter data reader to remove data that is not in model input. - - Args: - model (onnx.ModelProto): onnx model. - data_reader (CalibrationDataReader): data reader. - - Returns: - CalibrationDataReader: filtered data reader. - """ - filter_inputs = [] - input_names = [input.name for input in model.graph.input] - while True: - inputs = data_reader.get_next() - if not inputs: - break - filter_input = { - input_name: input_tensor for input_name, input_tensor in inputs.items() if input_name in input_names - } - filter_inputs.append(filter_input) - return DataReader(filter_inputs) - - -def _prepare_data_reader_for_next_split_model( - model_path: str, - data_reader: CalibrationDataReader, - providers: List[str] = ["CPUExecutionProvider"], -): - """Prepare data reader for next split model. - - Get data output of current split model and save for next split model. - - Args: - model (str): path to onnx model. - data_reader (CalibrationDataReader): data reader - providers (List[str], optional): providers to use. Defaults to ["CPUExecutionProvider"]. - - Returns: - CalibrationDataReader: data reader for next split model. - """ - data_reader = deepcopy(data_reader) - - data_reader_for_next_split_model = [] - session = ort.InferenceSession(model_path, providers=providers) - output_names = [output.name for output in session.get_outputs()] - while True: - inputs = data_reader.get_next() - if not inputs: - break - out = session.run(None, inputs) - inputs.update({name: value for name, value in zip(output_names, out)}) - data_reader_for_next_split_model.append(inputs) - return DataReader(data_reader_for_next_split_model) diff --git a/neural_compressor/onnxrt/algorithms/smoother/__init__.py b/neural_compressor/onnxrt/algorithms/smoother/__init__.py deleted file mode 100644 index 2e76dc06aee..00000000000 --- a/neural_compressor/onnxrt/algorithms/smoother/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from neural_compressor.onnxrt.algorithms.smoother.core import Smoother - -__all__ = ["Smoother"] diff --git a/neural_compressor/onnxrt/algorithms/smoother/calibrator.py b/neural_compressor/onnxrt/algorithms/smoother/calibrator.py deleted file mode 100644 index ddf009ea829..00000000000 --- a/neural_compressor/onnxrt/algorithms/smoother/calibrator.py +++ /dev/null @@ -1,237 +0,0 @@ -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Calibration for smooth quant.""" - -import sys -import tempfile -from importlib.util import find_spec -from pathlib import Path -from typing import List - -import numpy as np -import onnx -import onnx.numpy_helper as numpy_helper -import onnxruntime - -from neural_compressor.common import Logger -from neural_compressor.onnxrt.quantization.calibrate import CalibrationDataReader -from neural_compressor.onnxrt.utils.onnx_model import ONNXModel - -logger = Logger().get_logger() - -__all__ = ["Calibrator"] - - -class Calibrator: - """Dump information for smooth quant.""" - - def __init__( - self, - model: ONNXModel, - dataloader: CalibrationDataReader, - iterations: List[int] = [], - providers: List[str] = ["CPUExecutionProvider"], - **kwargs, - ): - """Initialize a Calibrator to dump information. - - Args: - model (ONNXModel): ONNXModel object. - dataloader (CalibrationDataReader): user implemented object to read in and preprocess calibration dataset. - iterations (List[int], optional): tensor of which iteration will be collected. Defaults to []. - providers (List[str], optional): execution provider for onnxruntime. Defaults to ["CPUExecutionProvider"]. - """ - self.model_wrapper = model - self.dataloader = dataloader - self.augmented_model = None - self.iterations = iterations - self.providers = providers - - def _check_is_group_conv(self, node): - """Check the op is group wised or not(depthwise conv is excluded,return false). - - Args: - node: The op node - - Returns: - Bool: group wised True, otherwise False, depthwise False - """ - name_to_indices = {} - for index, i in enumerate(self.model_wrapper.initializer()): - name_to_indices[i.name] = index - - if node.op_type == "Conv": - group = 1 - for attr in node.attribute: - if hasattr(attr, "name"): - if attr.name == "group": - group = attr.i - break - # currently only normal conv and depthwise conv are supported - if group > 1: # group conv, need to check depthwise or not - weight_name = node.input[1] - weight_shape = numpy_helper.to_array( - self.model_wrapper.initializer()[name_to_indices[weight_name]] - ).shape - input_channel = weight_shape[1] - if input_channel != 1: # TODO: need to double check - return True - return False - - def _get_input_tensor_of_ops(self, op_types: List[str] = ["MatMul", "Gemm", "Conv", "FusedConv"]): - """Traverse the graph and get all the data tensors flowing into layers of {op_types}. - - Group conv is excluded. - # TODO: the tensors could be set/filtered in configuration. - - Args: - op_types (List[str], optional): The op types whose input tensor will be dumped. - Defaults to ["MatMul", "Gemm", "Conv", "FusedConv"]. - - Returns: - dict: A dict of dumped tensor to node info - """ - tensors_to_node = {} - initializers = {i.name: i for i in self.model_wrapper.initializer()} - - for node in self.model_wrapper.nodes(): - if len(op_types) == 0 or node.op_type in op_types: - if node.op_type in ["Conv", "FusedConv"] and self._check_is_group_conv(node): - continue - # also need to check whether the layer has weight - if len(node.input) >= 2 and node.input[1] in initializers.keys(): - tensors_to_node.setdefault(node.input[0], []).append([node.name, node.input, node.output]) - return tensors_to_node - - def _get_max_per_channel(self, datas, percentile): - """Get the max values per input channel. - - Args: - datas: The tensors - percentile: percentile of calibration to remove outliers - - Returns: - The max values per input channel - """ - permute_datas = [] - for data in datas: - if len(data.shape) == 3: # TODO: mammul batchsize*seq*inchannel, conv:batchsize*inchannle*f*f - tensor = np.abs(np.reshape(data, (-1, data.shape[-1]))) - permute_datas.append(tensor) - elif len(data.shape) == 4: - tensor = np.swapaxes(data, 1, -1) - tensor = np.abs(np.reshape(tensor, (-1, tensor.shape[-1]))) - permute_datas.append(tensor) - elif len(data.shape) == 2: - permute_datas.append(np.abs(data)) - else: - assert False, "not supported" - permute_datas = np.stack(permute_datas, axis=0) - permute_datas = permute_datas.reshape(-1, permute_datas.shape[-1]) - max_per_channels = np.percentile(permute_datas, percentile, axis=0) - max_per_channels = max_per_channels.astype(np.single) - return max_per_channels - - def get_intermediate_outputs(self): - so = onnxruntime.SessionOptions() - if sys.version_info < (3, 11) and find_spec("onnxruntime_extensions"): # pragma: no cover - from onnxruntime_extensions import get_library_path - - so.register_custom_ops_library(get_library_path()) - - providers = self.providers if "TensorrtExecutionProvider" not in self.providers else ["CUDAExecutionProvider"] - if self.model_wrapper.is_large_model: # pragma: no cover - with tempfile.TemporaryDirectory(prefix="ort.calib.") as tmp_dir: - onnx.save_model( - self.model_wrapper.model, - Path(tmp_dir).joinpath("augment.onnx").as_posix(), - save_as_external_data=True, - all_tensors_to_one_file=True, - convert_attribute=False, - ) - session = onnxruntime.InferenceSession( - Path(tmp_dir).joinpath("augment.onnx").as_posix(), so, providers=providers - ) - from onnx.external_data_helper import load_external_data_for_model - - load_external_data_for_model(self.model_wrapper.model, Path(tmp_dir).as_posix()) - else: - session = onnxruntime.InferenceSession( - self.model_wrapper.model.SerializeToString(), so, providers=providers - ) - node_output_names = [output.name for output in session.get_outputs()] - output_dicts = {} - input_name_to_nodes = self.model_wrapper.input_name_to_nodes() - output_name_to_node = self.model_wrapper.output_name_to_node() - name_to_node = {} - for data_name in node_output_names: - node = None - if data_name in output_name_to_node: - node = output_name_to_node[data_name] - elif data_name in input_name_to_nodes: - node = input_name_to_nodes[data_name][0] - assert node, "{} is neither an input nor an output of nodes in augmented model.".format(data_name) - name_to_node[data_name] = node.name - - def _collect_data(ort_inputs): - for output_idx, output in enumerate(session.run(None, ort_inputs)): - output_dicts.setdefault(node_output_names[output_idx], []).append(output) - - idx = 0 - while True: - inputs = self.dataloader.get_next() - if not inputs: - break - if self.iterations != []: - if idx > max(self.iterations): - break - if idx in self.iterations: - _collect_data(inputs) - else: - _collect_data(inputs) - idx += 1 - return output_dicts - - def calib_smooth(self, op_types, percentile: float = 99.999): - """Smooth model calibration. - - Mainly get the max info per channel of input tensors. - - Args: - op_types (_type_): The op types whose input tensor will be dumped. - percentile (float, optional): Percentile of calibration to remove outliers. - Defaults to 99.999. - - Returns: - max_vals_per_channel: max values per channel of input tensors - shape_infos: The shape information of input tensors - """ - logger.info("Start smooth model calibration.") - # add the input tensors of {op_types} to outputs of the model - tensors_to_node = self._get_input_tensor_of_ops(op_types) - self.model_wrapper.add_tensors_to_outputs(tensors_to_node.keys()) - output_dicts = self.get_intermediate_outputs() - - # remove the input tensors of {op_types} to outputs of the model - self.model_wrapper.remove_tensors_from_outputs(tensors_to_node.keys()) - max_vals_per_channel = {} - shape_infos = {} - - for key, val in tensors_to_node.items(): - max_val_per_channel = self._get_max_per_channel(output_dicts[key], percentile=percentile) - max_vals_per_channel[key] = max_val_per_channel - shape_infos[key] = output_dicts[key][0].shape - for item in val: - shape_infos[item[1][1]] = self.model_wrapper.get_initializer(item[1][1]).dims - return max_vals_per_channel, shape_infos, tensors_to_node diff --git a/neural_compressor/onnxrt/algorithms/smoother/core.py b/neural_compressor/onnxrt/algorithms/smoother/core.py deleted file mode 100644 index 50227bf3994..00000000000 --- a/neural_compressor/onnxrt/algorithms/smoother/core.py +++ /dev/null @@ -1,668 +0,0 @@ -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Smoother for onnxrt.""" - -import copy -import os -from pathlib import Path -from typing import List, Union - -import numpy as np -import onnx -from onnx import helper, numpy_helper -from onnx import onnx_pb as onnx_proto - -from neural_compressor.common import Logger -from neural_compressor.onnxrt.algorithms.smoother.calibrator import Calibrator -from neural_compressor.onnxrt.quantization.calibrate import CalibrationDataReader -from neural_compressor.onnxrt.utils.onnx_model import ONNXModel -from neural_compressor.onnxrt.utils.utility import ( - get_qrange_for_qType, - is_B_transposed, - quantize_data, - simple_progress_bar, -) - -logger = Logger().get_logger() - -__all__ = ["Smoother"] - -_dtype_map = { - np.dtype("float32"): 1, - np.dtype("uint8"): 2, - np.dtype("int8"): 3, - np.dtype("int32"): 6, - np.dtype("int64"): 7, - np.dtype("float16"): 10, - np.dtype("double"): 11, -} - - -def _get_quant_dequant_output(model, input_data, output_data, providers): - """Get loss between fp32 output and QDQ output. - - Args: - model (object): model - input_data (numpy.ndarray): fp32 input - output_data (numpy.ndarray): fp32 output - providers (list): execution provider - """ - import onnxruntime as ort - - input_data = _quant_dequant_data(input_data, 2, "asym") - sess = ort.InferenceSession(model.SerializeToString(), providers=providers) - preds = sess.run(None, {model.graph.input[0].name: input_data}) - loss = np.sum(np.abs(output_data - preds) ** 2) - return loss - - -def _make_sub_graph(node, inits, input_data, output_data, opset, ir_version): - """Build a model with the specific node. - - Args: - node (object): node - inits (list): initializer inputs of this node - input_data (numpy.ndarray): fp32 input - output_data (numpy.ndarray): fp32 output - opset (object): opset of the model - ir_version (object): ir_version of the model - """ - from onnx import helper - - input = helper.make_tensor_value_info(node.input[0], _dtype_map[input_data.dtype], input_data.shape) - output = helper.make_tensor_value_info(node.output[0], _dtype_map[output_data.dtype], output_data.shape) - graph = helper.make_graph([node], "sub_graph", [input], [output], inits) - model = helper.make_model(graph, opset_imports=opset) - model.ir_version = ir_version - return model - - -def _quant_dequant_data(data, qType=3, scheme="sym"): - """Quantize and then dequantize data. - - Args: - data (numpy.ndarray): target data - qType (int): data type - scheme (str): sym or asym quantization - """ - rmin, rmax, zero_point, scale, quantized_data = quantize_data( - data.flatten().tolist(), get_qrange_for_qType(qType, False), qType, scheme - ) - return ((quantized_data - zero_point) * scale).astype(data.dtype).reshape(data.shape) - - -class Smoother: - """Fake input channel quantization. - - For more details please refer to: - [1] SmoothQuant: Accurate and Efficient - Post-Training Quantization for Large Language Models - [2] SPIQ: Data-Free Per-Channel Static Input Quantization - We only support inplace mode which means the model weights will be changed, - you can call recover function to recover the weights if needed. - """ - - def __init__( - self, - model: Union[onnx.ModelProto, ONNXModel, Path, str], - dataloader: CalibrationDataReader, - providers: List[str] = ["CPUExecutionProvider"], - ): - """Initialize the attributes of class.""" - self.model = model if isinstance(model, ONNXModel) else ONNXModel(model, load_external_data=True) - self.value_infos = {vi.name: vi for vi in self.model.model.graph.value_info} - self.value_infos.update({ot.name: ot for ot in self.model.model.graph.output}) - self.value_infos.update({it.name: it for it in self.model.model.graph.input}) - self.dataloader = dataloader - self.providers = providers - self.tensor_scales_info = {} - self.new_added_mul_nodes = [] - self.new_added_value_info = [] - self.new_init_tensors = [] # scales_tensor - self.scales_per_op = True - self.replace_input = [] - self.ops_to_absorb = [] - self.max_vals_per_channel = None - self.shape_info = None - self.tensors_to_node = None - self._build_absorb_function() - - def transform( - self, - alpha: Union[float, str] = 0.5, - folding: bool = True, - percentile: float = 99.999, - op_types: List[str] = ["Gemm", "Conv", "MatMul", "FusedConv"], - scales_per_op: bool = True, - calib_iter: int = 100, - auto_alpha_args: dict = {"alpha_min": 0.3, "alpha_max": 0.7, "alpha_step": 0.05, "attn_method": "min"}, - *args, - **kwargs - ): - """The main entry of smooth quant. - - Args: - alpha (float, optional): alpha value to balance the quantization difficulty of activation and weight. - Defaults to 0.5. - folding (bool, optional): whether fold those foldable Mul which are inserted for smooth quant. - Defaults to True. - percentile (float, optional): percentile of calibration to remove outliers. - Defaults to 99.999. - op_types (list, optional): the op type to be smooth quantized. - Defaults to ["Gemm", "Conv", "MatMul", "FusedConv"]. - scales_per_op (bool, optional): True, each op will have an individual scale, mainlyfor accuracy - False, ops with the same input will share a scale, mainly for performance. - Defaults to True. - calib_iter (int, optional): iteration num for calibration. Defaults to 100. - auto_alpha_args (_type_, optional): alpha args for auto smooth. - Defaults to {"alpha_min": 0.3, "alpha_max": 0.7, "alpha_step": 0.05, "attn_method": "min"}. - - Returns: - onnx.ModelProto: A FP32 model with the same architecture as the orig model - but with different weight which will be benefit to quantization - """ - self.scales_per_op = scales_per_op - self.clean() - if isinstance(alpha, float) and (alpha < 0 or alpha > 1): - logger.warning("alpha should be a float value in [0, 1] or 'auto' ") - if alpha < 0: - alpha = 0 - logger.warning("reset alpha to 0 ") - elif alpha > 1.0: - alpha = 1.0 - logger.warning("reset alpha to 1.0 ") - - self._dump_op_info(percentile, op_types, calib_iter) - - if alpha == "auto": - alpha = self._auto_tune_alpha(calib_iter, **auto_alpha_args) - - scales = self._get_smooth_scales(alpha) - self._insert_smooth_mul_op(scales) - self._adjust_weights(scales) - - self.model.add_nodes(self.new_added_mul_nodes) - self.model.model.graph.value_info.extend(self.new_added_value_info) - self.model.add_initializers(self.new_init_tensors) - for node, old_input_name, new_input_name in self.replace_input: - self.model.replace_node_input(node, old_input_name, new_input_name) - - self.model.update() - if folding: - self._fold_scale(scales) - self.model.topological_sort() - self.model.remove_unused_nodes() - return self.model.model - - def _dump_op_info(self, percentile, op_types, iterations): - """Dump op info for smooth quant. - - Args: - percentile (float): percentile of calibration to remove outliers - op_types (list): the op type to be smooth quantized - iterations (int): iterations - """ - calibrator = Calibrator( - self.model, - self.dataloader, - iterations=list(range(0, iterations)), - backend=self.providers, - ) - - self.max_vals_per_channel, self.shape_info, self.tensors_to_node = calibrator.calib_smooth(op_types, percentile) - for node in self.model.nodes(): - for out in node.output: - if ( - out in self.tensors_to_node - and node.op_type in self.could_absorb_optype - and self.model.get_initializer(node.input[1]) is not None - ): - self.ops_to_absorb.append(node.name) - - def recover(self): - """Recover the model weights.""" - for tensor_name, nodes in self.tensors_to_node.items(): - for node_info in nodes: - key = node_info[0] if self.scales_per_op else tensor_name - if key not in self.tensor_scales_info: - continue - input = node_info[1][1] - weight = numpy_helper.to_array( - self.model.get_initializer(input), - base_dir=os.path.dirname(self.model.model_path) if self.model.model_path is not None else "", - ) - scale = self.tensor_scales_info[key] - new_weight = weight * scale - self.model.set_initializer(input, new_weight) - - for node, old_input_name, new_input_name in self.replace_input: - self.model.replace_node_input(node, new_input_name, old_input_name) - - for value_info in self.new_added_value_info: - self.model.model.graph.value_info.remove(value_info) - - self.model.remove_nodes(self.new_added_mul_nodes) - self.model.remove_initializers(self.new_init_tensors) - self.tensor_scales_info = {} - self.new_added_mul_nodes = [] - self.new_init_tensors = [] - self.new_added_value_info = [] - self.replace_input = [] - - def clean(self): - """Clean data collected from calibration.""" - self.tensor_scales_info = {} - self.new_added_mul_nodes = [] - self.new_init_tensors = [] - self.new_added_value_info = [] - self.replace_input = [] - - def _build_absorb_function(self): - """Build function mapping for scale folding.""" - from onnx import numpy_helper - - def norm(node, scale): # pragma: no cover - for idx in [1, 2]: - tensor = self.model.get_initializer(node.input[idx]) - new_tensor = ( - numpy_helper.to_array(tensor, os.path.dirname(self.model.model_path)) * scale - if self.model.model_path is not None - else numpy_helper.to_array(tensor) * scale - ) - self.model.set_initializer(node.input[idx], new_tensor) - self.tensor_scales_info[node.input[idx]] = ( - 1.0 / scale - if node.input[idx] not in self.tensor_scales_info - else self.tensor_scales_info[node.input[idx]] * 1.0 / scale - ) - return True - - def mul(node, scale): # pragma: no cover - if all([self.model.get_initializer(inp) is None for inp in node.input]): - return False - for inp in node.input: - if self.model.get_initializer(inp) is not None: - key = node.input[0].split("_smooth_output")[0] - tensor = self.model.get_initializer(inp) - new_tensor = ( - numpy_helper.to_array(tensor, os.path.dirname(self.model.model_path)) * scale - if self.model.model_path is not None - else numpy_helper.to_array(tensor) * scale - ) - self.model.set_initializer(inp, new_tensor) - self.tensor_scales_info[key] = ( - 1.0 / scale - if key not in self.tensor_scales_info - else 1.0 / scale * self.tensor_scales_info[key] - ) - return True - - def conv(node, scale): # pragma: no cover - if len(node.input) > 2: - if self.model.get_initializer(node.input[2]) is not None: - tensor = self.model.get_initializer(node.input[2]) - new_tensor = ( - numpy_helper.to_array(tensor, os.path.dirname(self.model.model_path)) * scale - if self.model.model_path is not None - else numpy_helper.to_array(tensor) * scale - ) - self.model.set_initializer(node.input[2], new_tensor) - self.tensor_scales_info[node.input[2]] = 1.0 / scale - scale = scale.reshape(-1, 1, 1, 1) - tensor = self.model.get_initializer(node.input[1]) - new_tensor = ( - numpy_helper.to_array(tensor, os.path.dirname(self.model.model_path)) * scale - if self.model.model_path is not None - else numpy_helper.to_array(tensor) * scale - ) - self.model.set_initializer(node.input[1], new_tensor) - self.tensor_scales_info[node.input[1]] = ( - 1.0 / scale - if node.input[1] not in self.tensor_scales_info - else self.tensor_scales_info[node.input[1]] * 1.0 / scale - ) - return True - - self.could_absorb_optype = { - "LayerNormalization": norm, - "BatchNormalization": norm, - "InstanceNormalization": norm, - "SimplifiedLayerNormalization": mul, - "MatMul": mul, - "Gemm": mul, - "Conv": conv, - "FusedConv": conv, - "Mul": mul, - } - - def _fold_scale(self, scales): - """Absorb the scale to the operator at output channel. - - Args: - scales (dict): scales for smooth quant, {tensor_name: smooth quant scale} - """ - remove_nodes = [] - for node in self.model.nodes(): - if node.op_type == "Mul" and node.name.endswith("_smooth_mul") and node not in remove_nodes: - parent = self.model.get_parent(node, 0) - if parent is None: - continue - if parent.op_type in self.could_absorb_optype and len(self.model.get_children(parent)) == 1: - if node.output[0].split("_smooth_output")[0] in scales: - if self.could_absorb_optype[parent.op_type]( - parent, 1.0 / scales[node.output[0].split("_smooth_output")[0]] - ): - remove_nodes.append(node) - children = [i for i in self.model.nodes() if node.output[0] in i.input] - for child in children: - for idx, inp in enumerate(child.input): - if inp == node.output[0]: - child.input[idx] = node.input[0] - self.model.remove_nodes(remove_nodes) - - def _get_output_loss(self, node_name, scale, calib_iter): - """Get output loss of specific node after inserting QDQ pair. - - Args: - node_name (str): node name - scale (float): scale of the specific node - calib_iter (int): iterations - """ - import onnxruntime as ort - - node = [i for i in self.model.nodes() if i.name == node_name] - loss = 0 - if len(node) > 0: - node = node[0] - orig_outputs = self.model.output() - added_tensors = [node.input[0], node.output[0]] - self.model.add_tensors_to_outputs(added_tensors) - - session = ( - ort.InferenceSession(self.model.model_path + "_augment.onnx", providers=self.providers) - if self.model.is_large_model - else ort.InferenceSession(self.model.model.SerializeToString(), providers=self.providers) - ) - base_dir = "" if not self.model.is_large_model else os.path.dirname(self.model.model_path) - weight = onnx.numpy_helper.to_array(self.model.get_initializer(node.input[1]), base_dir) - weight_q = _quant_dequant_data(weight) - - self.model.set_initializer(node.input[1], weight_q) - inits = [self.model.get_initializer(i) for i in node.input if self.model.get_initializer(i) is not None] - - model = None - idx = 1 - while True: - inputs = self.dataloader.get_next() - if not inputs: - break - if idx > calib_iter: - break - - outputs = session.run(added_tensors, inputs) - if model is None: - model = _make_sub_graph( - node, - inits, - outputs[0], - outputs[1], - self.model.model.opset_import, - self.model.model.ir_version, - ) - loss += _get_quant_dequant_output(model, outputs[0] * scale, outputs[1], self.providers) - - self.model.remove_tensors_from_outputs([i for i in added_tensors if i not in orig_outputs]) - self.model.set_initializer(node.input[1], weight) - return loss - - def _reshape_scale_for_input(self, tensor, key): - """Reshape the scale for input feature in channel. - - Args: - tensor (str): tensor name - key (str): scale key of this tensor - """ - if len(self.shape_info[tensor]) == 4: - scale = np.reshape(self.tensor_scales_info[key], (1, self.tensor_scales_info[key].shape[1], 1, 1)) - else: - scale = np.reshape(self.tensor_scales_info[key], (1, self.tensor_scales_info[key].shape[0])) - return scale - - def _auto_tune_alpha( - self, - calib_iter, - alpha_min: float = 0.3, - alpha_max: float = 0.7, - alpha_step: float = 0.05, - attn_method: str = "min", - ): - """Perform alpha-tuning to obtain layer-wise optimal alpha values and adjust parameters accordingly. - - Args: - calib_iter (int): iterations - alpha_min (float): min value of alpha search space. - alpha_max (float): max value of alpha search space. - alpha_step (float): step size of alpha search space. - attn_method (str): criterion method used on attention ops; currently min, max and mean are supported. - """ - logger.info("auto tuning alpha") - - alpha_space = np.arange(alpha_min, alpha_max, alpha_step).tolist() - - optimal_alphas = {} - if self.model.is_large_model: - onnx.save_model( - self.model.model, - self.model.model_path + "_augment.onnx", - save_as_external_data=True, - all_tensors_to_one_file=True, - location="weights.pb", - convert_attribute=False, - ) - - ## Searching optimal alphas - for tensor_name, node_infos in self.tensors_to_node.items(): - for node_info in node_infos: - loss_alpha = {} - key = node_info[0] if self.scales_per_op else tensor_name - node = self.model.get_node(node_info[0]) - for alpha in alpha_space: - scale = self._get_smooth_scales(alpha, [key]) - self._adjust_weights(scale) - input_scale = ( - self._reshape_scale_for_input(tensor_name, key) - if not (node.op_type == "Gemm" and is_B_transposed(node)) - else self.tensor_scales_info[key] - ) - loss = self._get_output_loss(node_info[0], input_scale, calib_iter) - loss_alpha[alpha] = loss - if key not in optimal_alphas: # Update alpha results - optimal_alphas[key] = alpha - else: - optimal_alphas[key] = ( - alpha - if optimal_alphas[key] in loss_alpha and loss < loss_alpha[optimal_alphas[key]] - else optimal_alphas[key] - ) - self.recover() - logger.info("auto tuning alpha done") - if self.model.is_large_model: - from onnx.external_data_helper import load_external_data_for_model - - load_external_data_for_model(self.model.model, os.path.split(self.model.model_path)[0]) - os.remove(self.model.model_path + "_augment.onnx") - os.remove(os.path.join(os.path.dirname(self.model.model_path), "weights.pb")) - return optimal_alphas - - def _get_smooth_scales(self, alpha, target_list=[]): - """Get the smooth scales for. - - The ops with the same input will share one mul layer. - TODO support individual scales for each layer. - - Args: - alpha: smooth alpha in paper - target_list: target objects to get scale, [] means get all scales - - Returns: - the smooth scales for weights, currently one input tensor only have one scale - """ - logger.info("Start smooth scales collection.") - scales = {} - for tensor, nodes in self.tensors_to_node.items(): - # if scales_per_op the key of scales is the node name, otherwise the activation of node - if self.scales_per_op: - for node_info in nodes: - node = self.model.get_node_by_weight(node_info[1][1]) - if len(target_list) > 0 and node_info[0] not in target_list: - continue - weight = numpy_helper.to_array( - self.model.get_initializer(node_info[1][1]), - base_dir=os.path.dirname(self.model.model_path) if self.model.model_path is not None else "", - ) - if (len(weight.shape) == 4 and weight.shape[1] != 1) or ( - node.op_type == "Gemm" and is_B_transposed(node) - ): - weight = np.moveaxis(weight, 0, 1) - specific_alpha = alpha[node_info[0]] if isinstance(alpha, dict) else alpha - scales[node_info[0]] = self._get_smooth_scale(weight, specific_alpha, tensor) - else: - if len(target_list) > 0 and tensor not in target_list: - continue - weights_in_channel_max = [] - for node_info in nodes: - node = self.model.get_node_by_weight(node_info[1][1]) - weight = numpy_helper.to_array( - self.model.get_initializer(node_info[1][1]), - base_dir=os.path.dirname(self.model.model_path) if self.model.model_path is not None else "", - ) - if (len(weight.shape) == 4 and weight.shape[1] != 1) or ( - node.op_type == "Gemm" and is_B_transposed(node) - ): - weight = np.moveaxis(weight, 0, 1) - weight = weight.reshape(weight.shape[0], -1) - cur_max = np.amax(weight, axis=-1) - weights_in_channel_max.append(cur_max) - weights_stack = np.stack(weights_in_channel_max, axis=-1) - specific_alpha = alpha[tensor] if isinstance(alpha, dict) else alpha - scales[tensor] = self._get_smooth_scale(weights_stack, specific_alpha, tensor) - - return scales - - def _get_smooth_scale(self, weights, specific_alpha, tensor): - """Get smooth scale for specific weight. - - Args: - weights (numpy.ndarray): weight data - specific_alpha (float): current alpha for this weights - tensor (str): tensor name - """ - weights = np.abs(weights.reshape(weights.shape[0], -1)) - weights_max = np.amax(weights, axis=-1) - input_power = np.power(self.max_vals_per_channel[tensor], specific_alpha) - weight_power = np.power(weights_max, 1 - specific_alpha) - weight_power = np.clip(weight_power, a_min=1e-5, a_max=None) - scale = np.clip(input_power / weight_power, a_min=1e-5, a_max=None) - return scale - - def _insert_smooth_mul_op(self, scales): - """Insert the Mul after inupt. - - The ops with the same input will share one mul layer. - - Args: - scales (dict): The smooth scales - """ - for key in scales.keys(): - input_name = key if not self.scales_per_op else self.model.get_node(key).input[0] - weight_name = ( - self.tensors_to_node[key][0][1][1] if not self.scales_per_op else self.model.get_node(key).input[1] - ) - scale_factor = 1.0 / scales[key] - if ( - len(self.shape_info[weight_name]) == 3 or len(self.shape_info[weight_name]) == 2 - ): # the last dim is input channel - pass - elif len(self.shape_info[weight_name]) == 4: - scale_factor = np.reshape(scale_factor, (1, -1, 1, 1)) - else: - assert False, "not support" - name = key + "_" + "smooth_scale" - scale_tensor = helper.make_tensor( - name=key + "_" + "smooth_scale", - data_type=onnx_proto.TensorProto.FLOAT, - dims=scale_factor.shape, - vals=scale_factor.flatten().tolist(), - ) - self.new_init_tensors.append(scale_tensor) - mul_output_name = key + "_smooth_output" - mul_node = helper.make_node( - "Mul", - inputs=[input_name, key + "_" + "smooth_scale"], - outputs=[mul_output_name], - name=key + "_smooth_mul", - ) - self.new_added_mul_nodes.append(mul_node) - if input_name in self.value_infos: - value_info = copy.deepcopy(self.value_infos[input_name]) - value_info.name = mul_node.output[0] - self.new_added_value_info.append(value_info) - if self.scales_per_op: - self.replace_input.append([self.model.get_node(key), input_name, mul_output_name]) - else: - for node_info in self.tensors_to_node[key]: - self.replace_input.append([self.model.get_node(node_info[0]), key, mul_output_name]) - - def _adjust_weights(self, scales): - """Adjust the weights with scale. - - Args: - scales (dict): The input scales - """ - for idx, (tensor_name, nodes) in enumerate(self.tensors_to_node.items()): - simple_progress_bar(len(self.tensors_to_node), idx + 1) - for node_info in nodes: - key = node_info[0] if self.scales_per_op else tensor_name - if key not in scales: - continue - input = node_info[1][1] - node = self.model.get_node_by_weight(input) - weight = numpy_helper.to_array( - self.model.get_initializer(input), - base_dir=os.path.dirname(self.model.model_path) if self.model.model_path is not None else "", - ) - if len(weight.shape) == 2: - scale = ( - np.expand_dims(scales[key], axis=0) - if node.op_type == "Gemm" and is_B_transposed(node) - else np.expand_dims(scales[key], axis=-1) - ) - new_weight = weight * scale - elif len(weight.shape) == 4: # TODO need to check conv - node = self.model.get_node_by_weight(input) - if ( - weight.shape[1] == 1 - and "group" in [i.name for i in node.attribute] - and [i for i in node.attribute if i.name == "group"][0].i > 1 - ): - scale = np.reshape(scales[key], (-1, 1, 1, 1)) - else: - scale = np.reshape(scales[key], (1, -1, 1, 1)) - new_weight = weight * scale - else: - assert False, "not support" - self.tensor_scales_info[key] = 1.0 / scale - - new_tensor = numpy_helper.from_array(new_weight, input) - self.model.get_initializer(input).CopyFrom(new_tensor) diff --git a/neural_compressor/onnxrt/algorithms/weight_only/__init__.py b/neural_compressor/onnxrt/algorithms/weight_only/__init__.py deleted file mode 100644 index 28f108cb636..00000000000 --- a/neural_compressor/onnxrt/algorithms/weight_only/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/neural_compressor/onnxrt/algorithms/weight_only/awq.py b/neural_compressor/onnxrt/algorithms/weight_only/awq.py deleted file mode 100644 index 647d0a9d25e..00000000000 --- a/neural_compressor/onnxrt/algorithms/weight_only/awq.py +++ /dev/null @@ -1,437 +0,0 @@ -# Copyright (c) 2023 MIT HAN Lab -# This source code is licensed under the MIT license -# -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import copy -import os -from pathlib import Path -from typing import List, Union - -import numpy as np -import onnx -import onnxruntime as ort -from packaging.version import Version - -from neural_compressor.common import Logger -from neural_compressor.onnxrt.algorithms.weight_only.rtn import rtn_quantize -from neural_compressor.onnxrt.algorithms.weight_only.utility import pad_tensor, prepare_inputs, qdq_tensor -from neural_compressor.onnxrt.quantization.calibrate import CalibrationDataReader -from neural_compressor.onnxrt.quantization.config import AWQConfig -from neural_compressor.onnxrt.utils.onnx_model import ONNXModel -from neural_compressor.onnxrt.utils.utility import ONNXRT116_VERSION, ONNXRT1161_VERSION, dtype_mapping - -logger = Logger().get_logger() - -__all__ = ["apply_awq_on_model", "awq_quantize"] - - -def _get_weight_scale(weight, group_size): - """Get the scale of weight.""" - org_shape = weight.shape - weight = np.reshape(weight, (-1, group_size)) if group_size != -1 else weight - scale = np.mean(np.reshape(np.abs(weight) / np.max(np.abs(weight), axis=1, keepdims=True), org_shape), axis=0) - return scale - - -def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, group_size, scheme): - """Apply scale for salient weight.""" - best_scales = {} - new_init_tensors = [] - new_added_mul_nodes = [] - replace_input = [] - updated_nodes = [] - base_dir = os.path.dirname(model.model_path) if model.model_path is not None else "" - - for parent, nodes in absorb_pairs.items(): - if any([node.input[0] not in output_dicts for node in nodes]): - logger.warning( - "Miss input tensors of nodes {} during AWQ, skip it!".format( - ", ".join([node.name for node in nodes if node.input[0] not in output_dicts]) - ) - ) - continue - inp = np.concatenate(output_dicts[nodes[0].input[0]], axis=0) - inp_scale = np.mean(np.reshape(np.abs(inp), (-1, inp[0].shape[-1])), axis=0) - dtype = None - weight = [] - org_out = [] - for node in nodes: - if node.name in weight_config and weight_config.get(node.name, "fp32") != "fp32": - num_bits = weight_config[node.name]["bits"] - group_size = weight_config[node.name]["group_size"] - scheme = weight_config[node.name]["scheme"] - break - - # search scale - best_error = float("inf") - best_ratio = -1 - best_scale = None - n_grid = 20 - - for ratio in range(n_grid): - ratio = ratio * 1 / n_grid - loss = 0 - for node in nodes: - if weight_config.get(node.name, {}) == "fp32": - continue - - weight = onnx.numpy_helper.to_array(model.get_initializer(node.input[1]), base_dir) - if len(weight.shape) != 2: - continue - - org_out = np.matmul(inp, weight) - org_w_shape = weight.shape - group_size = group_size if group_size != -1 else org_w_shape[0] - - w_scale = _get_weight_scale(weight.T, weight.shape[0]) - scales = np.clip(np.power(inp_scale, ratio) / np.power(w_scale, (1 - ratio)), 1e-4, None) - scales = scales / np.sqrt(np.max(scales) * np.min(scales)) - weight = weight.T * scales - weight = pad_tensor(weight, group_size, (org_w_shape[0] + group_size - 1) // group_size).T - - if (Version(ort.__version__) > ONNXRT1161_VERSION and num_bits == 4) or ( - Version(ort.__version__) >= ONNXRT116_VERSION and num_bits == 4 and group_size == 32 - ): # pragma: no cover - # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions - # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1 - q_weight = qdq_tensor(weight, num_bits, group_size, scheme, "uint") / np.expand_dims( - scales, axis=-1 - ) - else: - q_weight = qdq_tensor(weight, num_bits, group_size, scheme, "int") / np.expand_dims(scales, axis=-1) - - q_weight = np.reshape(q_weight, (org_w_shape[1], -1))[:, : org_w_shape[0]] - out = np.matmul(inp, q_weight.T) - loss += np.mean(np.power((org_out - out), 2)) - - is_best = loss < best_error - if is_best: - best_error = loss - best_ratio = ratio - best_scale = scales - - for node in nodes: - weight_config.setdefault(node.name, {}).update({"bits": num_bits}) - weight_config.setdefault(node.name, {}).update({"group_size": group_size}) - weight_config.setdefault(node.name, {}).update({"scheme": scheme}) - - init_share_num = model.get_initializer_share_num(node.input[1]) - weight_tensor = model.get_initializer(node.input[1]) - tensor = onnx.numpy_helper.to_array(weight_tensor, base_dir) - dtype = tensor.dtype - tensor = tensor.T * best_scale - tensor = (tensor.T).astype(dtype) - - new_tensor = onnx.helper.make_tensor( - name=node.input[1] + "_scaled", - data_type=dtype_mapping[str(dtype)], - dims=tensor.shape, - vals=tensor.tobytes(), - raw=True, - ) - model.add_initializer(new_tensor) - node.input[1] = new_tensor.name - - if init_share_num == 1: - model.remove_initializer(weight_tensor) - - parent = model.get_node(parent) - if parent.name in updated_nodes: - continue - - if parent.op_type in ["LayerNormalization", "BatchNormalization", "InstanceNormalization"] and len( - model.input_name_to_nodes()[nodes[0].input[0]] - ) == len(nodes): - for idx in [1, 2]: - tensor = onnx.numpy_helper.to_array(model.get_initializer(parent.input[idx]), base_dir) - dtype = tensor.dtype - new_tensor = tensor / np.reshape(best_scale, (1, -1)) - model.set_initializer(parent.input[idx], new_tensor.astype(dtype), raw=True) - updated_nodes.append(parent.name) - output_dicts[parent.output[0]] = output_dicts[parent.output[0]] / np.reshape(best_scale, (1, -1)) - - elif ( - parent.op_type in ["SimplifiedLayerNormalization", "MatMul", "Gemm", "Mul"] - and not all([model.get_initializer(inp) is None for inp in parent.input]) - and len(model.input_name_to_nodes()[nodes[0].input[0]]) == len(nodes) - ): # pragma: no cover - for inp in parent.input: - if model.get_initializer(inp) is not None: - tensor = onnx.numpy_helper.to_array(model.get_initializer(inp), base_dir) - dtype = tensor.dtype - new_tensor = tensor / np.reshape(best_scale, (1, -1)) - model.set_initializer(inp, new_tensor.astype(dtype), raw=True) - updated_nodes.append(parent.name) - output_dicts[parent.output[0]] = output_dicts[parent.output[0]] / np.reshape(best_scale, (1, -1)) - - elif parent.op_type in ["Conv", "FusedConv"] and len(model.input_name_to_nodes()[nodes[0].input[0]]) == len( - nodes - ): # pragma: no cover - tensor = onnx.numpy_helper.to_array(model.get_initializer(parent.input[2]), base_dir) - dtype = tensor.dtype - new_tensor = tensor / np.reshape(best_scale, (1, -1)) - model.set_initializer(parent.input[2], new_tensor.astype(dtype), raw=True) - updated_nodes.append(parent.name) - output_dicts[parent.output[0]] = output_dicts[parent.output[0]] / np.reshape(best_scale, (1, -1)) - - else: # pragma: no cover - # insert mul - scale_tensor = onnx.helper.make_tensor( - name=parent.output[0] + "_weight_only_scale", - data_type=dtype_mapping[str(dtype)], - dims=best_scale.shape, - vals=(1.0 / best_scale).flatten().tolist(), - ) - new_init_tensors.append(scale_tensor) - mul_output_name = parent.output[0] + "_weight_only_out" - mul_node = onnx.helper.make_node( - "Mul", - inputs=[nodes[0].input[0], scale_tensor.name], - outputs=[mul_output_name], - name=nodes[0].input[0] + "_weight_only_mul", - ) - new_added_mul_nodes.append(mul_node) - for node in nodes: - replace_input.append([node, node.input[0], mul_node.output[0]]) - updated_nodes.append(parent.name) - output_dicts[mul_node.output[0]] = output_dicts[mul_node.input[0]] / np.reshape(best_scale, (1, -1)) - - model.add_nodes(new_added_mul_nodes) - model.add_initializers(new_init_tensors) - for node, old_input_name, new_input_name in replace_input: - model.replace_node_input(node, old_input_name, new_input_name) - - return model, output_dicts - - -def _apply_awq_clip(model, weight_config, absorb_pairs, output_dicts, num_bits, group_size, scheme): - """Apply clip for weight by checking mse.""" - base_dir = os.path.dirname(model.model_path) if model.model_path is not None else "" - ratios = {} - for parent, nodes in absorb_pairs.items(): - if any([node.input[0] not in output_dicts for node in nodes]): - logger.warning( - "Miss input tensors of nodes {} during AWQ, skip it!".format( - ", ".join([node.name for node in nodes if node.input[0] not in output_dicts]) - ) - ) - continue - - inp = np.concatenate(output_dicts[nodes[0].input[0]], axis=0) - - for node in nodes: - if node.name in weight_config: - num_bits = weight_config[node.name]["bits"] - group_size = weight_config[node.name]["group_size"] - scheme = weight_config[node.name]["scheme"] - - org_weight = onnx.numpy_helper.to_array(model.get_initializer(node.input[1]), base_dir=base_dir) - org_w_shape = org_weight.shape # ic, oc - group_size = group_size if group_size != -1 else org_w_shape[0] - org_out = np.matmul(inp, org_weight) # n_token, oc - - k_blocks = (org_w_shape[0] - 1) // group_size + 1 - org_weight = pad_tensor(org_weight, group_size, k_blocks) - - org_weight = np.transpose(org_weight) - - best_error = float("inf") - best_ratio = 1 - for i_s in range(10): - ratio = 1 - i_s / 100 - weight = copy.deepcopy(org_weight) - if (Version(ort.__version__) > ONNXRT1161_VERSION and num_bits == 4) or ( - Version(ort.__version__) >= ONNXRT116_VERSION and num_bits == 4 and group_size == 32 - ): # pragma: no cover - # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions - # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1 - weight = qdq_tensor(weight, num_bits, group_size, scheme, "uint", ratios.get(node.input[1], 1)) - else: - weight = qdq_tensor(weight, num_bits, group_size, scheme, "int", ratios.get(node.input[1], 1)) - weight = np.reshape(weight, (org_w_shape[1], -1))[:, : org_w_shape[0]] - cur_out = np.matmul(inp, weight.T) - loss = np.mean(np.power((org_out - cur_out), 2)) - is_best = loss < best_error - if is_best: - best_error = loss - best_ratio = ratio - ratios[node.input[1]] = best_ratio - return ratios - - -def awq_quantize( - model: Union[onnx.ModelProto, ONNXModel, Path, str], - data_reader: CalibrationDataReader, - weight_config: dict = {}, - num_bits: int = 4, - group_size: int = 32, - scheme: str = "asym", - enable_auto_scale: bool = True, - enable_mse_search: bool = True, - accuracy_level: int = 0, - providers: List[str] = ["CPUExecutionProvider"], -) -> onnx.ModelProto: - """Quant the model with Activation-aware Weight quantization(AWQ) method. - - Args: - model (Union[onnx.ModelProto, ONNXModel, Path, str]): onnx model. - data_reader (CalibrationDataReader): data_reader for calibration. - weight_config (dict, optional): quantization config - For example, - weight_config = { - '(fc2, "MatMul")': - { - 'weight_dtype': 'int', - 'weight_bits': 4, - 'weight_group_size': 32, - 'weight_sym': True, - 'accuracy_level': 0 - } - }. Defaults to {}. - num_bits (int, optional): number of bits used to represent weights. Defaults to 4. - group_size (int, optional): size of weight groups. Defaults to 32. - scheme (str, optional): indicates whether weights are symmetric. Defaults to "asym". - enable_auto_scale (bool, optional): whether to search for best scales based on activation - distribution. Defaults to True. - enable_mse_search (bool, optional): whether to search for the best clip range from range - [0.91, 1.0, 0.01]. Defaults to True. - accuracy_level (int, optional): accuracy level. Support 0 (unset), - 1(fp32 compute type of jblas kernel), 2 (fp16 compute type of jblas kernel), - 3 (bf16 compute type of jblas kernel), 4 (int8 compute type of jblas kernel). Defaults to 0. - providers (list, optional): providers to use. Defaults to ["CPUExecutionProvider"]. - - Returns: - onnx.ModelProto: quantized onnx model. - """ - if not isinstance(model, ONNXModel): - model = ONNXModel(model) - output_dicts = {} - full_ratio = {} - - if enable_mse_search: - inputs, so = prepare_inputs(model, data_reader, providers) - del data_reader - - org_output = copy.deepcopy(model.model.graph.output) - model.remove_tensors_from_outputs([i.name for i in org_output]) - - output_names = [] - for node in model.nodes(): - # check op_type of node is MatMul - # check dim 1 of input is weight tensor - # check weight_type is not "fp32" - if ( - node.op_type in ["MatMul"] - and model.get_initializer(node.input[1]) is not None - and weight_config.get((node.name, node.op_type), {}).get("weight_dtype", "fp32") != "fp32" - ): - output_names.append(node.input[0]) - output_names = list(set(output_names)) - model.add_tensors_to_outputs(output_names) - if model.is_large_model: - onnx.save_model( - model.model, - model.model_path + "_augment.onnx", - save_as_external_data=True, - all_tensors_to_one_file=True, - convert_attribute=False, - ) - - session = ( - ort.InferenceSession(model.model.SerializeToString(), so, providers=providers) - if not model.is_large_model - else ort.InferenceSession(model.model_path + "_augment.onnx", so, providers=providers) - ) - - for input_name in output_names: - parent = model.output_name_to_node()[input_name] - dump_pairs = {parent.name: []} - - for node in model.input_name_to_nodes()[input_name]: - # check op_type of node is MatMul - # check dim 1 of input is weight tensor - # check weight_type is not "fp32" - if ( - node.op_type in ["MatMul"] - and model.get_initializer(node.input[1]) is not None - and weight_config.get((node.name, node.op_type), {}).get("weight_dtype", "fp32") != "fp32" - ): - dump_pairs[parent.name].append(model.get_node(node.name)) - - if len(dump_pairs[parent.name]) == 0: - continue - - output_dicts = {} - for inp in inputs: - output = session.run([input_name], inp) - output_dicts.setdefault(input_name, []).append(output) - - if enable_auto_scale: - model, output_dicts = _apply_awq_scale( - model, - weight_config, - dump_pairs, - output_dicts, - num_bits, - group_size, - scheme, - ) - if enable_mse_search: - ratios = _apply_awq_clip( - model, - weight_config, - dump_pairs, - output_dicts, - num_bits, - group_size, - scheme, - ) - del output_dicts - del dump_pairs - full_ratio.update(ratios) - - model.remove_tensors_from_outputs(output_names) - model.model.graph.output.MergeFrom(org_output) - model = rtn_quantize(model, weight_config, num_bits, group_size, scheme, full_ratio, accuracy_level, providers) - return model - - -def apply_awq_on_model( - model: Union[onnx.ModelProto, ONNXModel, Path, str], - quant_config: dict, - calibration_data_reader: CalibrationDataReader, -) -> onnx.ModelProto: - """Apply Activation-aware Weight quantization(AWQ) on onnx model. - - Args: - model (Union[onnx.ModelProto, ONNXModel, Path, str]): nnx model. - quant_config (dict): quantization config. - calibration_data_reader (CalibrationDataReader): data_reader for calibration. - - Returns: - onnx.ModelProto: quantized onnx model. - """ - # set model params - kwargs = {} - kwargs = {key: quant_config.pop(key) for key in AWQConfig.model_params_list if key in quant_config} - - # change op config to dict type - for op_name_type, op_config in quant_config.items(): - if isinstance(op_config, AWQConfig): - quant_config[op_name_type] = op_config.to_dict() - - return awq_quantize(model, data_reader=calibration_data_reader, weight_config=quant_config, **kwargs) diff --git a/neural_compressor/onnxrt/algorithms/weight_only/gptq.py b/neural_compressor/onnxrt/algorithms/weight_only/gptq.py deleted file mode 100644 index 5a8985f1b0f..00000000000 --- a/neural_compressor/onnxrt/algorithms/weight_only/gptq.py +++ /dev/null @@ -1,451 +0,0 @@ -# Copyright (c) 2023 MIT HAN Lab -# This source code is licensed under the MIT license -# -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import copy -import os -from pathlib import Path -from typing import List, Union - -import numpy as np -import onnx -import onnxruntime as ort -from packaging.version import Version - -from neural_compressor.onnxrt.algorithms.weight_only.utility import ( - make_matmul_weight_only_node, - pad_tensor, - prepare_inputs, - quant_tensor, -) -from neural_compressor.onnxrt.quantization.calibrate import CalibrationDataReader -from neural_compressor.onnxrt.quantization.config import GPTQConfig -from neural_compressor.onnxrt.utils.onnx_model import ONNXModel -from neural_compressor.onnxrt.utils.utility import ( - ONNXRT116_VERSION, - ONNXRT1161_VERSION, - dtype_mapping, - simple_progress_bar, -) - -__all__ = [ - "apply_gptq_on_model", - "gptq_quantize", -] - - -def _gptq( - W: np.array, - H: np.array, - num_bits: int = 4, - group_size: int = 32, - scheme: str = "asym", - blocksize: int = 128, - percdamp: float = 0.01, - actorder: bool = False, - mse: bool = False, - perchannel: bool = True, -): - """Quant the weight with GPTQ method. - - Args: - W (np.array): weight. - H (np.array): Hessian matrix. - num_bits (int, optional): num_bits. Default is 4. - group_size (int, optional): how many elements share one scale/zp. Default is 32. - scheme (str, optional): sym or asym. Defaults to "asym". - blocksize (int, optional): blocksize to quantize weight. - percdamp (float, optional): percent of the average Hessian diagonal to use for dampening. - actorder (bool, optional): whether rearrange Hessian matrix considering the diag's value. - mse (bool, optional): whether get scale and zero point with mse error. - perchannel (bool, optional): whether quantize weight per-channel. - - Returns: - Q: fake quantized weight - """ - Qs = [] - maxq = 2**num_bits - 1 - grid = 100 - maxshrink = 0.8 - norm = 2.4 - - def find_params(weight): - org_shape = weight.shape - # find zp, scale - if not perchannel: - weight = np.expand_dims(weight.flatten(), axis=1) - tmp = np.zeros(weight.shape[1]) - xmin = np.minimum(np.min(weight, axis=0), tmp) - xmax = np.maximum(np.max(weight, axis=0), tmp) - if scheme == "sym": - xmax = np.maximum(np.abs(xmin), xmax) - tmp = xmin < 0 - if np.any(tmp): - xmin[tmp] = -xmax[tmp] - tmp = (xmin == 0) & (xmax == 0) - xmin[tmp] = -1 - xmax[tmp] = +1 - - scale = (xmax - xmin) / maxq - if scheme == "sym": - zero = np.ones(scale.shape) * (maxq + 1) / 2 - else: - zero = np.round(-xmin / scale) - if mse: - best = np.ones([weight.shape[1]]) * float("inf") - for i in range(int(maxshrink * grid)): - p = 1 - i / grid - xmin1 = p * xmin - xmax1 = p * xmax - scale1 = (xmax1 - xmin1) / maxq - zero1 = np.round(-xmin1 / scale1) if scheme != "sym" else zero - q = np.clip(np.round(weight / scale1) + zero1, 0, maxq) - q -= weight - q = np.power(np.abs(q), norm) - err = np.sum(q, 0) - tmp = err < best - if np.any(tmp): - best[tmp] = err[tmp] - scale[tmp] = scale1[tmp] - zero[tmp] = zero1[tmp] - if not perchannel: - tmp = org_shape[1] - scale = np.repeat(scale, tmp) - zero = np.repeat(zero, tmp) - shape = [-1] + [1] * (len(org_shape) - 1) - scale = np.reshape(scale, shape) - zero = np.reshape(zero, shape) - return scale, zero - - scales = [] - zps = [] - shape = W.shape - scale, zp = find_params(W) - dead = np.diag(H) == 0 - H[dead, dead] = 1 - W[dead, :] = 0 # such channel makes no contribution to quantization computation - - # rearrange considering the diag's value - if actorder: - perm = np.argsort(np.diag(H))[::-1] - W = W[perm, :] - H = H[perm, :][:, perm] - Losses = np.zeros(W.shape) - Q = np.zeros(W.shape) - damp = percdamp * np.mean(np.diag(H)) - diag = np.arange(shape[0]) - H[diag, diag] += damp # add a average value of - H = np.linalg.cholesky(np.linalg.inv(H)).T - Hinv = H - for i1 in range(0, shape[0], blocksize): - i2 = min(i1 + blocksize, shape[0]) - count = i2 - i1 - - W1 = copy.deepcopy(W[i1:i2, :]) - Q1 = np.zeros(W1.shape) - Err1 = np.zeros(W1.shape) - Losses1 = np.zeros(W1.shape) - Hinv1 = Hinv[i1:i2, i1:i2] - - for i in range(count): # within a block, channel wise - w = W1[i, :] - d = Hinv1[i, i] - - if group_size != -1: - if (i1 + i) % group_size == 0: - scale, zp = find_params(W[(i1 + i) : (i1 + i + group_size), :]) - - q = (scale * (np.clip(np.round(np.expand_dims(w, axis=1) / scale) + zp, 0, maxq) - zp)).flatten() - Q1[i, :] = q - Losses1[i, :] = (w - q) ** 2 / d**2 - - err1 = (w - q) / d - W1[i:, :] -= np.matmul(np.expand_dims(Hinv1[i:, i], axis=1), np.expand_dims(err1, axis=0)) - Err1[i, :] = err1 - - Q[i1:i2, :] = Q1 - Losses[i1:i2, :] = Losses1 / 2 - - W[i2:, :] -= np.matmul(Hinv[i2:, i1:i2], Err1) - - if actorder: - invperm = np.argsort(perm) - Q = Q[invperm, :] - - Q = np.reshape(Q, W.shape) - del W - return Q - - -def gptq_quantize( - model: Union[onnx.ModelProto, ONNXModel, Path, str], - data_reader: CalibrationDataReader, - weight_config: dict = {}, - num_bits: int = 4, - group_size: int = 32, - scheme: str = "asym", - percdamp: float = 0.01, - blocksize: int = 128, - actorder: bool = False, - mse: bool = False, - perchannel: bool = True, - accuracy_level: int = 0, - providers: List[str] = ["CPUExecutionProvider"], - return_modelproto: bool = True, -): - """Quant the model with GPTQ method. - - Args: - model (Union[onnx.ModelProto, ONNXModel, Path, str]): onnx model. - data_reader (CalibrationDataReader): data_reader for calibration. - weight_config (dict, optional): quantization config - For example, - weight_config = { - '(fc2, "MatMul")': - { - 'weight_dtype': 'int', - 'weight_bits': 4, - 'weight_group_size': 32, - 'weight_sym': True, - 'accuracy_level': 0 - }. Defaults to {}. - num_bits (int, optional): number of bits used to represent weights. Defaults to 4. - group_size (int, optional): size of weight groups. Defaults to 32. - scheme (str, optional): indicates whether weights are symmetric. Defaults to "asym". - percdamp (float, optional): percentage of Hessian's diagonal values' average, which will be added - to Hessian's diagonal to increase numerical stability. Defaults to 0.01. - blocksize (int, optional): execute GPTQ quantization per block. Defaults to 128. - actorder (bool, optional): whether to sort Hessian's diagonal values to rearrange channel-wise - quantization order. Defaults to False. - mse (bool, optional): whether get scale and zero point with mse error. Defaults to False. - perchannel (bool, optional): whether quantize weight per-channel. Defaults to True. - accuracy_level (int, optional): accuracy level. Support 0 (unset), - 1(fp32 compute type of jblas kernel), 2 (fp16 compute type of jblas kernel), - 3 (bf16 compute type of jblas kernel), 4 (int8 compute type of jblas kernel). Defaults to 0. - providers (list, optional): providers to use. Defaults to ["CPUExecutionProvider"]. - return_modelproto (bool, optionmal): whether to return onnx.Modelproto. set False for layer-wise quant. - Default to True - - Returns: - onnx.ModelProto: quantized onnx model - """ - if not isinstance(model, ONNXModel): - model = ONNXModel(model) - base_dir = os.path.dirname(model.model_path) if model.model_path is not None else "" - - inputs, so = prepare_inputs(model, data_reader, providers) - del data_reader - org_output = copy.deepcopy(model.model.graph.output) - model.remove_tensors_from_outputs([i.name for i in org_output]) - output_names = [] - for node in model.nodes(): - # check op_type of node is MatMul - # check dim 1 of input is weight tensor - # check weight_type is not "fp32" - if ( - node.op_type in ["MatMul"] - and model.get_initializer(node.input[1]) is not None - and weight_config.get((node.name, node.op_type), {}).get("weight_dtype", "fp32") != "fp32" - ): - output_names.append(node.input[0]) - output_names = list(set(output_names)) - model.add_tensors_to_outputs(output_names) - if model.is_large_model: - onnx.save_model( - model.model, - model.model_path + "_augment.onnx", - save_as_external_data=True, - all_tensors_to_one_file=True, - convert_attribute=False, - ) - - session = ( - ort.InferenceSession(model.model.SerializeToString(), so, providers=providers) - if not model.is_large_model - else ort.InferenceSession(model.model_path + "_augment.onnx", so, providers=providers) - ) - - for idx, input_name in enumerate(output_names): - simple_progress_bar(len(output_names), idx + 1) - node_list = [] - weights = [] - - for node in model.input_name_to_nodes()[input_name]: - # check op_type of node is MatMul - # check dim 1 of input is weight tensor - # check weight_type is not "fp32" - if ( - node.op_type in ["MatMul"] - and model.get_initializer(node.input[1]) is not None - and weight_config.get((node.name, node.op_type), {}).get("weight_dtype", "fp32") != "fp32" - ): - weight = onnx.numpy_helper.to_array( - model.get_initializer(model.get_node(node.name).input[1]), base_dir - ).copy() - if len(weight.shape) != 2: - continue - - weights.append(weight) - node_list.append(model.get_node(node.name)) - - if len(weights) == 0: - continue - - Hs = [np.zeros((i.shape[0], i.shape[0])) for i in weights] - nsamples = 0 - for data in inputs: - inp = session.run([input_name], data)[0] - tmp = inp.shape[0] - inp = np.reshape(inp, (-1, inp.shape[-1])) - Hs = [i * (nsamples / (nsamples + tmp)) for i in Hs] - nsamples += tmp - inp = np.sqrt(2 / nsamples) * inp - Hs = [i + np.matmul(inp.T, inp) for i in Hs] - - for ( - node, - weight, - H, - ) in zip(node_list, weights, Hs): - if node.name in weight_config: - num_bits = weight_config[node.name]["bits"] - group_size = weight_config[node.name]["group_size"] - scheme = weight_config[node.name]["scheme"] - accuracy_level = weight_config[(node.name, node.op_type)].accuracy_level - group_size = group_size if group_size != -1 else weight.shape[0] - dtype = weight.dtype - - q_weight = _gptq( - weight, - H, - num_bits=num_bits, - group_size=group_size, - scheme=scheme, - blocksize=blocksize, - percdamp=percdamp, - actorder=actorder, - mse=mse, - perchannel=perchannel, - ) - - weight_tensor = model.get_initializer(node.input[1]) - init_share_num = model.get_initializer_share_num(node.input[1]) - - satisfy_MatMulNBits_condition = Version(ort.__version__) > ONNXRT1161_VERSION and num_bits == 4 - satisfy_MatMulFpQ4_condition = ( - Version(ort.__version__) >= ONNXRT116_VERSION and num_bits == 4 and group_size == 32 - ) - if ("CUDAExecutionProvider" in providers and satisfy_MatMulNBits_condition) or ( - "CUDAExecutionProvider" not in providers - and (satisfy_MatMulFpQ4_condition or satisfy_MatMulNBits_condition) - ): # pragma: no cover - # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions, supported by CPU EP - # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1, supported by CPU EP AND CUDA EP - org_shape = weight.shape - k_blocks = (org_shape[0] + group_size - 1) // group_size - q_weight = pad_tensor(q_weight, group_size, k_blocks) - q_weight, scale, zp = quant_tensor(q_weight.T, num_bits, group_size, scheme, "uint") - q_matmul_node, new_inits = make_matmul_weight_only_node( - node=node, - weight_shape=org_shape, - num_bits=num_bits, - group_size=group_size, - k_blocks=k_blocks, - q_weight=q_weight.astype("uint8"), - scale=scale.astype(dtype), - zero_point=zp if scheme == "asym" else None, - accuracy_level=accuracy_level, - ) - - model.add_initializers(new_inits) - model.remove_node(node) - model.add_node(q_matmul_node) - else: - q_weight_tensor = onnx.helper.make_tensor( - name=node.input[1] + "_Q{}G{}".format(str(num_bits), str(group_size)), - data_type=dtype_mapping[str(dtype)], - dims=q_weight.shape, - vals=q_weight.astype(dtype).tobytes(), - raw=True, - ) - model.add_initializer(q_weight_tensor) - node.input[1] = q_weight_tensor.name - if init_share_num == 1: - model.remove_initializer(weight_tensor) - - model.remove_tensors_from_outputs(output_names) - model.model.graph.output.MergeFrom(org_output) - - model.topological_sort() - - # reload external data to prevent external data file path errors - if model.is_large_model: - from onnx.external_data_helper import load_external_data_for_model - - load_external_data_for_model(model.model, os.path.split(model.model_path)[0]) - - if return_modelproto: - return model.model - else: - return model - - -def apply_gptq_on_model( - model: Union[onnx.ModelProto, ONNXModel, Path, str], - quant_config: dict, - calibration_data_reader: CalibrationDataReader, -) -> onnx.ModelProto: - """Apply GPTQ on onnx model. - - Args: - model (Union[onnx.ModelProto, ONNXModel, Path, str]): onnx model. - quant_config (dict): quantization config. - calibration_data_reader (CalibrationDataReader): data_reader for calibration. - - Returns: - onnx.ModelProto: quantized onnx model. - """ - # check whether to do layer_wise quant - layer_wise = quant_config.pop("layer_wise_quant", False) - - # set other model params - quant_kwargs = {} - quant_kwargs = {key: quant_config.pop(key) for key in GPTQConfig.model_params_list if key in quant_config} - - # change op config to dict type - for op_name_type, op_config in quant_config.items(): - if isinstance(op_config, GPTQConfig): - quant_config[op_name_type] = op_config.to_dict() - - if layer_wise: - from neural_compressor.onnxrt.algorithms import layer_wise_quant - - quantized_model = layer_wise_quant( - model, - quant_func=gptq_quantize, - weight_config=quant_config, - data_reader=calibration_data_reader, - **quant_kwargs - ) - else: - quantized_model = gptq_quantize( - model, data_reader=calibration_data_reader, weight_config=quant_config, **quant_kwargs - ) - - if isinstance(quantized_model, ONNXModel): - quantized_model = quantized_model.model - return quantized_model diff --git a/neural_compressor/onnxrt/algorithms/weight_only/rtn.py b/neural_compressor/onnxrt/algorithms/weight_only/rtn.py deleted file mode 100644 index c4ee941bf17..00000000000 --- a/neural_compressor/onnxrt/algorithms/weight_only/rtn.py +++ /dev/null @@ -1,222 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (c) 2023 MIT HAN Lab -# This source code is licensed under the MIT license -# -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -from pathlib import Path -from typing import List, Union - -import numpy as np -import onnx -import onnxruntime as ort -from packaging.version import Version - -from neural_compressor.onnxrt.algorithms.weight_only.utility import ( - make_matmul_weight_only_node, - pad_tensor, - qdq_tensor, - quant_tensor, -) -from neural_compressor.onnxrt.quantization.config import RTNConfig -from neural_compressor.onnxrt.utils.onnx_model import ONNXModel -from neural_compressor.onnxrt.utils.utility import ( - ONNXRT116_VERSION, - ONNXRT1161_VERSION, - dtype_mapping, - simple_progress_bar, -) - -__all__ = ["apply_rtn_on_model", "rtn_quantize"] - - -def rtn_quantize( - model: Union[onnx.ModelProto, ONNXModel, Path, str], - weight_config: dict = {}, - num_bits: int = 4, - group_size: int = 32, - scheme: str = "asym", - ratios: dict = {}, - accuracy_level: int = 0, - providers: List[str] = ["CPUExecutionProvider"], - return_modelproto: bool = True, -): - """Quantize the model with round to nearst method. - - Args: - model (Union[onnx.ModelProto, ONNXModel, Path, str]): onnx model - weight_config (dict, optional): quantization config - For example, - weight_config = { - '(fc2, "MatMul")': - { - 'weight_dtype': 'int', - 'weight_bits': 4, - 'weight_group_size': 32, - 'weight_sym': True, - 'accuracy_level': 0 - } - }. Defaults to {}. - num_bits (int, optional): number of bits used to represent weights. Defaults to 4. - group_size (int, optional): size of weight groups. Defaults to 32. - scheme (str, optional): indicates whether weights are symmetric. Defaults to "asym". - ratios (dict, optional): percentile of clip. Defaults to {}. - accuracy_level (int, optional): - accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel), - 2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel), - 4 (int8 compute type of jblas kernel). Defaults to 0. - providers (list, optional): providers to use. Defaults to ["CPUExecutionProvider"]. - return_modelproto (bool, optionmal): whether to return onnx.Modelproto. set False for layer-wise quant. - Default to True - Returns: - onnx.ModelProto: quantized onnx model. - """ - if not isinstance(model, ONNXModel): - model = ONNXModel(model) - base_dir = os.path.dirname(model.model_path) if model.model_path is not None else "" - new_nodes = [] - remove_nodes = [] - total_num = len([i for i in model.nodes() if i.op_type in ["MatMul"]]) - curr_id = 0 - for node in model.nodes(): - if node.op_type in ["MatMul"]: - curr_id += 1 - simple_progress_bar(total_num, curr_id) - - # check op_type of node is MatMul - # check dim 1 of input is weight tensor - # check weight_type is not "fp32" - if ( - node.op_type in ["MatMul"] # check op_type of node is MatMul - and model.get_initializer(node.input[1]) is not None - and weight_config.get((node.name, node.op_type), {}).get("weight_dtype", "fp32") != "fp32" - ): - weight_tensor = model.get_initializer(node.input[1]) - weight = onnx.numpy_helper.to_array(weight_tensor, base_dir=base_dir).copy() - if len(weight.shape) != 2: - continue - - dtype = weight.dtype - if (node.name, node.op_type) in weight_config: - num_bits = weight_config[(node.name, node.op_type)].get("weight_bits", 4) - group_size = weight_config[(node.name, node.op_type)].get("weight_group_size", 32) - scheme = "sym" if weight_config[(node.name, node.op_type)].get("weight_sym", True) else "asym" - accuracy_level = weight_config[(node.name, node.op_type)].get("accuracy_level", 0) - - org_w_shape = weight.shape # ic, oc - group_size = group_size if group_size != -1 else org_w_shape[0] - - k_blocks = (org_w_shape[0] - 1) // group_size + 1 - init_share_num = model.get_initializer_share_num(node.input[1]) - - weight = pad_tensor(weight, group_size, k_blocks) - - satisfy_MatMulNBits_condition = Version(ort.__version__) > ONNXRT1161_VERSION and num_bits == 4 - satisfy_MatMulFpQ4_condition = ( - Version(ort.__version__) >= ONNXRT116_VERSION and num_bits == 4 and group_size == 32 - ) - if ("CUDAExecutionProvider" in providers and satisfy_MatMulNBits_condition) or ( - "CUDAExecutionProvider" not in providers - and (satisfy_MatMulFpQ4_condition or satisfy_MatMulNBits_condition) - ): # pragma: no cover - # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions, supported by CPU EP - # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1, supported by CPU EP AND CUDA EP - q_weight, scale, zp = quant_tensor( - weight.T, num_bits, group_size, scheme, "uint", ratios.get(node.input[1], 1) - ) - q_matmul_node, new_inits = make_matmul_weight_only_node( - node=node, - weight_shape=org_w_shape, - num_bits=num_bits, - group_size=group_size, - k_blocks=k_blocks, - q_weight=q_weight.astype("uint8"), - scale=scale.astype(dtype), - zero_point=zp if scheme == "asym" else None, - accuracy_level=accuracy_level, - ) - - model.add_initializers(new_inits) - remove_nodes.append(node) - new_nodes.append(q_matmul_node) - else: - q_weight = qdq_tensor(weight.T, num_bits, group_size, scheme, "int", ratios.get(node.input[1], 1)) - q_weight = np.reshape(q_weight, (org_w_shape[1], -1)) - q_weight = np.transpose(q_weight) - q_weight = q_weight[: org_w_shape[0], :].astype(dtype) - q_weight_tensor = onnx.helper.make_tensor( - name=node.input[1] + "_Q{}G{}".format(str(num_bits), str(group_size)), - data_type=dtype_mapping[str(dtype)], - dims=weight.shape, - vals=q_weight.tobytes(), - raw=True, - ) - model.add_initializer(q_weight_tensor) - node.input[1] = q_weight_tensor.name - if init_share_num == 1: - model.remove_initializer(weight_tensor) - - model.add_nodes(new_nodes) - model.remove_nodes(remove_nodes) - model.topological_sort() - - # reload external data to prevent external data file path errors - if model.is_large_model: - from onnx.external_data_helper import load_external_data_for_model - - load_external_data_for_model(model.model, os.path.split(model.model_path)[0]) - - if return_modelproto: - return model.model - else: - return model - - -def apply_rtn_on_model(model: Union[onnx.ModelProto, ONNXModel, Path, str], quant_config: dict) -> onnx.ModelProto: - """Apply RTN on onnx model. - - Args: - model (Union[onnx.ModelProto, ONNXModel, Path, str]): onnx model. - quant_config (dict): quantization config. - - Returns: - onnx.ModelProto: quantized onnx model. - """ - # check whether to do layer_wise quant - layer_wise = quant_config.pop("layer_wise_quant", False) - - # set other model params - quant_kwargs = {} - quant_kwargs = {key: quant_config.pop(key) for key in RTNConfig.model_params_list if key in quant_config} - - # change op config to dict type - for op_name_type, op_config in quant_config.items(): - if isinstance(op_config, RTNConfig): - quant_config[op_name_type] = op_config.to_dict() - - if layer_wise: - from neural_compressor.onnxrt.algorithms import layer_wise_quant - - quantized_model = layer_wise_quant(model, quant_func=rtn_quantize, weight_config=quant_config, **quant_kwargs) - else: - quantized_model = rtn_quantize(model, weight_config=quant_config, **quant_kwargs) - - if isinstance(quantized_model, ONNXModel): - quantized_model = quantized_model.model - return quantized_model diff --git a/neural_compressor/onnxrt/algorithms/weight_only/utility.py b/neural_compressor/onnxrt/algorithms/weight_only/utility.py deleted file mode 100644 index f69f8d57fab..00000000000 --- a/neural_compressor/onnxrt/algorithms/weight_only/utility.py +++ /dev/null @@ -1,335 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (c) 2023 MIT HAN Lab -# This source code is licensed under the MIT license -# -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import struct -import sys - -import numpy as np -import onnx -import onnxruntime as ort -from packaging.version import Version - -from neural_compressor.onnxrt.utils.utility import ONNXRT1161_VERSION, dtype_mapping - -__all__ = [ - "make_matmul_weight_only_node", - "prepare_inputs", - "pad_tensor", - "quant_tensor", - "qdq_tensor", -] - - -def _get_blob_size(group_size, has_zp): # pragma: no cover - """Get blob_size. - - Args: - group_size (int): how many elements share one scale/zp - has_zp (bool): whether zero_point is None - """ - if Version(ort.__version__) > ONNXRT1161_VERSION: - blob_size = group_size // 2 - elif has_zp: - blob_size = group_size // 2 + 4 + 1 - else: - blob_size = group_size // 2 + 4 - return blob_size - - -def make_matmul_weight_only_node( - node: onnx.NodeProto, - weight_shape: tuple, - num_bits: int, - group_size: int, - k_blocks: int, - q_weight: np.array, - scale: np.array, - zero_point: np.array, - accuracy_level: int = 0, -): - """Build MatMulFpQ4/MatMulNBits node. - - Args: - node (onnx.NodeProto): original matmul node - weight_shape (tuple): original weight shape - num_bits (int): number of bits used to represent weights. - group_size (int): how many elements share one scale/zp - k_blocks (int): block number - q_weight (np.array): quantized weight - scale (np.array): scale - zero_point (np.array): zero point - accuracy_level (int, optional): accuracy level. - Support 0 (unset), 1(fp32 compute type of jblas kernel), - 2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel), - 4 (int8 compute type of jblas kernel) Defaults to 0. - - Returns: - matmul_weight_only_node: MatMulFpQ4 or MatMulNBits node - new_inits: initializers of the new node - """ - blob_size = _get_blob_size(group_size, zero_point is not None) - packed = np.zeros((q_weight.shape[0], blob_size), dtype="uint8") - q_weight_name = node.input[1] + "_Q{}G{}".format(str(num_bits), str(group_size)) - input_names = [node.input[0], q_weight_name] - new_inits = [] - kwargs = {} - - if Version(ort.__version__) > ONNXRT1161_VERSION: - op_type = "MatMulNBits" - - # pack quantized weight - for i in range(q_weight.shape[0]): - for k in range(0, group_size, 2): - packed[i][k // 2] = q_weight[i][k] | q_weight[i][k + 1] << 4 - packed = np.reshape(packed, (-1, k_blocks, blob_size)) - - # build scale tensor - scale = np.reshape(scale, (-1, k_blocks)) - scale_tensor = onnx.helper.make_tensor( - name=node.input[1] + "_scale", - data_type=dtype_mapping[str(scale.dtype)], - dims=scale.shape, - vals=scale.tobytes(), - raw=True, - ) - input_names.append(scale_tensor.name) - new_inits.append(scale_tensor) - - # build zero_point tensor - if zero_point is not None: - if num_bits > 4: - packed_zp = np.reshape(zero_point, (1, -1)).astype("uint8") - else: - packed_zp = np.full((zero_point.shape[0] + 1) // 2, 136, dtype="uint8") - for i in range(zero_point.shape[0] // k_blocks): - for j in range(k_blocks): - idx = i * k_blocks + j - zp = zero_point[idx] - packed_zp[idx // 2] = ( - ((packed_zp[idx // 2] & 0x0F) | (zp << 4)) - if (idx & 1) - else ((packed_zp[idx // 2] & 0xF0) | zp) - ) - - zp_tensor = onnx.helper.make_tensor( - name=node.input[1] + "_zp", data_type=2, dims=packed_zp.shape, vals=packed_zp.tobytes(), raw=True - ) - input_names.append(zp_tensor.name) - new_inits.append(zp_tensor) - - # set kwargs - kwargs["K"] = weight_shape[0] - kwargs["N"] = weight_shape[1] - kwargs["bits"] = num_bits - kwargs["block_size"] = group_size - if accuracy_level > 0: - # require onnxruntime > 1.16.3 - kwargs["accuracy_level"] = accuracy_level - - else: - offset = 5 if zero_point is not None else 4 - op_type = "MatMulFpQ4" - - # pack quantized weight - for i in range(q_weight.shape[0]): - bf = struct.pack("f", scale[i]) - packed[i][0] = bf[0] - packed[i][1] = bf[1] - packed[i][2] = bf[2] - packed[i][3] = bf[3] - - if zero_point is not None: - packed[i][4] = zero_point[i] - - packed[i][offset:] = np.bitwise_or( - q_weight[i][: group_size // 2], np.left_shift(q_weight[i][group_size // 2 :], num_bits) - ) - packed = packed.reshape(-1) - - # build shape tensor - shape_tensor = onnx.helper.make_tensor( - name=node.input[1] + "_shape", data_type=7, dims=(2,), vals=np.array(weight_shape, dtype="int64") - ) - new_inits.append(shape_tensor) - input_names.append(shape_tensor.name) - - # set kwargs - kwargs["blk_quant_type"] = 1 if zero_point is not None else 0 - - q_weight_tensor = onnx.helper.make_tensor( - name=q_weight_name, - data_type=2, - dims=packed.shape, - vals=packed.tobytes(), - raw=True, - ) - new_inits.append(q_weight_tensor) - - matmul_weight_only_node = onnx.helper.make_node( - op_type, - inputs=input_names, - outputs=node.output, - name=node.name + "_Q" + str(num_bits) if node.name else "_Q" + str(num_bits), - domain="com.microsoft", - **kwargs, - ) - return matmul_weight_only_node, new_inits - - -def prepare_inputs(model, data_reader, providers): - """Prepare inputs for weight only quantization. - - Args: - model (ModelProto or ONNXModel): onnx model. - data_reader (CalibrationDataReader): a calibration data reader. - providers (list): providers to use. - - Returns: - inputs: prepared inputs. - so: session options - """ - from importlib.util import find_spec - - so = ort.SessionOptions() - if sys.version_info < (3, 11) and find_spec("onnxruntime_extensions"): # pragma: no cover - from onnxruntime_extensions import get_library_path - - so.register_custom_ops_library(get_library_path()) - if model.is_large_model: - onnx.save_model( - model.model, - model.model_path + "_augment.onnx", - save_as_external_data=True, - all_tensors_to_one_file=True, - convert_attribute=False, - ) - - inputs_list = [] - while True: - inputs = data_reader.get_next() - if not inputs: - break - inputs_list.append(inputs) - return inputs_list, so - - -def pad_tensor(weight, group_size, k_blocks): - """Pad tensor rowi so that it can be is divisible by group_size. - - Args: - weight (array): weight - group_size (int): how many elements share one scale/zp - k_blocks (int): the number of block - - Returns: - weight: paded weight - """ - if group_size == -1: - return weight - - org_w_shape = weight.shape - padded_rows = k_blocks * group_size - pad_len = padded_rows - org_w_shape[0] - - if pad_len > 0: - weight = np.pad(weight, ((0, pad_len), (0, 0)), "constant") - - return weight - - -def quant_tensor( - data: np.array, - num_bits: int = 4, - group_size: int = 32, - scheme: str = "asym", - dtype: str = "int", - ratio: float = 1.0, -): - """Quantize tensor per group. - - Args: - data (np.array): input weight - num_bits (int, optional): number of bits used to represent weights. Defaults to 4. - group_size (int, optional): how many elements share one scale/zp. Defaults to 4. - scheme (str, optional): _quantization scheme. Defaults to "asym". - dtype (str, optional): data type. Defaults to "int". - ratio (float, optional): percentile of clip. Defaults to 1.0. - - Returns: - output: quantized weight - scale: scale - zero_point: zero point - """ - data = np.reshape(data, (-1, group_size)) - if scheme == "asym" or dtype == "uint": - maxq = 2**num_bits - 1 - minq = 0 - elif scheme == "sym": - maxq = 2 ** (num_bits - 1) - 1 if num_bits != 1 else 0 - minq = -(2 ** (num_bits - 1)) if num_bits != 1 else -1 - - rmin = np.min(data, axis=1, keepdims=True) * ratio - rmax = np.max(data, axis=1, keepdims=True) * ratio - if scheme == "sym": - max_range = np.maximum(np.abs(rmin), np.abs(rmax)) - scale = np.ones(rmax.shape) - scale[max_range > 0] = np.array( - [float(i) / (maxq - minq) for i in (max_range[max_range > 0] * 2.0).flatten().tolist()] - ) - zero_point = ( - np.zeros(scale.shape) if dtype == "int" else np.ones(rmax.shape, dtype="uint8") * (1 << (num_bits - 1)) - ) - else: - scale = np.ones(rmax.shape) - scale[rmin != rmax] = np.array( - [float(i) / (maxq - minq) for i in (rmax - rmin)[rmin != rmax].flatten().tolist()] - ) - zero_point = ( - ((np.zeros(scale.shape) - rmin) / scale).round() - if dtype == "int" - else np.maximum(0, np.minimum(maxq, ((np.zeros(scale.shape) - rmin) / scale).round())).astype("uint8") - ) - return np.clip((data / scale + zero_point).round(), minq, maxq), scale, zero_point - - -def qdq_tensor( - data: np.array, - num_bits: int = 4, - group_size: int = 32, - scheme: str = "asym", - dtype: str = "int", - ratio: float = 1.0, -): - """Quant dequant tensor per group. - - Args: - data (np.array): input weight - num_bits (int, optional): number of bits used to represent weights. Defaults to 4. - group_size (int, optional): how many elements share one scale/zp. Defaults to 32. - scheme (str, optional): quantization scheme. Defaults to "asym". - dtype (str, optional): data type. Defaults to "int". - ratio (float, optional): percentile of clip. Defaults to 1.0. - - Returns: - output: quant-dequant weight - """ - org_shape = data.shape - weight, scale, zp = quant_tensor(data, num_bits, group_size, scheme, dtype, ratio) - return np.reshape(scale * (weight - zp), org_shape) diff --git a/neural_compressor/onnxrt/quantization/__init__.py b/neural_compressor/onnxrt/quantization/__init__.py deleted file mode 100644 index b3ae15f6a19..00000000000 --- a/neural_compressor/onnxrt/quantization/__init__.py +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from neural_compressor.onnxrt.quantization.algorithm_entry import ( - smooth_quant_entry, - rtn_quantize_entry, - gptq_quantize_entry, - awq_quantize_entry, -) -from neural_compressor.onnxrt.quantization.calibrate import CalibrationDataReader -from neural_compressor.onnxrt.quantization.config import ( - RTNConfig, - get_default_rtn_config, - GPTQConfig, - get_default_gptq_config, - AWQConfig, - get_default_awq_config, - SmoohQuantConfig, - get_default_sq_config, -) -from neural_compressor.onnxrt.quantization.autotune import autotune, get_all_config_set - -__all__ = [ - "smooth_quant_entry", - "rtn_quantize_entry", - "gptq_quantize_entry", - "awq_quantize_entry", - "RTNConfig", - "get_default_rtn_config", - "GPTQConfig", - "get_default_gptq_config", - "AWQConfig", - "get_default_awq_config", - "SmoohQuantConfig", - "get_default_sq_config", - "get_all_config_set", - "CalibrationDataReader", - "autotune", -] diff --git a/neural_compressor/onnxrt/quantization/algorithm_entry.py b/neural_compressor/onnxrt/quantization/algorithm_entry.py deleted file mode 100644 index f86e9791605..00000000000 --- a/neural_compressor/onnxrt/quantization/algorithm_entry.py +++ /dev/null @@ -1,152 +0,0 @@ -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import tempfile -from pathlib import Path -from typing import Union - -import onnx -from onnxruntime.quantization import quantize - -from neural_compressor.common import Logger -from neural_compressor.common.utils import AWQ, GPTQ, RTN, SMOOTH_QUANT -from neural_compressor.onnxrt.algorithms import Smoother -from neural_compressor.onnxrt.quantization.calibrate import CalibrationDataReader -from neural_compressor.onnxrt.quantization.config import AWQConfig, GPTQConfig, RTNConfig, SmoohQuantConfig -from neural_compressor.onnxrt.utils.utility import register_algo - -logger = Logger().get_logger() - -__all__ = [ - "smooth_quant_entry", - "rtn_quantize_entry", - "gptq_quantize_entry", - "awq_quantize_entry", -] - - -###################### SmoothQuant Entry ################################## -@register_algo(name=SMOOTH_QUANT) -def smooth_quant_entry( - model: Union[Path, str], - quant_config: SmoohQuantConfig, - calibration_data_reader: CalibrationDataReader, - *args, - **kwargs -) -> onnx.ModelProto: - """Apply smooth quant.""" - assert calibration_data_reader is not None, "Please provide calibration_data_reader" - assert isinstance( - calibration_data_reader, CalibrationDataReader - ), "Please follow neural_compressor/onnxrt/quantization/calibrate.py to implement calibration_data_reader" - - # smooth operation - calibration_data_reader.rewind() - smoother = Smoother( - model, - calibration_data_reader, - providers=quant_config.providers, - ) - smoothed_model = smoother.transform(**quant_config.to_dict()) - with tempfile.TemporaryDirectory(prefix="ort.quant.") as tmp_dir: - # ORT quant API requires str input - onnx.save_model( - smoothed_model, - Path(tmp_dir).joinpath("smooth.onnx").as_posix(), - save_as_external_data=True, - all_tensors_to_one_file=True, - location="smooth.onnx_data", - size_threshold=1024, - convert_attribute=False, - ) - - # quant operation - calibration_data_reader.rewind() - - # exclude Mul operations which are inserted during smooth operation - excluded_nodes = [i.name for i in smoothed_model.graph.node if i.name.endswith("_smooth_mul")] - quant_config.calibration_data_reader = calibration_data_reader - quant_config.nodes_to_exclude.extend(excluded_nodes) - quant_config.convert_to_ort_config() - - quantize( - Path(tmp_dir).joinpath("smooth.onnx").as_posix(), - Path(tmp_dir).joinpath("quant_model.onnx").as_posix(), - quant_config, - ) - model = onnx.load(Path(tmp_dir).joinpath("quant_model.onnx").as_posix()) - - return model - - -###################### RTN Algo Entry ################################## -@register_algo(name=RTN) -def rtn_quantize_entry(model: Union[Path, str], quant_config: RTNConfig, *args, **kwargs) -> onnx.ModelProto: - """The main entry to apply rtn quantization.""" - from neural_compressor.onnxrt.algorithms import apply_rtn_on_model - - # map config to each op - model_info = quant_config.get_model_info(model=model) - configs_mapping = quant_config.to_config_mapping(model_info=model_info) - logger.debug(configs_mapping) - model = apply_rtn_on_model(model, configs_mapping) - return model - - -###################### GPTQ Algo Entry ################################## -@register_algo(name=GPTQ) -def gptq_quantize_entry( - model: Union[Path, str], quant_config: GPTQConfig, calibration_data_reader: CalibrationDataReader, *args, **kwargs -) -> onnx.ModelProto: - """The main entry to apply gptq quantization.""" - assert calibration_data_reader is not None, "Please provide calibration_data_reader" - assert isinstance( - calibration_data_reader, CalibrationDataReader - ), "Please follow neural_compressor/onnxrt/quantization/calibrate.py to implement calibration_data_reader" - - from neural_compressor.onnxrt.algorithms import apply_gptq_on_model - - # map config to each op - model_info = quant_config.get_model_info(model=model) - configs_mapping = quant_config.to_config_mapping(model_info=model_info) - logger.debug(configs_mapping) - - # regenerate to ensure data exists - calibration_data_reader.rewind() - model = apply_gptq_on_model(model, configs_mapping, calibration_data_reader) - return model - - -###################### AWQ Algo Entry ################################## -@register_algo(name=AWQ) -def awq_quantize_entry( - model: Union[Path, str], quant_config: AWQConfig, calibration_data_reader: CalibrationDataReader, *args, **kwargs -) -> onnx.ModelProto: - """The main entry to apply awq quantization.""" - assert calibration_data_reader is not None, "Please provide calibration_data_reader" - assert isinstance( - calibration_data_reader, CalibrationDataReader - ), "Please follow neural_compressor/onnxrt/quantization/calibrate.py to implement calibration_data_reader" - - from neural_compressor.onnxrt.algorithms import apply_awq_on_model - - # map config to each op - model_info = quant_config.get_model_info(model=model) - configs_mapping = quant_config.to_config_mapping(model_info=model_info) - logger.debug(configs_mapping) - - # regenerate to ensure data exists - calibration_data_reader.rewind() - model = apply_awq_on_model(model, configs_mapping, calibration_data_reader) - return model diff --git a/neural_compressor/onnxrt/quantization/autotune.py b/neural_compressor/onnxrt/quantization/autotune.py deleted file mode 100644 index 7cddcc3a8b3..00000000000 --- a/neural_compressor/onnxrt/quantization/autotune.py +++ /dev/null @@ -1,116 +0,0 @@ -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import tempfile -from pathlib import Path -from typing import Any, Callable, List, Optional, Tuple, Union - -import onnx - -from neural_compressor.common import logger -from neural_compressor.common.base_config import BaseConfig, get_all_config_set_from_config_registry -from neural_compressor.common.base_tuning import EvaluationFuncWrapper, TuningConfig, init_tuning -from neural_compressor.onnxrt.quantization.calibrate import CalibrationDataReader -from neural_compressor.onnxrt.quantization.config import FRAMEWORK_NAME -from neural_compressor.onnxrt.quantization.quantize import _quantize - -__all__ = [ - "autotune", - "get_all_config_set", -] - - -def get_all_config_set() -> Union[BaseConfig, List[BaseConfig]]: - return get_all_config_set_from_config_registry(fwk_name=FRAMEWORK_NAME) - - -def autotune( - model_input: Union[Path, str], - tune_config: TuningConfig, - eval_fn: Callable, - eval_args: Optional[Tuple[Any]] = None, - calibration_data_reader: CalibrationDataReader = None, -) -> Union[None, onnx.ModelProto]: - """The main entry of auto-tune. - - Args: - model_input (Union[Path, str]): onnx model path. - tune_config (TuningConfig): tuning config. - TuningConfig is created with algorithm configs, parameters supported tuning are in their params_list. - Support: - Expand parameters to a list of parameters like TuningConfig(config_set=[RTNConfig(weight_bits=[4, 8])]) - Pass a list of configs like TuningConfig(config_set=[RTNConfig(), GPTQConfig()]) - eval_fn (Callable): evaluate function. - During evaluation, autotune will only pass model path as the input of function. - eval_args (Optional[Tuple[Any]]): evaluate arguments. - Positional arguments for `eval_fn`. - - calibration_data_reader (CalibrationDataReader): dataloader for calibration. - """ - best_quant_model = None - eval_func_wrapper = EvaluationFuncWrapper(eval_fn, eval_args) - config_loader, tuning_logger, tuning_monitor = init_tuning(tuning_config=tune_config) - try: - baseline: float = eval_func_wrapper.evaluate(model_input) - except Exception as e: - print(e) - if "'str' object has no attribute 'SerializeToString'" in str(e): - logger.warning("Please refine your eval_fn to accept model path (str) as input.") - exit(0) - tuning_monitor.set_baseline(baseline) - tuning_logger.tuning_start() - for trial_index, quant_config in enumerate(config_loader): - if calibration_data_reader is not None: - calibration_data_reader.rewind() - tuning_logger.trial_start(trial_index=trial_index) - tuning_logger.execution_start() - logger.debug("quant config: {}".format(quant_config)) - q_model = _quantize(model_input, quant_config=quant_config, calibration_data_reader=calibration_data_reader) - tuning_logger.execution_end() - tuning_logger.evaluation_start() - with tempfile.TemporaryDirectory(prefix="ort.quant.") as tmp_dir: - # evaluate API requires str input - onnx.save_model( - q_model, - Path(tmp_dir).joinpath(Path(model_input).name).as_posix(), - save_as_external_data=True, - all_tensors_to_one_file=True, - location=Path(model_input).with_suffix(Path(model_input).suffix + "_data").name, - size_threshold=1024, - convert_attribute=False, - ) - # copy config.json to tmp dir for evaluation, LLMs evaluation may need it - if isinstance(model_input, str) and os.path.exists( - Path(model_input).parent.joinpath("config.json").as_posix() - ): - import shutil - - shutil.copyfile( - Path(model_input).parent.joinpath("config.json").as_posix(), - Path(tmp_dir).joinpath("config.json").as_posix(), - ) - eval_result: float = eval_func_wrapper.evaluate(Path(tmp_dir).joinpath(Path(model_input).name).as_posix()) - tuning_logger.evaluation_end() - logger.info("Evaluation result: %.4f", eval_result) - tuning_monitor.add_trial_result(trial_index, eval_result, quant_config) - tuning_logger.trial_end(trial_index) - if tuning_monitor.need_stop(): - best_quant_config: BaseConfig = tuning_monitor.get_best_quant_config() - best_quant_model = _quantize( - model_input, quant_config=best_quant_config, calibration_data_reader=calibration_data_reader - ) - break - tuning_logger.tuning_end() - return best_quant_model diff --git a/neural_compressor/onnxrt/quantization/calibrate.py b/neural_compressor/onnxrt/quantization/calibrate.py deleted file mode 100644 index 1ba32672728..00000000000 --- a/neural_compressor/onnxrt/quantization/calibrate.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import abc - -from onnxruntime.quantization import CalibrationDataReader as ORTCalibrationDataReader - -__all__ = ["CalibrationDataReader"] - - -class CalibrationDataReader(ORTCalibrationDataReader): - """Get data for calibration. - - We define our CalibrationDataReader based on the class in below link: - https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/calibrate.py#L139 - """ - - @abc.abstractmethod - def rewind(self): - """Regenerate data.""" - raise NotImplementedError diff --git a/neural_compressor/onnxrt/quantization/config.py b/neural_compressor/onnxrt/quantization/config.py deleted file mode 100644 index 88a0a56171f..00000000000 --- a/neural_compressor/onnxrt/quantization/config.py +++ /dev/null @@ -1,614 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re -from collections import OrderedDict -from enum import Enum -from pathlib import Path -from typing import Callable, List, NamedTuple, Union - -import numpy as np -import onnx -from onnxruntime.quantization.calibrate import CalibrationMethod -from onnxruntime.quantization.quant_utils import QuantFormat, QuantType -from onnxruntime.quantization.quantize import StaticQuantConfig - -from neural_compressor.common import Logger -from neural_compressor.common.base_config import BaseConfig, register_config, register_supported_configs_for_fwk -from neural_compressor.common.utils import AWQ, DEFAULT_WHITE_LIST, GPTQ, OP_NAME_OR_MODULE_TYPE, RTN, SMOOTH_QUANT -from neural_compressor.onnxrt.utils import PRIORITY_AWQ, PRIORITY_GPTQ, PRIORITY_RTN, PRIORITY_SMOOTH_QUANT - -logger = Logger().get_logger() - -__all__ = [ - "FRAMEWORK_NAME", - "RTNConfig", - "get_default_rtn_config", - "GPTQConfig", - "get_default_gptq_config", - "AWQConfig", - "get_default_awq_config", - "SmoohQuantConfig", - "get_default_sq_config", -] - -FRAMEWORK_NAME = "onnxrt" - - -class _OperatorConfig(NamedTuple): - config: BaseConfig - operators: List[Union[str, Callable]] - valid_func_list: List[Callable] = [] - - -######################## RNT Config ############################### - - -@register_config(framework_name=FRAMEWORK_NAME, algo_name=RTN, priority=PRIORITY_RTN) -class RTNConfig(BaseConfig): - """Config class for round-to-nearest weight-only quantization.""" - - supported_configs: List[_OperatorConfig] = [] - params_list: List[str] = [ - "weight_dtype", - "weight_bits", - "weight_group_size", - "weight_sym", - "act_dtype", - "accuracy_level", - ] - model_params_list: List[str] = [ - "providers", - "layer_wise_quant", - ] - name: str = RTN - - def __init__( - self, - weight_dtype: str = "int", - weight_bits: int = 4, - weight_group_size: int = 32, - weight_sym: bool = True, - act_dtype: str = "fp32", - accuracy_level: int = 0, - providers: List[str] = ["CPUExecutionProvider"], - layer_wise_quant: bool = False, - white_list: List[OP_NAME_OR_MODULE_TYPE] = DEFAULT_WHITE_LIST, - ): - """Init RTN weight-only quantization config. - - Args: - weight_dtype (str, optional): Data type for weights, default is "int". - weight_bits (int, optional): Number of bits used to represent weights, default is 4. - weight_group_size (int, optional): Size of weight groups, default is 32. - weight_sym (bool, optional): Indicates whether weights are symmetric, default is True. - act_dtype (str, optional): Data type for activations, default is "fp32". - accuracy_level (int, optional): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel), - 2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel), - 4 (int8 compute type of jblas kernel). Defaults to 0. - providers (list, optional): execution providers to use. Defaults to ["CPUExecutionProvider"]. - layer_wise_quant (bool, optional): whether to quantize model layer by layer to save memory footprint. - Check below link for details - https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_layer_wise.md, - default is False. - white_list (list, optional): op in white_list will be applied current config. - Defaults to DEFAULT_WHITE_LIST. - """ - super().__init__(white_list=white_list) - self.weight_bits = weight_bits - self.weight_dtype = weight_dtype - self.weight_group_size = weight_group_size - self.weight_sym = weight_sym - self.act_dtype = act_dtype - self.accuracy_level = accuracy_level - self.providers = providers - self.layer_wise_quant = layer_wise_quant - self._post_init() - - def get_model_params_dict(self): - result = dict() - for param in self.model_params_list: - result[param] = getattr(self, param) - return result - - @classmethod - def register_supported_configs(cls) -> List[_OperatorConfig]: - supported_configs = [] - linear_rtn_config = RTNConfig( - weight_dtype=["int"], - weight_bits=[4, 3, 8], - weight_group_size=[32, -1, 1, 16, 64, 128, 256, 512, 1024], - weight_sym=[True, False], - act_dtype=["fp32"], - ) - operators = ["MatMul"] - supported_configs.append(_OperatorConfig(config=linear_rtn_config, operators=operators)) - cls.supported_configs = supported_configs - - def to_config_mapping(self, config_list: List[BaseConfig] = None, model_info: list = None): - config_mapping = OrderedDict() - if config_list is None: - config_list = [self] - for config in config_list: - # update model level setting - config_mapping.update(config.get_model_params_dict()) - - # update node level setting - global_config = config.global_config - op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config() - for op_name, op_type in model_info: - if self.global_config is not None: - config_mapping[(op_name, op_type)] = global_config - if op_type in op_type_config_dict: - config_mapping[(op_name, op_type)] = op_name_config_dict[op_type] - for op_name_pattern in op_name_config_dict: - if re.match(op_name_pattern, op_name): - config_mapping[(op_name, op_type)] = op_name_config_dict[op_name_pattern] - return config_mapping - - @staticmethod - def get_model_info(model: Union[onnx.ModelProto, Path, str]) -> list: - if not isinstance(model, onnx.ModelProto): - model = onnx.load(model, load_external_data=False) - white_list = ["MatMul"] - filter_result = [] - for node in model.graph.node: - if node.op_type in white_list: - pair = (node.name, node.op_type) - filter_result.append(pair) - logger.debug(f"Get model info: {filter_result}") - return filter_result - - @classmethod - def get_config_set_for_tuning(cls) -> Union[None, "RTNConfig", List["RTNConfig"]]: # pragma: no cover - # TODO fwk owner needs to update it. - return RTNConfig(weight_bits=[4, 8], weight_sym=[True, False]) - - -def get_default_rtn_config() -> RTNConfig: - """Generate the default rtn config. - - Returns: - the default rtn config. - """ - return RTNConfig() - - -######################## GPTQ Config ############################### - - -@register_config(framework_name=FRAMEWORK_NAME, algo_name=GPTQ, priority=PRIORITY_GPTQ) -class GPTQConfig(BaseConfig): - """Config class for gptq weight-only quantization.""" - - supported_configs: List[_OperatorConfig] = [] - params_list: List[str] = [ - "weight_dtype", - "weight_bits", - "weight_group_size", - "weight_sym", - "act_dtype", - "accuracy_level", - ] - model_params_list: List[str] = [ - "percdamp", - "blocksize", - "actorder", - "mse", - "perchannel", - "providers", - "layer_wise_quant", - ] - name: str = GPTQ - - def __init__( - self, - weight_dtype: str = "int", - weight_bits: int = 4, - weight_group_size: int = 32, - weight_sym: bool = True, - act_dtype: str = "fp32", - accuracy_level: int = 0, - percdamp: float = 0.01, - blocksize: int = 128, - actorder: bool = False, - mse: bool = False, - perchannel: bool = True, - providers: List[str] = ["CPUExecutionProvider"], - layer_wise_quant: bool = False, - white_list: List[OP_NAME_OR_MODULE_TYPE] = DEFAULT_WHITE_LIST, - ): - """Init GPTQ weight-only quantization config. - - Args: - weight_dtype (str, optional): data type for weights. Defaults to "int". - weight_bits (int, optional): number of bits used to represent weights. Defaults to 4. - weight_group_size (int, optional): size of weight groups. Defaults to 32. - weight_sym (bool, optional): indicates whether weights are symmetric. Defaults to True. - act_dtype (str, optional): data type for activations. Defaults to "fp32". - accuracy_level (int, optional): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel), - 2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel), - 4 (int8 compute type of jblas kernel). Defaults to 0. - percdamp (float, optional): percentage of Hessian's diagonal values' average, which will be added - to Hessian's diagonal to increase numerical stability. Defaults to 0.01. - blocksize (int, optional): execute GPTQ quantization per block. Defaults to 128. - actorder (bool, optional): whether to sort Hessian's diagonal values to rearrange channel-wise - quantization order. Defaults to False. - mse (bool, optional): whether get scale and zero point with mse error. Defaults to False. - perchannel (bool, optional): whether quantize weight per-channel. Defaults to True. - providers (list, optional): execution providers to use. Defaults to ["CPUExecutionProvider"]. - layer_wise_quant (bool, optional): whether to quantize model layer by layer to save memory footprint. - Check below link for details - https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_layer_wise.md, - default is False. - white_list (list, optional): op in white_list will be applied current config. - Defaults to DEFAULT_WHITE_LIST. - """ - super().__init__(white_list=white_list) - self.weight_bits = weight_bits - self.weight_dtype = weight_dtype - self.weight_group_size = weight_group_size - self.weight_sym = weight_sym - self.act_dtype = act_dtype - self.accuracy_level = accuracy_level - self.percdamp = percdamp - self.blocksize = blocksize - self.actorder = actorder - self.mse = mse - self.perchannel = perchannel - self.providers = providers - self.layer_wise_quant = layer_wise_quant - self._post_init() - - def get_model_params_dict(self): - result = dict() - for param in self.model_params_list: - result[param] = getattr(self, param) - return result - - @classmethod - def register_supported_configs(cls) -> List[_OperatorConfig]: - supported_configs = [] - linear_gptq_config = GPTQConfig( - weight_dtype=["int"], - weight_bits=[4, 3, 8], - weight_group_size=[32, -1, 1, 16, 64, 128, 256, 512, 1024], - weight_sym=[True, False], - act_dtype=["fp32"], - actorder=[True, False], - mse=[True, False], - perchannel=[True, False], - ) - operators = ["MatMul"] - supported_configs.append(_OperatorConfig(config=linear_gptq_config, operators=operators)) - cls.supported_configs = supported_configs - - def to_config_mapping(self, config_list: list = None, model_info: list = None) -> OrderedDict: - config_mapping = OrderedDict() - if config_list is None: - config_list = [self] - for config in config_list: - # update model level setting - config_mapping.update(config.get_model_params_dict()) - - # update node level setting - global_config = config.global_config - op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config() - for op_name, op_type in model_info: - if self.global_config is not None: - config_mapping[(op_name, op_type)] = global_config - if op_type in op_type_config_dict: - config_mapping[(op_name, op_type)] = op_name_config_dict[op_type] - for op_name_pattern in op_name_config_dict: - if re.match(op_name_pattern, op_name): - config_mapping[(op_name, op_type)] = op_name_config_dict[op_name_pattern] - return config_mapping - - @staticmethod - def get_model_info(model: Union[onnx.ModelProto, Path, str]) -> list: - if not isinstance(model, onnx.ModelProto): - model = onnx.load(model, load_external_data=False) - white_list = ["MatMul"] - filter_result = [] - for node in model.graph.node: - if node.op_type in white_list: - pair = (node.name, node.op_type) - filter_result.append(pair) - logger.debug(f"Get model info: {filter_result}") - return filter_result - - @classmethod - def get_config_set_for_tuning(cls) -> Union[None, "GPTQConfig", List["GPTQConfig"]]: # pragma: no cover - # TODO fwk owner needs to update it. - return GPTQConfig( - weight_bits=[4, 8], - weight_sym=[True, False], - actorder=[True, False], - mse=[True, False], - perchannel=[True, False], - ) - - -def get_default_gptq_config() -> GPTQConfig: - """Generate the default gptq config. - - Returns: - the default gptq config. - """ - return GPTQConfig() - - -######################## AWQ Config ############################### - - -@register_config(framework_name=FRAMEWORK_NAME, algo_name=AWQ, priority=PRIORITY_AWQ) -class AWQConfig(BaseConfig): - """Config class for awq weight-only quantization.""" - - supported_configs: List[_OperatorConfig] = [] - params_list: List[str] = [ - "weight_dtype", - "weight_bits", - "weight_group_size", - "weight_sym", - "act_dtype", - "accuracy_level", - ] - model_params_list: List[str] = [ - "enable_auto_scale", - "enable_mse_search", - "providers", - ] - name: str = AWQ - - def __init__( - self, - weight_dtype: str = "int", - weight_bits: int = 4, - weight_group_size: int = 32, - weight_sym: bool = True, - act_dtype: str = "fp32", - accuracy_level: int = 0, - enable_auto_scale: bool = True, - enable_mse_search: bool = True, - providers: List[str] = ["CPUExecutionProvider"], - white_list: List[OP_NAME_OR_MODULE_TYPE] = DEFAULT_WHITE_LIST, - ): - """Init AWQ weight-only quantization config. - - Args: - weight_dtype (str, optional): data type for weights. Defaults to "int". - weight_bits (int, optional): number of bits used to represent weights. Defaults to 4. - weight_group_size (int, optional): size of weight groups. Defaults to 32. - weight_sym (bool, optional): indicates whether weights are symmetric. Defaults to True. - act_dtype (str, optional): data type for activations. Defaults to "fp32". - accuracy_level (int, optional): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel), - 2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel), - 4 (int8 compute type of jblas kernel). Defaults to 0. - enable_auto_scale (bool, optional): whether to search for best scales based on activation distribution. - Defaults to True. - enable_mse_search (bool, optional): whether to search for the best clip range from range - [0.91, 1.0, 0.01]. Defaults to True. - providers (list, optional): execution providers to use. Defaults to ["CPUExecutionProvider"]. - white_list (list, optional): op in white_list will be applied current config. - Defaults to DEFAULT_WHITE_LIST. - """ - super().__init__(white_list=white_list) - self.weight_bits = weight_bits - self.weight_dtype = weight_dtype - self.weight_group_size = weight_group_size - self.weight_sym = weight_sym - self.act_dtype = act_dtype - self.accuracy_level = accuracy_level - self.enable_auto_scale = enable_auto_scale - self.enable_mse_search = enable_mse_search - self.providers = providers - self._post_init() - - def get_model_params_dict(self): - result = dict() - for param in self.model_params_list: - result[param] = getattr(self, param) - return result - - @classmethod - def register_supported_configs(cls) -> List[_OperatorConfig]: - supported_configs = [] - linear_awq_config = AWQConfig( - weight_dtype=["int"], - weight_bits=[4, 3, 8], - weight_group_size=[32, -1, 1, 16, 64, 128, 256, 512, 1024], - weight_sym=[True, False], - act_dtype=["fp32"], - enable_auto_scale=[True, False], - enable_mse_search=[True, False], - ) - operators = ["MatMul"] - supported_configs.append(_OperatorConfig(config=linear_awq_config, operators=operators)) - cls.supported_configs = supported_configs - - def to_config_mapping(self, config_list: list = None, model_info: list = None) -> OrderedDict: - config_mapping = OrderedDict() - if config_list is None: - config_list = [self] - for config in config_list: - # update model level setting - config_mapping.update(config.get_model_params_dict()) - - # update node level setting - global_config = config.global_config - op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config() - for op_name, op_type in model_info: - if self.global_config is not None: - config_mapping[(op_name, op_type)] = global_config - if op_type in op_type_config_dict: - config_mapping[(op_name, op_type)] = op_name_config_dict[op_type] - for op_name_pattern in op_name_config_dict: - if re.match(op_name_pattern, op_name): - config_mapping[(op_name, op_type)] = op_name_config_dict[op_name_pattern] - return config_mapping - - @staticmethod - def get_model_info(model: Union[onnx.ModelProto, Path, str]) -> list: - if not isinstance(model, onnx.ModelProto): - model = onnx.load(model, load_external_data=False) - white_list = ["MatMul"] - filter_result = [] - for node in model.graph.node: - if node.op_type in white_list: - pair = (node.name, node.op_type) - filter_result.append(pair) - logger.debug(f"Get model info: {filter_result}") - return filter_result - - @classmethod - def get_config_set_for_tuning(cls) -> Union[None, "AWQConfig", List["AWQConfig"]]: # pragma: no cover - # TODO fwk owner needs to update it. - return AWQConfig( - weight_bits=[4, 8], - weight_sym=[True, False], - enable_auto_scale=[True, False], - enable_mse_search=[True, False], - ) - - -def get_default_awq_config() -> AWQConfig: - """Generate the default awq config. - - Returns: - the default awq config. - """ - return AWQConfig() - - -######################## SmoohQuant Config ############################### - - -@register_config(framework_name=FRAMEWORK_NAME, algo_name=SMOOTH_QUANT, priority=PRIORITY_SMOOTH_QUANT) -class SmoohQuantConfig(BaseConfig, StaticQuantConfig): - """Smooth quant quantization config.""" - - supported_configs: List[_OperatorConfig] = [] - params_list: List[str] = [ - # smooth parameters - "alpha", - "folding", - "auto_alpha_args", - "calib_iter", - "scales_per_op", - ] - name: str = SMOOTH_QUANT - - def __init__( - self, - alpha: float = 0.5, - folding: bool = True, - op_types: List[str] = ["Gemm", "Conv", "MatMul", "FusedConv"], - calib_iter: int = 100, - scales_per_op: bool = True, - auto_alpha_args: dict = {"alpha_min": 0.3, "alpha_max": 0.7, "alpha_step": 0.05, "attn_method": "min"}, - providers: List[str] = ["CPUExecutionProvider"], - white_list: List[OP_NAME_OR_MODULE_TYPE] = DEFAULT_WHITE_LIST, - **kwargs, - ): - """Init smooth quant config. - - Args: - alpha (float, optional): alpha value to balance the quantization difficulty of activation and weight. - Defaults to 0.5. - folding (bool, optional): whether fold those foldable Mul which are inserted for smooth quant. - Defaults to True. - op_types (list, optional): the op type to be smooth quantized. - Defaults to ["Gemm", "Conv", "MatMul", "FusedConv"]. - calib_iter (int, optional): iteration num for calibration. Defaults to 100. - scales_per_op (bool, optional): True, each op will have an individual scale, mainlyfor accuracy. - False, ops with the same input will share a scale, mainly for performance. Defaults to True. - auto_alpha_args (dict, optional): settings for alpha tuning. - Defaults to {"alpha_min": 0.3, "alpha_max": 0.7, "alpha_step": 0.05, "attn_method": "min"}. - providers (list, optional): providers used for inference. - Defaults to ["CPUExecutionProvider"]. - white_list (list, optional): op in white_list will be applied current config. - Defaults to DEFAULT_WHITE_LIST. - kwargs (dict): kwargs in below link are supported except calibration_data_reader: - https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/quantize.py#L78 - """ - BaseConfig.__init__(self) - kwargs.update({"calibration_data_reader": None}) - StaticQuantConfig.__init__(self, **kwargs) - self.alpha = alpha - self.folding = folding - self.op_types = op_types - self.calib_iter = calib_iter - self.scales_per_op = scales_per_op - self.auto_alpha_args = auto_alpha_args - self.providers = providers - self.white_list = white_list - self.weight_type = self.weight_type.value if isinstance(self.weight_type, Enum) else self.weight_type - self.activation_type = ( - self.activation_type.value if isinstance(self.activation_type, Enum) else self.activation_type - ) - self.calibrate_method = ( - self.calibrate_method.value if isinstance(self.calibrate_method, Enum) else self.calibrate_method - ) - self.quant_format = self.quant_format.value if isinstance(self.quant_format, Enum) else self.quant_format - self._post_init() - - @classmethod - def register_supported_configs(cls) -> List[_OperatorConfig]: - supported_configs = [] - smooth_quant_config = SmoohQuantConfig() - operators = ["Gemm", "Conv", "MatMul", "FusedConv"] - supported_configs.append(_OperatorConfig(config=smooth_quant_config, operators=operators)) - cls.supported_configs = supported_configs - - @staticmethod - def get_model_info(model) -> list: - white_list = ["Gemm", "Conv", "MatMul", "FusedConv"] - filter_result = [] - for node in model.graph.node: - if node.op_type in white_list: - pair = (node.name, node.op_type) - filter_result.append(pair) - logger.debug(f"Get model info: {filter_result}") - return filter_result - - @classmethod - def get_config_set_for_tuning(cls) -> Union[None, "SmoohQuantConfig", List["SmoohQuantConfig"]]: # pragma: no cover - # TODO fwk owner needs to update it. - return SmoohQuantConfig(alpha=np.arange(0.3, 0.7, 0.05)) - - def convert_to_ort_config(self): - self.activation_type = QuantType(self.activation_type) - self.weight_type = QuantType(self.weight_type) - self.weight_type = QuantType(self.weight_type) - self.calibrate_method = CalibrationMethod(self.calibrate_method) - self.quant_format = QuantFormat(self.quant_format) - - -def get_default_sq_config() -> SmoohQuantConfig: - """Generate the default smooth quant config. - - Returns: - the default smooth quant config. - """ - return SmoohQuantConfig() - - -##################### Algo Configs End ################################### - - -register_supported_configs_for_fwk(fwk_name=FRAMEWORK_NAME) diff --git a/neural_compressor/onnxrt/quantization/quantize.py b/neural_compressor/onnxrt/quantization/quantize.py deleted file mode 100644 index eee9f3162f1..00000000000 --- a/neural_compressor/onnxrt/quantization/quantize.py +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from pathlib import Path -from typing import Union - -import onnx - -from neural_compressor.common import Logger -from neural_compressor.common.base_config import BaseConfig, ComposableConfig, config_registry -from neural_compressor.common.utils import log_process -from neural_compressor.onnxrt.quantization.calibrate import CalibrationDataReader -from neural_compressor.onnxrt.quantization.config import FRAMEWORK_NAME -from neural_compressor.onnxrt.utils.utility import algos_mapping - -logger = Logger().get_logger() - - -def _need_apply(quant_config: BaseConfig, algo_name): - return quant_config.name == algo_name if hasattr(quant_config, "name") else False - - -# * only for internal usage now -@log_process() -def _quantize( - model_input: Union[Path, str], - quant_config: BaseConfig, - calibration_data_reader: CalibrationDataReader = None, -) -> onnx.ModelProto: - """The main entry to quantize a model. - - Args: - model_input (Union[Path, str]): Path or str to the model to quantize. - quant_config (BaseConfig): a quantization configuration. - calibration_data_reader (CalibrationDataReader, optional): dataloader for calibration. - Defaults to None. - - Returns: - onnx.ModelProto: The quantized model. - """ - registered_configs = config_registry.get_cls_configs() - if isinstance(quant_config, dict): - quant_config = ComposableConfig.from_dict(quant_config, config_registry=registered_configs[FRAMEWORK_NAME]) - logger.info(f"Parsed a config dict to construct the quantization config: {quant_config}.") - else: - assert isinstance( - quant_config, BaseConfig - ), f"Please pass a dict or config instance as the quantization configuration, but got {type(quant_config)}." - logger.info(f"Quantize model with config: \n {quant_config} \n") - - # select quantization algo according to config - for algo_name, algo_func in algos_mapping.items(): - if _need_apply(quant_config, algo_name): - logger.info(f"Start to apply {algo_name} on the model.") - q_model = algo_func(model_input, quant_config, calibration_data_reader=calibration_data_reader) - return q_model diff --git a/neural_compressor/onnxrt/utils/__init__.py b/neural_compressor/onnxrt/utils/__init__.py deleted file mode 100644 index 813fc93ab5a..00000000000 --- a/neural_compressor/onnxrt/utils/__init__.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from neural_compressor.onnxrt.utils.onnx_model import ONNXModel -from neural_compressor.onnxrt.utils.utility import PRIORITY_RTN, PRIORITY_GPTQ, PRIORITY_AWQ, PRIORITY_SMOOTH_QUANT - -__all__ = [ - "ONNXModel", - "PRIORITY_RTN", - "PRIORITY_GPTQ", - "PRIORITY_AWQ", - "PRIORITY_SMOOTH_QUANT", -] diff --git a/neural_compressor/onnxrt/utils/onnx_model.py b/neural_compressor/onnxrt/utils/onnx_model.py deleted file mode 100644 index 801416f7f64..00000000000 --- a/neural_compressor/onnxrt/utils/onnx_model.py +++ /dev/null @@ -1,1082 +0,0 @@ -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Class for ONNX model.""" - -import os -import sys -from pathlib import Path - -import onnx -from onnxruntime.quantization.onnx_model import ONNXModel as ORTONNXModel - -from neural_compressor.common import Logger - -logger = Logger().get_logger() - -__all__ = ["ONNXModel"] - - -class ONNXModel(ORTONNXModel): - """Build ONNX model.""" - - def __init__(self, model, **kwargs): - """Initialize an ONNX model. - - Args: - model (str or ModelProto): path to onnx model or loaded ModelProto model object. - """ - self.model = model if not isinstance(model, str) else onnx.load(model, load_external_data=False) - super().__init__(self.model) - - self._model_path = None if not isinstance(model, str) else model - self.check_is_large_model() - if self._is_large_model and self._model_path is None and not kwargs.get("ignore_warning", False): - logger.warning("Model size > 2GB. Please use model path instead of onnx model object to quantize") - - if self._is_large_model and isinstance(model, str) and kwargs.get("load_external_data", True): - from onnx.external_data_helper import load_external_data_for_model - - load_external_data_for_model(self.model, os.path.dirname(self._model_path)) - - self._config = None - if isinstance(model, str) and os.path.exists(Path(model).parent.joinpath("config.json").as_posix()): - from transformers import PretrainedConfig - - self._config = PretrainedConfig.from_pretrained(Path(model).parent.as_posix()) - self.node_name_counter = {} - self._output_name_to_node = self.output_name_to_node() - self._input_name_to_nodes = self.input_name_to_nodes() - self._graph_info = {} - self._get_graph_info() - self._q_config = None - - @property - def model_path(self): - """Return model path.""" - return self._model_path - - @model_path.setter - def model_path(self, path): - """Set model path.""" - self._model_path = path - - def check_is_large_model(self): - """Check model > 2GB.""" - from neural_compressor.onnxrt.utils.utility import MAXIMUM_PROTOBUF - - init_size = 0 - for init in self.model.graph.initializer: - # if initializer has external data location, return True - if init.HasField("data_location") and init.data_location == onnx.TensorProto.EXTERNAL: - self._is_large_model = True - return - # if raise error of initializer size > 2GB, return True - try: - init_bytes = init.SerializeToString() - init_size += sys.getsizeof(init_bytes) - except Exception as e: - if "exceeds maximum protobuf size of 2GB" in str(e): - self._is_large_model = True - return - else: # pragma: no cover - raise e - if init_size > MAXIMUM_PROTOBUF: - self._is_large_model = True - return - self._is_large_model = False - - @property - def is_large_model(self): - """Check the onnx model is over 2GB.""" - return self._is_large_model - - def framework(self): - """Return framework.""" - return "onnxruntime" - - def add_initializers(self, tensors): - """Add initializers to model.""" - for tensor in tensors: - self.add_initializer(tensor) - - @property - def q_config(self): - """Return q_config.""" - return self._q_config - - @q_config.setter - def q_config(self, q_config): - """Set q_config.""" - self._q_config = q_config - - @property - def hf_config(self): - """Return huggingface config if model is Transformer-based.""" - return self._config - - def input(self): - """Return input of model.""" - return [i.name for i in self.model.graph.input] - - def output(self): - """Return output of model.""" - return [i.name for i in self.model.graph.output] - - def update(self): - """Update model info.""" - self._graph_info = {} - self._get_graph_info() - self._output_name_to_node = self.output_name_to_node() - self._input_name_to_nodes = self.input_name_to_nodes() - - @property - def graph_info(self): - """Return ORT Graph Info object holding information about backend graph.""" - return self._graph_info - - def _get_graph_info(self): - """Update graph info.""" - for node in self.model.graph.node: - self.graph_info.update({node.name: node.op_type}) - - def save(self, root): - """Save ONNX model.""" - if os.path.split(root)[0] != "" and not os.path.exists(os.path.split(root)[0]): - raise ValueError('"root" directory does not exists.') - if self.is_large_model: # pragma: no cover - from onnx.external_data_helper import load_external_data_for_model - - load_external_data_for_model(self.model, os.path.split(self._model_path)[0]) - onnx.save_model( - self.model, - root, - save_as_external_data=True, - all_tensors_to_one_file=True, - location=root.split("/")[-1] + "_data", - size_threshold=1024, - convert_attribute=False, - ) - else: - onnx.save(self.model, root) - - if self._config is not None: - model_type = "" if not hasattr(self._config, "model_type") else getattr(self._config, "model_type") - setattr(self._config.__class__, "model_type", model_type) - output_config_file = Path(root).parent.joinpath("config.json").as_posix() - self._config.to_json_file(output_config_file, use_diff=False) - - def get_initializer_share_num(self, name): - """Get the number of shares of initializer.""" - num = 0 - if self.get_initializer(name) is None: - return num - - for node in self.nodes(): - if name in node.input: - num += 1 - return num - - def get_node(self, name): - """Get a node by name.""" - for node in self.model.graph.node: - if node.name == name: - return node - return None - - def get_node_by_weight(self, weight_name): - """Get a node by its weight name.""" - if len(self._input_name_to_nodes) == 0: - self._input_name_to_nodes = self.input_name_to_nodes() - nodes = self._input_name_to_nodes[weight_name] - if len(nodes) == 1: - return nodes[0] - elif len(nodes) == 0: - raise ValueError("{} is not used by any node in this model.".format(weight_name)) - else: - raise NotImplementedError("Models with shared weights is not supported.") - - def set_initializer(self, tensor, array, raw=False): - """Update initializer.""" - old_tensor = self.get_initializer(tensor) - self.remove_initializer(old_tensor) - dims = old_tensor.dims - data_type = old_tensor.data_type - new_tensor = ( - onnx.helper.make_tensor(tensor, data_type, dims, array.flatten().tolist()) - if not raw - else onnx.helper.make_tensor(tensor, data_type, dims, array.tostring(), raw=raw) - ) - self.add_initializer(new_tensor) - - def get_siblings(self, node): - """Get siblings nodes.""" - siblings = [] - for parent in self.get_parents(node): - for child in self.get_children(parent): - if child.name != node.name: - siblings.append(child) - return siblings - - def get_scale_zero(self, tensor): - """Help function to get scale and zero_point.""" - if not tensor.endswith("_quantized"): - logger.debug("Find {} in the quantized graph is not quantized.".format(tensor)) - return None, None - - if len(self._input_name_to_nodes) == 0: - self._input_name_to_nodes = self.input_name_to_nodes() - if len(self._output_name_to_node) == 0: - self._output_name_to_node = self.output_name_to_node() - - def _searcher(tensor_name): - """Search scale and zero point tensor recursively.""" - node = self._input_name_to_nodes[tensor_name][0] - parent = self._output_name_to_node[tensor_name] if tensor_name in self._output_name_to_node else None - direct_int8 = ["Reshape", "Transpose", "Squeeze", "Unsqueeze", "MaxPool", "Pad", "Split"] - if parent is not None and parent.op_type in direct_int8: - fp32_tensor_name = ( - parent.input[0] - .replace("_quantized", "") - .replace("_QuantizeLinear", "") - .replace("_QuantizeInput", "") - ) - elif node.op_type in ["Gather"]: # pragma: no cover - fp32_tensor_name = ( - node.output[0] - .replace("_quantized", "") - .replace("_QuantizeLinear", "") - .replace("_QuantizeInput", "") - ) - else: - fp32_tensor_name = ( - tensor_name.replace("_quantized", "").replace("_QuantizeLinear", "").replace("_QuantizeInput", "") - ) - scale = fp32_tensor_name + "_scale" - scale_tensor = self.get_initializer(scale) - zo = fp32_tensor_name + "_zero_point" - zo_tensor = self.get_initializer(zo) - - if scale_tensor is None or zo_tensor is None: - if parent is not None: - scale_tensor, zo_tensor = _searcher(parent.input[0]) - return scale_tensor, zo_tensor - - node = self._input_name_to_nodes[tensor][0] - # TODO check if scale_tensor and zero_point is needed - # for bias of qlinearconv, scale and zero_point is not needed - if (node.op_type == "QLinearConv" and tensor == node.input[-1]) or ( - node.op_type == "QGemm" and tensor == node.input[-3] - ): - return None, None - else: - scale_tensor, zo_tensor = _searcher(tensor) - assert scale_tensor, "missing scale for tensor {}".format(tensor) - assert zo_tensor, "missing zero point for tensor {}".format(tensor) - return scale_tensor, zo_tensor - - def replace_input_of_all_nodes(self, old_input_name, new_input_name, white_optype=[], black_optype=[]): - """Replace inputs of all nodes.""" - if len(white_optype) > 0: - for node in self.model.graph.node: - if node.op_type in white_optype: - ONNXModel.replace_node_input(node, old_input_name, new_input_name) - else: - for node in self.model.graph.node: - if node.op_type not in black_optype: - ONNXModel.replace_node_input(node, old_input_name, new_input_name) - - def replace_output_of_all_nodes(self, old_output_name, new_output_name, white_optype=[], black_optype=[]): - """Replace outputs of all nodes.""" - if len(white_optype) > 0: - for node in self.model.graph.node: - if node.op_type in white_optype: - ONNXModel.replace_node_output(node, old_output_name, new_output_name) - else: - for node in self.model.graph.node: - if node.op_type not in black_optype: - ONNXModel.replace_node_output(node, old_output_name, new_output_name) - - def remove_unused_nodes(self): - """Remove unused nodes.""" - unused_nodes = [] - nodes = self.nodes() - if len(self._input_name_to_nodes) == 0: - self._input_name_to_nodes = self.input_name_to_nodes() - if len(self._output_name_to_node) == 0: - self._output_name_to_node = self.output_name_to_node() - for node in nodes: - if ( - node.op_type == "Constant" - and node.output[0] not in self.model.graph.output - and node.output[0] not in self._input_name_to_nodes - ): - unused_nodes.append(node) - elif ( - node.op_type == "QuantizeLinear" - and len(self.get_children(node)) == 1 - and self.get_children(node)[0].op_type == "DequantizeLinear" - and node.input[0] not in self._output_name_to_node - and self.get_children(node)[0].output[0] not in self._input_name_to_nodes - ): - unused_nodes.append(node) - unused_nodes.extend(self.get_children(node)) - else: - # remove the node if it does not serve as the input or output of any other nodes - unused = True - for output in node.output: - if output in self._input_name_to_nodes or output in self.output(): - unused = False - break - for input in node.input: - if self.get_initializer(input) is not None: - continue - elif input in self._output_name_to_node or input in self.input(): - unused = False - break - if unused: - unused_nodes.append(node) - self.remove_nodes(unused_nodes) - - ununsed_weights = [] - for w in self.model.graph.initializer: - if w.name not in self._input_name_to_nodes and w.name not in self.model.graph.output: - ununsed_weights.append(w) - # Remove from graph.input - for graph_input in self.graph().input: - if graph_input.name == w.name: - self.graph().input.remove(graph_input) - - self.remove_initializers(ununsed_weights) - self.update() - - def topological_sort(self, enable_subgraph=False): - """Topological sort the model.""" - import copy - from collections import deque - from functools import reduce - - if not enable_subgraph: - input_name_to_nodes = {} - output_name_to_node = {} - for node in self.model.graph.node: - for input_name in node.input: - if len(input_name.strip()) != 0: - if input_name not in input_name_to_nodes: - input_name_to_nodes[input_name] = [node] - else: - input_name_to_nodes[input_name].append(node) - for output_name in node.output: - if len(output_name.strip()) != 0: - output_name_to_node[output_name] = node - else: # pragma: no cover - if len(self._input_name_to_nodes) == 0: - self._input_name_to_nodes = self.input_name_to_nodes() - if len(self._output_name_to_node) == 0: - self._output_name_to_node = self.output_name_to_node() - input_name_to_nodes = self._input_name_to_nodes - output_name_to_node = self._output_name_to_node - - all_nodes = {} - q = deque() - wait = deque() - for inp in self.model.graph.input: - q.extend(input_name_to_nodes[inp.name]) - for n in self.model.graph.node: - if all([i not in output_name_to_node and i not in self.input() for i in n.input]): - q.append(n) - - while q: - n = q.popleft() - if not all([output_name_to_node[i].name in all_nodes for i in n.input if i in output_name_to_node]): - if n not in wait: - wait.append(n) - continue - - all_nodes[n.name] = n - for out in n.output: - if out in input_name_to_nodes: - q.extend([i for i in input_name_to_nodes[out] if i.name not in all_nodes and i not in q]) - if len(q) == 0 and len(wait) != 0: - q = copy.deepcopy(wait) - wait.clear() - nodes = [i[1] for i in all_nodes.items()] - assert len(list(set([n.name for n in nodes]))) == len(list(set([n.name for n in self.model.graph.node]))) - self.model.graph.ClearField("node") - self.model.graph.node.extend(nodes) - - def get_nodes_chain(self, start, stop, result_chain=[]): - """Get nodes chain with given start node and stop node.""" - from collections import deque - - from onnx import NodeProto - - from neural_compressor.onnxrt.utils.utility import find_by_name - - # process start node list - start_node = deque() - for node in start: - if isinstance(node, str): - start_node.append(node) - elif isinstance(node, NodeProto): - start_node.append(node.name) - else: - assert False, "'get_nodes_chain' function only support list[string]" "or list[NodeProto] params" - - # process stop node list - stop_node = [] - for node in stop: - if isinstance(node, str): - stop_node.append(node) - elif isinstance(node, NodeProto): - stop_node.append(node.name) - else: - assert False, "'get_nodes_chain' function only support list[string]" "or list[NodeProto] params" - - while start_node: - node_name = start_node.popleft() - if node_name in stop_node: - continue - if node_name not in result_chain: - result_chain.append(node_name) - else: - continue - - node = find_by_name(node_name, list(self.model.graph.node)) - for parent in self.get_parents(node): - start_node.append(parent.name) - - return result_chain - - def find_split_node_for_layer_wise_quantization(self): - """Find split node for layer wise quantization.""" - # find split nodes of decoder blocks - # embed -> decoder.0 -(split_node)-> ... -(split_node)-> decoder.n -(split_node)-> norm -> head - # after split: embed -> decoder.0, - # decoder.1, - # decoder.2, - # ..., - # decoder.n, - # norm -> head - start_nodes = [] - for node in self.model.graph.node: - start_node, qkv_nodes_list = None, None - if node.op_type == "SkipLayerNormalization": - start_node = node - qkv_nodes_list = [ - self.match_parent_path( - start_node, - ["MatMul", "Reshape", "Transpose", "Reshape", "MatMul"], - [None, 0, 0, 0, 0], - ), - self.match_parent_path( - start_node, - ["Add", "MatMul", "Reshape", "Transpose", "MatMul"], - [1, 1, 0, 0, 0], - ), - ] - if node.op_type == "Add": - start_node = node - qkv_nodes_list = [ - # match base attention structure - self.match_parent_path( - start_node, - ["Add", "MatMul", "Reshape", "Transpose", "MatMul"], - [0, None, 0, 0, 0], - ), - self.match_parent_path( - start_node, ["Add", "MatMul", "Reshape", "Transpose", "MatMul"], [1, None, 0, 0, 0] - ), - # match gpt attention no past structure - self.match_parent_path( - start_node, - ["Reshape", "Gemm", "Reshape", "Reshape", "Transpose", "MatMul"], - [None, 0, 0, 0, 0, 0], - output_name_to_node_dict=self._output_name_to_node, - return_indice=[], - ), - # match bart attention structure - self.match_parent_path( - start_node, - ["Add", "MatMul", "Reshape", "Transpose", "Reshape", "MatMul"], - [0, None, 0, 0, 0, 0], - ), - self.match_parent_path( - start_node, - ["Add", "MatMul", "Reshape", "Transpose", "Reshape", "MatMul"], - [1, None, 0, 0, 0, 0], - ), - self.match_parent_path( - start_node, - ["MatMul", "Mul", "MatMul", "Mul", "Div", "Add"], - [None, 0, None, 0, None, 0], - ), - self.match_parent_path( - start_node, - ["MatMul", "Mul", "MatMul", "SimplifiedLayerNormalization", "Add"], - [None, 0, None, 0, 0], - ), - ] - if not start_node: - continue - if not any(qkv_nodes_list): - continue - start_nodes.append(start_node) - return start_nodes - - def find_qkv_in_attention(self, find_all=False): - """Find qkv MatMul in Attention. - - Args: - find_all (bool, optional): find all qkv MatMul. Defaults to False - - Returns: - qkv (list): qkv MatMul list - """ - qkv = [] - if len(self._input_name_to_nodes) == 0: - self._input_name_to_nodes = self.input_name_to_nodes() - for node in self.model.graph.node: - if node.op_type == "Attention": - qkv.append([node.name]) - continue - start_node, qkv_nodes_list = None, None - if node.op_type == "SkipLayerNormalization": - start_node = node - qkv_nodes_list = [ - self.match_parent_path( - start_node, - ["MatMul", "Reshape", "Transpose", "Reshape", "MatMul"], - [None, 0, 0, 0, 0], - ), - self.match_parent_path( - start_node, - ["Add", "MatMul", "Reshape", "Transpose", "MatMul"], - [1, 1, 0, 0, 0], - ), - ] - if node.op_type == "Add": - start_node = node - qkv_nodes_list = [ - # match base attention structure - self.match_parent_path( - start_node, - ["Add", "MatMul", "Reshape", "Transpose", "MatMul"], - [0, None, 0, 0, 0], - ), - self.match_parent_path( - start_node, ["Add", "MatMul", "Reshape", "Transpose", "MatMul"], [1, None, 0, 0, 0] - ), - # match gpt attention no past structure - self.match_parent_path( - start_node, - ["Reshape", "Gemm", "Reshape", "Reshape", "Transpose", "MatMul"], - [None, 0, 0, 0, 0, 0], - output_name_to_node_dict=self._output_name_to_node, - return_indice=[], - ), - # match bart attention structure - self.match_parent_path( - start_node, - ["Add", "MatMul", "Reshape", "Transpose", "Reshape", "MatMul"], - [0, None, 0, 0, 0, 0], - ), - self.match_parent_path( - start_node, - ["Add", "MatMul", "Reshape", "Transpose", "Reshape", "MatMul"], - [1, None, 0, 0, 0, 0], - ), - ] - if not start_node: - continue - if not any(qkv_nodes_list): - continue - qkv_nodes = [qkv for qkv in qkv_nodes_list if qkv is not None][-1] - other_inputs = [] - for input in start_node.input: - if input not in self._output_name_to_node: - continue - if input == qkv_nodes[0].output[0]: - continue - other_inputs.append(input) - if len(other_inputs) != 1: - continue - root_input = other_inputs[0] - children = self._input_name_to_nodes[root_input] - children_types = [child.op_type for child in children] - if children_types.count("MatMul") == 3: - qkv.append([child.name for child in children if child.op_type == "MatMul"]) - if not find_all: - break - return qkv - - def find_ffn_matmul(self, attention_index, attention_matmul_list, block_len): - """Find MatMul in FFN. - - Args: - attention_index (list): index of Attention - attention_matmul_list (list): list of Attention and MatMul nodes - block_len (int): block length - - Returns: - list: list of MatMul in FFN - """ - ffn_matmul = [] - for idx in range(len(attention_index)): - if idx != len(attention_index) - 1: - index = attention_index[idx + 1] - if index - 2 >= 0: - ffn_matmul.append([attention_matmul_list[index - 2], attention_matmul_list[index - 1]]) - else: - index = attention_index[idx] - if index + block_len - 1 < len(attention_matmul_list): - ffn_matmul.append( - [attention_matmul_list[index + block_len - 2], attention_matmul_list[index + block_len - 1]] - ) - return ffn_matmul - - def export(self, save_path, conf): - """Export Qlinear to QDQ model.""" - from neural_compressor.config import ONNXQlinear2QDQConfig - from neural_compressor.utils.export import onnx_qlinear_to_qdq - - if isinstance(conf, ONNXQlinear2QDQConfig): - if len(self._input_name_to_nodes) == 0: - self._input_name_to_nodes = self.input_name_to_nodes() - add_nodes, remove_nodes, inits = onnx_qlinear_to_qdq(self.model, self._input_name_to_nodes) - self.add_nodes(add_nodes) - self.remove_nodes(remove_nodes) - self.add_initializers(inits) - self.update() - self.remove_unused_nodes() - self.topological_sort() - self.save(save_path) - else: - logger.warning("Unsupported config for export, " "only ONNXQlinear2QDQConfig is supported!") - exit(0) - - def add_tensors_to_outputs(self, tensor_names): - """Add the tensors to the model outputs to gets their values. - - Args: - tensor_names: The names of tensors to be dumped. - """ - added_outputs = [] - for tensor in tensor_names: - if tensor not in self.output(): - added_tensor = onnx.helper.ValueInfoProto() - added_tensor.name = tensor - added_outputs.append(added_tensor) - self.model.graph.output.extend(added_outputs) # pylint: disable=no-member - - def remove_tensors_from_outputs(self, tensor_names): - """Remove the tensors from the model outputs. - - Args: - tensor_names: The names of tensors to be removed. - """ - removed_outputs = [] - for tensor in tensor_names: - if tensor in self.output(): - removed_outputs.append(self.model.graph.output[self.output().index(tensor)]) - for output in removed_outputs: - self.model.graph.output.remove(output) - - def match_first_parent(self, node, parent_op_type, output_name_to_node_dict, exclude=[]): - """Find parent node based on constraints on op_type. - - Args: - node (str): current node name. - parent_op_type (str): constraint of parent node op_type. - output_name_to_node (dict): dictionary with output name as key, and node as value. - exclude (list): list of nodes that are excluded (not allowed to match as parent). - - Returns: - parent: The matched parent node. None if not found. - index: The input index of matched parent node. None if not found. - """ - for i, input in enumerate(node.input): - if input in output_name_to_node_dict: - parent = output_name_to_node_dict[input] - if parent.op_type == parent_op_type and parent not in exclude: - return parent, i - return None, None - - def match_parent( - self, - node, - parent_op_type, - input_index=None, - output_name_to_node_dict=None, - exclude=[], - return_indice=None, - ): - """Find parent node based on constraints on op_type and index. - - Args: - node (str): current node name. - parent_op_type (str): constraint of parent node op_type. - input_index (int or None): only check the parent given input index of current node. - output_name_to_node (dict): dictionary with output name as key, and node as value. - exclude (list): list of nodes that are excluded (not allowed to match as parent). - return_indice (list): a list to append the input index when input_index is None. - - Returns: - parent: The matched parent node. - """ - assert node is not None - assert input_index is None or input_index >= 0 - - if output_name_to_node_dict is None: - if len(self._output_name_to_node) == 0: - self._output_name_to_node = self.output_name_to_node() - output_name_to_node_dict = self._output_name_to_node - - if input_index is None: - parent, index = self.match_first_parent(node, parent_op_type, output_name_to_node_dict, exclude) - if return_indice is not None: - return_indice.append(index) - return parent - - if input_index >= len(node.input): - return None - - parent = self.get_parent(node, input_index, output_name_to_node_dict) - if parent is not None and parent.op_type == parent_op_type and parent not in exclude: - return parent - - return None - - def match_parent_path( - self, - node, - parent_op_types, - parent_input_index, - output_name_to_node_dict=None, - return_indice=None, - ): - """Find a sequence of input edges based on constraints on parent op_type and index. - - Args: - node (str): current node name. - parent_op_types (str): constraint of parent node op_type of each input edge. - parent_input_index (list): constraint of input index of each input edge. - None means no constraint. - output_name_to_node (dict): dictionary with output name as key, and node as value. - return_indice (list): a list to append the input index when there is - no constraint on input index of an edge. - - Returns: - parents: a list of matched parent node. - """ - assert len(parent_input_index) == len(parent_op_types) - - if output_name_to_node_dict is None: - if len(self._output_name_to_node) == 0: - self._output_name_to_node = self.output_name_to_node() - output_name_to_node_dict = self._output_name_to_node - - current_node = node - matched_parents = [] - for i, op_type in enumerate(parent_op_types): - matched_parent = self.match_parent( - current_node, - op_type, - parent_input_index[i], - output_name_to_node_dict, - exclude=[], - return_indice=return_indice, - ) - if matched_parent is None: - return None - - matched_parents.append(matched_parent) - current_node = matched_parent - - return matched_parents - - def is_smoothquant_model(self): - """Check the model is smooth quantized or not. - - Returns: - bool: the model is smooth quantized or not. - """ - for init in self.model.graph.initializer: - if "_smooth_scale" in init.name: - return True - return False - - def find_split_nodes(self): - """Find split nodes for layer-wise quantization.""" - split_nodes = self.find_split_node_for_layer_wise_quantization() - return split_nodes - - def split_model_with_node(self, split_node_name, path_of_model_to_split, save_both_split_models=True): - """Split model into two parts at a given node. - - Args: - split_node_name (str): name of the node where the model is split at> - path_of_model_to_split (str): path of model to be split. - save_both_split_models (bool): whether to save the two split models. - False means only save the first split model. - True means save both the two split models. - Default id True. - - Returns: - tuple: the first split model, the second split model - """ - # origin model : ... -> node_1 -> split_node -> node_2 -> ... - # split model 1: ... -> node_1 -> split_node - # split model 2: node_2 -> ... - - # remove nodes which are not followed by other nodes - unvalid_nodes = [ - i - for i in self.model.graph.node - if all(out not in self._input_name_to_nodes and not self.is_graph_output(out) for out in i.output) - ] - while len(unvalid_nodes) > 0: - self.remove_nodes(unvalid_nodes) - self._input_name_to_nodes = self.input_name_to_nodes() - unvalid_nodes = [ - i - for i in self.model.graph.node - if all([out not in self._input_name_to_nodes and not self.is_graph_output(out) for out in i.output]) - ] - self.topological_sort() - - split_model_part_1 = onnx.ModelProto() - split_model_part_1.CopyFrom(self.model) - split_model_part_1.graph.ClearField("node") - - split_model_part_2 = onnx.ModelProto() - split_model_part_2.CopyFrom(self.model) - split_model_part_2.graph.ClearField("node") - - split_node_output = None - part_idx = 1 - for node in self.model.graph.node: - if part_idx == 1: - split_model_part_1.graph.node.append(node) - elif part_idx == 2: - split_model_part_2.graph.node.append(node) - - if node.name == split_node_name: - split_node_output = node.output - part_idx = 2 - - assert len(split_node_output) == 1, ( - "Only support split at node with 1 output tensor, while " - "current split node {} has {} output tensors".format(split_node_name, len(split_node_output)) - ) - split_tensor_name = split_node_output[0] - - split_tensor_type, split_tensor_shape = self._get_output_type_shape_by_tensor_name(split_tensor_name) - split_tensor = onnx.helper.make_tensor_value_info(split_tensor_name, split_tensor_type, split_tensor_shape) - - split_model_part_1.graph.output.append(split_tensor) - split_model_part_2.graph.input.append(split_tensor) - - split_model_part_1 = ONNXModel(split_model_part_1, ignore_warning=True) - split_model_part_2 = ONNXModel(split_model_part_2, ignore_warning=True) - - # remove unused input & output - split_model_part_1._remove_unused_input_output() - split_model_part_2._remove_unused_input_output() - - insert_output_for_model_1 = [] - insert_input_for_model_2 = [] - for output in split_model_part_1._output_name_to_node.keys(): - if output in split_model_part_2._input_name_to_nodes.keys(): - output_type, output_shape = self._get_output_type_shape_by_tensor_name(output) - output_tensor = onnx.helper.make_tensor_value_info(output, output_type, output_shape) - if output_tensor not in split_model_part_1.model.graph.output: - insert_output_for_model_1.append(output_tensor) - if output_tensor not in split_model_part_2.model.graph.input: - insert_input_for_model_2.append(output_tensor) - - # insert model 1 output - for output in insert_output_for_model_1: - split_model_part_1.model.graph.output.append(output) - - # insert model 2 input - for input in insert_input_for_model_2: - split_model_part_2.model.graph.input.append(input) - - # remove unused init - split_model_part_1.remove_unused_init() - split_model_part_2.remove_unused_init() - - split_model_part_1.update() - split_model_part_2.update() - - dir_of_model_to_split = os.path.dirname(path_of_model_to_split) - - split_model_part_1.load_model_initializer_by_tensor(dir_of_model_to_split) - split_model_part_1_path = os.path.join(dir_of_model_to_split, "split_model_part_1.onnx") - split_model_part_1.model_path = split_model_part_1_path - split_model_part_1._save_split_model(split_model_part_1_path) - split_model_part_1.check_is_large_model() - logger.debug("save split model part 1 to {} for layer wise quantization".format(split_model_part_1_path)) - - if save_both_split_models: - split_model_part_2.load_model_initializer_by_tensor(dir_of_model_to_split) - split_model_part_2_path = os.path.join(dir_of_model_to_split, "split_model_part_2.onnx") - split_model_part_2.model_path = split_model_part_2_path - split_model_part_2._save_split_model(split_model_part_2_path) - split_model_part_2.check_is_large_model() - logger.debug("save split model part 2 to {} for layer wise quantization".format(split_model_part_2_path)) - return split_model_part_1, split_model_part_2 - else: - return split_model_part_1, split_model_part_2 - - def _save_split_model(self, save_path): - """Save split model as external data for layer wise quantization. - - Args: - save_path (str): the path to save the split model - """ - if os.path.exists(save_path + "_data"): - os.remove(save_path + "_data") - onnx.save_model( - self.model, - save_path, - save_as_external_data=True, - all_tensors_to_one_file=True, - location=save_path.split("/")[-1] + "_data", - size_threshold=1024, - convert_attribute=False, - ) - - def _get_output_type_shape_by_tensor_name(self, tensor_name): - """Get output type and shape with a tensor name. - - Args: - tensor_name (str): name of a tensor - - Returns: - tuple: output type and shape - """ - elem_type = onnx.TensorProto.FLOAT - shape = None - for output in self.model.graph.value_info: - if output.name == tensor_name: - elem_type = output.type.tensor_type.elem_type - shape = [ - dim.dim_value if dim.HasField("dim_value") else -1 for dim in output.type.tensor_type.shape.dim - ] - break - return elem_type, shape - - def _remove_unused_input_output(self): - """Remove unused input & output for split model.""" - remove_outputs = [] - remove_inputs = [] - if len(self._input_name_to_nodes) == 0: - self._input_name_to_nodes = self.input_name_to_nodes() - for output in self.model.graph.output: - if output.name not in self._output_name_to_node.keys(): - remove_outputs.append(output) - - for input in self.model.graph.input: - if input.name not in self._input_name_to_nodes.keys(): - remove_inputs.append(input) - - for output in remove_outputs: - self.model.graph.output.remove(output) - for input in remove_inputs: - self.model.graph.input.remove(input) - - def remove_unused_init(self): - """Remove unused init.""" - remov_inits = [] - if len(self._input_name_to_nodes) == 0: - self._input_name_to_nodes = self.input_name_to_nodes() - for init in self.model.graph.initializer: - if init.name not in self._input_name_to_nodes.keys(): - remov_inits.append(init) - self.remove_initializers(remov_inits) - - def load_model_initializer_by_tensor(self, data_path=None): - """Load model initializer by tensor. - - Args: - data_path (str, optional): the directory of saved initializer. Defaults to None. - """ - from onnx.external_data_helper import load_external_data_for_tensor - - if data_path is None: - data_path = os.path.dirname(self._model_path) - for init in self.model.graph.initializer: - if init.HasField("data_location") and init.data_location == onnx.TensorProto.EXTERNAL: - load_external_data_for_tensor(init, data_path) - - def write_external_data_to_new_location(self, external_data_location="external.data", overwrite=False): - """Write external data of merged quantized model to new location to save memory. - - Args: - external_data_location (str, optional): external data location of merged quantized model. - Defaults to "external.data". - overwrite (bool, optional): if True, remove existed externa data. Defaults to False. - """ - from onnx.external_data_helper import convert_model_to_external_data, write_external_data_tensors - - if overwrite and os.path.exists(os.path.join(os.path.dirname(self._model_path), external_data_location)): - os.remove(os.path.join(os.path.dirname(self._model_path), external_data_location)) - self.load_model_initializer_by_tensor() - convert_model_to_external_data(self.model, location=external_data_location) - # TODO : if init is already saved, skip write it - write_external_data_tensors(self.model, filepath=os.path.dirname(self._model_path)) - - def merge_split_models(self, to_merge_model): - """Merge two split model into final model.""" - to_merge_model.write_external_data_to_new_location() - self.add_nodes([node for node in to_merge_model.nodes()]) - self.add_initializers([init for init in to_merge_model.initializer()]) - self.update() - - # add new output - for output in to_merge_model.graph().output: - if output.name not in self.output(): - self.model.graph.output.append(output) - - # remove unused output - remove_output = [] - for output in self.model.graph.output: - if output.name in to_merge_model.input(): - remove_output.append(output) - for output in remove_output: - self.model.graph.output.remove(output) - - # add new input - for input in to_merge_model.graph().input: - if ( - input.name not in self.input() - and input.name not in self.output() - and input.name not in self._output_name_to_node.keys() - ): - self.model.graph.input.append(input) - - def re_org_output(self, origin_output): - """Re-org output of merged model for layer-wise quantization.""" - outputs = {} - tmp_remove = [] - for output in self.model.graph.output: - outputs[output.name] = output - tmp_remove.append(output) - - for output in tmp_remove: - self.model.graph.output.remove(output) - - for out_name in origin_output: - self.model.graph.output.append(outputs[out_name]) diff --git a/neural_compressor/onnxrt/utils/utility.py b/neural_compressor/onnxrt/utils/utility.py deleted file mode 100644 index 21678717229..00000000000 --- a/neural_compressor/onnxrt/utils/utility.py +++ /dev/null @@ -1,288 +0,0 @@ -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from pathlib import Path -from typing import Callable, Dict, List, Tuple, Union - -import numpy as np -import onnx -import onnxruntime.tools.symbolic_shape_infer as symbolic_shape_infer -from packaging.version import Version - -from neural_compressor.common import Logger - -logger = Logger().get_logger() - -__all__ = [ - "ONNXRT116_VERSION", - "ONNXRT1161_VERSION", - "algos_mapping", - "WHITE_MODULE_LIST", - "MAXIMUM_PROTOBUF", - "PRIORITY_RTN", - "PRIORITY_GPTQ", - "PRIORITY_AWQ", - "PRIORITY_SMOOTH_QUANT", - "dtype_mapping", - "find_by_name", - "simple_progress_bar", - "register_algo", - "get_model_info", - "is_B_transposed", - "get_qrange_for_qType", - "quantize_data", - "check_model_with_infer_shapes", -] - -ONNXRT116_VERSION = Version("1.16.0") -ONNXRT1161_VERSION = Version("1.16.1") - -# Dictionary to store a mapping between algorithm names and corresponding algo implementation(function) -algos_mapping: Dict[str, Callable] = {} - -# All constants for onnxrt -WHITE_MODULE_LIST = ["MatMul", "Conv"] - -MAXIMUM_PROTOBUF = 2147483648 - -PRIORITY_RTN = 60 -PRIORITY_GPTQ = 70 -PRIORITY_AWQ = 50 -PRIORITY_SMOOTH_QUANT = 80 - -dtype_mapping = { - "fp32": 1, - "float32": 1, - "uint8": 2, - "int8": 3, - "uint16": 4, - "int16": 5, - "int32": 6, - "int64": 7, - "string": 8, - "bool": 9, - "fp16": 10, - "float16": 10, - "double": 11, - "uint32": 12, - "uint64": 13, - "complex64": 14, - "complex128": 15, - "bf16": 16, - "bfloat16": 16, -} - - -def find_by_name(name, item_list): - """Helper function to find item by name in a list.""" - items = [] - for item in item_list: - assert hasattr(item, "name"), "{} should have a 'name' attribute defined".format(item) # pragma: no cover - if item.name == name: - items.append(item) - if len(items) > 0: - return items[0] - else: - return None - - -def simple_progress_bar(total, i): - """Progress bar for cases where tqdm can't be used.""" - progress = i / total - bar_length = 20 - bar = "#" * int(bar_length * progress) - spaces = " " * (bar_length - len(bar)) - percentage = progress * 100 - print(f"\rProgress: [{bar}{spaces}] {percentage:.2f}%", end="") - - -def register_algo(name): - """Decorator function to register algorithms in the algos_mapping dictionary. - - Usage example: - @register_algo(name=example_algo) - def example_algo(model: Union[onnx.ModelProto, Path, str], - quant_config: RTNConfig) -> onnx.ModelProto: - ... - - Args: - name (str): The name under which the algorithm function will be registered. - - Returns: - decorator: The decorator function to be used with algorithm functions. - """ - - def decorator(algo_func): - algos_mapping[name] = algo_func - return algo_func - - return decorator - - -def get_model_info( - model: Union[onnx.ModelProto, Path, str], white_op_type_list: List[Callable] -) -> List[Tuple[str, Callable]]: - if not isinstance(model, onnx.ModelProto): - model = onnx.load(model) - filter_result = [] - filter_result_set = set() - for node in model.graph.node: - if node.op_type in white_op_type_list: - pair = (node.name, node.op_type) - if pair not in filter_result_set: - filter_result_set.add(pair) - filter_result.append(pair) - logger.debug(f"Get model info: {filter_result}") - return filter_result - - -def is_B_transposed(node): - """Whether inuput B is transposed.""" - transB = [attr for attr in node.attribute if attr.name == "transB"] - if len(transB): - return 0 < onnx.helper.get_attribute_value(transB[0]) - return False - - -def get_qrange_for_qType(qType, reduce_range=False): - """Helper function to get the quantization range for a type. - - Args: - qType (int): data type - reduce_range (bool, optional): use 7 bit or not. Defaults to False. - """ - if qType == onnx.onnx_pb.TensorProto.UINT8: - return 127 if reduce_range else 255 - elif qType == onnx.onnx_pb.TensorProto.INT8: - # [-64, 64] for reduce_range, and [-127, 127] full_range. - return 128 if reduce_range else 254 - else: - raise ValueError("unsupported quantization data type") - - -def _quantize_data_with_scale_zero(data, qType, scheme, scale, zero_point): - """Quantize data with scale and zero point. - - To pack weights, we compute a linear transformation - - when data type == uint8 mode, from [rmin, rmax] -> [0, 2^{b-1}] and - - when data type == int8, from [-m , m] -> [-(2^{b-1}-1), 2^{b-1}-1] where - m = max(abs(rmin), abs(rmax)) - - Args: - data (np.array): data to quantize - qType (int): data type to quantize to. Supported types UINT8 and INT8 - scheme (string): sym or asym quantization. - scale (float): computed scale of quantized data - zero_point (uint8 or int8): computed zero point of quantized data - """ - data = np.asarray(data) - if qType == onnx.onnx_pb.TensorProto.INT8 and scheme == "sym": - # signed byte type - quantized_data = (data.astype(np.float32) / scale).round().astype("b") - elif qType == onnx.onnx_pb.TensorProto.UINT8 and scheme == "asym": - quantized_data = ((data.astype(np.float32) / scale).round() + zero_point).astype("B") - else: - raise ValueError("Unexpected combination of data type {} and scheme {}.".format(qType, scheme)) - return quantized_data - - -def _calculate_scale_zp(rmin, rmax, quantize_range, qType, scheme): - """Calculate scale and zero point.""" - if isinstance(rmax, np.ndarray): - if scheme == "sym": - max_range = np.maximum(abs(rmin), abs(rmax)) - scale = np.ones(rmax.shape, dtype="float32") - scale[max_range > 0] = np.array( - [float(i) / quantize_range for i in (max_range[max_range > 0] * 2.0).flatten().tolist()], - dtype="float32", - ) - else: - scale = np.ones(rmax.shape, dtype="float32") - scale[rmin != rmax] = np.array( - [float(i) / quantize_range for i in (rmax - rmin)[rmin != rmax].flatten().tolist()], dtype="float32" - ) - - if scheme == "sym" and qType == onnx.onnx_pb.TensorProto.INT8: - zero_point = np.zeros(scale.shape, dtype="int8") if isinstance(scale, np.ndarray) else 0 - elif isinstance(scale, np.ndarray) and (scale == 1).all(): - zero_point = ( - np.zeros(scale.shape, dtype="int8") - if qType == onnx.onnx_pb.TensorProto.INT8 - else np.zeros(scale.shape, dtype="uint8") - ) - elif qType == onnx.onnx_pb.TensorProto.UINT8: - zero_point = np.maximum(0, np.minimum(255, ((0 - float(rmin)) / scale).round()).round()).astype("uint8") - else: - zero_point = ( - (-64 - rmin) / float(scale) if quantize_range == 128 else (-127 - rmin) / float(scale) - ).round() - - else: - if scheme == "sym": - max_range = max(abs(rmin), abs(rmax)) - scale = (float(max_range) * 2) / quantize_range if max_range > 0 else 1 - else: - scale = (float(rmax) - float(rmin)) / quantize_range if rmin != rmax else 1 - - if scale == 1 or (scheme == "sym" and qType == onnx.onnx_pb.TensorProto.INT8): - zero_point = 0 - elif qType == onnx.onnx_pb.TensorProto.UINT8: - zero_point = round((0 - float(rmin)) / scale) - zero_point = np.uint8(round(max(0, min(255, zero_point)))) - else: - zero_point = ( - round((-64 - float(rmin)) / scale) if quantize_range == 128 else round((-127 - float(rmin)) / scale) - ) - return scale, zero_point - - -def quantize_data(data, quantize_range, qType, scheme): - """Quantize data. - - To pack weights, we compute a linear transformation - - when data type == uint8 mode, from [rmin, rmax] -> [0, 2^{b-1}] and - - when data type == int8, from [-m , m] -> [-(2^{b-1}-1), 2^{b-1}-1] where - m = max(abs(rmin), abs(rmax)) - and add necessary intermediate nodes to transform quantized weight to full weight - using the equation r = S(q-z), where - r: real original value - q: quantized value - S: scale - z: zero point - - Args: - data (array): data to quantize - quantize_range (list): list of data to weight pack. - qType (int): data type to quantize to. Supported types UINT8 and INT8 - scheme (string): sym or asym quantization. - """ - rmin = min(min(data), 0) - rmax = max(max(data), 0) - - scale, zero_point = _calculate_scale_zp(rmin, rmax, quantize_range, qType, scheme) - quantized_data = _quantize_data_with_scale_zero(data, qType, scheme, scale, zero_point) - return rmin, rmax, zero_point, scale, quantized_data - - -def check_model_with_infer_shapes(model): - """Check if the model has been shape inferred.""" - from neural_compressor.onnxrt.utils.onnx_model import ONNXModel - - if isinstance(model, (Path, str)): - model = onnx.load(model, load_external_data=False) - elif isinstance(model, ONNXModel): - model = model.model - if len(model.graph.value_info) > 0: - return True - return False diff --git a/requirements_ort.txt b/requirements_ort.txt deleted file mode 100644 index 23f608859d1..00000000000 --- a/requirements_ort.txt +++ /dev/null @@ -1,9 +0,0 @@ -numpy < 2.0 -onnx -onnxruntime -onnxruntime-extensions -prettytable -psutil -py-cpuinfo -pydantic -transformers diff --git a/setup.py b/setup.py index f1a9c9b22f6..bb23ac7866a 100644 --- a/setup.py +++ b/setup.py @@ -42,7 +42,7 @@ def get_build_version(): assert False, "Error: Could not open '%s' due %s\n" % (filepath, error) PKG_INSTALL_CFG = { - # overall install config for build from source, python setup.py install + # overall installation config, pip install neural-compressor "neural_compressor": { "project_name": "neural_compressor", "include_packages": find_packages( @@ -53,33 +53,12 @@ def get_build_version(): ), "package_data": {"": ["*.yaml"]}, "install_requires": fetch_requirements("requirements.txt"), - }, - # 2.x binary build config, pip install neural-compressor - "neural_compressor_2x": { - "project_name": "neural_compressor", - "include_packages": find_packages( - include=["neural_compressor", "neural_compressor.*"], - exclude=[ - "neural_compressor.template", - "neural_compressor.common", - "neural_compressor.common.*", - "neural_compressor.torch", - "neural_compressor.torch.*", - "neural_compressor.tensorflow", - "neural_compressor.tensorflow.*", - "neural_compressor.onnxrt", - "neural_compressor.onnxrt.*", - ], - ), - "package_data": {"": ["*.yaml"]}, - "install_requires": fetch_requirements("requirements.txt"), "extras_require": { - "pt": [f"neural_compressor_3x_pt=={__version__}"], - "tf": [f"neural_compressor_3x_tf=={__version__}"], - "ort": [f"neural_compressor_3x_ort=={__version__}"], + "pt": fetch_requirements("requirements_pt.txt"), + "tf": fetch_requirements("requirements_tf.txt"), }, }, - # 3.x pt binary build config, pip install neural-compressor[pt], install 2.x API + 3.x PyTorch API. + # 3.x pt binary build config, pip install neural-compressor-pt, install 3.x PyTorch API. "neural_compressor_3x_pt": { "project_name": "neural_compressor_3x_pt", "include_packages": find_packages( @@ -92,7 +71,7 @@ def get_build_version(): ), "install_requires": fetch_requirements("requirements_pt.txt"), }, - # 3.x tf binary build config, pip install neural-compressor[tf], install 2.x API + 3.x TensorFlow API. + # 3.x tf binary build config, pip install neural-compressor-tf, install 3.x TensorFlow API. "neural_compressor_3x_tf": { "project_name": "neural_compressor_3x_tf", "include_packages": find_packages( @@ -106,19 +85,6 @@ def get_build_version(): "package_data": {"": ["*.yaml"]}, "install_requires": fetch_requirements("requirements_tf.txt"), }, - # 3.x ort binary build config, pip install neural-compressor[ort], install 2.x API + 3.x ONNXRT API. - "neural_compressor_3x_ort": { - "project_name": "neural_compressor_3x_ort", - "include_packages": find_packages( - include=[ - "neural_compressor.common", - "neural_compressor.common.*", - "neural_compressor.onnxrt", - "neural_compressor.onnxrt.*", - ], - ), - "install_requires": fetch_requirements("requirements_ort.txt"), - }, } @@ -131,10 +97,6 @@ def get_build_version(): ext_modules = [] cmdclass = {} - if "2x" in sys.argv: - sys.argv.remove("2x") - cfg_key = "neural_compressor_2x" - if "pt" in sys.argv: sys.argv.remove("pt") cfg_key = "neural_compressor_3x_pt" @@ -143,10 +105,6 @@ def get_build_version(): sys.argv.remove("tf") cfg_key = "neural_compressor_3x_tf" - if "ort" in sys.argv: - sys.argv.remove("ort") - cfg_key = "neural_compressor_3x_ort" - if bool(os.getenv("USE_FP8_CONVERT", False)): from torch.utils.cpp_extension import BuildExtension, CppExtension diff --git a/test/3x/onnxrt/quantization/layer_wise/test_layer_wise.py b/test/3x/onnxrt/quantization/layer_wise/test_layer_wise.py deleted file mode 100644 index c8e7584ee7f..00000000000 --- a/test/3x/onnxrt/quantization/layer_wise/test_layer_wise.py +++ /dev/null @@ -1,155 +0,0 @@ -import os -import shutil -import unittest -from copy import deepcopy - -import onnx -import onnxruntime as ort -import onnxruntime.tools.symbolic_shape_infer as symbolic_shape_infer -import torch -from optimum.exporters.onnx import main_export -from transformers import AutoTokenizer - -from neural_compressor.common import Logger -from neural_compressor.onnxrt.quantization.calibrate import CalibrationDataReader - -logger = Logger().get_logger() - - -def find_onnx_file(folder_path): - # return first .onnx file path in folder_path - for root, dirs, files in os.walk(folder_path): - for file in files: - if file.endswith(".onnx"): - return os.path.join(root, file) - return None - - -class DummyNLPDataloader(CalibrationDataReader): - def __init__(self, model_name): - self.tokenizer = AutoTokenizer.from_pretrained(model_name) - self.sequence_a = "intel-extension-for-transformers is based in SH" - self.sequence_b = "Where is intel-extension-for-transformers based? NYC or SH" - - self.encoded_list = [] - encoded_input = dict(self.tokenizer(self.sequence_a, self.sequence_b, return_tensors="pt")) - input_shape = encoded_input["input_ids"].shape - encoded_input["position_ids"] = ( - torch.arange(0, input_shape[-1], dtype=torch.long).unsqueeze(0).view(-1, input_shape[-1]) - ) - - # convert torch tensor to numpy - for input_name, input_value in encoded_input.items(): - if isinstance(input_value, torch.Tensor): - encoded_input[input_name] = input_value.numpy() - - self.encoded_list.append(encoded_input) - self.iter_next = iter(self.encoded_list) - - def get_next(self): - return next(self.iter_next, None) - - def rewind(self): - self.iter_next = iter(self.encoded_list) - - -class TestLayerWiseQuant(unittest.TestCase): - @classmethod - def setUpClass(self): - # onnx model exported with transformers>=4.38.0 is different with low version - # which will cause layer-wise quant ut to fail - # limit transformers to 4.37.2 - # TODO: remove transformers version limitation - llama_id = "yujiepan/llama-2-tiny-3layers-random" - main_export(llama_id, output="llama-2-tiny-3layers-random", task="text-generation") - model_path = find_onnx_file("llama-2-tiny-3layers-random") - - model = onnx.load(model_path) - model = symbolic_shape_infer.SymbolicShapeInference.infer_shapes(model, auto_merge=True) - infer_shape_model_path = "llama-2-tiny-3layers-random/model-infer-shape.onnx" - onnx.save(model, infer_shape_model_path) - - sess_options = ort.SessionOptions() - sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED - sess_options.optimized_model_filepath = "llama-2-tiny-3layers-random/optimized_model.onnx" - ort.InferenceSession(infer_shape_model_path, sess_options) - - self.llama = "llama-2-tiny-3layers-random/optimized_model.onnx" - self.calibration_data_reader = DummyNLPDataloader(llama_id) - - @classmethod - def tearDownClass(self): - shutil.rmtree("llama-2-tiny-3layers-random", ignore_errors=True) - - def setUp(self): - # print the test name - logger.info(f"Running ONNXRT TestLayerWiseQuant test: {self.id()}") - - def _check_model_is_quantized(self, model): - node_optypes = [node.op_type for node in model.graph.node] - return "MatMulNBits" in node_optypes or "MatMulFpQ4" in node_optypes - - def _get_quantized_matmul_weight(self, model, matmul_name): - weight_init_name = None - for node in model.graph.node: - if node.name == matmul_name: - weight_init_name = node.input[1] - if weight_init_name is None: - return None - - weight_init = None - for init in model.graph.initializer: - if init.name == weight_init_name: - weight_init = onnx.numpy_helper.to_array(init) - return weight_init - - def _apply_quantize(self, quant_config, data_reader=None): - from neural_compressor.onnxrt.quantization.quantize import _quantize - - fp32_model = self.llama - if data_reader is None: - qmodel = _quantize(fp32_model, quant_config) - else: - qmodel = _quantize(fp32_model, quant_config, data_reader) - self.assertIsNotNone(qmodel) - return qmodel - - def test_rtn_layer_wise(self): - from neural_compressor.onnxrt.quantization import RTNConfig - - rtn_config = RTNConfig(layer_wise_quant=True) - qmodel_lwq = self._apply_quantize(rtn_config) - self.assertTrue(self._check_model_is_quantized(qmodel_lwq)) - - rtn_config = RTNConfig(layer_wise_quant=False) - qmodel = self._apply_quantize(rtn_config) - self.assertTrue(self._check_model_is_quantized(qmodel)) - - lwq_quantized_weight = self._get_quantized_matmul_weight(qmodel_lwq, "/lm_head/MatMul_Q4") - self.assertIsNotNone(lwq_quantized_weight) - quantized_weight = self._get_quantized_matmul_weight(qmodel, "/lm_head/MatMul_Q4") - self.assertIsNotNone(quantized_weight) - self.assertTrue((lwq_quantized_weight == quantized_weight).all()) - - def test_gptq_layer_wise(self): - from neural_compressor.onnxrt.quantization import GPTQConfig - - self.calibration_data_reader.rewind() - gptq_config = GPTQConfig(layer_wise_quant=True) - qmodel_lwq = self._apply_quantize(gptq_config, self.calibration_data_reader) - self.assertTrue(self._check_model_is_quantized(qmodel_lwq)) - - self.calibration_data_reader.rewind() - gptq_config = GPTQConfig(layer_wise_quant=False) - qmodel = self._apply_quantize(gptq_config, self.calibration_data_reader) - self.assertTrue(self._check_model_is_quantized(qmodel)) - - lwq_quantized_weight = self._get_quantized_matmul_weight(qmodel_lwq, "/lm_head/MatMul_Q4") - self.assertIsNotNone(lwq_quantized_weight) - quantized_weight = self._get_quantized_matmul_weight(qmodel, "/lm_head/MatMul_Q4") - self.assertIsNotNone(quantized_weight) - self.assertTrue((lwq_quantized_weight == quantized_weight).all()) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/3x/onnxrt/quantization/weight_only/test_awq.py b/test/3x/onnxrt/quantization/weight_only/test_awq.py deleted file mode 100644 index 1587399f9ca..00000000000 --- a/test/3x/onnxrt/quantization/weight_only/test_awq.py +++ /dev/null @@ -1,219 +0,0 @@ -import os -import shutil -import unittest - -import torch -from optimum.exporters.onnx import main_export -from transformers import AutoTokenizer - -from neural_compressor.common import Logger -from neural_compressor.onnxrt.quantization.calibrate import CalibrationDataReader - -logger = Logger().get_logger() - - -def find_onnx_file(folder_path): - # return first .onnx file path in folder_path - for root, dirs, files in os.walk(folder_path): - for file in files: - if file.endswith(".onnx"): - return os.path.join(root, file) - return None - - -class DummyNLPDataloader(CalibrationDataReader): - def __init__(self, model_name): - self.tokenizer = AutoTokenizer.from_pretrained(model_name) - self.sequence_a = "intel-extension-for-transformers is based in SH" - self.sequence_b = "Where is intel-extension-for-transformers based? NYC or SH" - - self.encoded_list = [] - encoded_input = dict(self.tokenizer(self.sequence_a, self.sequence_b, return_tensors="pt")) - input_shape = encoded_input["input_ids"].shape - encoded_input["position_ids"] = ( - torch.arange(0, input_shape[-1], dtype=torch.long).unsqueeze(0).view(-1, input_shape[-1]) - ) - - # convert torch tensor to numpy - for input_name, input_value in encoded_input.items(): - if isinstance(input_value, torch.Tensor): - encoded_input[input_name] = input_value.numpy() - - self.encoded_list.append(encoded_input) - self.iter_next = iter(self.encoded_list) - - def get_next(self): - return next(self.iter_next, None) - - def rewind(self): - self.iter_next = iter(self.encoded_list) - - -class TestAWQQuant(unittest.TestCase): - @classmethod - def setUpClass(self): - main_export( - "hf-internal-testing/tiny-random-gptj", - output="gptj", - ) - self.gptj = find_onnx_file("./gptj") - self.calibration_data_reader = DummyNLPDataloader("hf-internal-testing/tiny-random-gptj") - - @classmethod - def tearDownClass(self): - shutil.rmtree("gptj", ignore_errors=True) - - def setUp(self): - # print the test name - logger.info(f"Running ONNXRT TestAWQQuant test: {self.id()}") - - def _count_woq_matmul(self, q_model, bits=4, group_size=32): - op_names = [ - i.name - for i in q_model.graph.node - if i.op_type.startswith("MatMul") and i.input[1].endswith("_Q{}G{}".format(bits, group_size)) - ] - return len(op_names) - - def _check_model_is_quantized(self, model): - node_optypes = [node.op_type for node in model.graph.node] - return "MatMulNBits" in node_optypes or "MatMulFpQ4" in node_optypes - - def _check_node_is_quantized(self, model, node_name): - for node in model.graph.node: - if (node.name == node_name or node.name == node_name + "_Q4") and node.op_type in [ - "MatMulNBits", - "MatMulFpQ4", - ]: - return True - return False - - def _apply_awq(self, quant_config): - logger.info(f"Test AWQ with config {quant_config}") - from neural_compressor.onnxrt.quantization.quantize import _quantize - - fp32_model = self.gptj - qmodel = _quantize(fp32_model, quant_config, calibration_data_reader=self.calibration_data_reader) - self.assertIsNotNone(qmodel) - return qmodel - - def test_awq_params_combination(self): - from neural_compressor.onnxrt import AWQConfig - - # some tests were skipped to accelerate the CI - # TODO: check params combination. - # TODO: Add number check for group_size. - awq_options = { - "weight_dtype": ["int"], - "weight_bits": [4, 3, 8], - "weight_group_size": [32], - "weight_sym": [True, False], - "act_dtype": ["fp32"], - "accuracy_level": [0], - "enable_auto_scale": [True, False], - "enable_mse_search": [True, False], - } - from itertools import product - - keys = AWQConfig.params_list - for value in product(*awq_options.values()): - d = dict(zip(keys, value)) - print(d) - quant_config = AWQConfig(**d) - qmodel = self._apply_awq(quant_config) - self.assertEqual(self._count_woq_matmul(qmodel, bits=value[1], group_size=value[2]), 30) - - def test_awq_config(self): - from neural_compressor.onnxrt.quantization import AWQConfig - - awq_config1 = AWQConfig(weight_bits=4) - quant_config_dict = { - "awq": {"weight_bits": 4}, - } - awq_config2 = AWQConfig.from_dict(quant_config_dict["awq"]) - self.assertEqual(awq_config1.to_dict(), awq_config2.to_dict()) - - def test_quantize_awq_from_dict_default(self): - from neural_compressor.onnxrt import get_default_awq_config - - qmodel = self._apply_awq(quant_config=get_default_awq_config()) - self.assertIsNotNone(qmodel) - self.assertTrue(self._check_model_is_quantized(qmodel)) - - def test_quantize_awq_from_dict_beginner(self): - quant_config = { - "awq": { - "weight_bits": 4, - "weight_group_size": 32, - }, - } - qmodel = self._apply_awq(quant_config) - self.assertIsNotNone(qmodel) - self.assertIsNotNone(qmodel) - self.assertTrue(self._check_model_is_quantized(qmodel)) - - def test_quantize_awq_from_class_beginner(self): - from neural_compressor.onnxrt import AWQConfig - - quant_config = AWQConfig(weight_bits=4, weight_group_size=32) - qmodel = self._apply_awq(quant_config) - self.assertIsNotNone(qmodel) - - def test_quantize_awq_fallback_from_class_beginner(self): - from neural_compressor.onnxrt import AWQConfig - - fp32_config = AWQConfig(weight_dtype="fp32") - quant_config = AWQConfig( - weight_bits=4, - weight_dtype="int", - weight_sym=False, - weight_group_size=32, - ) - quant_config.set_local("/h.4/mlp/fc_out/MatMul", fp32_config) - qmodel = self._apply_awq(quant_config) - self.assertIsNotNone(qmodel) - self.assertEqual(self._count_woq_matmul(qmodel), 29) - self.assertFalse(self._check_node_is_quantized(qmodel, "/h.4/mlp/fc_out/MatMul")) - - def test_quantize_awq_from_dict_advance(self): - quant_config = { - "awq": { - "global": { - "weight_bits": 4, - "weight_group_size": 32, - }, - "local": { - "/h.4/mlp/fc_out/MatMul": { - "weight_dtype": "fp32", - } - }, - } - } - qmodel = self._apply_awq(quant_config) - self.assertIsNotNone(qmodel) - self.assertEqual(self._count_woq_matmul(qmodel), 29) - self.assertFalse(self._check_node_is_quantized(qmodel, "/h.4/mlp/fc_out/MatMul")) - - quant_config = { - "awq": { - "global": { - "weight_bits": 4, - "weight_group_size": 32, - }, - "local": { - "/h.4/mlp/fc_out/MatMul": { - "weight_bits": 8, - "weight_group_size": 32, - } - }, - } - } - qmodel = self._apply_awq(quant_config) - self.assertIsNotNone(qmodel) - for node in qmodel.graph.node: - if node.name == "/h.4/mlp/fc_out/MatMul": - self.assertTrue(node.input[1].endswith("Q8G32")) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/3x/onnxrt/quantization/weight_only/test_gptq.py b/test/3x/onnxrt/quantization/weight_only/test_gptq.py deleted file mode 100644 index 4309af4e654..00000000000 --- a/test/3x/onnxrt/quantization/weight_only/test_gptq.py +++ /dev/null @@ -1,222 +0,0 @@ -import os -import shutil -import unittest - -import torch -from optimum.exporters.onnx import main_export -from transformers import AutoTokenizer - -from neural_compressor.common import Logger -from neural_compressor.onnxrt.quantization.calibrate import CalibrationDataReader - -logger = Logger().get_logger() - - -def find_onnx_file(folder_path): - # return first .onnx file path in folder_path - for root, dirs, files in os.walk(folder_path): - for file in files: - if file.endswith(".onnx"): - return os.path.join(root, file) - return None - - -class DummyNLPDataloader(CalibrationDataReader): - def __init__(self, model_name): - self.tokenizer = AutoTokenizer.from_pretrained(model_name) - self.sequence_a = "intel-extension-for-transformers is based in SH" - self.sequence_b = "Where is intel-extension-for-transformers based? NYC or SH" - - self.encoded_list = [] - encoded_input = dict(self.tokenizer(self.sequence_a, self.sequence_b, return_tensors="pt")) - input_shape = encoded_input["input_ids"].shape - encoded_input["position_ids"] = ( - torch.arange(0, input_shape[-1], dtype=torch.long).unsqueeze(0).view(-1, input_shape[-1]) - ) - - # convert torch tensor to numpy - for input_name, input_value in encoded_input.items(): - if isinstance(input_value, torch.Tensor): - encoded_input[input_name] = input_value.numpy() - - self.encoded_list.append(encoded_input) - self.iter_next = iter(self.encoded_list) - - def get_next(self): - return next(self.iter_next, None) - - def rewind(self): - self.iter_next = iter(self.encoded_list) - - -class TestGPTQQuant(unittest.TestCase): - @classmethod - def setUpClass(self): - main_export( - "hf-internal-testing/tiny-random-gptj", - output="gptj", - ) - self.gptj = find_onnx_file("./gptj") - self.calibration_data_reader = DummyNLPDataloader("hf-internal-testing/tiny-random-gptj") - - @classmethod - def tearDownClass(self): - shutil.rmtree("gptj", ignore_errors=True) - - def setUp(self): - # print the test name - logger.info(f"Running ONNXRT TestGPTQQuant test: {self.id()}") - - def _count_woq_matmul(self, q_model, bits=4, group_size=32): - op_names = [ - i.name - for i in q_model.graph.node - if i.op_type.startswith("MatMul") and i.input[1].endswith("_Q{}G{}".format(bits, group_size)) - ] - return len(op_names) - - def _check_model_is_quantized(self, model): - node_optypes = [node.op_type for node in model.graph.node] - return "MatMulNBits" in node_optypes or "MatMulFpQ4" in node_optypes - - def _check_node_is_quantized(self, model, node_name): - for node in model.graph.node: - if (node.name == node_name or node.name == node_name + "_Q4") and node.op_type in [ - "MatMulNBits", - "MatMulFpQ4", - ]: - return True - return False - - def _apply_gptq(self, quant_config): - logger.info(f"Test GPTQ with config {quant_config}") - from neural_compressor.onnxrt.quantization.quantize import _quantize - - fp32_model = self.gptj - qmodel = _quantize(fp32_model, quant_config, calibration_data_reader=self.calibration_data_reader) - self.assertIsNotNone(qmodel) - return qmodel - - def test_gptq_params_combination(self): - from neural_compressor.onnxrt import GPTQConfig - - # some tests were skipped to accelerate the CI - # TODO: check params combination. - # TODO: Add number check for group_size. - gptq_options = { - "weight_dtype": ["int"], - "weight_bits": [4], - "weight_group_size": [32], - "weight_sym": [True, False], - "act_dtype": ["fp32"], - "accuracy_level": [0], - "percdamp": [0.01], - "blocksize": [128], - "actorder": [True, False], - "mse": [True, False], - "perchannel": [True, False], - } - from itertools import product - - keys = GPTQConfig.params_list - for value in product(*gptq_options.values()): - d = dict(zip(keys, value)) - print(d) - quant_config = GPTQConfig(**d) - qmodel = self._apply_gptq(quant_config) - self.assertEqual(self._count_woq_matmul(qmodel, bits=value[1], group_size=value[2]), 30) - - def test_gptq_config(self): - from neural_compressor.onnxrt.quantization import GPTQConfig - - gptq_config1 = GPTQConfig(weight_bits=4) - quant_config_dict = { - "gptq": {"weight_bits": 4}, - } - gptq_config2 = GPTQConfig.from_dict(quant_config_dict["gptq"]) - self.assertEqual(gptq_config1.to_dict(), gptq_config2.to_dict()) - - def test_quantize_gptq_from_dict_default(self): - from neural_compressor.onnxrt import get_default_gptq_config - - qmodel = self._apply_gptq(quant_config=get_default_gptq_config()) - self.assertIsNotNone(qmodel) - self.assertTrue(self._check_model_is_quantized(qmodel)) - - def test_quantize_gptq_from_dict_beginner(self): - quant_config = { - "gptq": { - "weight_bits": 4, - "weight_group_size": 32, - }, - } - qmodel = self._apply_gptq(quant_config) - self.assertIsNotNone(qmodel) - self.assertIsNotNone(qmodel) - self.assertTrue(self._check_model_is_quantized(qmodel)) - - def test_quantize_gptq_from_class_beginner(self): - from neural_compressor.onnxrt import GPTQConfig - - quant_config = GPTQConfig(weight_bits=4, weight_group_size=32) - qmodel = self._apply_gptq(quant_config) - self.assertIsNotNone(qmodel) - - def test_quantize_gptq_fallback_from_class_beginner(self): - from neural_compressor.onnxrt import GPTQConfig - - fp32_config = GPTQConfig(weight_dtype="fp32") - quant_config = GPTQConfig( - weight_bits=4, - weight_dtype="int", - weight_sym=False, - weight_group_size=32, - ) - quant_config.set_local("/h.4/mlp/fc_out/MatMul", fp32_config) - qmodel = self._apply_gptq(quant_config) - self.assertIsNotNone(qmodel) - self.assertEqual(self._count_woq_matmul(qmodel), 29) - self.assertFalse(self._check_node_is_quantized(qmodel, "/h.4/mlp/fc_out/MatMul")) - - def test_quantize_gptq_from_dict_advance(self): - quant_config = { - "gptq": { - "global": { - "weight_bits": 4, - "weight_group_size": 32, - }, - "local": { - "/h.4/mlp/fc_out/MatMul": { - "weight_dtype": "fp32", - } - }, - } - } - qmodel = self._apply_gptq(quant_config) - self.assertIsNotNone(qmodel) - self.assertEqual(self._count_woq_matmul(qmodel), 29) - self.assertFalse(self._check_node_is_quantized(qmodel, "/h.4/mlp/fc_out/MatMul")) - - quant_config = { - "gptq": { - "global": { - "weight_bits": 4, - "weight_group_size": 32, - }, - "local": { - "/h.4/mlp/fc_out/MatMul": { - "weight_bits": 8, - "weight_group_size": 32, - } - }, - } - } - qmodel = self._apply_gptq(quant_config) - self.assertIsNotNone(qmodel) - for node in qmodel.graph.node: - if node.name == "/h.4/mlp/fc_out/MatMul": - self.assertTrue(node.input[1].endswith("Q8G32")) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/3x/onnxrt/quantization/weight_only/test_rtn.py b/test/3x/onnxrt/quantization/weight_only/test_rtn.py deleted file mode 100644 index 11a05bc48da..00000000000 --- a/test/3x/onnxrt/quantization/weight_only/test_rtn.py +++ /dev/null @@ -1,193 +0,0 @@ -import os -import shutil -import unittest - -from optimum.exporters.onnx import main_export - -from neural_compressor.common import Logger - -logger = Logger().get_logger() - - -def find_onnx_file(folder_path): - # return first .onnx file path in folder_path - for root, dirs, files in os.walk(folder_path): - for file in files: - if file.endswith(".onnx"): - return os.path.join(root, file) - return None - - -class TestRTNQuant(unittest.TestCase): - @classmethod - def setUpClass(self): - main_export( - "hf-internal-testing/tiny-random-gptj", - output="gptj", - ) - self.gptj = find_onnx_file("./gptj") - - @classmethod - def tearDownClass(self): - shutil.rmtree("gptj", ignore_errors=True) - - def setUp(self): - # print the test name - logger.info(f"Running ONNXRT TestRTNQuant test: {self.id()}") - - def _check_model_is_quantized(self, model): - node_optypes = [node.op_type for node in model.graph.node] - return "MatMulNBits" in node_optypes or "MatMulFpQ4" in node_optypes - - def _check_node_is_quantized(self, model, node_name): - for node in model.graph.node: - if (node.name == node_name or node.name == node_name + "_Q4") and node.op_type in [ - "MatMulNBits", - "MatMulFpQ4", - ]: - return True - return False - - def _count_woq_matmul(self, q_model, bits=4, group_size=32): - op_names = [ - i.name - for i in q_model.graph.node - if i.op_type.startswith("MatMul") and i.input[1].endswith("_Q{}G{}".format(bits, group_size)) - ] - return len(op_names) - - def _apply_rtn(self, quant_config): - logger.info(f"Test RTN with config {quant_config}") - from neural_compressor.onnxrt.quantization.quantize import _quantize - - fp32_model = self.gptj - qmodel = _quantize(fp32_model, quant_config) - self.assertIsNotNone(qmodel) - return qmodel - - def test_rtn_params_combination(self): - from neural_compressor.onnxrt import RTNConfig - - # some tests were skipped to accelerate the CI - # TODO: check params combination. - # TODO: Add number check for group_size. - rtn_options = { - "weight_dtype": ["int"], - "weight_bits": [4, 3, 8], - "weight_group_size": [32], - "weight_sym": [True, False], - "act_dtype": ["fp32"], - } - from itertools import product - - keys = RTNConfig.params_list - for value in product(*rtn_options.values()): - d = dict(zip(keys, value)) - quant_config = RTNConfig(**d) - qmodel = self._apply_rtn(quant_config) - self.assertEqual(self._count_woq_matmul(qmodel, bits=value[1], group_size=value[2]), 30) - - def test_rtn_config(self): - from neural_compressor.onnxrt.quantization import RTNConfig - - rtn_config1 = RTNConfig(weight_bits=4) - quant_config_dict = { - "rtn": {"weight_bits": 4}, - } - rtn_config2 = RTNConfig.from_dict(quant_config_dict["rtn"]) - self.assertEqual(rtn_config1.to_dict(), rtn_config2.to_dict()) - - def test_quantize_rtn_from_dict_default(self): - from neural_compressor.onnxrt import get_default_rtn_config - from neural_compressor.onnxrt.quantization.quantize import _quantize - - qmodel = self._apply_rtn(quant_config=get_default_rtn_config()) - self.assertIsNotNone(qmodel) - self.assertTrue(self._check_model_is_quantized(qmodel)) - - def test_quantize_rtn_from_dict_beginner(self): - from neural_compressor.onnxrt.quantization.quantize import _quantize - - quant_config = { - "rtn": { - "weight_bits": 4, - "weight_group_size": 32, - }, - } - qmodel = self._apply_rtn(quant_config) - self.assertIsNotNone(qmodel) - self.assertIsNotNone(qmodel) - self.assertTrue(self._check_model_is_quantized(qmodel)) - - def test_quantize_rtn_from_class_beginner(self): - from neural_compressor.onnxrt import RTNConfig - from neural_compressor.onnxrt.quantization.quantize import _quantize - - quant_config = RTNConfig(weight_bits=4, weight_group_size=32) - qmodel = self._apply_rtn(quant_config) - self.assertIsNotNone(qmodel) - - def test_quantize_rtn_fallback_from_class_beginner(self): - from neural_compressor.onnxrt import RTNConfig - from neural_compressor.onnxrt.quantization.quantize import _quantize - - fp32_config = RTNConfig(weight_dtype="fp32") - fp32_model = self.gptj - quant_config = RTNConfig( - weight_bits=4, - weight_dtype="int", - weight_sym=False, - weight_group_size=32, - ) - quant_config.set_local("/h.4/mlp/fc_out/MatMul", fp32_config) - qmodel = _quantize(fp32_model, quant_config) - self.assertIsNotNone(qmodel) - self.assertEqual(self._count_woq_matmul(qmodel), 29) - self.assertFalse(self._check_node_is_quantized(qmodel, "/h.4/mlp/fc_out/MatMul")) - - def test_quantize_rtn_from_dict_advance(self): - from neural_compressor.onnxrt.quantization.quantize import _quantize - - fp32_model = self.gptj - quant_config = { - "rtn": { - "global": { - "weight_bits": 4, - "weight_group_size": 32, - }, - "local": { - "/h.4/mlp/fc_out/MatMul": { - "weight_dtype": "fp32", - } - }, - } - } - qmodel = _quantize(fp32_model, quant_config) - self.assertIsNotNone(qmodel) - self.assertEqual(self._count_woq_matmul(qmodel), 29) - self.assertFalse(self._check_node_is_quantized(qmodel, "/h.4/mlp/fc_out/MatMul")) - - fp32_model = self.gptj - quant_config = { - "rtn": { - "global": { - "weight_bits": 4, - "weight_group_size": 32, - }, - "local": { - "/h.4/mlp/fc_out/MatMul": { - "weight_bits": 8, - "weight_group_size": 32, - } - }, - } - } - qmodel = _quantize(fp32_model, quant_config) - self.assertIsNotNone(qmodel) - for node in qmodel.graph.node: - if node.name == "/h.4/mlp/fc_out/MatMul": - self.assertTrue(node.input[1].endswith("Q8G32")) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/3x/onnxrt/requirements.txt b/test/3x/onnxrt/requirements.txt deleted file mode 100644 index 4165ba5e0a6..00000000000 --- a/test/3x/onnxrt/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -optimum -pytest diff --git a/test/3x/onnxrt/test_autotune.py b/test/3x/onnxrt/test_autotune.py deleted file mode 100644 index 8291d3ef344..00000000000 --- a/test/3x/onnxrt/test_autotune.py +++ /dev/null @@ -1,304 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import glob -import os -import shutil -import unittest -from typing import Callable, Dict, List, Optional, Union -from unittest.mock import patch - -import numpy as np -import onnx -import onnxruntime as ort -from optimum.exporters.onnx import main_export - -from neural_compressor.common import Logger -from neural_compressor.common.base_tuning import Evaluator, TuningConfig -from neural_compressor.onnxrt import AWQConfig, CalibrationDataReader, GPTQConfig, RTNConfig, SmoohQuantConfig -from neural_compressor.onnxrt.quantization import autotune - -logger = Logger().get_logger() - - -def _create_evaluator_for_eval_fns(eval_fns: Optional[Union[Callable, Dict, List[Dict]]] = None) -> Evaluator: - evaluator = Evaluator() - evaluator.set_eval_fn_registry(eval_fns) - return evaluator - - -class DataReader(CalibrationDataReader): - def __init__(self, model): - model = onnx.load(model) - batch_size = 1 - sequence_length = 1 - self.data = { - "input_ids": np.random.randint(10, size=(batch_size, sequence_length)).astype("int64"), - "attention_mask": np.zeros((batch_size, sequence_length)).astype("int64"), - } - for inp in model.graph.input: - if inp.name in self.data: - continue - if inp.name == "position_ids": - # model is exported with optimum >= 1.14.0 with new input 'position_ids' - self.data[inp.name] = np.random.randint(10, size=(batch_size, sequence_length)).astype("int64") - - self.enum_data = None - - def get_next(self): - if self.enum_data is None: - self.enum_data = iter([self.data]) - return next(self.enum_data, None) - - def rewind(self): - self.enum_data = None - - -class TestONNXRT3xAutoTune(unittest.TestCase): - @classmethod - def setUpClass(self): - main_export( - "hf-internal-testing/tiny-random-gptj", - output="gptj", - ) - self.gptj = glob.glob(os.path.join("./gptj", "*.onnx"))[0] - self.data_reader = DataReader(self.gptj) - - @classmethod - def tearDownClass(self): - shutil.rmtree("./gptj", ignore_errors=True) - - @patch("logging.Logger.warning") - def test_auto_tune_warning(self, mock_warning): - acc_data = iter([1.0, 0.8, 0.99, 1.0, 0.99, 0.99]) - - def eval_acc_fn(model) -> float: - session = ort.InferenceSession(model.SerializeToString(), providers=["CPUExecutionProvider"]) - return next(acc_data) - - custom_tune_config = TuningConfig(config_set=[SmoohQuantConfig(alpha=0.5), SmoohQuantConfig(alpha=0.6)]) - with self.assertRaises(SystemExit): - best_model = autotune( - model_input=self.gptj, - tune_config=custom_tune_config, - eval_fn=eval_acc_fn, - calibration_data_reader=self.data_reader, - ) - call_args_list = mock_warning.call_args_list - # There may be multiple calls to warning, so we need to check all of them - self.assertIn( - "Please refine your eval_fn to accept model path (str) as input.", [info[0][0] for info in call_args_list] - ) - - def test_sq_auto_tune(self): - acc_data = iter([1.0, 0.8, 0.99, 1.0, 0.99, 0.99]) - - def eval_acc_fn(model) -> float: - return next(acc_data) - - perf_data = iter([1.0, 0.9, 0.99]) - - def eval_perf_fn(model) -> float: - return next(perf_data) - - eval_fns = [ - {"eval_fn": eval_acc_fn, "weight": 0.5, "name": "accuracy"}, - { - "eval_fn": eval_perf_fn, - "weight": 0.5, - }, - ] - - evaluator = _create_evaluator_for_eval_fns(eval_fns) - - def eval_fn_wrapper(model): - result = evaluator.evaluate(model) - return result - - custom_tune_config = TuningConfig(config_set=[SmoohQuantConfig(alpha=0.5), SmoohQuantConfig(alpha=0.6)]) - best_model = autotune( - model_input=self.gptj, - tune_config=custom_tune_config, - eval_fn=eval_acc_fn, - calibration_data_reader=self.data_reader, - ) - self.assertIsNotNone(best_model) - - custom_tune_config = TuningConfig(config_set=[SmoohQuantConfig(alpha=[0.5, 0.6])]) - best_model = autotune( - model_input=self.gptj, - tune_config=custom_tune_config, - eval_fn=eval_fn_wrapper, - calibration_data_reader=self.data_reader, - ) - self.assertEqual(len(evaluator.eval_fn_registry), 2) - self.assertIsNotNone(best_model) - - def test_rtn_auto_tune(self): - acc_data = iter([1.0, 0.8, 0.6, 1.0, 0.99, 0.9]) - - def eval_acc_fn(model) -> float: - return next(acc_data) - - perf_data = iter([1.0, 0.99, 0.99]) - - def eval_perf_fn(model) -> float: - return next(perf_data) - - eval_fns = [ - {"eval_fn": eval_acc_fn, "weight": 0.5, "name": "accuracy"}, - { - "eval_fn": eval_perf_fn, - "weight": 0.5, - }, - ] - - evaluator = _create_evaluator_for_eval_fns(eval_fns) - - def eval_fn_wrapper(model): - result = evaluator.evaluate(model) - return result - - custom_tune_config = TuningConfig(config_set=[RTNConfig(weight_group_size=32), RTNConfig(weight_group_size=64)]) - best_model = autotune( - model_input=self.gptj, - tune_config=custom_tune_config, - eval_fn=eval_acc_fn, - calibration_data_reader=self.data_reader, - ) - self.assertIsNone(best_model) - - custom_tune_config = TuningConfig(config_set=[RTNConfig(weight_group_size=[32, 64])]) - best_model = autotune( - model_input=self.gptj, - tune_config=custom_tune_config, - eval_fn=eval_fn_wrapper, - calibration_data_reader=self.data_reader, - ) - self.assertEqual(len(evaluator.eval_fn_registry), 2) - self.assertIsNotNone(best_model) - op_names = [ - i.name - for i in best_model.graph.node - if i.op_type.startswith("MatMul") and i.input[1].endswith("_Q{}G{}".format(4, 32)) - ] - self.assertTrue(len(op_names) > 0) - - def test_awq_auto_tune(self): - acc_data = iter([1.0, 0.8, 0.6, 1.0, 0.99, 0.9]) - - def eval_acc_fn(model) -> float: - return next(acc_data) - - perf_data = iter([1.0, 0.99, 0.99]) - - def eval_perf_fn(model) -> float: - return next(perf_data) - - eval_fns = [ - {"eval_fn": eval_acc_fn, "weight": 0.5, "name": "accuracy"}, - { - "eval_fn": eval_perf_fn, - "weight": 0.5, - }, - ] - - evaluator = _create_evaluator_for_eval_fns(eval_fns) - - def eval_fn_wrapper(model): - result = evaluator.evaluate(model) - return result - - custom_tune_config = TuningConfig(config_set=[AWQConfig(weight_group_size=32), AWQConfig(weight_group_size=64)]) - best_model = autotune( - model_input=self.gptj, - tune_config=custom_tune_config, - eval_fn=eval_acc_fn, - calibration_data_reader=self.data_reader, - ) - self.assertIsNone(best_model) - - custom_tune_config = TuningConfig(config_set=[AWQConfig(weight_group_size=[32, 64])]) - best_model = autotune( - model_input=self.gptj, - tune_config=custom_tune_config, - eval_fn=eval_fn_wrapper, - calibration_data_reader=self.data_reader, - ) - self.assertEqual(len(evaluator.eval_fn_registry), 2) - self.assertIsNotNone(best_model) - op_names = [ - i.name - for i in best_model.graph.node - if i.op_type.startswith("MatMul") and i.input[1].endswith("_Q{}G{}".format(4, 32)) - ] - self.assertTrue(len(op_names) > 0) - - def test_gptq_auto_tune(self): - acc_data = iter([1.0, 0.8, 0.6, 1.0, 0.99, 0.9]) - - def eval_acc_fn(model) -> float: - return next(acc_data) - - perf_data = iter([1.0, 0.99, 0.99]) - - def eval_perf_fn(model) -> float: - return next(perf_data) - - eval_fns = [ - {"eval_fn": eval_acc_fn, "weight": 0.5, "name": "accuracy"}, - { - "eval_fn": eval_perf_fn, - "weight": 0.5, - }, - ] - evaluator = _create_evaluator_for_eval_fns(eval_fns) - - def eval_fn_wrapper(model): - result = evaluator.evaluate(model) - return result - - custom_tune_config = TuningConfig( - config_set=[GPTQConfig(weight_group_size=32), GPTQConfig(weight_group_size=64)] - ) - best_model = autotune( - model_input=self.gptj, - tune_config=custom_tune_config, - eval_fn=eval_acc_fn, - calibration_data_reader=self.data_reader, - ) - self.assertIsNone(best_model) - - custom_tune_config = TuningConfig(config_set=[GPTQConfig(weight_group_size=[32, 64])]) - best_model = autotune( - model_input=self.gptj, - tune_config=custom_tune_config, - eval_fn=eval_fn_wrapper, - calibration_data_reader=self.data_reader, - ) - self.assertEqual(len(evaluator.eval_fn_registry), 2) - self.assertIsNotNone(best_model) - op_names = [ - i.name - for i in best_model.graph.node - if i.op_type.startswith("MatMul") and i.input[1].endswith("_Q{}G{}".format(4, 32)) - ] - self.assertTrue(len(op_names) > 0) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/3x/onnxrt/test_config.py b/test/3x/onnxrt/test_config.py deleted file mode 100644 index 9b0c49de1b8..00000000000 --- a/test/3x/onnxrt/test_config.py +++ /dev/null @@ -1,251 +0,0 @@ -import copy -import os -import shutil -import unittest - -import numpy as np -import onnx -from optimum.exporters.onnx import main_export - -from neural_compressor.common import Logger - -logger = Logger().get_logger() - - -def find_onnx_file(folder_path): - # return first .onnx file path in folder_path - for root, dirs, files in os.walk(folder_path): - for file in files: - if file.endswith(".onnx"): - return os.path.join(root, file) - return None - - -def build_simple_onnx_model(): - A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 5, 5]) - C = onnx.helper.make_tensor_value_info("C", onnx.TensorProto.FLOAT, [1, 5, 2]) - D = onnx.helper.make_tensor_value_info("D", onnx.TensorProto.FLOAT, [1, 5, 2]) - H = onnx.helper.make_tensor_value_info("H", onnx.TensorProto.FLOAT, [1, 5, 2]) - - e_value = np.random.randint(2, size=(10)).astype(np.float32) - B_init = onnx.helper.make_tensor("B", onnx.TensorProto.FLOAT, [5, 2], e_value.reshape(10).tolist()) - E_init = onnx.helper.make_tensor("E", onnx.TensorProto.FLOAT, [1, 5, 2], e_value.reshape(10).tolist()) - - matmul_node = onnx.helper.make_node("MatMul", ["A", "B"], ["C"], name="Matmul") - add = onnx.helper.make_node("Add", ["C", "E"], ["D"], name="add") - - f_value = np.random.randint(2, size=(10)).astype(np.float32) - F_init = onnx.helper.make_tensor("F", onnx.TensorProto.FLOAT, [1, 5, 2], e_value.reshape(10).tolist()) - add2 = onnx.helper.make_node("Add", ["D", "F"], ["H"], name="add2") - - graph = onnx.helper.make_graph([matmul_node, add, add2], "test_graph_1", [A], [H], [B_init, E_init, F_init]) - model = onnx.helper.make_model(graph) - model = onnx.helper.make_model(graph, **{"opset_imports": [onnx.helper.make_opsetid("", 13)]}) - return model - - -class TestQuantizationConfig(unittest.TestCase): - @classmethod - def setUpClass(self): - main_export( - "hf-internal-testing/tiny-random-gptj", - output="gptj", - ) - self.gptj = find_onnx_file("./gptj") - - simple_onnx_model = build_simple_onnx_model() - onnx.save(simple_onnx_model, "simple_onnx_model.onnx") - self.simple_onnx_model = "simple_onnx_model.onnx" - - @classmethod - def tearDownClass(self): - shutil.rmtree("gptj", ignore_errors=True) - os.remove("simple_onnx_model.onnx") - - def setUp(self): - # print the test name - logger.info(f"Running TestQuantizationConfig test: {self.id()}") - - def _check_node_is_quantized(self, model, node_name): - for node in model.graph.node: - if (node.name == node_name or node.name == node_name + "_Q4") and node.op_type in [ - "MatMulNBits", - "MatMulFpQ4", - ]: - return True - return False - - def _count_woq_matmul(self, q_model, bits=4, group_size=32): - op_names = [ - i.name - for i in q_model.graph.node - if i.op_type.startswith("MatMul") and i.input[1].endswith("_Q{}G{}".format(bits, group_size)) - ] - return len(op_names) - - def test_config_white_lst(self): - from neural_compressor.onnxrt import RTNConfig - from neural_compressor.onnxrt.quantization.quantize import _quantize - - global_config = RTNConfig(weight_bits=4) - # set operator instance - fc_out_config = RTNConfig(weight_dtype="fp32", white_list=["/h.4/mlp/fc_out/MatMul"]) - # get model and quantize - fp32_model = self.gptj - qmodel = _quantize(fp32_model, quant_config=global_config + fc_out_config) - self.assertIsNotNone(qmodel) - self.assertEqual(self._count_woq_matmul(qmodel), 29) - self.assertFalse(self._check_node_is_quantized(qmodel, "/h.4/mlp/fc_out/MatMul")) - - def test_config_white_lst2(self): - from neural_compressor.onnxrt import RTNConfig - from neural_compressor.onnxrt.quantization.quantize import _quantize - - global_config = RTNConfig(weight_dtype="fp32") - # set operator instance - fc_out_config = RTNConfig(weight_bits=4, white_list=["/h.4/mlp/fc_out/MatMul"]) - # get model and quantize - fp32_model = self.gptj - qmodel = _quantize(fp32_model, quant_config=global_config + fc_out_config) - self.assertIsNotNone(qmodel) - self.assertEqual(self._count_woq_matmul(qmodel), 1) - self.assertTrue(self._check_node_is_quantized(qmodel, "/h.4/mlp/fc_out/MatMul")) - - def test_config_white_lst3(self): - from neural_compressor.onnxrt import RTNConfig - from neural_compressor.onnxrt.utils.utility import get_model_info - - global_config = RTNConfig(weight_bits=4) - # set operator instance - fc_out_config = RTNConfig(weight_bits=8, white_list=["/h.4/mlp/fc_out/MatMul"]) - quant_config = global_config + fc_out_config - # get model and quantize - fp32_model = self.gptj - model_info = get_model_info(fp32_model, white_op_type_list=["MatMul"]) - logger.info(quant_config) - configs_mapping = quant_config.to_config_mapping(model_info=model_info) - logger.info(configs_mapping) - self.assertTrue(configs_mapping[("/h.4/mlp/fc_out/MatMul", "MatMul")].weight_bits == 8) - self.assertTrue(configs_mapping[("/h.4/mlp/fc_in/MatMul", "MatMul")].weight_bits == 4) - - def test_config_from_dict(self): - from neural_compressor.onnxrt import RTNConfig - - quant_config = { - "rtn": { - "global": { - "weight_dtype": "int", - "weight_bits": 4, - "weight_group_size": 32, - }, - "local": { - "fc1": { - "weight_dtype": "int", - "weight_bits": 8, - } - }, - } - } - config = RTNConfig.from_dict(quant_config["rtn"]) - self.assertIsNotNone(config.local_config) - - def test_config_to_dict(self): - from neural_compressor.onnxrt import RTNConfig - - quant_config = RTNConfig(weight_bits=4) - fc_out_config = RTNConfig(weight_bits=8) - quant_config.set_local("/h.4/mlp/fc_out/MatMul", fc_out_config) - config_dict = quant_config.to_dict() - self.assertIn("global", config_dict) - self.assertIn("local", config_dict) - - def test_same_type_configs_addition(self): - from neural_compressor.onnxrt import RTNConfig - - quant_config1 = { - "rtn": { - "weight_dtype": "int", - "weight_bits": 4, - "weight_group_size": 32, - }, - } - q_config = RTNConfig.from_dict(quant_config1["rtn"]) - quant_config2 = { - "rtn": { - "global": { - "weight_bits": 8, - "weight_group_size": 32, - }, - "local": { - "/h.4/mlp/fc_out/MatMul": { - "weight_dtype": "int", - "weight_bits": 4, - } - }, - } - } - q_config2 = RTNConfig.from_dict(quant_config2["rtn"]) - q_config3 = q_config + q_config2 - q3_dict = q_config3.to_dict() - for op_name, op_config in quant_config2["rtn"]["local"].items(): - for attr, val in op_config.items(): - self.assertEqual(q3_dict["local"][op_name][attr], val) - self.assertNotEqual(q3_dict["global"]["weight_bits"], quant_config2["rtn"]["global"]["weight_bits"]) - - def test_config_mapping(self): - from neural_compressor.onnxrt import RTNConfig - from neural_compressor.onnxrt.utils.utility import get_model_info - - quant_config = RTNConfig(weight_bits=4) - # set operator instance - fc_out_config = RTNConfig(weight_bits=8) - quant_config.set_local("/h.4/mlp/fc_out/MatMul", fc_out_config) - # get model and quantize - fp32_model = self.gptj - model_info = get_model_info(fp32_model, white_op_type_list=["MatMul"]) - logger.info(quant_config) - configs_mapping = quant_config.to_config_mapping(model_info=model_info) - logger.info(configs_mapping) - self.assertTrue(configs_mapping[("/h.4/mlp/fc_out/MatMul", "MatMul")].weight_bits == 8) - self.assertTrue(configs_mapping[("/h.4/mlp/fc_in/MatMul", "MatMul")].weight_bits == 4) - # test regular matching - fc_config = RTNConfig(weight_bits=3) - quant_config.set_local("/h.[1-4]/mlp/fc_out/MatMul", fc_config) - configs_mapping = quant_config.to_config_mapping(model_info=model_info) - logger.info(configs_mapping) - self.assertTrue(configs_mapping[("/h.4/mlp/fc_out/MatMul", "MatMul")].weight_bits == 3) - self.assertTrue(configs_mapping[("/h.3/mlp/fc_out/MatMul", "MatMul")].weight_bits == 3) - self.assertTrue(configs_mapping[("/h.2/mlp/fc_out/MatMul", "MatMul")].weight_bits == 3) - self.assertTrue(configs_mapping[("/h.1/mlp/fc_out/MatMul", "MatMul")].weight_bits == 3) - - def test_diff_types_configs_addition(self): - from neural_compressor.onnxrt import GPTQConfig, RTNConfig - - quant_config1 = { - "rtn": { - "weight_bits": 4, - "weight_group_size": 32, - }, - } - q_config = RTNConfig.from_dict(quant_config1["rtn"]) - d_config = GPTQConfig(weight_group_size=128) - combined_config = q_config + d_config - combined_config_d = combined_config.to_dict() - logger.info(combined_config) - self.assertIn("rtn", combined_config_d) - self.assertIn("gptq", combined_config_d) - - -class TestQuantConfigForAutotune(unittest.TestCase): - def test_expand_config(self): - # test the expand functionalities, the user is not aware it - from neural_compressor.onnxrt import RTNConfig - - tune_config = RTNConfig(weight_bits=[4, 8]) - expand_config_list = RTNConfig.expand(tune_config) - self.assertEqual(expand_config_list[0].weight_bits, 4) - self.assertEqual(expand_config_list[1].weight_bits, 8) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/3x/onnxrt/test_smooth_quant.py b/test/3x/onnxrt/test_smooth_quant.py deleted file mode 100644 index 6974020185b..00000000000 --- a/test/3x/onnxrt/test_smooth_quant.py +++ /dev/null @@ -1,127 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import glob -import os -import shutil -import unittest - -import numpy as np -import onnx -from optimum.exporters.onnx import main_export - -from neural_compressor.common import Logger -from neural_compressor.onnxrt import CalibrationDataReader, QuantType, SmoohQuantConfig, get_default_sq_config -from neural_compressor.onnxrt.quantization.quantize import _quantize - -logger = Logger().get_logger() - - -class DataReader(CalibrationDataReader): - def __init__(self, model): - model = onnx.load(model) - batch_size = 1 - sequence_length = 1 - self.data = { - "input_ids": np.random.randint(10, size=(batch_size, sequence_length)).astype("int64"), - "attention_mask": np.zeros((batch_size, sequence_length)).astype("int64"), - } - for inp in model.graph.input: - if inp.name in self.data: - continue - if inp.name == "position_ids": - # model is exported with optimum >= 1.14.0 with new input 'position_ids' - self.data[inp.name] = np.random.randint(10, size=(batch_size, sequence_length)).astype("int64") - - self.enum_data = None - - def get_next(self): - if self.enum_data is None: - self.enum_data = iter([self.data]) - return next(self.enum_data, None) - - def rewind(self): - self.enum_data = None - - -class TestONNXRT3xSmoothQuant(unittest.TestCase): - @classmethod - def setUpClass(self): - main_export( - "hf-internal-testing/tiny-random-gptj", - output="gptj", - ) - self.gptj = glob.glob(os.path.join("./gptj", "*.onnx"))[0] - self.data_reader = DataReader(self.gptj) - - @classmethod - def tearDownClass(self): - shutil.rmtree("./gptj", ignore_errors=True) - - def test_sq_from_class_beginner(self): - self.data_reader.rewind() - config = get_default_sq_config() - model = _quantize(self.gptj, config, self.data_reader) - num_muls = len([i for i in model.graph.node if i.name.endswith("_smooth_mul") and i.op_type == "Mul"]) - self.assertEqual(num_muls, 30) - - def test_sq_auto_tune_from_class_beginner(self): - self.data_reader.rewind() - config = SmoohQuantConfig(alpha="auto", scales_per_op=False) - model = _quantize(self.gptj, config, self.data_reader) - num_muls = len([i for i in model.graph.node if i.name.endswith("_smooth_mul") and i.op_type == "Mul"]) - self.assertEqual(num_muls, 15) - - def test_sq_from_dict_beginner(self): - config = { - "smooth_quant": { - "global": { - "alpha": 0.5, - "scales_per_op": False, - }, - } - } - self.data_reader.rewind() - model = _quantize(self.gptj, config, self.data_reader) - num_muls = len([i for i in model.graph.node if i.name.endswith("_smooth_mul") and i.op_type == "Mul"]) - self.assertEqual(num_muls, 15) - - def test_sq_auto_tune_from_dict_beginner(self): - config = { - "smooth_quant": { - "global": { - "alpha": "auto", - }, - } - } - self.data_reader.rewind() - model = _quantize(self.gptj, config, self.data_reader) - num_muls = len([i for i in model.graph.node if i.name.endswith("_smooth_mul") and i.op_type == "Mul"]) - self.assertEqual(num_muls, 30) - - def test_sq_ort_param_class_beginner(self): - self.data_reader.rewind() - config = SmoohQuantConfig(weight_type=QuantType.QUInt8, activation_type=QuantType.QUInt8) - model = _quantize(self.gptj, config, self.data_reader) - num_muls = len([i for i in model.graph.node if i.name.endswith("_smooth_mul") and i.op_type == "Mul"]) - self.assertTrue(2 in [i.data_type for i in model.graph.initializer]) - self.assertTrue(3 not in [i.data_type for i in model.graph.initializer]) - self.assertEqual(num_muls, 30) - - -if __name__ == "__main__": - unittest.main()