From ff3740146a829e845d79266acf233b202843d3fd Mon Sep 17 00:00:00 2001
From: "chen, suyue" <suyue.chen@intel.com>
Date: Wed, 17 Jul 2024 23:11:15 +0800
Subject: [PATCH] 3.X API installation update (#1935)

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .azure-pipelines/scripts/install_nc.sh        |    4 -
 .../scripts/ut/3x/coverage.3x_ort             |   15 -
 .azure-pipelines/scripts/ut/3x/run_3x_ort.sh  |   35 -
 .azure-pipelines/ut-3x-ort.yml                |  109 --
 .github/checkgroup.yml                        |   13 -
 README.md                                     |   10 +-
 docs/source/installation_guide.md             |   57 +-
 neural_compressor/onnxrt/__init__.py          |   56 -
 .../onnxrt/algorithms/__init__.py             |   22 -
 .../onnxrt/algorithms/layer_wise/__init__.py  |   17 -
 .../onnxrt/algorithms/layer_wise/core.py      |  289 -----
 .../onnxrt/algorithms/smoother/__init__.py    |   17 -
 .../onnxrt/algorithms/smoother/calibrator.py  |  237 ----
 .../onnxrt/algorithms/smoother/core.py        |  668 ----------
 .../onnxrt/algorithms/weight_only/__init__.py |   13 -
 .../onnxrt/algorithms/weight_only/awq.py      |  437 -------
 .../onnxrt/algorithms/weight_only/gptq.py     |  451 -------
 .../onnxrt/algorithms/weight_only/rtn.py      |  222 ----
 .../onnxrt/algorithms/weight_only/utility.py  |  335 -----
 .../onnxrt/quantization/__init__.py           |   50 -
 .../onnxrt/quantization/algorithm_entry.py    |  152 ---
 .../onnxrt/quantization/autotune.py           |  116 --
 .../onnxrt/quantization/calibrate.py          |   35 -
 .../onnxrt/quantization/config.py             |  614 ----------
 .../onnxrt/quantization/quantize.py           |   67 -
 neural_compressor/onnxrt/utils/__init__.py    |   24 -
 neural_compressor/onnxrt/utils/onnx_model.py  | 1082 -----------------
 neural_compressor/onnxrt/utils/utility.py     |  288 -----
 requirements_ort.txt                          |    9 -
 setup.py                                      |   52 +-
 .../layer_wise/test_layer_wise.py             |  155 ---
 .../quantization/weight_only/test_awq.py      |  219 ----
 .../quantization/weight_only/test_gptq.py     |  222 ----
 .../quantization/weight_only/test_rtn.py      |  193 ---
 test/3x/onnxrt/requirements.txt               |    2 -
 test/3x/onnxrt/test_autotune.py               |  304 -----
 test/3x/onnxrt/test_config.py                 |  251 ----
 test/3x/onnxrt/test_smooth_quant.py           |  127 --
 38 files changed, 43 insertions(+), 6926 deletions(-)
 delete mode 100644 .azure-pipelines/scripts/ut/3x/coverage.3x_ort
 delete mode 100644 .azure-pipelines/scripts/ut/3x/run_3x_ort.sh
 delete mode 100644 .azure-pipelines/ut-3x-ort.yml
 delete mode 100644 neural_compressor/onnxrt/__init__.py
 delete mode 100644 neural_compressor/onnxrt/algorithms/__init__.py
 delete mode 100644 neural_compressor/onnxrt/algorithms/layer_wise/__init__.py
 delete mode 100644 neural_compressor/onnxrt/algorithms/layer_wise/core.py
 delete mode 100644 neural_compressor/onnxrt/algorithms/smoother/__init__.py
 delete mode 100644 neural_compressor/onnxrt/algorithms/smoother/calibrator.py
 delete mode 100644 neural_compressor/onnxrt/algorithms/smoother/core.py
 delete mode 100644 neural_compressor/onnxrt/algorithms/weight_only/__init__.py
 delete mode 100644 neural_compressor/onnxrt/algorithms/weight_only/awq.py
 delete mode 100644 neural_compressor/onnxrt/algorithms/weight_only/gptq.py
 delete mode 100644 neural_compressor/onnxrt/algorithms/weight_only/rtn.py
 delete mode 100644 neural_compressor/onnxrt/algorithms/weight_only/utility.py
 delete mode 100644 neural_compressor/onnxrt/quantization/__init__.py
 delete mode 100644 neural_compressor/onnxrt/quantization/algorithm_entry.py
 delete mode 100644 neural_compressor/onnxrt/quantization/autotune.py
 delete mode 100644 neural_compressor/onnxrt/quantization/calibrate.py
 delete mode 100644 neural_compressor/onnxrt/quantization/config.py
 delete mode 100644 neural_compressor/onnxrt/quantization/quantize.py
 delete mode 100644 neural_compressor/onnxrt/utils/__init__.py
 delete mode 100644 neural_compressor/onnxrt/utils/onnx_model.py
 delete mode 100644 neural_compressor/onnxrt/utils/utility.py
 delete mode 100644 requirements_ort.txt
 delete mode 100644 test/3x/onnxrt/quantization/layer_wise/test_layer_wise.py
 delete mode 100644 test/3x/onnxrt/quantization/weight_only/test_awq.py
 delete mode 100644 test/3x/onnxrt/quantization/weight_only/test_gptq.py
 delete mode 100644 test/3x/onnxrt/quantization/weight_only/test_rtn.py
 delete mode 100644 test/3x/onnxrt/requirements.txt
 delete mode 100644 test/3x/onnxrt/test_autotune.py
 delete mode 100644 test/3x/onnxrt/test_config.py
 delete mode 100644 test/3x/onnxrt/test_smooth_quant.py

diff --git a/.azure-pipelines/scripts/install_nc.sh b/.azure-pipelines/scripts/install_nc.sh
index 2cb175138b4..55b323c56c2 100644
--- a/.azure-pipelines/scripts/install_nc.sh
+++ b/.azure-pipelines/scripts/install_nc.sh
@@ -10,10 +10,6 @@ elif [[ $1 = *"3x_tf"* ]]; then
     python -m pip install --no-cache-dir -r requirements_tf.txt
     python setup.py tf bdist_wheel
     pip install dist/neural_compressor*.whl --force-reinstall
-elif [[ $1 = *"3x_ort" ]]; then
-    python -m pip install --no-cache-dir -r requirements_ort.txt
-    python setup.py ort bdist_wheel
-    pip install dist/neural_compressor*.whl --force-reinstall
 else
     python -m pip install --no-cache-dir -r requirements.txt
     python setup.py bdist_wheel
diff --git a/.azure-pipelines/scripts/ut/3x/coverage.3x_ort b/.azure-pipelines/scripts/ut/3x/coverage.3x_ort
deleted file mode 100644
index 1404dccbaee..00000000000
--- a/.azure-pipelines/scripts/ut/3x/coverage.3x_ort
+++ /dev/null
@@ -1,15 +0,0 @@
-[run]
-branch = True
-
-[report]
-include =
- */neural_compressor/common/*
- */neural_compressor/onnxrt/*
-exclude_lines =
- pragma: no cover
- raise NotImplementedError
- raise TypeError
- if self.device == "gpu":
- if device == "gpu":
- except ImportError:
- except Exception as e:
\ No newline at end of file
diff --git a/.azure-pipelines/scripts/ut/3x/run_3x_ort.sh b/.azure-pipelines/scripts/ut/3x/run_3x_ort.sh
deleted file mode 100644
index 5f8550ea742..00000000000
--- a/.azure-pipelines/scripts/ut/3x/run_3x_ort.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-python -c "import neural_compressor as nc"
-test_case="run 3x ONNXRT"
-echo "${test_case}"
-
-# install requirements
-echo "set up UT env..."
-pip install -r /neural-compressor/test/3x/onnxrt/requirements.txt
-pip install pytest-cov
-pip install pytest-html
-pip list
-
-export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/3x/coverage.3x_ort
-inc_path=$(python -c 'import neural_compressor; print(neural_compressor.__path__[0])')
-cd /neural-compressor/test/3x || exit 1
-rm -rf torch
-rm -rf tensorflow
-
-LOG_DIR=/neural-compressor/log_dir
-mkdir -p ${LOG_DIR}
-ut_log_name=${LOG_DIR}/ut_3x_ort.log
-pytest --cov="${inc_path}" -vs --disable-warnings --html=report.html --self-contained-html . 2>&1 | tee -a ${ut_log_name}
-
-cp report.html ${LOG_DIR}/
-
-if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then
-    echo "Find errors in pytest case, please check the output..."
-    echo "Please search for '== FAILURES ==' or '== ERRORS =='"
-    exit 1
-fi
-
-# if ut pass, collect the coverage file into artifacts
-cp .coverage ${LOG_DIR}/.coverage
-
-echo "UT finished successfully! "
\ No newline at end of file
diff --git a/.azure-pipelines/ut-3x-ort.yml b/.azure-pipelines/ut-3x-ort.yml
deleted file mode 100644
index 42636df2314..00000000000
--- a/.azure-pipelines/ut-3x-ort.yml
+++ /dev/null
@@ -1,109 +0,0 @@
-trigger: none
-
-pr:
-  autoCancel: true
-  drafts: false
-  branches:
-    include:
-      - master
-  paths:
-    include:
-      - neural_compressor/common
-      - neural_compressor/onnxrt
-      - test/3x/onnxrt
-      - test/3x/common
-      - setup.py
-      - requirements_ort.txt
-      - .azure-pipelines/scripts/ut/3x/run_3x_ort.sh
-
-pool: ICX-16C
-
-variables:
-  IMAGE_NAME: "neural-compressor"
-  IMAGE_TAG: "py310"
-  UPLOAD_PATH: $(Build.SourcesDirectory)/log_dir
-  DOWNLOAD_PATH: $(Build.SourcesDirectory)/log_dir
-  ARTIFACT_NAME: "UT_coverage_report_3x_ort"
-  REPO: $(Build.Repository.Uri)
-
-stages:
-  - stage: ONNXRT
-    displayName: Unit Test 3x ONNXRT
-    dependsOn: []
-    jobs:
-      - job:
-        displayName: Unit Test 3x ONNXRT
-        steps:
-          - template: template/ut-template.yml
-            parameters:
-              dockerConfigName: "commonDockerConfig"
-              utScriptFileName: "3x/run_3x_ort"
-              uploadPath: $(UPLOAD_PATH)
-              utArtifact: "ut_3x"
-
-
-  - stage: ONNXRT_baseline
-    displayName: Unit Test 3x ONNXRT baseline
-    dependsOn: []
-    jobs:
-      - job:
-        displayName: Unit Test 3x ONNXRT baseline
-        steps:
-          - template: template/ut-template.yml
-            parameters:
-              dockerConfigName: "gitCloneDockerConfig"
-              utScriptFileName: "3x/run_3x_ort"
-              uploadPath: $(UPLOAD_PATH)
-              utArtifact: "ut_3x_baseline"
-              repo: $(REPO)
-
-  - stage: Coverage
-    displayName: "Coverage Compare"
-    pool:
-      vmImage: "ubuntu-latest"
-    dependsOn: [ONNXRT, ONNXRT_baseline]
-    jobs:
-      - job: CollectDatafiles
-        steps:
-          - script: |
-              if [[ ! $(docker images | grep -i ${IMAGE_NAME}:${IMAGE_TAG}) ]]; then
-                docker build -f ${BUILD_SOURCESDIRECTORY}/.azure-pipelines/docker/Dockerfile.devel -t ${IMAGE_NAME}:${IMAGE_TAG} .
-              fi
-              docker images | grep -i ${IMAGE_NAME}
-              if [[ $? -ne 0 ]]; then
-                echo "NO Such Repo"
-                exit 1
-              fi
-            displayName: "Build develop docker image"
-
-          - task: DownloadPipelineArtifact@2
-            inputs:
-              artifact:
-              patterns: '*_coverage/.coverage'
-              path: $(DOWNLOAD_PATH)
-
-          - script: |
-              echo "--- create container ---"
-              docker run -d -it --name="collectLogs"  -v ${BUILD_SOURCESDIRECTORY}:/neural-compressor  ${IMAGE_NAME}:${IMAGE_TAG} /bin/bash
-              echo "--- docker ps ---"
-              docker ps
-              echo "--- collect logs ---"
-              docker exec collectLogs /bin/bash  +x -c "cd /neural-compressor/.azure-pipelines/scripts \
-              && bash install_nc.sh 3x_ort \
-              && bash ut/3x/collect_log_3x.sh 3x_ort"
-            displayName: "Collect UT Coverage"
-
-          - task: PublishPipelineArtifact@1
-            condition: succeededOrFailed()
-            inputs:
-              targetPath: $(UPLOAD_PATH)
-              artifact: $(ARTIFACT_NAME)
-              publishLocation: "pipeline"
-
-          - task: Bash@3
-            condition: always()
-            inputs:
-              targetType: "inline"
-              script: |
-                docker exec collectLogs bash -c "rm -fr /neural-compressor/* && rm -fr /neural-compressor/.* || true"
-            displayName: "Docker clean up"
diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml
index 4c6691da86a..c1e6e147fab 100644
--- a/.github/checkgroup.yml
+++ b/.github/checkgroup.yml
@@ -140,16 +140,3 @@ subprojects:
       - "UT-3x-Torch (Coverage Compare CollectDatafiles)"
       - "UT-3x-Torch (Unit Test 3x Torch Unit Test 3x Torch)"
       - "UT-3x-Torch (Unit Test 3x Torch baseline Unit Test 3x Torch baseline)"
-
-  - id: "Unit Tests 3x-ONNXRT workflow"
-    paths:
-      - "neural_compressor/common/**"
-      - "neural_compressor/onnxrt/**"
-      - "test/3x/onnxrt/**"
-      - "setup.py"
-      - "requirements_ort.txt"
-    checks:
-      - "UT-3x-ONNXRT"
-      - "UT-3x-ONNXRT (Coverage Compare CollectDatafiles)"
-      - "UT-3x-ONNXRT (Unit Test 3x ONNXRT Unit Test 3x ONNXRT)"
-      - "UT-3x-ONNXRT (Unit Test 3x ONNXRT baseline Unit Test 3x ONNXRT baseline)"
diff --git a/README.md b/README.md
index 31772f4d025..7e5b65bf351 100644
--- a/README.md
+++ b/README.md
@@ -19,21 +19,25 @@ Intel® Neural Compressor aims to provide popular model compression techniques s
 as well as Intel extensions such as [Intel Extension for TensorFlow](https://github.com/intel/intel-extension-for-tensorflow) and [Intel Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch).
 In particular, the tool provides the key features, typical examples, and open collaborations as below:
 
-* Support a wide range of Intel hardware such as [Intel Xeon Scalable Processors](https://www.intel.com/content/www/us/en/products/details/processors/xeon/scalable.html), [Intel Xeon CPU Max Series](https://www.intel.com/content/www/us/en/products/details/processors/xeon/max-series.html), [Intel Data Center GPU Flex Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/flex-series.html), and [Intel Data Center GPU Max Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/max-series.html) with extensive testing; support AMD CPU, ARM CPU, and NVidia GPU through ONNX Runtime with limited testing
+* Support a wide range of Intel hardware such as [Intel Gaudi Al Accelerators](https://www.intel.com/content/www/us/en/products/details/processors/ai-accelerators/gaudi-overview.html), [Intel Core Ultra Processors](https://www.intel.com/content/www/us/en/products/details/processors/core-ultra.html), [Intel Xeon Scalable Processors](https://www.intel.com/content/www/us/en/products/details/processors/xeon/scalable.html), [Intel Xeon CPU Max Series](https://www.intel.com/content/www/us/en/products/details/processors/xeon/max-series.html), [Intel Data Center GPU Flex Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/flex-series.html), and [Intel Data Center GPU Max Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/max-series.html) with extensive testing; 
+support AMD CPU, ARM CPU, and NVidia GPU through ONNX Runtime with limited testing; support NVidia GPU for some WOQ algorithms like AutoRound and HQQ. 
 
 * Validate popular LLMs such as [LLama2](/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), [Falcon](/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), [GPT-J](/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), [Bloom](/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), [OPT](/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), and more than 10,000 broad models such as [Stable Diffusion](/examples/pytorch/nlp/huggingface_models/text-to-image/quantization), [BERT-Large](/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx), and [ResNet50](/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx) from popular model hubs such as [Hugging Face](https://huggingface.co/), [Torch Vision](https://pytorch.org/vision/stable/index.html), and [ONNX Model Zoo](https://github.com/onnx/models#models), with automatic [accuracy-driven](/docs/source/design.md#workflow) quantization strategies
 
 * Collaborate with cloud marketplaces such as [Google Cloud Platform](https://console.cloud.google.com/marketplace/product/bitnami-launchpad/inc-tensorflow-intel?project=verdant-sensor-286207), [Amazon Web Services](https://aws.amazon.com/marketplace/pp/prodview-yjyh2xmggbmga#pdp-support), and [Azure](https://azuremarketplace.microsoft.com/en-us/marketplace/apps/bitnami.inc-tensorflow-intel), software platforms such as [Alibaba Cloud](https://www.intel.com/content/www/us/en/developer/articles/technical/quantize-ai-by-oneapi-analytics-on-alibaba-cloud.html), [Tencent TACO](https://new.qq.com/rain/a/20221202A00B9S00) and [Microsoft Olive](https://github.com/microsoft/Olive), and open AI ecosystem such as [Hugging Face](https://huggingface.co/blog/intel), [PyTorch](https://pytorch.org/tutorials/recipes/intel_neural_compressor_for_pytorch.html), [ONNX](https://github.com/onnx/models#models), [ONNX Runtime](https://github.com/microsoft/onnxruntime), and [Lightning AI](https://github.com/Lightning-AI/lightning/blob/master/docs/source-pytorch/advanced/post_training_quantization.rst)
 
 ## What's New
+* [2024/07] From 3.0 release, framework extension API is recommended to be used for quantization.
 * [2024/07] Performance optimizations and usability improvements on [client-side](https://github.com/intel/neural-compressor/blob/master/docs/3x/client_quant.md).
-* [2024/03] A new SOTA approach [AutoRound](https://github.com/intel/auto-round) Weight-Only Quantization on [Intel Gaudi2 AI accelerator](https://habana.ai/products/gaudi2/) is available for LLMs.
 
 ## Installation
 
 ### Install from pypi
 ```Shell
-pip install neural-compressor
+# Install 2.X API + Framework extension API + PyTorch dependency
+pip install neural-compressor[pt] 
+# Install 2.X API + Framework extension API + TensorFlow dependency
+pip install neural-compressor[tf]
 ```
 > **Note**: 
 > Further installation methods can be found under [Installation Guide](https://github.com/intel/neural-compressor/blob/master/docs/source/installation_guide.md). check out our [FAQ](https://github.com/intel/neural-compressor/blob/master/docs/source/faq.md) for more details.
diff --git a/docs/source/installation_guide.md b/docs/source/installation_guide.md
index a0e7ad5e47c..f4497806c58 100644
--- a/docs/source/installation_guide.md
+++ b/docs/source/installation_guide.md
@@ -29,28 +29,28 @@ The following prerequisites and requirements must be satisfied for a successful
 
 ### Install from Binary
 - Install from Pypi
-  ```Shell
-  # install stable basic version from pypi
-  pip install neural-compressor
-  ```
-  ```Shell
-  # [Experimental] install stable basic + PyTorch framework extension API from pypi 
-  pip install neural-compressor[pt]
-  ```
-  ```Shell
-  # [Experimental] install stable basic + TensorFlow framework extension API from pypi 
-  pip install neural-compressor[tf]
-  ```
-
-- Install from test Pypi
-  ```Shell
-  # install nightly version
-  git clone https://github.com/intel/neural-compressor.git
-  cd neural-compressor
-  pip install -r requirements.txt
-  # install nightly basic version from pypi
-  pip install -i https://test.pypi.org/simple/ neural-compressor
-  ```
+```Shell
+# Install 2.X API + Framework extension API + PyTorch dependency
+pip install neural-compressor[pt]
+```
+```Shell
+# Install 2.X API + Framework extension API + TensorFlow dependency
+pip install neural-compressor[tf]
+```
+```Shell
+# Install 2.X API + Framework extension API
+# With this install CMD, some dependencies for framework extension API not installed, 
+# you can install them separately by `pip install -r requirements_pt.txt` or `pip install -r requirements_tf.txt`.
+pip install neural-compressor
+```
+```Shell
+# Framework extension API + TensorFlow dependency
+pip install neural-compressor-pt
+```
+```Shell
+# Framework extension API + TensorFlow dependency
+pip install neural-compressor-tf
+```
 
 ### Install from Source
 
@@ -76,15 +76,20 @@ The AI Kit is distributed through many common channels, including from Intel's w
 ## System Requirements
 
 ### Validated Hardware Environment
+
+#### Intel® Neural Compressor supports HPUs based on heterogeneous architecture with two compute engines (MME and TPC): 
+* Intel Gaudi Al Accelerators (Gaudi2)
+
 #### Intel® Neural Compressor supports CPUs based on [Intel 64 architecture or compatible processors](https://en.wikipedia.org/wiki/X86-64):
 
-* Intel Xeon Scalable processor (formerly Skylake, Cascade Lake, Cooper Lake, Ice Lake, and Sapphire Rapids)
-* Intel Xeon CPU Max Series (formerly Sapphire Rapids HBM)
+* Intel Xeon Scalable processor (Skylake, Cascade Lake, Cooper Lake, Ice Lake, and Sapphire Rapids)
+* Intel Xeon CPU Max Series (Sapphire Rapids HBM)
+* Intel Core Ultra Processors (Meteor Lake)
 
 #### Intel® Neural Compressor supports GPUs built on Intel's Xe architecture:
 
-* Intel Data Center GPU Flex Series (formerly Arctic Sound-M)
-* Intel Data Center GPU Max Series (formerly Ponte Vecchio)
+* Intel Data Center GPU Flex Series (Arctic Sound-M)
+* Intel Data Center GPU Max Series (Ponte Vecchio)
 
 #### Intel® Neural Compressor quantized ONNX models support multiple hardware vendors through ONNX Runtime:
 
diff --git a/neural_compressor/onnxrt/__init__.py b/neural_compressor/onnxrt/__init__.py
deleted file mode 100644
index e26d2897dd5..00000000000
--- a/neural_compressor/onnxrt/__init__.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from onnxruntime.quantization.calibrate import CalibrationMethod
-from onnxruntime.quantization.quant_utils import QuantType, QuantFormat
-from neural_compressor.onnxrt.utils.utility import register_algo
-from neural_compressor.onnxrt.quantization import (
-    rtn_quantize_entry,
-    RTNConfig,
-    get_default_rtn_config,
-    gptq_quantize_entry,
-    GPTQConfig,
-    get_default_gptq_config,
-    awq_quantize_entry,
-    AWQConfig,
-    get_default_awq_config,
-    smooth_quant_entry,
-    SmoohQuantConfig,
-    get_default_sq_config,
-    CalibrationDataReader,
-    autotune,
-    get_all_config_set,
-)
-
-__all__ = [
-    "register_algo",
-    "rtn_quantize_entry",
-    "RTNConfig",
-    "get_default_rtn_config",
-    "gptq_quantize_entry",
-    "GPTQConfig",
-    "get_default_gptq_config",
-    "awq_quantize_entry",
-    "AWQConfig",
-    "get_default_awq_config",
-    "smooth_quant_entry",
-    "SmoohQuantConfig",
-    "get_default_sq_config",
-    "CalibrationDataReader",
-    "QuantType",
-    "QuantFormat",
-    "CalibrationMethod",
-    "autotune",
-    "get_all_config_set",
-]
diff --git a/neural_compressor/onnxrt/algorithms/__init__.py b/neural_compressor/onnxrt/algorithms/__init__.py
deleted file mode 100644
index c1d38b1844c..00000000000
--- a/neural_compressor/onnxrt/algorithms/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from neural_compressor.onnxrt.algorithms.smoother import Smoother
-from neural_compressor.onnxrt.algorithms.weight_only.rtn import apply_rtn_on_model
-from neural_compressor.onnxrt.algorithms.weight_only.gptq import apply_gptq_on_model
-from neural_compressor.onnxrt.algorithms.weight_only.awq import apply_awq_on_model
-from neural_compressor.onnxrt.algorithms.layer_wise import layer_wise_quant
-
-__all__ = ["Smoother", "apply_rtn_on_model", "apply_gptq_on_model", "apply_awq_on_model", "layer_wise_quant"]
diff --git a/neural_compressor/onnxrt/algorithms/layer_wise/__init__.py b/neural_compressor/onnxrt/algorithms/layer_wise/__init__.py
deleted file mode 100644
index 86c5371fbb3..00000000000
--- a/neural_compressor/onnxrt/algorithms/layer_wise/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from neural_compressor.onnxrt.algorithms.layer_wise.core import layer_wise_quant
-
-__all__ = ["layer_wise_quant"]
diff --git a/neural_compressor/onnxrt/algorithms/layer_wise/core.py b/neural_compressor/onnxrt/algorithms/layer_wise/core.py
deleted file mode 100644
index a3eacb6ebc9..00000000000
--- a/neural_compressor/onnxrt/algorithms/layer_wise/core.py
+++ /dev/null
@@ -1,289 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-# Copyright (c) 2023 MIT HAN Lab
-# This source code is licensed under the MIT license
-#
-# Copyright (c) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from copy import deepcopy
-from pathlib import Path
-from typing import Callable, List, Union
-
-import onnx
-import onnxruntime as ort
-import transformers
-
-from neural_compressor.common import Logger
-from neural_compressor.onnxrt.quantization.calibrate import CalibrationDataReader
-from neural_compressor.onnxrt.utils.onnx_model import ONNXModel
-from neural_compressor.onnxrt.utils.utility import check_model_with_infer_shapes
-
-logger = Logger().get_logger()
-
-__all__ = [
-    "layer_wise_quant",
-]
-
-
-def layer_wise_quant(
-    model: Union[onnx.ModelProto, ONNXModel, Path, str],
-    quant_func: Callable,
-    weight_config: dict,
-    data_reader: CalibrationDataReader = None,
-    *args,
-    **kwargs
-) -> ONNXModel:
-    """Quantize model layer by layer to save memory.
-
-    Args:
-        model (Union[onnx.ModelProto, ONNXModel, Path, str]): onnx model.
-        quant_func (Callable): quantization algo function.
-        weight_config (dict): quantization config.
-        data_reader (CalibrationDataReader, optional): data_reader for calibration. Defaults to None.
-
-    Returns:
-        _type_: _description_
-    """
-    # check whether model shape is inferred
-    if not check_model_with_infer_shapes(model):
-        logger.error(
-            "Before applying layer-wise quantization, please make sure to "
-            "run symbolic shape inference on your model like follows:\n"
-            "import onnxruntime.tools.symbolic_shape_infer as symbolic_shape_infer\n"
-            "model = onnx.load(your_model_path)\n"
-            "out = symbolic_shape_infer.SymbolicShapeInference.infer_shapes(model, auto_merge=True)\n"
-            "onnx.save(out, infer_shape_model_path)\n"
-        )
-        raise ValueError("Fail to run layer-wise quantization.")
-
-    if not isinstance(model, ONNXModel):
-        model = ONNXModel(model, ignore_warning=True, load_external_data=False)
-
-    origin_model = deepcopy(model)
-
-    providers = kwargs.get("providers", ["CPUExecutionProvider"])
-
-    # get and check split nodes
-    split_nodes = origin_model.find_split_nodes()
-    if len(split_nodes) == 0:
-        logger.error(
-            "Can't find split nodes for layer-wise quantization. "
-            "We recommend applying graph optimization for your model like follows: \n"
-            "import onnxruntime as ort \n"
-            "sess_options = ort.SessionOptions() \n"
-            "sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED "
-            "# or ORT_ENABLE_BASIC \n"
-            "sess_options.optimized_model_filepath = 'optimized_model_path' \n"
-            "ort.InferenceSession(infer_shape_model_path, sess_options)"
-        )
-        raise ValueError("Fail to run layer-wise quantization.")
-    logger.info(
-        "Will split model into {} parts to do layer-wise quantization".format(
-            len([node.name for node in split_nodes]) + 1
-        )
-    )
-    logger.debug(
-        "Will split model with these nodes for layer-wise quantization: {}".format([node.name for node in split_nodes])
-    )
-
-    split_idx = 1
-    model_to_split = [origin_model]
-    quantized_model_merged = None
-
-    require_data_reader = data_reader is not None
-    if require_data_reader:
-        lwq_data_reader = [data_reader]
-
-    while len(model_to_split) != 0:
-        # prepare model, node and data_reader for current split
-        split_model = model_to_split.pop(0)
-        split_node = split_nodes.pop(0)
-        if require_data_reader:
-            current_data_reader = lwq_data_reader.pop(0)
-
-        # if no remaining split nodes, it means this is the last split, and the two split models will be saved.
-        save_both_split_models = True if len(split_nodes) == 0 else False
-
-        # split model with given split node
-        split_model_part_1, split_model_part_2 = split_model.split_model_with_node(
-            split_node.name, model.model_path, save_both_split_models
-        )
-        if not save_both_split_models:
-            # append split_model_part_2 to do next split
-            model_to_split.append(split_model_part_2)
-
-        logger.info("Quantize split model {}".format(split_idx))
-        if require_data_reader:
-            # process data_reader for current split and next split
-            current_data_reader = _filter_data_reader_for_current_split_model(
-                split_model_part_1.model, current_data_reader
-            )
-            next_data_reader = _prepare_data_reader_for_next_split_model(
-                split_model_part_1.model_path, current_data_reader, providers
-            )
-            lwq_data_reader.append(next_data_reader)
-
-            # perform quantization
-            split_model_part_1_quantized = quant_func(
-                split_model_part_1,
-                weight_config=weight_config,
-                data_reader=current_data_reader,
-                return_modelproto=False,
-                **kwargs
-            )
-        else:
-            # perform quantization
-            split_model_part_1_quantized = quant_func(
-                split_model_part_1, weight_config=weight_config, return_modelproto=False, **kwargs
-            )
-
-        # check split model is valid
-        try:
-            ort.InferenceSession(split_model_part_1_quantized.model.SerializeToString(), providers=providers)
-        except Exception as e:
-            logger.error(
-                "Layer-wise quantized model {} can't be inferred correctly. "
-                "Please check the raise exception".format(split_idx)
-            )
-            raise e
-
-        # merge split quantized model
-        if quantized_model_merged is None:
-            quantized_model_merged = split_model_part_1_quantized
-            quantized_model_merged.write_external_data_to_new_location(overwrite=True)
-        else:
-            quantized_model_merged.merge_split_models(split_model_part_1_quantized)
-
-        split_idx += 1
-        # if this is the last split, quantize the last split model
-        if save_both_split_models:
-            logger.info("Quantize split model {}".format(split_idx))
-
-            # quantize split model
-            if require_data_reader:
-                # process data_reader for current split
-                current_data_reader = lwq_data_reader.pop(0)
-                current_data_reader = _filter_data_reader_for_current_split_model(
-                    split_model_part_2.model, current_data_reader
-                )
-
-                # perform quantization
-                split_model_part_2_quantized = quant_func(
-                    split_model_part_2,
-                    weight_config=weight_config,
-                    data_reader=current_data_reader,
-                    return_modelproto=False,
-                    **kwargs
-                )
-            else:
-                # perform quantization
-                split_model_part_2_quantized = quant_func(
-                    split_model_part_2, weight_config=weight_config, return_modelproto=False, **kwargs
-                )
-
-            # check split model is valid
-            try:
-                ort.InferenceSession(split_model_part_2_quantized.model.SerializeToString(), providers=providers)
-            except Exception as e:
-                logger.error(
-                    "Layer-wise quantized model {} can't be inferred correctly. "
-                    "Please check the raise exception".format(split_idx)
-                )
-                raise e
-
-            # merge split quantized model
-            if quantized_model_merged is None:
-                quantized_model_merged = split_model_part_2_quantized
-                quantized_model_merged.write_external_data_to_new_location(overwrite=True)
-            else:
-                quantized_model_merged.merge_split_models(split_model_part_2_quantized)
-
-    # reload external data to prevent external data file path errors
-    from onnx.external_data_helper import load_external_data_for_model
-
-    load_external_data_for_model(quantized_model_merged.model, os.path.dirname(quantized_model_merged.model_path))
-
-    return quantized_model_merged
-
-
-class DataReader(CalibrationDataReader):
-    """Data reader for layer-wise quantization."""
-
-    def __init__(self, data_list):
-        self.data_list = data_list
-        self.iter_next = iter(self.data_list)
-
-    def get_next(self):
-        return next(self.iter_next, None)
-
-    def rewind(self):
-        self.iter_next = iter(self.data_list)
-
-
-def _filter_data_reader_for_current_split_model(model: onnx.ModelProto, data_reader: CalibrationDataReader):
-    """Filter data reader to remove data that is not in model input.
-
-    Args:
-        model (onnx.ModelProto): onnx model.
-        data_reader (CalibrationDataReader): data reader.
-
-    Returns:
-        CalibrationDataReader: filtered data reader.
-    """
-    filter_inputs = []
-    input_names = [input.name for input in model.graph.input]
-    while True:
-        inputs = data_reader.get_next()
-        if not inputs:
-            break
-        filter_input = {
-            input_name: input_tensor for input_name, input_tensor in inputs.items() if input_name in input_names
-        }
-        filter_inputs.append(filter_input)
-    return DataReader(filter_inputs)
-
-
-def _prepare_data_reader_for_next_split_model(
-    model_path: str,
-    data_reader: CalibrationDataReader,
-    providers: List[str] = ["CPUExecutionProvider"],
-):
-    """Prepare data reader for next split model.
-
-    Get data output of current split model and save for next split model.
-
-    Args:
-        model (str): path to onnx model.
-        data_reader (CalibrationDataReader): data reader
-        providers (List[str], optional): providers to use. Defaults to ["CPUExecutionProvider"].
-
-    Returns:
-        CalibrationDataReader: data reader for next split model.
-    """
-    data_reader = deepcopy(data_reader)
-
-    data_reader_for_next_split_model = []
-    session = ort.InferenceSession(model_path, providers=providers)
-    output_names = [output.name for output in session.get_outputs()]
-    while True:
-        inputs = data_reader.get_next()
-        if not inputs:
-            break
-        out = session.run(None, inputs)
-        inputs.update({name: value for name, value in zip(output_names, out)})
-        data_reader_for_next_split_model.append(inputs)
-    return DataReader(data_reader_for_next_split_model)
diff --git a/neural_compressor/onnxrt/algorithms/smoother/__init__.py b/neural_compressor/onnxrt/algorithms/smoother/__init__.py
deleted file mode 100644
index 2e76dc06aee..00000000000
--- a/neural_compressor/onnxrt/algorithms/smoother/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from neural_compressor.onnxrt.algorithms.smoother.core import Smoother
-
-__all__ = ["Smoother"]
diff --git a/neural_compressor/onnxrt/algorithms/smoother/calibrator.py b/neural_compressor/onnxrt/algorithms/smoother/calibrator.py
deleted file mode 100644
index ddf009ea829..00000000000
--- a/neural_compressor/onnxrt/algorithms/smoother/calibrator.py
+++ /dev/null
@@ -1,237 +0,0 @@
-# Copyright (c) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Calibration for smooth quant."""
-
-import sys
-import tempfile
-from importlib.util import find_spec
-from pathlib import Path
-from typing import List
-
-import numpy as np
-import onnx
-import onnx.numpy_helper as numpy_helper
-import onnxruntime
-
-from neural_compressor.common import Logger
-from neural_compressor.onnxrt.quantization.calibrate import CalibrationDataReader
-from neural_compressor.onnxrt.utils.onnx_model import ONNXModel
-
-logger = Logger().get_logger()
-
-__all__ = ["Calibrator"]
-
-
-class Calibrator:
-    """Dump information for smooth quant."""
-
-    def __init__(
-        self,
-        model: ONNXModel,
-        dataloader: CalibrationDataReader,
-        iterations: List[int] = [],
-        providers: List[str] = ["CPUExecutionProvider"],
-        **kwargs,
-    ):
-        """Initialize a Calibrator to dump information.
-
-        Args:
-            model (ONNXModel): ONNXModel object.
-            dataloader (CalibrationDataReader): user implemented object to read in and preprocess calibration dataset.
-            iterations (List[int], optional): tensor of which iteration will be collected. Defaults to [].
-            providers (List[str], optional): execution provider for onnxruntime. Defaults to ["CPUExecutionProvider"].
-        """
-        self.model_wrapper = model
-        self.dataloader = dataloader
-        self.augmented_model = None
-        self.iterations = iterations
-        self.providers = providers
-
-    def _check_is_group_conv(self, node):
-        """Check the op is group wised or not(depthwise conv is excluded,return false).
-
-        Args:
-            node: The op node
-
-        Returns:
-            Bool: group wised True, otherwise False, depthwise False
-        """
-        name_to_indices = {}
-        for index, i in enumerate(self.model_wrapper.initializer()):
-            name_to_indices[i.name] = index
-
-        if node.op_type == "Conv":
-            group = 1
-            for attr in node.attribute:
-                if hasattr(attr, "name"):
-                    if attr.name == "group":
-                        group = attr.i
-                        break
-            # currently only normal conv and depthwise conv are supported
-            if group > 1:  # group conv, need to check depthwise or not
-                weight_name = node.input[1]
-                weight_shape = numpy_helper.to_array(
-                    self.model_wrapper.initializer()[name_to_indices[weight_name]]
-                ).shape
-                input_channel = weight_shape[1]
-                if input_channel != 1:  # TODO: need to double check
-                    return True
-        return False
-
-    def _get_input_tensor_of_ops(self, op_types: List[str] = ["MatMul", "Gemm", "Conv", "FusedConv"]):
-        """Traverse the graph and get all the data tensors flowing into layers of {op_types}.
-
-        Group conv is excluded.
-        # TODO: the tensors could be set/filtered in configuration.
-
-        Args:
-            op_types (List[str], optional): The op types whose input tensor will be dumped.
-                Defaults to ["MatMul", "Gemm", "Conv", "FusedConv"].
-
-        Returns:
-            dict: A dict of dumped tensor to node info
-        """
-        tensors_to_node = {}
-        initializers = {i.name: i for i in self.model_wrapper.initializer()}
-
-        for node in self.model_wrapper.nodes():
-            if len(op_types) == 0 or node.op_type in op_types:
-                if node.op_type in ["Conv", "FusedConv"] and self._check_is_group_conv(node):
-                    continue
-                # also need to check whether the layer has weight
-                if len(node.input) >= 2 and node.input[1] in initializers.keys():
-                    tensors_to_node.setdefault(node.input[0], []).append([node.name, node.input, node.output])
-        return tensors_to_node
-
-    def _get_max_per_channel(self, datas, percentile):
-        """Get the max values per input channel.
-
-        Args:
-            datas: The tensors
-            percentile: percentile of calibration to remove outliers
-
-        Returns:
-            The max values per input channel
-        """
-        permute_datas = []
-        for data in datas:
-            if len(data.shape) == 3:  # TODO: mammul batchsize*seq*inchannel, conv:batchsize*inchannle*f*f
-                tensor = np.abs(np.reshape(data, (-1, data.shape[-1])))
-                permute_datas.append(tensor)
-            elif len(data.shape) == 4:
-                tensor = np.swapaxes(data, 1, -1)
-                tensor = np.abs(np.reshape(tensor, (-1, tensor.shape[-1])))
-                permute_datas.append(tensor)
-            elif len(data.shape) == 2:
-                permute_datas.append(np.abs(data))
-            else:
-                assert False, "not supported"
-        permute_datas = np.stack(permute_datas, axis=0)
-        permute_datas = permute_datas.reshape(-1, permute_datas.shape[-1])
-        max_per_channels = np.percentile(permute_datas, percentile, axis=0)
-        max_per_channels = max_per_channels.astype(np.single)
-        return max_per_channels
-
-    def get_intermediate_outputs(self):
-        so = onnxruntime.SessionOptions()
-        if sys.version_info < (3, 11) and find_spec("onnxruntime_extensions"):  # pragma: no cover
-            from onnxruntime_extensions import get_library_path
-
-            so.register_custom_ops_library(get_library_path())
-
-        providers = self.providers if "TensorrtExecutionProvider" not in self.providers else ["CUDAExecutionProvider"]
-        if self.model_wrapper.is_large_model:  # pragma: no cover
-            with tempfile.TemporaryDirectory(prefix="ort.calib.") as tmp_dir:
-                onnx.save_model(
-                    self.model_wrapper.model,
-                    Path(tmp_dir).joinpath("augment.onnx").as_posix(),
-                    save_as_external_data=True,
-                    all_tensors_to_one_file=True,
-                    convert_attribute=False,
-                )
-                session = onnxruntime.InferenceSession(
-                    Path(tmp_dir).joinpath("augment.onnx").as_posix(), so, providers=providers
-                )
-                from onnx.external_data_helper import load_external_data_for_model
-
-                load_external_data_for_model(self.model_wrapper.model, Path(tmp_dir).as_posix())
-        else:
-            session = onnxruntime.InferenceSession(
-                self.model_wrapper.model.SerializeToString(), so, providers=providers
-            )
-        node_output_names = [output.name for output in session.get_outputs()]
-        output_dicts = {}
-        input_name_to_nodes = self.model_wrapper.input_name_to_nodes()
-        output_name_to_node = self.model_wrapper.output_name_to_node()
-        name_to_node = {}
-        for data_name in node_output_names:
-            node = None
-            if data_name in output_name_to_node:
-                node = output_name_to_node[data_name]
-            elif data_name in input_name_to_nodes:
-                node = input_name_to_nodes[data_name][0]
-            assert node, "{} is neither an input nor an output of nodes in augmented model.".format(data_name)
-            name_to_node[data_name] = node.name
-
-        def _collect_data(ort_inputs):
-            for output_idx, output in enumerate(session.run(None, ort_inputs)):
-                output_dicts.setdefault(node_output_names[output_idx], []).append(output)
-
-        idx = 0
-        while True:
-            inputs = self.dataloader.get_next()
-            if not inputs:
-                break
-            if self.iterations != []:
-                if idx > max(self.iterations):
-                    break
-                if idx in self.iterations:
-                    _collect_data(inputs)
-            else:
-                _collect_data(inputs)
-            idx += 1
-        return output_dicts
-
-    def calib_smooth(self, op_types, percentile: float = 99.999):
-        """Smooth model calibration.
-
-        Mainly get the max info per channel of input tensors.
-
-        Args:
-            op_types (_type_): The op types whose input tensor will be dumped.
-            percentile (float, optional): Percentile of calibration to remove outliers.
-                Defaults to 99.999.
-
-        Returns:
-            max_vals_per_channel: max values per channel of input tensors
-            shape_infos: The shape information of input tensors
-        """
-        logger.info("Start smooth model calibration.")
-        # add the input tensors of {op_types} to outputs of the model
-        tensors_to_node = self._get_input_tensor_of_ops(op_types)
-        self.model_wrapper.add_tensors_to_outputs(tensors_to_node.keys())
-        output_dicts = self.get_intermediate_outputs()
-
-        # remove the input tensors of {op_types} to outputs of the model
-        self.model_wrapper.remove_tensors_from_outputs(tensors_to_node.keys())
-        max_vals_per_channel = {}
-        shape_infos = {}
-
-        for key, val in tensors_to_node.items():
-            max_val_per_channel = self._get_max_per_channel(output_dicts[key], percentile=percentile)
-            max_vals_per_channel[key] = max_val_per_channel
-            shape_infos[key] = output_dicts[key][0].shape
-            for item in val:
-                shape_infos[item[1][1]] = self.model_wrapper.get_initializer(item[1][1]).dims
-        return max_vals_per_channel, shape_infos, tensors_to_node
diff --git a/neural_compressor/onnxrt/algorithms/smoother/core.py b/neural_compressor/onnxrt/algorithms/smoother/core.py
deleted file mode 100644
index 50227bf3994..00000000000
--- a/neural_compressor/onnxrt/algorithms/smoother/core.py
+++ /dev/null
@@ -1,668 +0,0 @@
-# Copyright (c) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Smoother for onnxrt."""
-
-import copy
-import os
-from pathlib import Path
-from typing import List, Union
-
-import numpy as np
-import onnx
-from onnx import helper, numpy_helper
-from onnx import onnx_pb as onnx_proto
-
-from neural_compressor.common import Logger
-from neural_compressor.onnxrt.algorithms.smoother.calibrator import Calibrator
-from neural_compressor.onnxrt.quantization.calibrate import CalibrationDataReader
-from neural_compressor.onnxrt.utils.onnx_model import ONNXModel
-from neural_compressor.onnxrt.utils.utility import (
-    get_qrange_for_qType,
-    is_B_transposed,
-    quantize_data,
-    simple_progress_bar,
-)
-
-logger = Logger().get_logger()
-
-__all__ = ["Smoother"]
-
-_dtype_map = {
-    np.dtype("float32"): 1,
-    np.dtype("uint8"): 2,
-    np.dtype("int8"): 3,
-    np.dtype("int32"): 6,
-    np.dtype("int64"): 7,
-    np.dtype("float16"): 10,
-    np.dtype("double"): 11,
-}
-
-
-def _get_quant_dequant_output(model, input_data, output_data, providers):
-    """Get loss between fp32 output and QDQ output.
-
-    Args:
-        model (object): model
-        input_data (numpy.ndarray): fp32 input
-        output_data (numpy.ndarray): fp32 output
-        providers (list): execution provider
-    """
-    import onnxruntime as ort
-
-    input_data = _quant_dequant_data(input_data, 2, "asym")
-    sess = ort.InferenceSession(model.SerializeToString(), providers=providers)
-    preds = sess.run(None, {model.graph.input[0].name: input_data})
-    loss = np.sum(np.abs(output_data - preds) ** 2)
-    return loss
-
-
-def _make_sub_graph(node, inits, input_data, output_data, opset, ir_version):
-    """Build a model with the specific node.
-
-    Args:
-        node (object): node
-        inits (list): initializer inputs of this node
-        input_data (numpy.ndarray): fp32 input
-        output_data (numpy.ndarray): fp32 output
-        opset (object): opset of the model
-        ir_version (object): ir_version of the model
-    """
-    from onnx import helper
-
-    input = helper.make_tensor_value_info(node.input[0], _dtype_map[input_data.dtype], input_data.shape)
-    output = helper.make_tensor_value_info(node.output[0], _dtype_map[output_data.dtype], output_data.shape)
-    graph = helper.make_graph([node], "sub_graph", [input], [output], inits)
-    model = helper.make_model(graph, opset_imports=opset)
-    model.ir_version = ir_version
-    return model
-
-
-def _quant_dequant_data(data, qType=3, scheme="sym"):
-    """Quantize and then dequantize data.
-
-    Args:
-        data (numpy.ndarray): target data
-        qType (int): data type
-        scheme (str): sym or asym quantization
-    """
-    rmin, rmax, zero_point, scale, quantized_data = quantize_data(
-        data.flatten().tolist(), get_qrange_for_qType(qType, False), qType, scheme
-    )
-    return ((quantized_data - zero_point) * scale).astype(data.dtype).reshape(data.shape)
-
-
-class Smoother:
-    """Fake input channel quantization.
-
-    For more details please refer to:
-    [1] SmoothQuant: Accurate and Efficient
-    Post-Training Quantization for Large Language Models
-    [2] SPIQ: Data-Free Per-Channel Static Input Quantization
-    We only support inplace mode which means the model weights will be changed,
-    you can call recover function to recover the weights if needed.
-    """
-
-    def __init__(
-        self,
-        model: Union[onnx.ModelProto, ONNXModel, Path, str],
-        dataloader: CalibrationDataReader,
-        providers: List[str] = ["CPUExecutionProvider"],
-    ):
-        """Initialize the attributes of class."""
-        self.model = model if isinstance(model, ONNXModel) else ONNXModel(model, load_external_data=True)
-        self.value_infos = {vi.name: vi for vi in self.model.model.graph.value_info}
-        self.value_infos.update({ot.name: ot for ot in self.model.model.graph.output})
-        self.value_infos.update({it.name: it for it in self.model.model.graph.input})
-        self.dataloader = dataloader
-        self.providers = providers
-        self.tensor_scales_info = {}
-        self.new_added_mul_nodes = []
-        self.new_added_value_info = []
-        self.new_init_tensors = []  # scales_tensor
-        self.scales_per_op = True
-        self.replace_input = []
-        self.ops_to_absorb = []
-        self.max_vals_per_channel = None
-        self.shape_info = None
-        self.tensors_to_node = None
-        self._build_absorb_function()
-
-    def transform(
-        self,
-        alpha: Union[float, str] = 0.5,
-        folding: bool = True,
-        percentile: float = 99.999,
-        op_types: List[str] = ["Gemm", "Conv", "MatMul", "FusedConv"],
-        scales_per_op: bool = True,
-        calib_iter: int = 100,
-        auto_alpha_args: dict = {"alpha_min": 0.3, "alpha_max": 0.7, "alpha_step": 0.05, "attn_method": "min"},
-        *args,
-        **kwargs
-    ):
-        """The main entry of smooth quant.
-
-        Args:
-            alpha (float, optional): alpha value to balance the quantization difficulty of activation and weight.
-                Defaults to 0.5.
-            folding (bool, optional): whether fold those foldable Mul which are inserted for smooth quant.
-                Defaults to True.
-            percentile (float, optional): percentile of calibration to remove outliers.
-                Defaults to 99.999.
-            op_types (list, optional): the op type to be smooth quantized.
-                Defaults to ["Gemm", "Conv", "MatMul", "FusedConv"].
-            scales_per_op (bool, optional): True, each op will have an individual scale, mainlyfor accuracy
-                False, ops with the same input will share a scale, mainly for performance.
-                Defaults to True.
-            calib_iter (int, optional): iteration num for calibration. Defaults to 100.
-            auto_alpha_args (_type_, optional): alpha args for auto smooth.
-                Defaults to {"alpha_min": 0.3, "alpha_max": 0.7, "alpha_step": 0.05, "attn_method": "min"}.
-
-        Returns:
-            onnx.ModelProto: A FP32 model with the same architecture as the orig model
-                but with different weight which will be benefit to quantization
-        """
-        self.scales_per_op = scales_per_op
-        self.clean()
-        if isinstance(alpha, float) and (alpha < 0 or alpha > 1):
-            logger.warning("alpha should be a float value in [0, 1] or 'auto' ")
-            if alpha < 0:
-                alpha = 0
-                logger.warning("reset alpha to 0 ")
-            elif alpha > 1.0:
-                alpha = 1.0
-                logger.warning("reset alpha to 1.0 ")
-
-        self._dump_op_info(percentile, op_types, calib_iter)
-
-        if alpha == "auto":
-            alpha = self._auto_tune_alpha(calib_iter, **auto_alpha_args)
-
-        scales = self._get_smooth_scales(alpha)
-        self._insert_smooth_mul_op(scales)
-        self._adjust_weights(scales)
-
-        self.model.add_nodes(self.new_added_mul_nodes)
-        self.model.model.graph.value_info.extend(self.new_added_value_info)
-        self.model.add_initializers(self.new_init_tensors)
-        for node, old_input_name, new_input_name in self.replace_input:
-            self.model.replace_node_input(node, old_input_name, new_input_name)
-
-        self.model.update()
-        if folding:
-            self._fold_scale(scales)
-        self.model.topological_sort()
-        self.model.remove_unused_nodes()
-        return self.model.model
-
-    def _dump_op_info(self, percentile, op_types, iterations):
-        """Dump op info for smooth quant.
-
-        Args:
-            percentile (float): percentile of calibration to remove outliers
-            op_types (list): the op type to be smooth quantized
-            iterations (int): iterations
-        """
-        calibrator = Calibrator(
-            self.model,
-            self.dataloader,
-            iterations=list(range(0, iterations)),
-            backend=self.providers,
-        )
-
-        self.max_vals_per_channel, self.shape_info, self.tensors_to_node = calibrator.calib_smooth(op_types, percentile)
-        for node in self.model.nodes():
-            for out in node.output:
-                if (
-                    out in self.tensors_to_node
-                    and node.op_type in self.could_absorb_optype
-                    and self.model.get_initializer(node.input[1]) is not None
-                ):
-                    self.ops_to_absorb.append(node.name)
-
-    def recover(self):
-        """Recover the model weights."""
-        for tensor_name, nodes in self.tensors_to_node.items():
-            for node_info in nodes:
-                key = node_info[0] if self.scales_per_op else tensor_name
-                if key not in self.tensor_scales_info:
-                    continue
-                input = node_info[1][1]
-                weight = numpy_helper.to_array(
-                    self.model.get_initializer(input),
-                    base_dir=os.path.dirname(self.model.model_path) if self.model.model_path is not None else "",
-                )
-                scale = self.tensor_scales_info[key]
-                new_weight = weight * scale
-                self.model.set_initializer(input, new_weight)
-
-        for node, old_input_name, new_input_name in self.replace_input:
-            self.model.replace_node_input(node, new_input_name, old_input_name)
-
-        for value_info in self.new_added_value_info:
-            self.model.model.graph.value_info.remove(value_info)
-
-        self.model.remove_nodes(self.new_added_mul_nodes)
-        self.model.remove_initializers(self.new_init_tensors)
-        self.tensor_scales_info = {}
-        self.new_added_mul_nodes = []
-        self.new_init_tensors = []
-        self.new_added_value_info = []
-        self.replace_input = []
-
-    def clean(self):
-        """Clean data collected from calibration."""
-        self.tensor_scales_info = {}
-        self.new_added_mul_nodes = []
-        self.new_init_tensors = []
-        self.new_added_value_info = []
-        self.replace_input = []
-
-    def _build_absorb_function(self):
-        """Build function mapping for scale folding."""
-        from onnx import numpy_helper
-
-        def norm(node, scale):  # pragma: no cover
-            for idx in [1, 2]:
-                tensor = self.model.get_initializer(node.input[idx])
-                new_tensor = (
-                    numpy_helper.to_array(tensor, os.path.dirname(self.model.model_path)) * scale
-                    if self.model.model_path is not None
-                    else numpy_helper.to_array(tensor) * scale
-                )
-                self.model.set_initializer(node.input[idx], new_tensor)
-                self.tensor_scales_info[node.input[idx]] = (
-                    1.0 / scale
-                    if node.input[idx] not in self.tensor_scales_info
-                    else self.tensor_scales_info[node.input[idx]] * 1.0 / scale
-                )
-            return True
-
-        def mul(node, scale):  # pragma: no cover
-            if all([self.model.get_initializer(inp) is None for inp in node.input]):
-                return False
-            for inp in node.input:
-                if self.model.get_initializer(inp) is not None:
-                    key = node.input[0].split("_smooth_output")[0]
-                    tensor = self.model.get_initializer(inp)
-                    new_tensor = (
-                        numpy_helper.to_array(tensor, os.path.dirname(self.model.model_path)) * scale
-                        if self.model.model_path is not None
-                        else numpy_helper.to_array(tensor) * scale
-                    )
-                    self.model.set_initializer(inp, new_tensor)
-                    self.tensor_scales_info[key] = (
-                        1.0 / scale
-                        if key not in self.tensor_scales_info
-                        else 1.0 / scale * self.tensor_scales_info[key]
-                    )
-            return True
-
-        def conv(node, scale):  # pragma: no cover
-            if len(node.input) > 2:
-                if self.model.get_initializer(node.input[2]) is not None:
-                    tensor = self.model.get_initializer(node.input[2])
-                    new_tensor = (
-                        numpy_helper.to_array(tensor, os.path.dirname(self.model.model_path)) * scale
-                        if self.model.model_path is not None
-                        else numpy_helper.to_array(tensor) * scale
-                    )
-                    self.model.set_initializer(node.input[2], new_tensor)
-                    self.tensor_scales_info[node.input[2]] = 1.0 / scale
-                scale = scale.reshape(-1, 1, 1, 1)
-                tensor = self.model.get_initializer(node.input[1])
-                new_tensor = (
-                    numpy_helper.to_array(tensor, os.path.dirname(self.model.model_path)) * scale
-                    if self.model.model_path is not None
-                    else numpy_helper.to_array(tensor) * scale
-                )
-                self.model.set_initializer(node.input[1], new_tensor)
-                self.tensor_scales_info[node.input[1]] = (
-                    1.0 / scale
-                    if node.input[1] not in self.tensor_scales_info
-                    else self.tensor_scales_info[node.input[1]] * 1.0 / scale
-                )
-            return True
-
-        self.could_absorb_optype = {
-            "LayerNormalization": norm,
-            "BatchNormalization": norm,
-            "InstanceNormalization": norm,
-            "SimplifiedLayerNormalization": mul,
-            "MatMul": mul,
-            "Gemm": mul,
-            "Conv": conv,
-            "FusedConv": conv,
-            "Mul": mul,
-        }
-
-    def _fold_scale(self, scales):
-        """Absorb the scale to the operator at output channel.
-
-        Args:
-            scales (dict): scales for smooth quant, {tensor_name: smooth quant scale}
-        """
-        remove_nodes = []
-        for node in self.model.nodes():
-            if node.op_type == "Mul" and node.name.endswith("_smooth_mul") and node not in remove_nodes:
-                parent = self.model.get_parent(node, 0)
-                if parent is None:
-                    continue
-                if parent.op_type in self.could_absorb_optype and len(self.model.get_children(parent)) == 1:
-                    if node.output[0].split("_smooth_output")[0] in scales:
-                        if self.could_absorb_optype[parent.op_type](
-                            parent, 1.0 / scales[node.output[0].split("_smooth_output")[0]]
-                        ):
-                            remove_nodes.append(node)
-                            children = [i for i in self.model.nodes() if node.output[0] in i.input]
-                            for child in children:
-                                for idx, inp in enumerate(child.input):
-                                    if inp == node.output[0]:
-                                        child.input[idx] = node.input[0]
-        self.model.remove_nodes(remove_nodes)
-
-    def _get_output_loss(self, node_name, scale, calib_iter):
-        """Get output loss of specific node after inserting QDQ pair.
-
-        Args:
-            node_name (str): node name
-            scale (float): scale of the specific node
-            calib_iter (int): iterations
-        """
-        import onnxruntime as ort
-
-        node = [i for i in self.model.nodes() if i.name == node_name]
-        loss = 0
-        if len(node) > 0:
-            node = node[0]
-            orig_outputs = self.model.output()
-            added_tensors = [node.input[0], node.output[0]]
-            self.model.add_tensors_to_outputs(added_tensors)
-
-            session = (
-                ort.InferenceSession(self.model.model_path + "_augment.onnx", providers=self.providers)
-                if self.model.is_large_model
-                else ort.InferenceSession(self.model.model.SerializeToString(), providers=self.providers)
-            )
-            base_dir = "" if not self.model.is_large_model else os.path.dirname(self.model.model_path)
-            weight = onnx.numpy_helper.to_array(self.model.get_initializer(node.input[1]), base_dir)
-            weight_q = _quant_dequant_data(weight)
-
-            self.model.set_initializer(node.input[1], weight_q)
-            inits = [self.model.get_initializer(i) for i in node.input if self.model.get_initializer(i) is not None]
-
-            model = None
-            idx = 1
-            while True:
-                inputs = self.dataloader.get_next()
-                if not inputs:
-                    break
-                if idx > calib_iter:
-                    break
-
-                outputs = session.run(added_tensors, inputs)
-                if model is None:
-                    model = _make_sub_graph(
-                        node,
-                        inits,
-                        outputs[0],
-                        outputs[1],
-                        self.model.model.opset_import,
-                        self.model.model.ir_version,
-                    )
-                loss += _get_quant_dequant_output(model, outputs[0] * scale, outputs[1], self.providers)
-
-            self.model.remove_tensors_from_outputs([i for i in added_tensors if i not in orig_outputs])
-            self.model.set_initializer(node.input[1], weight)
-        return loss
-
-    def _reshape_scale_for_input(self, tensor, key):
-        """Reshape the scale for input feature in channel.
-
-        Args:
-            tensor (str): tensor name
-            key (str): scale key of this tensor
-        """
-        if len(self.shape_info[tensor]) == 4:
-            scale = np.reshape(self.tensor_scales_info[key], (1, self.tensor_scales_info[key].shape[1], 1, 1))
-        else:
-            scale = np.reshape(self.tensor_scales_info[key], (1, self.tensor_scales_info[key].shape[0]))
-        return scale
-
-    def _auto_tune_alpha(
-        self,
-        calib_iter,
-        alpha_min: float = 0.3,
-        alpha_max: float = 0.7,
-        alpha_step: float = 0.05,
-        attn_method: str = "min",
-    ):
-        """Perform alpha-tuning to obtain layer-wise optimal alpha values and adjust parameters accordingly.
-
-        Args:
-            calib_iter (int): iterations
-            alpha_min (float): min value of alpha search space.
-            alpha_max (float): max value of alpha search space.
-            alpha_step (float): step size of alpha search space.
-            attn_method (str): criterion method used on attention ops; currently min, max and mean are supported.
-        """
-        logger.info("auto tuning alpha")
-
-        alpha_space = np.arange(alpha_min, alpha_max, alpha_step).tolist()
-
-        optimal_alphas = {}
-        if self.model.is_large_model:
-            onnx.save_model(
-                self.model.model,
-                self.model.model_path + "_augment.onnx",
-                save_as_external_data=True,
-                all_tensors_to_one_file=True,
-                location="weights.pb",
-                convert_attribute=False,
-            )
-
-        ## Searching optimal alphas
-        for tensor_name, node_infos in self.tensors_to_node.items():
-            for node_info in node_infos:
-                loss_alpha = {}
-                key = node_info[0] if self.scales_per_op else tensor_name
-                node = self.model.get_node(node_info[0])
-                for alpha in alpha_space:
-                    scale = self._get_smooth_scales(alpha, [key])
-                    self._adjust_weights(scale)
-                    input_scale = (
-                        self._reshape_scale_for_input(tensor_name, key)
-                        if not (node.op_type == "Gemm" and is_B_transposed(node))
-                        else self.tensor_scales_info[key]
-                    )
-                    loss = self._get_output_loss(node_info[0], input_scale, calib_iter)
-                    loss_alpha[alpha] = loss
-                    if key not in optimal_alphas:  # Update alpha results
-                        optimal_alphas[key] = alpha
-                    else:
-                        optimal_alphas[key] = (
-                            alpha
-                            if optimal_alphas[key] in loss_alpha and loss < loss_alpha[optimal_alphas[key]]
-                            else optimal_alphas[key]
-                        )
-                    self.recover()
-        logger.info("auto tuning alpha done")
-        if self.model.is_large_model:
-            from onnx.external_data_helper import load_external_data_for_model
-
-            load_external_data_for_model(self.model.model, os.path.split(self.model.model_path)[0])
-            os.remove(self.model.model_path + "_augment.onnx")
-            os.remove(os.path.join(os.path.dirname(self.model.model_path), "weights.pb"))
-        return optimal_alphas
-
-    def _get_smooth_scales(self, alpha, target_list=[]):
-        """Get the smooth scales for.
-
-        The ops with the same input will share one mul layer.
-        TODO support individual scales for each layer.
-
-        Args:
-            alpha: smooth alpha in paper
-            target_list: target objects to get scale, [] means get all scales
-
-        Returns:
-            the smooth scales for weights, currently one input tensor only have one scale
-        """
-        logger.info("Start smooth scales collection.")
-        scales = {}
-        for tensor, nodes in self.tensors_to_node.items():
-            # if scales_per_op the key of scales is the node name, otherwise the activation of node
-            if self.scales_per_op:
-                for node_info in nodes:
-                    node = self.model.get_node_by_weight(node_info[1][1])
-                    if len(target_list) > 0 and node_info[0] not in target_list:
-                        continue
-                    weight = numpy_helper.to_array(
-                        self.model.get_initializer(node_info[1][1]),
-                        base_dir=os.path.dirname(self.model.model_path) if self.model.model_path is not None else "",
-                    )
-                    if (len(weight.shape) == 4 and weight.shape[1] != 1) or (
-                        node.op_type == "Gemm" and is_B_transposed(node)
-                    ):
-                        weight = np.moveaxis(weight, 0, 1)
-                    specific_alpha = alpha[node_info[0]] if isinstance(alpha, dict) else alpha
-                    scales[node_info[0]] = self._get_smooth_scale(weight, specific_alpha, tensor)
-            else:
-                if len(target_list) > 0 and tensor not in target_list:
-                    continue
-                weights_in_channel_max = []
-                for node_info in nodes:
-                    node = self.model.get_node_by_weight(node_info[1][1])
-                    weight = numpy_helper.to_array(
-                        self.model.get_initializer(node_info[1][1]),
-                        base_dir=os.path.dirname(self.model.model_path) if self.model.model_path is not None else "",
-                    )
-                    if (len(weight.shape) == 4 and weight.shape[1] != 1) or (
-                        node.op_type == "Gemm" and is_B_transposed(node)
-                    ):
-                        weight = np.moveaxis(weight, 0, 1)
-                    weight = weight.reshape(weight.shape[0], -1)
-                    cur_max = np.amax(weight, axis=-1)
-                    weights_in_channel_max.append(cur_max)
-                weights_stack = np.stack(weights_in_channel_max, axis=-1)
-                specific_alpha = alpha[tensor] if isinstance(alpha, dict) else alpha
-                scales[tensor] = self._get_smooth_scale(weights_stack, specific_alpha, tensor)
-
-        return scales
-
-    def _get_smooth_scale(self, weights, specific_alpha, tensor):
-        """Get smooth scale for specific weight.
-
-        Args:
-            weights (numpy.ndarray): weight data
-            specific_alpha (float): current alpha for this weights
-            tensor (str): tensor name
-        """
-        weights = np.abs(weights.reshape(weights.shape[0], -1))
-        weights_max = np.amax(weights, axis=-1)
-        input_power = np.power(self.max_vals_per_channel[tensor], specific_alpha)
-        weight_power = np.power(weights_max, 1 - specific_alpha)
-        weight_power = np.clip(weight_power, a_min=1e-5, a_max=None)
-        scale = np.clip(input_power / weight_power, a_min=1e-5, a_max=None)
-        return scale
-
-    def _insert_smooth_mul_op(self, scales):
-        """Insert the Mul after inupt.
-
-        The ops with the same input will share one mul layer.
-
-        Args:
-            scales (dict): The smooth scales
-        """
-        for key in scales.keys():
-            input_name = key if not self.scales_per_op else self.model.get_node(key).input[0]
-            weight_name = (
-                self.tensors_to_node[key][0][1][1] if not self.scales_per_op else self.model.get_node(key).input[1]
-            )
-            scale_factor = 1.0 / scales[key]
-            if (
-                len(self.shape_info[weight_name]) == 3 or len(self.shape_info[weight_name]) == 2
-            ):  # the last dim is input channel
-                pass
-            elif len(self.shape_info[weight_name]) == 4:
-                scale_factor = np.reshape(scale_factor, (1, -1, 1, 1))
-            else:
-                assert False, "not support"
-            name = key + "_" + "smooth_scale"
-            scale_tensor = helper.make_tensor(
-                name=key + "_" + "smooth_scale",
-                data_type=onnx_proto.TensorProto.FLOAT,
-                dims=scale_factor.shape,
-                vals=scale_factor.flatten().tolist(),
-            )
-            self.new_init_tensors.append(scale_tensor)
-            mul_output_name = key + "_smooth_output"
-            mul_node = helper.make_node(
-                "Mul",
-                inputs=[input_name, key + "_" + "smooth_scale"],
-                outputs=[mul_output_name],
-                name=key + "_smooth_mul",
-            )
-            self.new_added_mul_nodes.append(mul_node)
-            if input_name in self.value_infos:
-                value_info = copy.deepcopy(self.value_infos[input_name])
-                value_info.name = mul_node.output[0]
-                self.new_added_value_info.append(value_info)
-            if self.scales_per_op:
-                self.replace_input.append([self.model.get_node(key), input_name, mul_output_name])
-            else:
-                for node_info in self.tensors_to_node[key]:
-                    self.replace_input.append([self.model.get_node(node_info[0]), key, mul_output_name])
-
-    def _adjust_weights(self, scales):
-        """Adjust the weights with scale.
-
-        Args:
-            scales (dict): The input scales
-        """
-        for idx, (tensor_name, nodes) in enumerate(self.tensors_to_node.items()):
-            simple_progress_bar(len(self.tensors_to_node), idx + 1)
-            for node_info in nodes:
-                key = node_info[0] if self.scales_per_op else tensor_name
-                if key not in scales:
-                    continue
-                input = node_info[1][1]
-                node = self.model.get_node_by_weight(input)
-                weight = numpy_helper.to_array(
-                    self.model.get_initializer(input),
-                    base_dir=os.path.dirname(self.model.model_path) if self.model.model_path is not None else "",
-                )
-                if len(weight.shape) == 2:
-                    scale = (
-                        np.expand_dims(scales[key], axis=0)
-                        if node.op_type == "Gemm" and is_B_transposed(node)
-                        else np.expand_dims(scales[key], axis=-1)
-                    )
-                    new_weight = weight * scale
-                elif len(weight.shape) == 4:  # TODO need to check conv
-                    node = self.model.get_node_by_weight(input)
-                    if (
-                        weight.shape[1] == 1
-                        and "group" in [i.name for i in node.attribute]
-                        and [i for i in node.attribute if i.name == "group"][0].i > 1
-                    ):
-                        scale = np.reshape(scales[key], (-1, 1, 1, 1))
-                    else:
-                        scale = np.reshape(scales[key], (1, -1, 1, 1))
-                    new_weight = weight * scale
-                else:
-                    assert False, "not support"
-                self.tensor_scales_info[key] = 1.0 / scale
-
-                new_tensor = numpy_helper.from_array(new_weight, input)
-                self.model.get_initializer(input).CopyFrom(new_tensor)
diff --git a/neural_compressor/onnxrt/algorithms/weight_only/__init__.py b/neural_compressor/onnxrt/algorithms/weight_only/__init__.py
deleted file mode 100644
index 28f108cb636..00000000000
--- a/neural_compressor/onnxrt/algorithms/weight_only/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/neural_compressor/onnxrt/algorithms/weight_only/awq.py b/neural_compressor/onnxrt/algorithms/weight_only/awq.py
deleted file mode 100644
index 647d0a9d25e..00000000000
--- a/neural_compressor/onnxrt/algorithms/weight_only/awq.py
+++ /dev/null
@@ -1,437 +0,0 @@
-# Copyright (c) 2023 MIT HAN Lab
-# This source code is licensed under the MIT license
-#
-# Copyright (c) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import copy
-import os
-from pathlib import Path
-from typing import List, Union
-
-import numpy as np
-import onnx
-import onnxruntime as ort
-from packaging.version import Version
-
-from neural_compressor.common import Logger
-from neural_compressor.onnxrt.algorithms.weight_only.rtn import rtn_quantize
-from neural_compressor.onnxrt.algorithms.weight_only.utility import pad_tensor, prepare_inputs, qdq_tensor
-from neural_compressor.onnxrt.quantization.calibrate import CalibrationDataReader
-from neural_compressor.onnxrt.quantization.config import AWQConfig
-from neural_compressor.onnxrt.utils.onnx_model import ONNXModel
-from neural_compressor.onnxrt.utils.utility import ONNXRT116_VERSION, ONNXRT1161_VERSION, dtype_mapping
-
-logger = Logger().get_logger()
-
-__all__ = ["apply_awq_on_model", "awq_quantize"]
-
-
-def _get_weight_scale(weight, group_size):
-    """Get the scale of weight."""
-    org_shape = weight.shape
-    weight = np.reshape(weight, (-1, group_size)) if group_size != -1 else weight
-    scale = np.mean(np.reshape(np.abs(weight) / np.max(np.abs(weight), axis=1, keepdims=True), org_shape), axis=0)
-    return scale
-
-
-def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, group_size, scheme):
-    """Apply scale for salient weight."""
-    best_scales = {}
-    new_init_tensors = []
-    new_added_mul_nodes = []
-    replace_input = []
-    updated_nodes = []
-    base_dir = os.path.dirname(model.model_path) if model.model_path is not None else ""
-
-    for parent, nodes in absorb_pairs.items():
-        if any([node.input[0] not in output_dicts for node in nodes]):
-            logger.warning(
-                "Miss input tensors of nodes {} during AWQ, skip it!".format(
-                    ", ".join([node.name for node in nodes if node.input[0] not in output_dicts])
-                )
-            )
-            continue
-        inp = np.concatenate(output_dicts[nodes[0].input[0]], axis=0)
-        inp_scale = np.mean(np.reshape(np.abs(inp), (-1, inp[0].shape[-1])), axis=0)
-        dtype = None
-        weight = []
-        org_out = []
-        for node in nodes:
-            if node.name in weight_config and weight_config.get(node.name, "fp32") != "fp32":
-                num_bits = weight_config[node.name]["bits"]
-                group_size = weight_config[node.name]["group_size"]
-                scheme = weight_config[node.name]["scheme"]
-                break
-
-        # search scale
-        best_error = float("inf")
-        best_ratio = -1
-        best_scale = None
-        n_grid = 20
-
-        for ratio in range(n_grid):
-            ratio = ratio * 1 / n_grid
-            loss = 0
-            for node in nodes:
-                if weight_config.get(node.name, {}) == "fp32":
-                    continue
-
-                weight = onnx.numpy_helper.to_array(model.get_initializer(node.input[1]), base_dir)
-                if len(weight.shape) != 2:
-                    continue
-
-                org_out = np.matmul(inp, weight)
-                org_w_shape = weight.shape
-                group_size = group_size if group_size != -1 else org_w_shape[0]
-
-                w_scale = _get_weight_scale(weight.T, weight.shape[0])
-                scales = np.clip(np.power(inp_scale, ratio) / np.power(w_scale, (1 - ratio)), 1e-4, None)
-                scales = scales / np.sqrt(np.max(scales) * np.min(scales))
-                weight = weight.T * scales
-                weight = pad_tensor(weight, group_size, (org_w_shape[0] + group_size - 1) // group_size).T
-
-                if (Version(ort.__version__) > ONNXRT1161_VERSION and num_bits == 4) or (
-                    Version(ort.__version__) >= ONNXRT116_VERSION and num_bits == 4 and group_size == 32
-                ):  # pragma: no cover
-                    # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions
-                    # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1
-                    q_weight = qdq_tensor(weight, num_bits, group_size, scheme, "uint") / np.expand_dims(
-                        scales, axis=-1
-                    )
-                else:
-                    q_weight = qdq_tensor(weight, num_bits, group_size, scheme, "int") / np.expand_dims(scales, axis=-1)
-
-                q_weight = np.reshape(q_weight, (org_w_shape[1], -1))[:, : org_w_shape[0]]
-                out = np.matmul(inp, q_weight.T)
-                loss += np.mean(np.power((org_out - out), 2))
-
-            is_best = loss < best_error
-            if is_best:
-                best_error = loss
-                best_ratio = ratio
-                best_scale = scales
-
-        for node in nodes:
-            weight_config.setdefault(node.name, {}).update({"bits": num_bits})
-            weight_config.setdefault(node.name, {}).update({"group_size": group_size})
-            weight_config.setdefault(node.name, {}).update({"scheme": scheme})
-
-            init_share_num = model.get_initializer_share_num(node.input[1])
-            weight_tensor = model.get_initializer(node.input[1])
-            tensor = onnx.numpy_helper.to_array(weight_tensor, base_dir)
-            dtype = tensor.dtype
-            tensor = tensor.T * best_scale
-            tensor = (tensor.T).astype(dtype)
-
-            new_tensor = onnx.helper.make_tensor(
-                name=node.input[1] + "_scaled",
-                data_type=dtype_mapping[str(dtype)],
-                dims=tensor.shape,
-                vals=tensor.tobytes(),
-                raw=True,
-            )
-            model.add_initializer(new_tensor)
-            node.input[1] = new_tensor.name
-
-            if init_share_num == 1:
-                model.remove_initializer(weight_tensor)
-
-        parent = model.get_node(parent)
-        if parent.name in updated_nodes:
-            continue
-
-        if parent.op_type in ["LayerNormalization", "BatchNormalization", "InstanceNormalization"] and len(
-            model.input_name_to_nodes()[nodes[0].input[0]]
-        ) == len(nodes):
-            for idx in [1, 2]:
-                tensor = onnx.numpy_helper.to_array(model.get_initializer(parent.input[idx]), base_dir)
-                dtype = tensor.dtype
-                new_tensor = tensor / np.reshape(best_scale, (1, -1))
-                model.set_initializer(parent.input[idx], new_tensor.astype(dtype), raw=True)
-                updated_nodes.append(parent.name)
-            output_dicts[parent.output[0]] = output_dicts[parent.output[0]] / np.reshape(best_scale, (1, -1))
-
-        elif (
-            parent.op_type in ["SimplifiedLayerNormalization", "MatMul", "Gemm", "Mul"]
-            and not all([model.get_initializer(inp) is None for inp in parent.input])
-            and len(model.input_name_to_nodes()[nodes[0].input[0]]) == len(nodes)
-        ):  # pragma: no cover
-            for inp in parent.input:
-                if model.get_initializer(inp) is not None:
-                    tensor = onnx.numpy_helper.to_array(model.get_initializer(inp), base_dir)
-                    dtype = tensor.dtype
-                    new_tensor = tensor / np.reshape(best_scale, (1, -1))
-                    model.set_initializer(inp, new_tensor.astype(dtype), raw=True)
-            updated_nodes.append(parent.name)
-            output_dicts[parent.output[0]] = output_dicts[parent.output[0]] / np.reshape(best_scale, (1, -1))
-
-        elif parent.op_type in ["Conv", "FusedConv"] and len(model.input_name_to_nodes()[nodes[0].input[0]]) == len(
-            nodes
-        ):  # pragma: no cover
-            tensor = onnx.numpy_helper.to_array(model.get_initializer(parent.input[2]), base_dir)
-            dtype = tensor.dtype
-            new_tensor = tensor / np.reshape(best_scale, (1, -1))
-            model.set_initializer(parent.input[2], new_tensor.astype(dtype), raw=True)
-            updated_nodes.append(parent.name)
-            output_dicts[parent.output[0]] = output_dicts[parent.output[0]] / np.reshape(best_scale, (1, -1))
-
-        else:  # pragma: no cover
-            # insert mul
-            scale_tensor = onnx.helper.make_tensor(
-                name=parent.output[0] + "_weight_only_scale",
-                data_type=dtype_mapping[str(dtype)],
-                dims=best_scale.shape,
-                vals=(1.0 / best_scale).flatten().tolist(),
-            )
-            new_init_tensors.append(scale_tensor)
-            mul_output_name = parent.output[0] + "_weight_only_out"
-            mul_node = onnx.helper.make_node(
-                "Mul",
-                inputs=[nodes[0].input[0], scale_tensor.name],
-                outputs=[mul_output_name],
-                name=nodes[0].input[0] + "_weight_only_mul",
-            )
-            new_added_mul_nodes.append(mul_node)
-            for node in nodes:
-                replace_input.append([node, node.input[0], mul_node.output[0]])
-            updated_nodes.append(parent.name)
-            output_dicts[mul_node.output[0]] = output_dicts[mul_node.input[0]] / np.reshape(best_scale, (1, -1))
-
-    model.add_nodes(new_added_mul_nodes)
-    model.add_initializers(new_init_tensors)
-    for node, old_input_name, new_input_name in replace_input:
-        model.replace_node_input(node, old_input_name, new_input_name)
-
-    return model, output_dicts
-
-
-def _apply_awq_clip(model, weight_config, absorb_pairs, output_dicts, num_bits, group_size, scheme):
-    """Apply clip for weight by checking mse."""
-    base_dir = os.path.dirname(model.model_path) if model.model_path is not None else ""
-    ratios = {}
-    for parent, nodes in absorb_pairs.items():
-        if any([node.input[0] not in output_dicts for node in nodes]):
-            logger.warning(
-                "Miss input tensors of nodes {} during AWQ, skip it!".format(
-                    ", ".join([node.name for node in nodes if node.input[0] not in output_dicts])
-                )
-            )
-            continue
-
-        inp = np.concatenate(output_dicts[nodes[0].input[0]], axis=0)
-
-        for node in nodes:
-            if node.name in weight_config:
-                num_bits = weight_config[node.name]["bits"]
-                group_size = weight_config[node.name]["group_size"]
-                scheme = weight_config[node.name]["scheme"]
-
-            org_weight = onnx.numpy_helper.to_array(model.get_initializer(node.input[1]), base_dir=base_dir)
-            org_w_shape = org_weight.shape  # ic, oc
-            group_size = group_size if group_size != -1 else org_w_shape[0]
-            org_out = np.matmul(inp, org_weight)  # n_token, oc
-
-            k_blocks = (org_w_shape[0] - 1) // group_size + 1
-            org_weight = pad_tensor(org_weight, group_size, k_blocks)
-
-            org_weight = np.transpose(org_weight)
-
-            best_error = float("inf")
-            best_ratio = 1
-            for i_s in range(10):
-                ratio = 1 - i_s / 100
-                weight = copy.deepcopy(org_weight)
-                if (Version(ort.__version__) > ONNXRT1161_VERSION and num_bits == 4) or (
-                    Version(ort.__version__) >= ONNXRT116_VERSION and num_bits == 4 and group_size == 32
-                ):  # pragma: no cover
-                    # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions
-                    # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1
-                    weight = qdq_tensor(weight, num_bits, group_size, scheme, "uint", ratios.get(node.input[1], 1))
-                else:
-                    weight = qdq_tensor(weight, num_bits, group_size, scheme, "int", ratios.get(node.input[1], 1))
-                weight = np.reshape(weight, (org_w_shape[1], -1))[:, : org_w_shape[0]]
-                cur_out = np.matmul(inp, weight.T)
-                loss = np.mean(np.power((org_out - cur_out), 2))
-                is_best = loss < best_error
-                if is_best:
-                    best_error = loss
-                    best_ratio = ratio
-            ratios[node.input[1]] = best_ratio
-    return ratios
-
-
-def awq_quantize(
-    model: Union[onnx.ModelProto, ONNXModel, Path, str],
-    data_reader: CalibrationDataReader,
-    weight_config: dict = {},
-    num_bits: int = 4,
-    group_size: int = 32,
-    scheme: str = "asym",
-    enable_auto_scale: bool = True,
-    enable_mse_search: bool = True,
-    accuracy_level: int = 0,
-    providers: List[str] = ["CPUExecutionProvider"],
-) -> onnx.ModelProto:
-    """Quant the model with Activation-aware Weight quantization(AWQ) method.
-
-    Args:
-        model (Union[onnx.ModelProto, ONNXModel, Path, str]): onnx model.
-        data_reader (CalibrationDataReader): data_reader for calibration.
-        weight_config (dict, optional): quantization config
-            For example,
-            weight_config = {
-                '(fc2, "MatMul")':
-                {
-                    'weight_dtype': 'int',
-                    'weight_bits': 4,
-                    'weight_group_size': 32,
-                    'weight_sym': True,
-                    'accuracy_level': 0
-                }
-            }. Defaults to {}.
-        num_bits (int, optional): number of bits used to represent weights. Defaults to 4.
-        group_size (int, optional): size of weight groups. Defaults to 32.
-        scheme (str, optional): indicates whether weights are symmetric. Defaults to "asym".
-        enable_auto_scale (bool, optional): whether to search for best scales based on activation
-            distribution. Defaults to True.
-        enable_mse_search (bool, optional): whether to search for the best clip range from range
-            [0.91, 1.0, 0.01]. Defaults to True.
-        accuracy_level (int, optional): accuracy level. Support 0 (unset),
-            1(fp32 compute type of jblas kernel), 2 (fp16 compute type of jblas kernel),
-            3 (bf16 compute type of jblas kernel), 4 (int8 compute type of jblas kernel). Defaults to 0.
-        providers (list, optional): providers to use. Defaults to ["CPUExecutionProvider"].
-
-    Returns:
-        onnx.ModelProto: quantized onnx model.
-    """
-    if not isinstance(model, ONNXModel):
-        model = ONNXModel(model)
-    output_dicts = {}
-    full_ratio = {}
-
-    if enable_mse_search:
-        inputs, so = prepare_inputs(model, data_reader, providers)
-        del data_reader
-
-        org_output = copy.deepcopy(model.model.graph.output)
-        model.remove_tensors_from_outputs([i.name for i in org_output])
-
-        output_names = []
-        for node in model.nodes():
-            # check op_type of node is MatMul
-            # check dim 1 of input is weight tensor
-            # check weight_type is not "fp32"
-            if (
-                node.op_type in ["MatMul"]
-                and model.get_initializer(node.input[1]) is not None
-                and weight_config.get((node.name, node.op_type), {}).get("weight_dtype", "fp32") != "fp32"
-            ):
-                output_names.append(node.input[0])
-        output_names = list(set(output_names))
-        model.add_tensors_to_outputs(output_names)
-        if model.is_large_model:
-            onnx.save_model(
-                model.model,
-                model.model_path + "_augment.onnx",
-                save_as_external_data=True,
-                all_tensors_to_one_file=True,
-                convert_attribute=False,
-            )
-
-        session = (
-            ort.InferenceSession(model.model.SerializeToString(), so, providers=providers)
-            if not model.is_large_model
-            else ort.InferenceSession(model.model_path + "_augment.onnx", so, providers=providers)
-        )
-
-        for input_name in output_names:
-            parent = model.output_name_to_node()[input_name]
-            dump_pairs = {parent.name: []}
-
-            for node in model.input_name_to_nodes()[input_name]:
-                # check op_type of node is MatMul
-                # check dim 1 of input is weight tensor
-                # check weight_type is not "fp32"
-                if (
-                    node.op_type in ["MatMul"]
-                    and model.get_initializer(node.input[1]) is not None
-                    and weight_config.get((node.name, node.op_type), {}).get("weight_dtype", "fp32") != "fp32"
-                ):
-                    dump_pairs[parent.name].append(model.get_node(node.name))
-
-            if len(dump_pairs[parent.name]) == 0:
-                continue
-
-            output_dicts = {}
-            for inp in inputs:
-                output = session.run([input_name], inp)
-                output_dicts.setdefault(input_name, []).append(output)
-
-            if enable_auto_scale:
-                model, output_dicts = _apply_awq_scale(
-                    model,
-                    weight_config,
-                    dump_pairs,
-                    output_dicts,
-                    num_bits,
-                    group_size,
-                    scheme,
-                )
-            if enable_mse_search:
-                ratios = _apply_awq_clip(
-                    model,
-                    weight_config,
-                    dump_pairs,
-                    output_dicts,
-                    num_bits,
-                    group_size,
-                    scheme,
-                )
-            del output_dicts
-            del dump_pairs
-            full_ratio.update(ratios)
-
-        model.remove_tensors_from_outputs(output_names)
-        model.model.graph.output.MergeFrom(org_output)
-    model = rtn_quantize(model, weight_config, num_bits, group_size, scheme, full_ratio, accuracy_level, providers)
-    return model
-
-
-def apply_awq_on_model(
-    model: Union[onnx.ModelProto, ONNXModel, Path, str],
-    quant_config: dict,
-    calibration_data_reader: CalibrationDataReader,
-) -> onnx.ModelProto:
-    """Apply Activation-aware Weight quantization(AWQ) on onnx model.
-
-    Args:
-        model (Union[onnx.ModelProto, ONNXModel, Path, str]): nnx model.
-        quant_config (dict): quantization config.
-        calibration_data_reader (CalibrationDataReader): data_reader for calibration.
-
-    Returns:
-        onnx.ModelProto: quantized onnx model.
-    """
-    # set model params
-    kwargs = {}
-    kwargs = {key: quant_config.pop(key) for key in AWQConfig.model_params_list if key in quant_config}
-
-    # change op config to dict type
-    for op_name_type, op_config in quant_config.items():
-        if isinstance(op_config, AWQConfig):
-            quant_config[op_name_type] = op_config.to_dict()
-
-    return awq_quantize(model, data_reader=calibration_data_reader, weight_config=quant_config, **kwargs)
diff --git a/neural_compressor/onnxrt/algorithms/weight_only/gptq.py b/neural_compressor/onnxrt/algorithms/weight_only/gptq.py
deleted file mode 100644
index 5a8985f1b0f..00000000000
--- a/neural_compressor/onnxrt/algorithms/weight_only/gptq.py
+++ /dev/null
@@ -1,451 +0,0 @@
-# Copyright (c) 2023 MIT HAN Lab
-# This source code is licensed under the MIT license
-#
-# Copyright (c) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import copy
-import os
-from pathlib import Path
-from typing import List, Union
-
-import numpy as np
-import onnx
-import onnxruntime as ort
-from packaging.version import Version
-
-from neural_compressor.onnxrt.algorithms.weight_only.utility import (
-    make_matmul_weight_only_node,
-    pad_tensor,
-    prepare_inputs,
-    quant_tensor,
-)
-from neural_compressor.onnxrt.quantization.calibrate import CalibrationDataReader
-from neural_compressor.onnxrt.quantization.config import GPTQConfig
-from neural_compressor.onnxrt.utils.onnx_model import ONNXModel
-from neural_compressor.onnxrt.utils.utility import (
-    ONNXRT116_VERSION,
-    ONNXRT1161_VERSION,
-    dtype_mapping,
-    simple_progress_bar,
-)
-
-__all__ = [
-    "apply_gptq_on_model",
-    "gptq_quantize",
-]
-
-
-def _gptq(
-    W: np.array,
-    H: np.array,
-    num_bits: int = 4,
-    group_size: int = 32,
-    scheme: str = "asym",
-    blocksize: int = 128,
-    percdamp: float = 0.01,
-    actorder: bool = False,
-    mse: bool = False,
-    perchannel: bool = True,
-):
-    """Quant the weight with GPTQ method.
-
-    Args:
-        W (np.array): weight.
-        H (np.array): Hessian matrix.
-        num_bits (int, optional): num_bits. Default is 4.
-        group_size (int, optional): how many elements share one scale/zp. Default is 32.
-        scheme (str, optional): sym or asym. Defaults to "asym".
-        blocksize (int, optional): blocksize to quantize weight.
-        percdamp (float, optional): percent of the average Hessian diagonal to use for dampening.
-        actorder (bool, optional): whether rearrange Hessian matrix considering the diag's value.
-        mse (bool, optional): whether get scale and zero point with mse error.
-        perchannel (bool, optional): whether quantize weight per-channel.
-
-    Returns:
-        Q: fake quantized weight
-    """
-    Qs = []
-    maxq = 2**num_bits - 1
-    grid = 100
-    maxshrink = 0.8
-    norm = 2.4
-
-    def find_params(weight):
-        org_shape = weight.shape
-        # find zp, scale
-        if not perchannel:
-            weight = np.expand_dims(weight.flatten(), axis=1)
-        tmp = np.zeros(weight.shape[1])
-        xmin = np.minimum(np.min(weight, axis=0), tmp)
-        xmax = np.maximum(np.max(weight, axis=0), tmp)
-        if scheme == "sym":
-            xmax = np.maximum(np.abs(xmin), xmax)
-            tmp = xmin < 0
-            if np.any(tmp):
-                xmin[tmp] = -xmax[tmp]
-        tmp = (xmin == 0) & (xmax == 0)
-        xmin[tmp] = -1
-        xmax[tmp] = +1
-
-        scale = (xmax - xmin) / maxq
-        if scheme == "sym":
-            zero = np.ones(scale.shape) * (maxq + 1) / 2
-        else:
-            zero = np.round(-xmin / scale)
-        if mse:
-            best = np.ones([weight.shape[1]]) * float("inf")
-            for i in range(int(maxshrink * grid)):
-                p = 1 - i / grid
-                xmin1 = p * xmin
-                xmax1 = p * xmax
-                scale1 = (xmax1 - xmin1) / maxq
-                zero1 = np.round(-xmin1 / scale1) if scheme != "sym" else zero
-                q = np.clip(np.round(weight / scale1) + zero1, 0, maxq)
-                q -= weight
-                q = np.power(np.abs(q), norm)
-                err = np.sum(q, 0)
-                tmp = err < best
-                if np.any(tmp):
-                    best[tmp] = err[tmp]
-                    scale[tmp] = scale1[tmp]
-                    zero[tmp] = zero1[tmp]
-        if not perchannel:
-            tmp = org_shape[1]
-            scale = np.repeat(scale, tmp)
-            zero = np.repeat(zero, tmp)
-        shape = [-1] + [1] * (len(org_shape) - 1)
-        scale = np.reshape(scale, shape)
-        zero = np.reshape(zero, shape)
-        return scale, zero
-
-    scales = []
-    zps = []
-    shape = W.shape
-    scale, zp = find_params(W)
-    dead = np.diag(H) == 0
-    H[dead, dead] = 1
-    W[dead, :] = 0  # such channel makes no contribution to quantization computation
-
-    # rearrange considering the diag's value
-    if actorder:
-        perm = np.argsort(np.diag(H))[::-1]
-        W = W[perm, :]
-        H = H[perm, :][:, perm]
-    Losses = np.zeros(W.shape)
-    Q = np.zeros(W.shape)
-    damp = percdamp * np.mean(np.diag(H))
-    diag = np.arange(shape[0])
-    H[diag, diag] += damp  # add a average value of
-    H = np.linalg.cholesky(np.linalg.inv(H)).T
-    Hinv = H
-    for i1 in range(0, shape[0], blocksize):
-        i2 = min(i1 + blocksize, shape[0])
-        count = i2 - i1
-
-        W1 = copy.deepcopy(W[i1:i2, :])
-        Q1 = np.zeros(W1.shape)
-        Err1 = np.zeros(W1.shape)
-        Losses1 = np.zeros(W1.shape)
-        Hinv1 = Hinv[i1:i2, i1:i2]
-
-        for i in range(count):  # within a block, channel wise
-            w = W1[i, :]
-            d = Hinv1[i, i]
-
-            if group_size != -1:
-                if (i1 + i) % group_size == 0:
-                    scale, zp = find_params(W[(i1 + i) : (i1 + i + group_size), :])
-
-            q = (scale * (np.clip(np.round(np.expand_dims(w, axis=1) / scale) + zp, 0, maxq) - zp)).flatten()
-            Q1[i, :] = q
-            Losses1[i, :] = (w - q) ** 2 / d**2
-
-            err1 = (w - q) / d
-            W1[i:, :] -= np.matmul(np.expand_dims(Hinv1[i:, i], axis=1), np.expand_dims(err1, axis=0))
-            Err1[i, :] = err1
-
-        Q[i1:i2, :] = Q1
-        Losses[i1:i2, :] = Losses1 / 2
-
-        W[i2:, :] -= np.matmul(Hinv[i2:, i1:i2], Err1)
-
-    if actorder:
-        invperm = np.argsort(perm)
-        Q = Q[invperm, :]
-
-    Q = np.reshape(Q, W.shape)
-    del W
-    return Q
-
-
-def gptq_quantize(
-    model: Union[onnx.ModelProto, ONNXModel, Path, str],
-    data_reader: CalibrationDataReader,
-    weight_config: dict = {},
-    num_bits: int = 4,
-    group_size: int = 32,
-    scheme: str = "asym",
-    percdamp: float = 0.01,
-    blocksize: int = 128,
-    actorder: bool = False,
-    mse: bool = False,
-    perchannel: bool = True,
-    accuracy_level: int = 0,
-    providers: List[str] = ["CPUExecutionProvider"],
-    return_modelproto: bool = True,
-):
-    """Quant the model with GPTQ method.
-
-    Args:
-        model (Union[onnx.ModelProto, ONNXModel, Path, str]): onnx model.
-        data_reader (CalibrationDataReader): data_reader for calibration.
-        weight_config (dict, optional): quantization config
-            For example,
-            weight_config = {
-                '(fc2, "MatMul")':
-                    {
-                        'weight_dtype': 'int',
-                        'weight_bits': 4,
-                        'weight_group_size': 32,
-                        'weight_sym': True,
-                        'accuracy_level': 0
-                    }. Defaults to {}.
-        num_bits (int, optional): number of bits used to represent weights. Defaults to 4.
-        group_size (int, optional): size of weight groups. Defaults to 32.
-        scheme (str, optional): indicates whether weights are symmetric. Defaults to "asym".
-        percdamp (float, optional): percentage of Hessian's diagonal values' average, which will be added
-            to Hessian's diagonal to increase numerical stability. Defaults to 0.01.
-        blocksize (int, optional): execute GPTQ quantization per block. Defaults to 128.
-        actorder (bool, optional): whether to sort Hessian's diagonal values to rearrange channel-wise
-            quantization order. Defaults to False.
-        mse (bool, optional): whether get scale and zero point with mse error. Defaults to False.
-        perchannel (bool, optional): whether quantize weight per-channel. Defaults to True.
-        accuracy_level (int, optional): accuracy level. Support 0 (unset),
-            1(fp32 compute type of jblas kernel), 2 (fp16 compute type of jblas kernel),
-            3 (bf16 compute type of jblas kernel), 4 (int8 compute type of jblas kernel). Defaults to 0.
-        providers (list, optional): providers to use. Defaults to ["CPUExecutionProvider"].
-        return_modelproto (bool, optionmal): whether to return onnx.Modelproto. set False for layer-wise quant.
-            Default to True
-
-    Returns:
-        onnx.ModelProto: quantized onnx model
-    """
-    if not isinstance(model, ONNXModel):
-        model = ONNXModel(model)
-    base_dir = os.path.dirname(model.model_path) if model.model_path is not None else ""
-
-    inputs, so = prepare_inputs(model, data_reader, providers)
-    del data_reader
-    org_output = copy.deepcopy(model.model.graph.output)
-    model.remove_tensors_from_outputs([i.name for i in org_output])
-    output_names = []
-    for node in model.nodes():
-        # check op_type of node is MatMul
-        # check dim 1 of input is weight tensor
-        # check weight_type is not "fp32"
-        if (
-            node.op_type in ["MatMul"]
-            and model.get_initializer(node.input[1]) is not None
-            and weight_config.get((node.name, node.op_type), {}).get("weight_dtype", "fp32") != "fp32"
-        ):
-            output_names.append(node.input[0])
-    output_names = list(set(output_names))
-    model.add_tensors_to_outputs(output_names)
-    if model.is_large_model:
-        onnx.save_model(
-            model.model,
-            model.model_path + "_augment.onnx",
-            save_as_external_data=True,
-            all_tensors_to_one_file=True,
-            convert_attribute=False,
-        )
-
-    session = (
-        ort.InferenceSession(model.model.SerializeToString(), so, providers=providers)
-        if not model.is_large_model
-        else ort.InferenceSession(model.model_path + "_augment.onnx", so, providers=providers)
-    )
-
-    for idx, input_name in enumerate(output_names):
-        simple_progress_bar(len(output_names), idx + 1)
-        node_list = []
-        weights = []
-
-        for node in model.input_name_to_nodes()[input_name]:
-            # check op_type of node is MatMul
-            # check dim 1 of input is weight tensor
-            # check weight_type is not "fp32"
-            if (
-                node.op_type in ["MatMul"]
-                and model.get_initializer(node.input[1]) is not None
-                and weight_config.get((node.name, node.op_type), {}).get("weight_dtype", "fp32") != "fp32"
-            ):
-                weight = onnx.numpy_helper.to_array(
-                    model.get_initializer(model.get_node(node.name).input[1]), base_dir
-                ).copy()
-                if len(weight.shape) != 2:
-                    continue
-
-                weights.append(weight)
-                node_list.append(model.get_node(node.name))
-
-        if len(weights) == 0:
-            continue
-
-        Hs = [np.zeros((i.shape[0], i.shape[0])) for i in weights]
-        nsamples = 0
-        for data in inputs:
-            inp = session.run([input_name], data)[0]
-            tmp = inp.shape[0]
-            inp = np.reshape(inp, (-1, inp.shape[-1]))
-            Hs = [i * (nsamples / (nsamples + tmp)) for i in Hs]
-            nsamples += tmp
-            inp = np.sqrt(2 / nsamples) * inp
-            Hs = [i + np.matmul(inp.T, inp) for i in Hs]
-
-        for (
-            node,
-            weight,
-            H,
-        ) in zip(node_list, weights, Hs):
-            if node.name in weight_config:
-                num_bits = weight_config[node.name]["bits"]
-                group_size = weight_config[node.name]["group_size"]
-                scheme = weight_config[node.name]["scheme"]
-                accuracy_level = weight_config[(node.name, node.op_type)].accuracy_level
-            group_size = group_size if group_size != -1 else weight.shape[0]
-            dtype = weight.dtype
-
-            q_weight = _gptq(
-                weight,
-                H,
-                num_bits=num_bits,
-                group_size=group_size,
-                scheme=scheme,
-                blocksize=blocksize,
-                percdamp=percdamp,
-                actorder=actorder,
-                mse=mse,
-                perchannel=perchannel,
-            )
-
-            weight_tensor = model.get_initializer(node.input[1])
-            init_share_num = model.get_initializer_share_num(node.input[1])
-
-            satisfy_MatMulNBits_condition = Version(ort.__version__) > ONNXRT1161_VERSION and num_bits == 4
-            satisfy_MatMulFpQ4_condition = (
-                Version(ort.__version__) >= ONNXRT116_VERSION and num_bits == 4 and group_size == 32
-            )
-            if ("CUDAExecutionProvider" in providers and satisfy_MatMulNBits_condition) or (
-                "CUDAExecutionProvider" not in providers
-                and (satisfy_MatMulFpQ4_condition or satisfy_MatMulNBits_condition)
-            ):  # pragma: no cover
-                # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions, supported by CPU EP
-                # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1, supported by CPU EP AND CUDA EP
-                org_shape = weight.shape
-                k_blocks = (org_shape[0] + group_size - 1) // group_size
-                q_weight = pad_tensor(q_weight, group_size, k_blocks)
-                q_weight, scale, zp = quant_tensor(q_weight.T, num_bits, group_size, scheme, "uint")
-                q_matmul_node, new_inits = make_matmul_weight_only_node(
-                    node=node,
-                    weight_shape=org_shape,
-                    num_bits=num_bits,
-                    group_size=group_size,
-                    k_blocks=k_blocks,
-                    q_weight=q_weight.astype("uint8"),
-                    scale=scale.astype(dtype),
-                    zero_point=zp if scheme == "asym" else None,
-                    accuracy_level=accuracy_level,
-                )
-
-                model.add_initializers(new_inits)
-                model.remove_node(node)
-                model.add_node(q_matmul_node)
-            else:
-                q_weight_tensor = onnx.helper.make_tensor(
-                    name=node.input[1] + "_Q{}G{}".format(str(num_bits), str(group_size)),
-                    data_type=dtype_mapping[str(dtype)],
-                    dims=q_weight.shape,
-                    vals=q_weight.astype(dtype).tobytes(),
-                    raw=True,
-                )
-                model.add_initializer(q_weight_tensor)
-                node.input[1] = q_weight_tensor.name
-            if init_share_num == 1:
-                model.remove_initializer(weight_tensor)
-
-    model.remove_tensors_from_outputs(output_names)
-    model.model.graph.output.MergeFrom(org_output)
-
-    model.topological_sort()
-
-    # reload external data to prevent external data file path errors
-    if model.is_large_model:
-        from onnx.external_data_helper import load_external_data_for_model
-
-        load_external_data_for_model(model.model, os.path.split(model.model_path)[0])
-
-    if return_modelproto:
-        return model.model
-    else:
-        return model
-
-
-def apply_gptq_on_model(
-    model: Union[onnx.ModelProto, ONNXModel, Path, str],
-    quant_config: dict,
-    calibration_data_reader: CalibrationDataReader,
-) -> onnx.ModelProto:
-    """Apply GPTQ on onnx model.
-
-    Args:
-        model (Union[onnx.ModelProto, ONNXModel, Path, str]): onnx model.
-        quant_config (dict): quantization config.
-        calibration_data_reader (CalibrationDataReader): data_reader for calibration.
-
-    Returns:
-        onnx.ModelProto: quantized onnx model.
-    """
-    # check whether to do layer_wise quant
-    layer_wise = quant_config.pop("layer_wise_quant", False)
-
-    # set other model params
-    quant_kwargs = {}
-    quant_kwargs = {key: quant_config.pop(key) for key in GPTQConfig.model_params_list if key in quant_config}
-
-    # change op config to dict type
-    for op_name_type, op_config in quant_config.items():
-        if isinstance(op_config, GPTQConfig):
-            quant_config[op_name_type] = op_config.to_dict()
-
-    if layer_wise:
-        from neural_compressor.onnxrt.algorithms import layer_wise_quant
-
-        quantized_model = layer_wise_quant(
-            model,
-            quant_func=gptq_quantize,
-            weight_config=quant_config,
-            data_reader=calibration_data_reader,
-            **quant_kwargs
-        )
-    else:
-        quantized_model = gptq_quantize(
-            model, data_reader=calibration_data_reader, weight_config=quant_config, **quant_kwargs
-        )
-
-    if isinstance(quantized_model, ONNXModel):
-        quantized_model = quantized_model.model
-    return quantized_model
diff --git a/neural_compressor/onnxrt/algorithms/weight_only/rtn.py b/neural_compressor/onnxrt/algorithms/weight_only/rtn.py
deleted file mode 100644
index c4ee941bf17..00000000000
--- a/neural_compressor/onnxrt/algorithms/weight_only/rtn.py
+++ /dev/null
@@ -1,222 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-# Copyright (c) 2023 MIT HAN Lab
-# This source code is licensed under the MIT license
-#
-# Copyright (c) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os
-from pathlib import Path
-from typing import List, Union
-
-import numpy as np
-import onnx
-import onnxruntime as ort
-from packaging.version import Version
-
-from neural_compressor.onnxrt.algorithms.weight_only.utility import (
-    make_matmul_weight_only_node,
-    pad_tensor,
-    qdq_tensor,
-    quant_tensor,
-)
-from neural_compressor.onnxrt.quantization.config import RTNConfig
-from neural_compressor.onnxrt.utils.onnx_model import ONNXModel
-from neural_compressor.onnxrt.utils.utility import (
-    ONNXRT116_VERSION,
-    ONNXRT1161_VERSION,
-    dtype_mapping,
-    simple_progress_bar,
-)
-
-__all__ = ["apply_rtn_on_model", "rtn_quantize"]
-
-
-def rtn_quantize(
-    model: Union[onnx.ModelProto, ONNXModel, Path, str],
-    weight_config: dict = {},
-    num_bits: int = 4,
-    group_size: int = 32,
-    scheme: str = "asym",
-    ratios: dict = {},
-    accuracy_level: int = 0,
-    providers: List[str] = ["CPUExecutionProvider"],
-    return_modelproto: bool = True,
-):
-    """Quantize the model with round to nearst method.
-
-    Args:
-        model (Union[onnx.ModelProto, ONNXModel, Path, str]): onnx model
-        weight_config (dict, optional): quantization config
-            For example,
-            weight_config = {
-                '(fc2, "MatMul")':
-                    {
-                        'weight_dtype': 'int',
-                        'weight_bits': 4,
-                        'weight_group_size': 32,
-                        'weight_sym': True,
-                        'accuracy_level': 0
-                    }
-            }. Defaults to {}.
-        num_bits (int, optional): number of bits used to represent weights. Defaults to 4.
-        group_size (int, optional): size of weight groups. Defaults to 32.
-        scheme (str, optional): indicates whether weights are symmetric. Defaults to "asym".
-        ratios (dict, optional): percentile of clip. Defaults to {}.
-        accuracy_level (int, optional):
-            accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel),
-            2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel),
-            4 (int8 compute type of jblas kernel). Defaults to 0.
-        providers (list, optional): providers to use. Defaults to ["CPUExecutionProvider"].
-        return_modelproto (bool, optionmal): whether to return onnx.Modelproto. set False for layer-wise quant.
-            Default to True
-    Returns:
-        onnx.ModelProto: quantized onnx model.
-    """
-    if not isinstance(model, ONNXModel):
-        model = ONNXModel(model)
-    base_dir = os.path.dirname(model.model_path) if model.model_path is not None else ""
-    new_nodes = []
-    remove_nodes = []
-    total_num = len([i for i in model.nodes() if i.op_type in ["MatMul"]])
-    curr_id = 0
-    for node in model.nodes():
-        if node.op_type in ["MatMul"]:
-            curr_id += 1
-            simple_progress_bar(total_num, curr_id)
-
-        # check op_type of node is MatMul
-        # check dim 1 of input is weight tensor
-        # check weight_type is not "fp32"
-        if (
-            node.op_type in ["MatMul"]  # check op_type of node is MatMul
-            and model.get_initializer(node.input[1]) is not None
-            and weight_config.get((node.name, node.op_type), {}).get("weight_dtype", "fp32") != "fp32"
-        ):
-            weight_tensor = model.get_initializer(node.input[1])
-            weight = onnx.numpy_helper.to_array(weight_tensor, base_dir=base_dir).copy()
-            if len(weight.shape) != 2:
-                continue
-
-            dtype = weight.dtype
-            if (node.name, node.op_type) in weight_config:
-                num_bits = weight_config[(node.name, node.op_type)].get("weight_bits", 4)
-                group_size = weight_config[(node.name, node.op_type)].get("weight_group_size", 32)
-                scheme = "sym" if weight_config[(node.name, node.op_type)].get("weight_sym", True) else "asym"
-                accuracy_level = weight_config[(node.name, node.op_type)].get("accuracy_level", 0)
-
-            org_w_shape = weight.shape  # ic, oc
-            group_size = group_size if group_size != -1 else org_w_shape[0]
-
-            k_blocks = (org_w_shape[0] - 1) // group_size + 1
-            init_share_num = model.get_initializer_share_num(node.input[1])
-
-            weight = pad_tensor(weight, group_size, k_blocks)
-
-            satisfy_MatMulNBits_condition = Version(ort.__version__) > ONNXRT1161_VERSION and num_bits == 4
-            satisfy_MatMulFpQ4_condition = (
-                Version(ort.__version__) >= ONNXRT116_VERSION and num_bits == 4 and group_size == 32
-            )
-            if ("CUDAExecutionProvider" in providers and satisfy_MatMulNBits_condition) or (
-                "CUDAExecutionProvider" not in providers
-                and (satisfy_MatMulFpQ4_condition or satisfy_MatMulNBits_condition)
-            ):  # pragma: no cover
-                # MatMulFpQ4 support 4 bits and 32 group_size with ort 1.16.0 and 1.16.1 versions, supported by CPU EP
-                # MatMulNBits supports 4 bits and 2^n group_size with ort > 1.16.1, supported by CPU EP AND CUDA EP
-                q_weight, scale, zp = quant_tensor(
-                    weight.T, num_bits, group_size, scheme, "uint", ratios.get(node.input[1], 1)
-                )
-                q_matmul_node, new_inits = make_matmul_weight_only_node(
-                    node=node,
-                    weight_shape=org_w_shape,
-                    num_bits=num_bits,
-                    group_size=group_size,
-                    k_blocks=k_blocks,
-                    q_weight=q_weight.astype("uint8"),
-                    scale=scale.astype(dtype),
-                    zero_point=zp if scheme == "asym" else None,
-                    accuracy_level=accuracy_level,
-                )
-
-                model.add_initializers(new_inits)
-                remove_nodes.append(node)
-                new_nodes.append(q_matmul_node)
-            else:
-                q_weight = qdq_tensor(weight.T, num_bits, group_size, scheme, "int", ratios.get(node.input[1], 1))
-                q_weight = np.reshape(q_weight, (org_w_shape[1], -1))
-                q_weight = np.transpose(q_weight)
-                q_weight = q_weight[: org_w_shape[0], :].astype(dtype)
-                q_weight_tensor = onnx.helper.make_tensor(
-                    name=node.input[1] + "_Q{}G{}".format(str(num_bits), str(group_size)),
-                    data_type=dtype_mapping[str(dtype)],
-                    dims=weight.shape,
-                    vals=q_weight.tobytes(),
-                    raw=True,
-                )
-                model.add_initializer(q_weight_tensor)
-                node.input[1] = q_weight_tensor.name
-            if init_share_num == 1:
-                model.remove_initializer(weight_tensor)
-
-    model.add_nodes(new_nodes)
-    model.remove_nodes(remove_nodes)
-    model.topological_sort()
-
-    # reload external data to prevent external data file path errors
-    if model.is_large_model:
-        from onnx.external_data_helper import load_external_data_for_model
-
-        load_external_data_for_model(model.model, os.path.split(model.model_path)[0])
-
-    if return_modelproto:
-        return model.model
-    else:
-        return model
-
-
-def apply_rtn_on_model(model: Union[onnx.ModelProto, ONNXModel, Path, str], quant_config: dict) -> onnx.ModelProto:
-    """Apply RTN on onnx model.
-
-    Args:
-        model (Union[onnx.ModelProto, ONNXModel, Path, str]): onnx model.
-        quant_config (dict): quantization config.
-
-    Returns:
-        onnx.ModelProto: quantized onnx model.
-    """
-    # check whether to do layer_wise quant
-    layer_wise = quant_config.pop("layer_wise_quant", False)
-
-    # set other model params
-    quant_kwargs = {}
-    quant_kwargs = {key: quant_config.pop(key) for key in RTNConfig.model_params_list if key in quant_config}
-
-    # change op config to dict type
-    for op_name_type, op_config in quant_config.items():
-        if isinstance(op_config, RTNConfig):
-            quant_config[op_name_type] = op_config.to_dict()
-
-    if layer_wise:
-        from neural_compressor.onnxrt.algorithms import layer_wise_quant
-
-        quantized_model = layer_wise_quant(model, quant_func=rtn_quantize, weight_config=quant_config, **quant_kwargs)
-    else:
-        quantized_model = rtn_quantize(model, weight_config=quant_config, **quant_kwargs)
-
-    if isinstance(quantized_model, ONNXModel):
-        quantized_model = quantized_model.model
-    return quantized_model
diff --git a/neural_compressor/onnxrt/algorithms/weight_only/utility.py b/neural_compressor/onnxrt/algorithms/weight_only/utility.py
deleted file mode 100644
index f69f8d57fab..00000000000
--- a/neural_compressor/onnxrt/algorithms/weight_only/utility.py
+++ /dev/null
@@ -1,335 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-# Copyright (c) 2023 MIT HAN Lab
-# This source code is licensed under the MIT license
-#
-# Copyright (c) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import struct
-import sys
-
-import numpy as np
-import onnx
-import onnxruntime as ort
-from packaging.version import Version
-
-from neural_compressor.onnxrt.utils.utility import ONNXRT1161_VERSION, dtype_mapping
-
-__all__ = [
-    "make_matmul_weight_only_node",
-    "prepare_inputs",
-    "pad_tensor",
-    "quant_tensor",
-    "qdq_tensor",
-]
-
-
-def _get_blob_size(group_size, has_zp):  # pragma: no cover
-    """Get blob_size.
-
-    Args:
-        group_size (int): how many elements share one scale/zp
-        has_zp (bool): whether zero_point is None
-    """
-    if Version(ort.__version__) > ONNXRT1161_VERSION:
-        blob_size = group_size // 2
-    elif has_zp:
-        blob_size = group_size // 2 + 4 + 1
-    else:
-        blob_size = group_size // 2 + 4
-    return blob_size
-
-
-def make_matmul_weight_only_node(
-    node: onnx.NodeProto,
-    weight_shape: tuple,
-    num_bits: int,
-    group_size: int,
-    k_blocks: int,
-    q_weight: np.array,
-    scale: np.array,
-    zero_point: np.array,
-    accuracy_level: int = 0,
-):
-    """Build MatMulFpQ4/MatMulNBits node.
-
-    Args:
-        node (onnx.NodeProto): original matmul node
-        weight_shape (tuple): original weight shape
-        num_bits (int): number of bits used to represent weights.
-        group_size (int): how many elements share one scale/zp
-        k_blocks (int): block number
-        q_weight (np.array): quantized weight
-        scale (np.array): scale
-        zero_point (np.array): zero point
-        accuracy_level (int, optional): accuracy level.
-            Support 0 (unset), 1(fp32 compute type of jblas kernel),
-            2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel),
-            4 (int8 compute type of jblas kernel) Defaults to 0.
-
-    Returns:
-        matmul_weight_only_node: MatMulFpQ4 or MatMulNBits node
-        new_inits: initializers of the new node
-    """
-    blob_size = _get_blob_size(group_size, zero_point is not None)
-    packed = np.zeros((q_weight.shape[0], blob_size), dtype="uint8")
-    q_weight_name = node.input[1] + "_Q{}G{}".format(str(num_bits), str(group_size))
-    input_names = [node.input[0], q_weight_name]
-    new_inits = []
-    kwargs = {}
-
-    if Version(ort.__version__) > ONNXRT1161_VERSION:
-        op_type = "MatMulNBits"
-
-        # pack quantized weight
-        for i in range(q_weight.shape[0]):
-            for k in range(0, group_size, 2):
-                packed[i][k // 2] = q_weight[i][k] | q_weight[i][k + 1] << 4
-        packed = np.reshape(packed, (-1, k_blocks, blob_size))
-
-        # build scale tensor
-        scale = np.reshape(scale, (-1, k_blocks))
-        scale_tensor = onnx.helper.make_tensor(
-            name=node.input[1] + "_scale",
-            data_type=dtype_mapping[str(scale.dtype)],
-            dims=scale.shape,
-            vals=scale.tobytes(),
-            raw=True,
-        )
-        input_names.append(scale_tensor.name)
-        new_inits.append(scale_tensor)
-
-        # build zero_point tensor
-        if zero_point is not None:
-            if num_bits > 4:
-                packed_zp = np.reshape(zero_point, (1, -1)).astype("uint8")
-            else:
-                packed_zp = np.full((zero_point.shape[0] + 1) // 2, 136, dtype="uint8")
-                for i in range(zero_point.shape[0] // k_blocks):
-                    for j in range(k_blocks):
-                        idx = i * k_blocks + j
-                        zp = zero_point[idx]
-                        packed_zp[idx // 2] = (
-                            ((packed_zp[idx // 2] & 0x0F) | (zp << 4))
-                            if (idx & 1)
-                            else ((packed_zp[idx // 2] & 0xF0) | zp)
-                        )
-
-            zp_tensor = onnx.helper.make_tensor(
-                name=node.input[1] + "_zp", data_type=2, dims=packed_zp.shape, vals=packed_zp.tobytes(), raw=True
-            )
-            input_names.append(zp_tensor.name)
-            new_inits.append(zp_tensor)
-
-        # set kwargs
-        kwargs["K"] = weight_shape[0]
-        kwargs["N"] = weight_shape[1]
-        kwargs["bits"] = num_bits
-        kwargs["block_size"] = group_size
-        if accuracy_level > 0:
-            # require onnxruntime > 1.16.3
-            kwargs["accuracy_level"] = accuracy_level
-
-    else:
-        offset = 5 if zero_point is not None else 4
-        op_type = "MatMulFpQ4"
-
-        # pack quantized weight
-        for i in range(q_weight.shape[0]):
-            bf = struct.pack("f", scale[i])
-            packed[i][0] = bf[0]
-            packed[i][1] = bf[1]
-            packed[i][2] = bf[2]
-            packed[i][3] = bf[3]
-
-            if zero_point is not None:
-                packed[i][4] = zero_point[i]
-
-            packed[i][offset:] = np.bitwise_or(
-                q_weight[i][: group_size // 2], np.left_shift(q_weight[i][group_size // 2 :], num_bits)
-            )
-        packed = packed.reshape(-1)
-
-        # build shape tensor
-        shape_tensor = onnx.helper.make_tensor(
-            name=node.input[1] + "_shape", data_type=7, dims=(2,), vals=np.array(weight_shape, dtype="int64")
-        )
-        new_inits.append(shape_tensor)
-        input_names.append(shape_tensor.name)
-
-        # set kwargs
-        kwargs["blk_quant_type"] = 1 if zero_point is not None else 0
-
-    q_weight_tensor = onnx.helper.make_tensor(
-        name=q_weight_name,
-        data_type=2,
-        dims=packed.shape,
-        vals=packed.tobytes(),
-        raw=True,
-    )
-    new_inits.append(q_weight_tensor)
-
-    matmul_weight_only_node = onnx.helper.make_node(
-        op_type,
-        inputs=input_names,
-        outputs=node.output,
-        name=node.name + "_Q" + str(num_bits) if node.name else "_Q" + str(num_bits),
-        domain="com.microsoft",
-        **kwargs,
-    )
-    return matmul_weight_only_node, new_inits
-
-
-def prepare_inputs(model, data_reader, providers):
-    """Prepare inputs for weight only quantization.
-
-    Args:
-        model (ModelProto or ONNXModel): onnx model.
-        data_reader (CalibrationDataReader): a calibration data reader.
-        providers (list): providers to use.
-
-    Returns:
-        inputs: prepared inputs.
-        so: session options
-    """
-    from importlib.util import find_spec
-
-    so = ort.SessionOptions()
-    if sys.version_info < (3, 11) and find_spec("onnxruntime_extensions"):  # pragma: no cover
-        from onnxruntime_extensions import get_library_path
-
-        so.register_custom_ops_library(get_library_path())
-    if model.is_large_model:
-        onnx.save_model(
-            model.model,
-            model.model_path + "_augment.onnx",
-            save_as_external_data=True,
-            all_tensors_to_one_file=True,
-            convert_attribute=False,
-        )
-
-    inputs_list = []
-    while True:
-        inputs = data_reader.get_next()
-        if not inputs:
-            break
-        inputs_list.append(inputs)
-    return inputs_list, so
-
-
-def pad_tensor(weight, group_size, k_blocks):
-    """Pad tensor rowi so that it can be is divisible by group_size.
-
-    Args:
-        weight (array): weight
-        group_size (int): how many elements share one scale/zp
-        k_blocks (int): the number of block
-
-    Returns:
-        weight: paded weight
-    """
-    if group_size == -1:
-        return weight
-
-    org_w_shape = weight.shape
-    padded_rows = k_blocks * group_size
-    pad_len = padded_rows - org_w_shape[0]
-
-    if pad_len > 0:
-        weight = np.pad(weight, ((0, pad_len), (0, 0)), "constant")
-
-    return weight
-
-
-def quant_tensor(
-    data: np.array,
-    num_bits: int = 4,
-    group_size: int = 32,
-    scheme: str = "asym",
-    dtype: str = "int",
-    ratio: float = 1.0,
-):
-    """Quantize tensor per group.
-
-    Args:
-        data (np.array): input weight
-        num_bits (int, optional): number of bits used to represent weights. Defaults to 4.
-        group_size (int, optional): how many elements share one scale/zp. Defaults to 4.
-        scheme (str, optional): _quantization scheme. Defaults to "asym".
-        dtype (str, optional): data type. Defaults to "int".
-        ratio (float, optional): percentile of clip. Defaults to 1.0.
-
-    Returns:
-        output: quantized weight
-        scale: scale
-        zero_point: zero point
-    """
-    data = np.reshape(data, (-1, group_size))
-    if scheme == "asym" or dtype == "uint":
-        maxq = 2**num_bits - 1
-        minq = 0
-    elif scheme == "sym":
-        maxq = 2 ** (num_bits - 1) - 1 if num_bits != 1 else 0
-        minq = -(2 ** (num_bits - 1)) if num_bits != 1 else -1
-
-    rmin = np.min(data, axis=1, keepdims=True) * ratio
-    rmax = np.max(data, axis=1, keepdims=True) * ratio
-    if scheme == "sym":
-        max_range = np.maximum(np.abs(rmin), np.abs(rmax))
-        scale = np.ones(rmax.shape)
-        scale[max_range > 0] = np.array(
-            [float(i) / (maxq - minq) for i in (max_range[max_range > 0] * 2.0).flatten().tolist()]
-        )
-        zero_point = (
-            np.zeros(scale.shape) if dtype == "int" else np.ones(rmax.shape, dtype="uint8") * (1 << (num_bits - 1))
-        )
-    else:
-        scale = np.ones(rmax.shape)
-        scale[rmin != rmax] = np.array(
-            [float(i) / (maxq - minq) for i in (rmax - rmin)[rmin != rmax].flatten().tolist()]
-        )
-        zero_point = (
-            ((np.zeros(scale.shape) - rmin) / scale).round()
-            if dtype == "int"
-            else np.maximum(0, np.minimum(maxq, ((np.zeros(scale.shape) - rmin) / scale).round())).astype("uint8")
-        )
-    return np.clip((data / scale + zero_point).round(), minq, maxq), scale, zero_point
-
-
-def qdq_tensor(
-    data: np.array,
-    num_bits: int = 4,
-    group_size: int = 32,
-    scheme: str = "asym",
-    dtype: str = "int",
-    ratio: float = 1.0,
-):
-    """Quant dequant tensor per group.
-
-    Args:
-        data (np.array): input weight
-        num_bits (int, optional): number of bits used to represent weights. Defaults to 4.
-        group_size (int, optional):  how many elements share one scale/zp. Defaults to 32.
-        scheme (str, optional): quantization scheme. Defaults to "asym".
-        dtype (str, optional): data type. Defaults to "int".
-        ratio (float, optional): percentile of clip. Defaults to 1.0.
-
-    Returns:
-        output: quant-dequant weight
-    """
-    org_shape = data.shape
-    weight, scale, zp = quant_tensor(data, num_bits, group_size, scheme, dtype, ratio)
-    return np.reshape(scale * (weight - zp), org_shape)
diff --git a/neural_compressor/onnxrt/quantization/__init__.py b/neural_compressor/onnxrt/quantization/__init__.py
deleted file mode 100644
index b3ae15f6a19..00000000000
--- a/neural_compressor/onnxrt/quantization/__init__.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from neural_compressor.onnxrt.quantization.algorithm_entry import (
-    smooth_quant_entry,
-    rtn_quantize_entry,
-    gptq_quantize_entry,
-    awq_quantize_entry,
-)
-from neural_compressor.onnxrt.quantization.calibrate import CalibrationDataReader
-from neural_compressor.onnxrt.quantization.config import (
-    RTNConfig,
-    get_default_rtn_config,
-    GPTQConfig,
-    get_default_gptq_config,
-    AWQConfig,
-    get_default_awq_config,
-    SmoohQuantConfig,
-    get_default_sq_config,
-)
-from neural_compressor.onnxrt.quantization.autotune import autotune, get_all_config_set
-
-__all__ = [
-    "smooth_quant_entry",
-    "rtn_quantize_entry",
-    "gptq_quantize_entry",
-    "awq_quantize_entry",
-    "RTNConfig",
-    "get_default_rtn_config",
-    "GPTQConfig",
-    "get_default_gptq_config",
-    "AWQConfig",
-    "get_default_awq_config",
-    "SmoohQuantConfig",
-    "get_default_sq_config",
-    "get_all_config_set",
-    "CalibrationDataReader",
-    "autotune",
-]
diff --git a/neural_compressor/onnxrt/quantization/algorithm_entry.py b/neural_compressor/onnxrt/quantization/algorithm_entry.py
deleted file mode 100644
index f86e9791605..00000000000
--- a/neural_compressor/onnxrt/quantization/algorithm_entry.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Copyright (c) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tempfile
-from pathlib import Path
-from typing import Union
-
-import onnx
-from onnxruntime.quantization import quantize
-
-from neural_compressor.common import Logger
-from neural_compressor.common.utils import AWQ, GPTQ, RTN, SMOOTH_QUANT
-from neural_compressor.onnxrt.algorithms import Smoother
-from neural_compressor.onnxrt.quantization.calibrate import CalibrationDataReader
-from neural_compressor.onnxrt.quantization.config import AWQConfig, GPTQConfig, RTNConfig, SmoohQuantConfig
-from neural_compressor.onnxrt.utils.utility import register_algo
-
-logger = Logger().get_logger()
-
-__all__ = [
-    "smooth_quant_entry",
-    "rtn_quantize_entry",
-    "gptq_quantize_entry",
-    "awq_quantize_entry",
-]
-
-
-###################### SmoothQuant Entry ##################################
-@register_algo(name=SMOOTH_QUANT)
-def smooth_quant_entry(
-    model: Union[Path, str],
-    quant_config: SmoohQuantConfig,
-    calibration_data_reader: CalibrationDataReader,
-    *args,
-    **kwargs
-) -> onnx.ModelProto:
-    """Apply smooth quant."""
-    assert calibration_data_reader is not None, "Please provide calibration_data_reader"
-    assert isinstance(
-        calibration_data_reader, CalibrationDataReader
-    ), "Please follow neural_compressor/onnxrt/quantization/calibrate.py to implement calibration_data_reader"
-
-    # smooth operation
-    calibration_data_reader.rewind()
-    smoother = Smoother(
-        model,
-        calibration_data_reader,
-        providers=quant_config.providers,
-    )
-    smoothed_model = smoother.transform(**quant_config.to_dict())
-    with tempfile.TemporaryDirectory(prefix="ort.quant.") as tmp_dir:
-        # ORT quant API requires str input
-        onnx.save_model(
-            smoothed_model,
-            Path(tmp_dir).joinpath("smooth.onnx").as_posix(),
-            save_as_external_data=True,
-            all_tensors_to_one_file=True,
-            location="smooth.onnx_data",
-            size_threshold=1024,
-            convert_attribute=False,
-        )
-
-        # quant operation
-        calibration_data_reader.rewind()
-
-        # exclude Mul operations which are inserted during smooth operation
-        excluded_nodes = [i.name for i in smoothed_model.graph.node if i.name.endswith("_smooth_mul")]
-        quant_config.calibration_data_reader = calibration_data_reader
-        quant_config.nodes_to_exclude.extend(excluded_nodes)
-        quant_config.convert_to_ort_config()
-
-        quantize(
-            Path(tmp_dir).joinpath("smooth.onnx").as_posix(),
-            Path(tmp_dir).joinpath("quant_model.onnx").as_posix(),
-            quant_config,
-        )
-        model = onnx.load(Path(tmp_dir).joinpath("quant_model.onnx").as_posix())
-
-    return model
-
-
-###################### RTN Algo Entry ##################################
-@register_algo(name=RTN)
-def rtn_quantize_entry(model: Union[Path, str], quant_config: RTNConfig, *args, **kwargs) -> onnx.ModelProto:
-    """The main entry to apply rtn quantization."""
-    from neural_compressor.onnxrt.algorithms import apply_rtn_on_model
-
-    # map config to each op
-    model_info = quant_config.get_model_info(model=model)
-    configs_mapping = quant_config.to_config_mapping(model_info=model_info)
-    logger.debug(configs_mapping)
-    model = apply_rtn_on_model(model, configs_mapping)
-    return model
-
-
-###################### GPTQ Algo Entry ##################################
-@register_algo(name=GPTQ)
-def gptq_quantize_entry(
-    model: Union[Path, str], quant_config: GPTQConfig, calibration_data_reader: CalibrationDataReader, *args, **kwargs
-) -> onnx.ModelProto:
-    """The main entry to apply gptq quantization."""
-    assert calibration_data_reader is not None, "Please provide calibration_data_reader"
-    assert isinstance(
-        calibration_data_reader, CalibrationDataReader
-    ), "Please follow neural_compressor/onnxrt/quantization/calibrate.py to implement calibration_data_reader"
-
-    from neural_compressor.onnxrt.algorithms import apply_gptq_on_model
-
-    # map config to each op
-    model_info = quant_config.get_model_info(model=model)
-    configs_mapping = quant_config.to_config_mapping(model_info=model_info)
-    logger.debug(configs_mapping)
-
-    # regenerate to ensure data exists
-    calibration_data_reader.rewind()
-    model = apply_gptq_on_model(model, configs_mapping, calibration_data_reader)
-    return model
-
-
-###################### AWQ Algo Entry ##################################
-@register_algo(name=AWQ)
-def awq_quantize_entry(
-    model: Union[Path, str], quant_config: AWQConfig, calibration_data_reader: CalibrationDataReader, *args, **kwargs
-) -> onnx.ModelProto:
-    """The main entry to apply awq quantization."""
-    assert calibration_data_reader is not None, "Please provide calibration_data_reader"
-    assert isinstance(
-        calibration_data_reader, CalibrationDataReader
-    ), "Please follow neural_compressor/onnxrt/quantization/calibrate.py to implement calibration_data_reader"
-
-    from neural_compressor.onnxrt.algorithms import apply_awq_on_model
-
-    # map config to each op
-    model_info = quant_config.get_model_info(model=model)
-    configs_mapping = quant_config.to_config_mapping(model_info=model_info)
-    logger.debug(configs_mapping)
-
-    # regenerate to ensure data exists
-    calibration_data_reader.rewind()
-    model = apply_awq_on_model(model, configs_mapping, calibration_data_reader)
-    return model
diff --git a/neural_compressor/onnxrt/quantization/autotune.py b/neural_compressor/onnxrt/quantization/autotune.py
deleted file mode 100644
index 7cddcc3a8b3..00000000000
--- a/neural_compressor/onnxrt/quantization/autotune.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright (c) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-from pathlib import Path
-from typing import Any, Callable, List, Optional, Tuple, Union
-
-import onnx
-
-from neural_compressor.common import logger
-from neural_compressor.common.base_config import BaseConfig, get_all_config_set_from_config_registry
-from neural_compressor.common.base_tuning import EvaluationFuncWrapper, TuningConfig, init_tuning
-from neural_compressor.onnxrt.quantization.calibrate import CalibrationDataReader
-from neural_compressor.onnxrt.quantization.config import FRAMEWORK_NAME
-from neural_compressor.onnxrt.quantization.quantize import _quantize
-
-__all__ = [
-    "autotune",
-    "get_all_config_set",
-]
-
-
-def get_all_config_set() -> Union[BaseConfig, List[BaseConfig]]:
-    return get_all_config_set_from_config_registry(fwk_name=FRAMEWORK_NAME)
-
-
-def autotune(
-    model_input: Union[Path, str],
-    tune_config: TuningConfig,
-    eval_fn: Callable,
-    eval_args: Optional[Tuple[Any]] = None,
-    calibration_data_reader: CalibrationDataReader = None,
-) -> Union[None, onnx.ModelProto]:
-    """The main entry of auto-tune.
-
-    Args:
-        model_input (Union[Path, str]): onnx model path.
-        tune_config (TuningConfig): tuning config.
-            TuningConfig is created with algorithm configs, parameters supported tuning are in their params_list.
-            Support:
-            Expand parameters to a list of parameters like TuningConfig(config_set=[RTNConfig(weight_bits=[4, 8])])
-            Pass a list of configs like TuningConfig(config_set=[RTNConfig(), GPTQConfig()])
-        eval_fn (Callable): evaluate function.
-            During evaluation, autotune will only pass model path as the input of function.
-        eval_args (Optional[Tuple[Any]]): evaluate arguments.
-            Positional arguments for `eval_fn`.
-
-        calibration_data_reader (CalibrationDataReader): dataloader for calibration.
-    """
-    best_quant_model = None
-    eval_func_wrapper = EvaluationFuncWrapper(eval_fn, eval_args)
-    config_loader, tuning_logger, tuning_monitor = init_tuning(tuning_config=tune_config)
-    try:
-        baseline: float = eval_func_wrapper.evaluate(model_input)
-    except Exception as e:
-        print(e)
-        if "'str' object has no attribute 'SerializeToString'" in str(e):
-            logger.warning("Please refine your eval_fn to accept model path (str) as input.")
-        exit(0)
-    tuning_monitor.set_baseline(baseline)
-    tuning_logger.tuning_start()
-    for trial_index, quant_config in enumerate(config_loader):
-        if calibration_data_reader is not None:
-            calibration_data_reader.rewind()
-        tuning_logger.trial_start(trial_index=trial_index)
-        tuning_logger.execution_start()
-        logger.debug("quant config: {}".format(quant_config))
-        q_model = _quantize(model_input, quant_config=quant_config, calibration_data_reader=calibration_data_reader)
-        tuning_logger.execution_end()
-        tuning_logger.evaluation_start()
-        with tempfile.TemporaryDirectory(prefix="ort.quant.") as tmp_dir:
-            # evaluate API requires str input
-            onnx.save_model(
-                q_model,
-                Path(tmp_dir).joinpath(Path(model_input).name).as_posix(),
-                save_as_external_data=True,
-                all_tensors_to_one_file=True,
-                location=Path(model_input).with_suffix(Path(model_input).suffix + "_data").name,
-                size_threshold=1024,
-                convert_attribute=False,
-            )
-            # copy config.json to tmp dir for evaluation, LLMs evaluation may need it
-            if isinstance(model_input, str) and os.path.exists(
-                Path(model_input).parent.joinpath("config.json").as_posix()
-            ):
-                import shutil
-
-                shutil.copyfile(
-                    Path(model_input).parent.joinpath("config.json").as_posix(),
-                    Path(tmp_dir).joinpath("config.json").as_posix(),
-                )
-            eval_result: float = eval_func_wrapper.evaluate(Path(tmp_dir).joinpath(Path(model_input).name).as_posix())
-        tuning_logger.evaluation_end()
-        logger.info("Evaluation result: %.4f", eval_result)
-        tuning_monitor.add_trial_result(trial_index, eval_result, quant_config)
-        tuning_logger.trial_end(trial_index)
-        if tuning_monitor.need_stop():
-            best_quant_config: BaseConfig = tuning_monitor.get_best_quant_config()
-            best_quant_model = _quantize(
-                model_input, quant_config=best_quant_config, calibration_data_reader=calibration_data_reader
-            )
-            break
-    tuning_logger.tuning_end()
-    return best_quant_model
diff --git a/neural_compressor/onnxrt/quantization/calibrate.py b/neural_compressor/onnxrt/quantization/calibrate.py
deleted file mode 100644
index 1ba32672728..00000000000
--- a/neural_compressor/onnxrt/quantization/calibrate.py
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-# Copyright (c) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import abc
-
-from onnxruntime.quantization import CalibrationDataReader as ORTCalibrationDataReader
-
-__all__ = ["CalibrationDataReader"]
-
-
-class CalibrationDataReader(ORTCalibrationDataReader):
-    """Get data for calibration.
-
-    We define our CalibrationDataReader based on the class in below link:
-    https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/calibrate.py#L139
-    """
-
-    @abc.abstractmethod
-    def rewind(self):
-        """Regenerate data."""
-        raise NotImplementedError
diff --git a/neural_compressor/onnxrt/quantization/config.py b/neural_compressor/onnxrt/quantization/config.py
deleted file mode 100644
index 88a0a56171f..00000000000
--- a/neural_compressor/onnxrt/quantization/config.py
+++ /dev/null
@@ -1,614 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-# Copyright (c) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import re
-from collections import OrderedDict
-from enum import Enum
-from pathlib import Path
-from typing import Callable, List, NamedTuple, Union
-
-import numpy as np
-import onnx
-from onnxruntime.quantization.calibrate import CalibrationMethod
-from onnxruntime.quantization.quant_utils import QuantFormat, QuantType
-from onnxruntime.quantization.quantize import StaticQuantConfig
-
-from neural_compressor.common import Logger
-from neural_compressor.common.base_config import BaseConfig, register_config, register_supported_configs_for_fwk
-from neural_compressor.common.utils import AWQ, DEFAULT_WHITE_LIST, GPTQ, OP_NAME_OR_MODULE_TYPE, RTN, SMOOTH_QUANT
-from neural_compressor.onnxrt.utils import PRIORITY_AWQ, PRIORITY_GPTQ, PRIORITY_RTN, PRIORITY_SMOOTH_QUANT
-
-logger = Logger().get_logger()
-
-__all__ = [
-    "FRAMEWORK_NAME",
-    "RTNConfig",
-    "get_default_rtn_config",
-    "GPTQConfig",
-    "get_default_gptq_config",
-    "AWQConfig",
-    "get_default_awq_config",
-    "SmoohQuantConfig",
-    "get_default_sq_config",
-]
-
-FRAMEWORK_NAME = "onnxrt"
-
-
-class _OperatorConfig(NamedTuple):
-    config: BaseConfig
-    operators: List[Union[str, Callable]]
-    valid_func_list: List[Callable] = []
-
-
-######################## RNT Config ###############################
-
-
-@register_config(framework_name=FRAMEWORK_NAME, algo_name=RTN, priority=PRIORITY_RTN)
-class RTNConfig(BaseConfig):
-    """Config class for round-to-nearest weight-only quantization."""
-
-    supported_configs: List[_OperatorConfig] = []
-    params_list: List[str] = [
-        "weight_dtype",
-        "weight_bits",
-        "weight_group_size",
-        "weight_sym",
-        "act_dtype",
-        "accuracy_level",
-    ]
-    model_params_list: List[str] = [
-        "providers",
-        "layer_wise_quant",
-    ]
-    name: str = RTN
-
-    def __init__(
-        self,
-        weight_dtype: str = "int",
-        weight_bits: int = 4,
-        weight_group_size: int = 32,
-        weight_sym: bool = True,
-        act_dtype: str = "fp32",
-        accuracy_level: int = 0,
-        providers: List[str] = ["CPUExecutionProvider"],
-        layer_wise_quant: bool = False,
-        white_list: List[OP_NAME_OR_MODULE_TYPE] = DEFAULT_WHITE_LIST,
-    ):
-        """Init RTN weight-only quantization config.
-
-        Args:
-            weight_dtype (str, optional): Data type for weights, default is "int".
-            weight_bits (int, optional): Number of bits used to represent weights, default is 4.
-            weight_group_size (int, optional): Size of weight groups, default is 32.
-            weight_sym (bool, optional): Indicates whether weights are symmetric, default is True.
-            act_dtype (str, optional): Data type for activations, default is "fp32".
-            accuracy_level (int, optional): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel),
-                2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel),
-                4 (int8 compute type of jblas kernel). Defaults to 0.
-            providers (list, optional): execution providers to use. Defaults to ["CPUExecutionProvider"].
-            layer_wise_quant (bool, optional): whether to quantize model layer by layer to save memory footprint.
-                Check below link for details
-                https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_layer_wise.md,
-                default is False.
-            white_list (list, optional): op in white_list will be applied current config.
-                Defaults to DEFAULT_WHITE_LIST.
-        """
-        super().__init__(white_list=white_list)
-        self.weight_bits = weight_bits
-        self.weight_dtype = weight_dtype
-        self.weight_group_size = weight_group_size
-        self.weight_sym = weight_sym
-        self.act_dtype = act_dtype
-        self.accuracy_level = accuracy_level
-        self.providers = providers
-        self.layer_wise_quant = layer_wise_quant
-        self._post_init()
-
-    def get_model_params_dict(self):
-        result = dict()
-        for param in self.model_params_list:
-            result[param] = getattr(self, param)
-        return result
-
-    @classmethod
-    def register_supported_configs(cls) -> List[_OperatorConfig]:
-        supported_configs = []
-        linear_rtn_config = RTNConfig(
-            weight_dtype=["int"],
-            weight_bits=[4, 3, 8],
-            weight_group_size=[32, -1, 1, 16, 64, 128, 256, 512, 1024],
-            weight_sym=[True, False],
-            act_dtype=["fp32"],
-        )
-        operators = ["MatMul"]
-        supported_configs.append(_OperatorConfig(config=linear_rtn_config, operators=operators))
-        cls.supported_configs = supported_configs
-
-    def to_config_mapping(self, config_list: List[BaseConfig] = None, model_info: list = None):
-        config_mapping = OrderedDict()
-        if config_list is None:
-            config_list = [self]
-        for config in config_list:
-            # update model level setting
-            config_mapping.update(config.get_model_params_dict())
-
-            # update node level setting
-            global_config = config.global_config
-            op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
-            for op_name, op_type in model_info:
-                if self.global_config is not None:
-                    config_mapping[(op_name, op_type)] = global_config
-                if op_type in op_type_config_dict:
-                    config_mapping[(op_name, op_type)] = op_name_config_dict[op_type]
-                for op_name_pattern in op_name_config_dict:
-                    if re.match(op_name_pattern, op_name):
-                        config_mapping[(op_name, op_type)] = op_name_config_dict[op_name_pattern]
-        return config_mapping
-
-    @staticmethod
-    def get_model_info(model: Union[onnx.ModelProto, Path, str]) -> list:
-        if not isinstance(model, onnx.ModelProto):
-            model = onnx.load(model, load_external_data=False)
-        white_list = ["MatMul"]
-        filter_result = []
-        for node in model.graph.node:
-            if node.op_type in white_list:
-                pair = (node.name, node.op_type)
-                filter_result.append(pair)
-        logger.debug(f"Get model info: {filter_result}")
-        return filter_result
-
-    @classmethod
-    def get_config_set_for_tuning(cls) -> Union[None, "RTNConfig", List["RTNConfig"]]:  # pragma: no cover
-        # TODO fwk owner needs to update it.
-        return RTNConfig(weight_bits=[4, 8], weight_sym=[True, False])
-
-
-def get_default_rtn_config() -> RTNConfig:
-    """Generate the default rtn config.
-
-    Returns:
-        the default rtn config.
-    """
-    return RTNConfig()
-
-
-######################## GPTQ Config ###############################
-
-
-@register_config(framework_name=FRAMEWORK_NAME, algo_name=GPTQ, priority=PRIORITY_GPTQ)
-class GPTQConfig(BaseConfig):
-    """Config class for gptq weight-only quantization."""
-
-    supported_configs: List[_OperatorConfig] = []
-    params_list: List[str] = [
-        "weight_dtype",
-        "weight_bits",
-        "weight_group_size",
-        "weight_sym",
-        "act_dtype",
-        "accuracy_level",
-    ]
-    model_params_list: List[str] = [
-        "percdamp",
-        "blocksize",
-        "actorder",
-        "mse",
-        "perchannel",
-        "providers",
-        "layer_wise_quant",
-    ]
-    name: str = GPTQ
-
-    def __init__(
-        self,
-        weight_dtype: str = "int",
-        weight_bits: int = 4,
-        weight_group_size: int = 32,
-        weight_sym: bool = True,
-        act_dtype: str = "fp32",
-        accuracy_level: int = 0,
-        percdamp: float = 0.01,
-        blocksize: int = 128,
-        actorder: bool = False,
-        mse: bool = False,
-        perchannel: bool = True,
-        providers: List[str] = ["CPUExecutionProvider"],
-        layer_wise_quant: bool = False,
-        white_list: List[OP_NAME_OR_MODULE_TYPE] = DEFAULT_WHITE_LIST,
-    ):
-        """Init GPTQ weight-only quantization config.
-
-        Args:
-            weight_dtype (str, optional): data type for weights. Defaults to "int".
-            weight_bits (int, optional): number of bits used to represent weights. Defaults to 4.
-            weight_group_size (int, optional): size of weight groups. Defaults to 32.
-            weight_sym (bool, optional): indicates whether weights are symmetric. Defaults to True.
-            act_dtype (str, optional): data type for activations. Defaults to "fp32".
-            accuracy_level (int, optional): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel),
-                2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel),
-                4 (int8 compute type of jblas kernel). Defaults to 0.
-            percdamp (float, optional): percentage of Hessian's diagonal values' average, which will be added
-                to Hessian's diagonal to increase numerical stability. Defaults to 0.01.
-            blocksize (int, optional): execute GPTQ quantization per block. Defaults to 128.
-            actorder (bool, optional): whether to sort Hessian's diagonal values to rearrange channel-wise
-                quantization order. Defaults to False.
-            mse (bool, optional): whether get scale and zero point with mse error. Defaults to False.
-            perchannel (bool, optional): whether quantize weight per-channel. Defaults to True.
-            providers (list, optional): execution providers to use. Defaults to ["CPUExecutionProvider"].
-            layer_wise_quant (bool, optional): whether to quantize model layer by layer to save memory footprint.
-                Check below link for details
-                https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_layer_wise.md,
-                default is False.
-            white_list (list, optional): op in white_list will be applied current config.
-                Defaults to DEFAULT_WHITE_LIST.
-        """
-        super().__init__(white_list=white_list)
-        self.weight_bits = weight_bits
-        self.weight_dtype = weight_dtype
-        self.weight_group_size = weight_group_size
-        self.weight_sym = weight_sym
-        self.act_dtype = act_dtype
-        self.accuracy_level = accuracy_level
-        self.percdamp = percdamp
-        self.blocksize = blocksize
-        self.actorder = actorder
-        self.mse = mse
-        self.perchannel = perchannel
-        self.providers = providers
-        self.layer_wise_quant = layer_wise_quant
-        self._post_init()
-
-    def get_model_params_dict(self):
-        result = dict()
-        for param in self.model_params_list:
-            result[param] = getattr(self, param)
-        return result
-
-    @classmethod
-    def register_supported_configs(cls) -> List[_OperatorConfig]:
-        supported_configs = []
-        linear_gptq_config = GPTQConfig(
-            weight_dtype=["int"],
-            weight_bits=[4, 3, 8],
-            weight_group_size=[32, -1, 1, 16, 64, 128, 256, 512, 1024],
-            weight_sym=[True, False],
-            act_dtype=["fp32"],
-            actorder=[True, False],
-            mse=[True, False],
-            perchannel=[True, False],
-        )
-        operators = ["MatMul"]
-        supported_configs.append(_OperatorConfig(config=linear_gptq_config, operators=operators))
-        cls.supported_configs = supported_configs
-
-    def to_config_mapping(self, config_list: list = None, model_info: list = None) -> OrderedDict:
-        config_mapping = OrderedDict()
-        if config_list is None:
-            config_list = [self]
-        for config in config_list:
-            # update model level setting
-            config_mapping.update(config.get_model_params_dict())
-
-            # update node level setting
-            global_config = config.global_config
-            op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
-            for op_name, op_type in model_info:
-                if self.global_config is not None:
-                    config_mapping[(op_name, op_type)] = global_config
-                if op_type in op_type_config_dict:
-                    config_mapping[(op_name, op_type)] = op_name_config_dict[op_type]
-                for op_name_pattern in op_name_config_dict:
-                    if re.match(op_name_pattern, op_name):
-                        config_mapping[(op_name, op_type)] = op_name_config_dict[op_name_pattern]
-        return config_mapping
-
-    @staticmethod
-    def get_model_info(model: Union[onnx.ModelProto, Path, str]) -> list:
-        if not isinstance(model, onnx.ModelProto):
-            model = onnx.load(model, load_external_data=False)
-        white_list = ["MatMul"]
-        filter_result = []
-        for node in model.graph.node:
-            if node.op_type in white_list:
-                pair = (node.name, node.op_type)
-                filter_result.append(pair)
-        logger.debug(f"Get model info: {filter_result}")
-        return filter_result
-
-    @classmethod
-    def get_config_set_for_tuning(cls) -> Union[None, "GPTQConfig", List["GPTQConfig"]]:  # pragma: no cover
-        # TODO fwk owner needs to update it.
-        return GPTQConfig(
-            weight_bits=[4, 8],
-            weight_sym=[True, False],
-            actorder=[True, False],
-            mse=[True, False],
-            perchannel=[True, False],
-        )
-
-
-def get_default_gptq_config() -> GPTQConfig:
-    """Generate the default gptq config.
-
-    Returns:
-        the default gptq config.
-    """
-    return GPTQConfig()
-
-
-######################## AWQ Config ###############################
-
-
-@register_config(framework_name=FRAMEWORK_NAME, algo_name=AWQ, priority=PRIORITY_AWQ)
-class AWQConfig(BaseConfig):
-    """Config class for awq weight-only quantization."""
-
-    supported_configs: List[_OperatorConfig] = []
-    params_list: List[str] = [
-        "weight_dtype",
-        "weight_bits",
-        "weight_group_size",
-        "weight_sym",
-        "act_dtype",
-        "accuracy_level",
-    ]
-    model_params_list: List[str] = [
-        "enable_auto_scale",
-        "enable_mse_search",
-        "providers",
-    ]
-    name: str = AWQ
-
-    def __init__(
-        self,
-        weight_dtype: str = "int",
-        weight_bits: int = 4,
-        weight_group_size: int = 32,
-        weight_sym: bool = True,
-        act_dtype: str = "fp32",
-        accuracy_level: int = 0,
-        enable_auto_scale: bool = True,
-        enable_mse_search: bool = True,
-        providers: List[str] = ["CPUExecutionProvider"],
-        white_list: List[OP_NAME_OR_MODULE_TYPE] = DEFAULT_WHITE_LIST,
-    ):
-        """Init AWQ weight-only quantization config.
-
-        Args:
-            weight_dtype (str, optional): data type for weights. Defaults to "int".
-            weight_bits (int, optional): number of bits used to represent weights. Defaults to 4.
-            weight_group_size (int, optional): size of weight groups. Defaults to 32.
-            weight_sym (bool, optional): indicates whether weights are symmetric. Defaults to True.
-            act_dtype (str, optional): data type for activations. Defaults to "fp32".
-            accuracy_level (int, optional): accuracy level. Support 0 (unset), 1(fp32 compute type of jblas kernel),
-                2 (fp16 compute type of jblas kernel), 3 (bf16 compute type of jblas kernel),
-                4 (int8 compute type of jblas kernel). Defaults to 0.
-            enable_auto_scale (bool, optional): whether to search for best scales based on activation distribution.
-                Defaults to True.
-            enable_mse_search (bool, optional): whether to search for the best clip range from range
-                [0.91, 1.0, 0.01]. Defaults to True.
-            providers (list, optional): execution providers to use. Defaults to ["CPUExecutionProvider"].
-            white_list (list, optional): op in white_list will be applied current config.
-                Defaults to DEFAULT_WHITE_LIST.
-        """
-        super().__init__(white_list=white_list)
-        self.weight_bits = weight_bits
-        self.weight_dtype = weight_dtype
-        self.weight_group_size = weight_group_size
-        self.weight_sym = weight_sym
-        self.act_dtype = act_dtype
-        self.accuracy_level = accuracy_level
-        self.enable_auto_scale = enable_auto_scale
-        self.enable_mse_search = enable_mse_search
-        self.providers = providers
-        self._post_init()
-
-    def get_model_params_dict(self):
-        result = dict()
-        for param in self.model_params_list:
-            result[param] = getattr(self, param)
-        return result
-
-    @classmethod
-    def register_supported_configs(cls) -> List[_OperatorConfig]:
-        supported_configs = []
-        linear_awq_config = AWQConfig(
-            weight_dtype=["int"],
-            weight_bits=[4, 3, 8],
-            weight_group_size=[32, -1, 1, 16, 64, 128, 256, 512, 1024],
-            weight_sym=[True, False],
-            act_dtype=["fp32"],
-            enable_auto_scale=[True, False],
-            enable_mse_search=[True, False],
-        )
-        operators = ["MatMul"]
-        supported_configs.append(_OperatorConfig(config=linear_awq_config, operators=operators))
-        cls.supported_configs = supported_configs
-
-    def to_config_mapping(self, config_list: list = None, model_info: list = None) -> OrderedDict:
-        config_mapping = OrderedDict()
-        if config_list is None:
-            config_list = [self]
-        for config in config_list:
-            # update model level setting
-            config_mapping.update(config.get_model_params_dict())
-
-            # update node level setting
-            global_config = config.global_config
-            op_type_config_dict, op_name_config_dict = config._get_op_name_op_type_config()
-            for op_name, op_type in model_info:
-                if self.global_config is not None:
-                    config_mapping[(op_name, op_type)] = global_config
-                if op_type in op_type_config_dict:
-                    config_mapping[(op_name, op_type)] = op_name_config_dict[op_type]
-                for op_name_pattern in op_name_config_dict:
-                    if re.match(op_name_pattern, op_name):
-                        config_mapping[(op_name, op_type)] = op_name_config_dict[op_name_pattern]
-        return config_mapping
-
-    @staticmethod
-    def get_model_info(model: Union[onnx.ModelProto, Path, str]) -> list:
-        if not isinstance(model, onnx.ModelProto):
-            model = onnx.load(model, load_external_data=False)
-        white_list = ["MatMul"]
-        filter_result = []
-        for node in model.graph.node:
-            if node.op_type in white_list:
-                pair = (node.name, node.op_type)
-                filter_result.append(pair)
-        logger.debug(f"Get model info: {filter_result}")
-        return filter_result
-
-    @classmethod
-    def get_config_set_for_tuning(cls) -> Union[None, "AWQConfig", List["AWQConfig"]]:  # pragma: no cover
-        # TODO fwk owner needs to update it.
-        return AWQConfig(
-            weight_bits=[4, 8],
-            weight_sym=[True, False],
-            enable_auto_scale=[True, False],
-            enable_mse_search=[True, False],
-        )
-
-
-def get_default_awq_config() -> AWQConfig:
-    """Generate the default awq config.
-
-    Returns:
-        the default awq config.
-    """
-    return AWQConfig()
-
-
-######################## SmoohQuant Config ###############################
-
-
-@register_config(framework_name=FRAMEWORK_NAME, algo_name=SMOOTH_QUANT, priority=PRIORITY_SMOOTH_QUANT)
-class SmoohQuantConfig(BaseConfig, StaticQuantConfig):
-    """Smooth quant quantization config."""
-
-    supported_configs: List[_OperatorConfig] = []
-    params_list: List[str] = [
-        # smooth parameters
-        "alpha",
-        "folding",
-        "auto_alpha_args",
-        "calib_iter",
-        "scales_per_op",
-    ]
-    name: str = SMOOTH_QUANT
-
-    def __init__(
-        self,
-        alpha: float = 0.5,
-        folding: bool = True,
-        op_types: List[str] = ["Gemm", "Conv", "MatMul", "FusedConv"],
-        calib_iter: int = 100,
-        scales_per_op: bool = True,
-        auto_alpha_args: dict = {"alpha_min": 0.3, "alpha_max": 0.7, "alpha_step": 0.05, "attn_method": "min"},
-        providers: List[str] = ["CPUExecutionProvider"],
-        white_list: List[OP_NAME_OR_MODULE_TYPE] = DEFAULT_WHITE_LIST,
-        **kwargs,
-    ):
-        """Init smooth quant config.
-
-        Args:
-            alpha (float, optional): alpha value to balance the quantization difficulty of activation and weight.
-                Defaults to 0.5.
-            folding (bool, optional): whether fold those foldable Mul which are inserted for smooth quant.
-                Defaults to True.
-            op_types (list, optional): the op type to be smooth quantized.
-                Defaults to ["Gemm", "Conv", "MatMul", "FusedConv"].
-            calib_iter (int, optional): iteration num for calibration. Defaults to 100.
-            scales_per_op (bool, optional): True, each op will have an individual scale, mainlyfor accuracy.
-                False, ops with the same input will share a scale, mainly for performance. Defaults to True.
-            auto_alpha_args (dict, optional): settings for alpha tuning.
-                Defaults to {"alpha_min": 0.3, "alpha_max": 0.7, "alpha_step": 0.05, "attn_method": "min"}.
-            providers (list, optional): providers used for inference.
-                Defaults to ["CPUExecutionProvider"].
-            white_list (list, optional): op in white_list will be applied current config.
-                Defaults to DEFAULT_WHITE_LIST.
-            kwargs (dict): kwargs in below link are supported except calibration_data_reader:
-                https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/quantize.py#L78
-        """
-        BaseConfig.__init__(self)
-        kwargs.update({"calibration_data_reader": None})
-        StaticQuantConfig.__init__(self, **kwargs)
-        self.alpha = alpha
-        self.folding = folding
-        self.op_types = op_types
-        self.calib_iter = calib_iter
-        self.scales_per_op = scales_per_op
-        self.auto_alpha_args = auto_alpha_args
-        self.providers = providers
-        self.white_list = white_list
-        self.weight_type = self.weight_type.value if isinstance(self.weight_type, Enum) else self.weight_type
-        self.activation_type = (
-            self.activation_type.value if isinstance(self.activation_type, Enum) else self.activation_type
-        )
-        self.calibrate_method = (
-            self.calibrate_method.value if isinstance(self.calibrate_method, Enum) else self.calibrate_method
-        )
-        self.quant_format = self.quant_format.value if isinstance(self.quant_format, Enum) else self.quant_format
-        self._post_init()
-
-    @classmethod
-    def register_supported_configs(cls) -> List[_OperatorConfig]:
-        supported_configs = []
-        smooth_quant_config = SmoohQuantConfig()
-        operators = ["Gemm", "Conv", "MatMul", "FusedConv"]
-        supported_configs.append(_OperatorConfig(config=smooth_quant_config, operators=operators))
-        cls.supported_configs = supported_configs
-
-    @staticmethod
-    def get_model_info(model) -> list:
-        white_list = ["Gemm", "Conv", "MatMul", "FusedConv"]
-        filter_result = []
-        for node in model.graph.node:
-            if node.op_type in white_list:
-                pair = (node.name, node.op_type)
-                filter_result.append(pair)
-        logger.debug(f"Get model info: {filter_result}")
-        return filter_result
-
-    @classmethod
-    def get_config_set_for_tuning(cls) -> Union[None, "SmoohQuantConfig", List["SmoohQuantConfig"]]:  # pragma: no cover
-        # TODO fwk owner needs to update it.
-        return SmoohQuantConfig(alpha=np.arange(0.3, 0.7, 0.05))
-
-    def convert_to_ort_config(self):
-        self.activation_type = QuantType(self.activation_type)
-        self.weight_type = QuantType(self.weight_type)
-        self.weight_type = QuantType(self.weight_type)
-        self.calibrate_method = CalibrationMethod(self.calibrate_method)
-        self.quant_format = QuantFormat(self.quant_format)
-
-
-def get_default_sq_config() -> SmoohQuantConfig:
-    """Generate the default smooth quant config.
-
-    Returns:
-        the default smooth quant config.
-    """
-    return SmoohQuantConfig()
-
-
-##################### Algo Configs End ###################################
-
-
-register_supported_configs_for_fwk(fwk_name=FRAMEWORK_NAME)
diff --git a/neural_compressor/onnxrt/quantization/quantize.py b/neural_compressor/onnxrt/quantization/quantize.py
deleted file mode 100644
index eee9f3162f1..00000000000
--- a/neural_compressor/onnxrt/quantization/quantize.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from pathlib import Path
-from typing import Union
-
-import onnx
-
-from neural_compressor.common import Logger
-from neural_compressor.common.base_config import BaseConfig, ComposableConfig, config_registry
-from neural_compressor.common.utils import log_process
-from neural_compressor.onnxrt.quantization.calibrate import CalibrationDataReader
-from neural_compressor.onnxrt.quantization.config import FRAMEWORK_NAME
-from neural_compressor.onnxrt.utils.utility import algos_mapping
-
-logger = Logger().get_logger()
-
-
-def _need_apply(quant_config: BaseConfig, algo_name):
-    return quant_config.name == algo_name if hasattr(quant_config, "name") else False
-
-
-# * only for internal usage now
-@log_process()
-def _quantize(
-    model_input: Union[Path, str],
-    quant_config: BaseConfig,
-    calibration_data_reader: CalibrationDataReader = None,
-) -> onnx.ModelProto:
-    """The main entry to quantize a model.
-
-    Args:
-        model_input (Union[Path, str]): Path or str to the model to quantize.
-        quant_config (BaseConfig): a quantization configuration.
-        calibration_data_reader (CalibrationDataReader, optional): dataloader for calibration.
-            Defaults to None.
-
-    Returns:
-        onnx.ModelProto: The quantized model.
-    """
-    registered_configs = config_registry.get_cls_configs()
-    if isinstance(quant_config, dict):
-        quant_config = ComposableConfig.from_dict(quant_config, config_registry=registered_configs[FRAMEWORK_NAME])
-        logger.info(f"Parsed a config dict to construct the quantization config: {quant_config}.")
-    else:
-        assert isinstance(
-            quant_config, BaseConfig
-        ), f"Please pass a dict or config instance as the quantization configuration, but got {type(quant_config)}."
-    logger.info(f"Quantize model with config: \n {quant_config} \n")
-
-    # select quantization algo according to config
-    for algo_name, algo_func in algos_mapping.items():
-        if _need_apply(quant_config, algo_name):
-            logger.info(f"Start to apply {algo_name} on the model.")
-            q_model = algo_func(model_input, quant_config, calibration_data_reader=calibration_data_reader)
-    return q_model
diff --git a/neural_compressor/onnxrt/utils/__init__.py b/neural_compressor/onnxrt/utils/__init__.py
deleted file mode 100644
index 813fc93ab5a..00000000000
--- a/neural_compressor/onnxrt/utils/__init__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from neural_compressor.onnxrt.utils.onnx_model import ONNXModel
-from neural_compressor.onnxrt.utils.utility import PRIORITY_RTN, PRIORITY_GPTQ, PRIORITY_AWQ, PRIORITY_SMOOTH_QUANT
-
-__all__ = [
-    "ONNXModel",
-    "PRIORITY_RTN",
-    "PRIORITY_GPTQ",
-    "PRIORITY_AWQ",
-    "PRIORITY_SMOOTH_QUANT",
-]
diff --git a/neural_compressor/onnxrt/utils/onnx_model.py b/neural_compressor/onnxrt/utils/onnx_model.py
deleted file mode 100644
index 801416f7f64..00000000000
--- a/neural_compressor/onnxrt/utils/onnx_model.py
+++ /dev/null
@@ -1,1082 +0,0 @@
-# Copyright (c) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Class for ONNX model."""
-
-import os
-import sys
-from pathlib import Path
-
-import onnx
-from onnxruntime.quantization.onnx_model import ONNXModel as ORTONNXModel
-
-from neural_compressor.common import Logger
-
-logger = Logger().get_logger()
-
-__all__ = ["ONNXModel"]
-
-
-class ONNXModel(ORTONNXModel):
-    """Build ONNX model."""
-
-    def __init__(self, model, **kwargs):
-        """Initialize an ONNX model.
-
-        Args:
-            model (str or ModelProto): path to onnx model or loaded ModelProto model object.
-        """
-        self.model = model if not isinstance(model, str) else onnx.load(model, load_external_data=False)
-        super().__init__(self.model)
-
-        self._model_path = None if not isinstance(model, str) else model
-        self.check_is_large_model()
-        if self._is_large_model and self._model_path is None and not kwargs.get("ignore_warning", False):
-            logger.warning("Model size > 2GB. Please use model path instead of onnx model object to quantize")
-
-        if self._is_large_model and isinstance(model, str) and kwargs.get("load_external_data", True):
-            from onnx.external_data_helper import load_external_data_for_model
-
-            load_external_data_for_model(self.model, os.path.dirname(self._model_path))
-
-        self._config = None
-        if isinstance(model, str) and os.path.exists(Path(model).parent.joinpath("config.json").as_posix()):
-            from transformers import PretrainedConfig
-
-            self._config = PretrainedConfig.from_pretrained(Path(model).parent.as_posix())
-        self.node_name_counter = {}
-        self._output_name_to_node = self.output_name_to_node()
-        self._input_name_to_nodes = self.input_name_to_nodes()
-        self._graph_info = {}
-        self._get_graph_info()
-        self._q_config = None
-
-    @property
-    def model_path(self):
-        """Return model path."""
-        return self._model_path
-
-    @model_path.setter
-    def model_path(self, path):
-        """Set model path."""
-        self._model_path = path
-
-    def check_is_large_model(self):
-        """Check model > 2GB."""
-        from neural_compressor.onnxrt.utils.utility import MAXIMUM_PROTOBUF
-
-        init_size = 0
-        for init in self.model.graph.initializer:
-            # if initializer has external data location, return True
-            if init.HasField("data_location") and init.data_location == onnx.TensorProto.EXTERNAL:
-                self._is_large_model = True
-                return
-            # if raise error of initializer size > 2GB, return True
-            try:
-                init_bytes = init.SerializeToString()
-                init_size += sys.getsizeof(init_bytes)
-            except Exception as e:
-                if "exceeds maximum protobuf size of 2GB" in str(e):
-                    self._is_large_model = True
-                    return
-                else:  # pragma: no cover
-                    raise e
-            if init_size > MAXIMUM_PROTOBUF:
-                self._is_large_model = True
-                return
-        self._is_large_model = False
-
-    @property
-    def is_large_model(self):
-        """Check the onnx model is over 2GB."""
-        return self._is_large_model
-
-    def framework(self):
-        """Return framework."""
-        return "onnxruntime"
-
-    def add_initializers(self, tensors):
-        """Add initializers to model."""
-        for tensor in tensors:
-            self.add_initializer(tensor)
-
-    @property
-    def q_config(self):
-        """Return q_config."""
-        return self._q_config
-
-    @q_config.setter
-    def q_config(self, q_config):
-        """Set q_config."""
-        self._q_config = q_config
-
-    @property
-    def hf_config(self):
-        """Return huggingface config if model is Transformer-based."""
-        return self._config
-
-    def input(self):
-        """Return input of model."""
-        return [i.name for i in self.model.graph.input]
-
-    def output(self):
-        """Return output of model."""
-        return [i.name for i in self.model.graph.output]
-
-    def update(self):
-        """Update model info."""
-        self._graph_info = {}
-        self._get_graph_info()
-        self._output_name_to_node = self.output_name_to_node()
-        self._input_name_to_nodes = self.input_name_to_nodes()
-
-    @property
-    def graph_info(self):
-        """Return ORT Graph Info object holding information about backend graph."""
-        return self._graph_info
-
-    def _get_graph_info(self):
-        """Update graph info."""
-        for node in self.model.graph.node:
-            self.graph_info.update({node.name: node.op_type})
-
-    def save(self, root):
-        """Save ONNX model."""
-        if os.path.split(root)[0] != "" and not os.path.exists(os.path.split(root)[0]):
-            raise ValueError('"root" directory does not exists.')
-        if self.is_large_model:  # pragma: no cover
-            from onnx.external_data_helper import load_external_data_for_model
-
-            load_external_data_for_model(self.model, os.path.split(self._model_path)[0])
-            onnx.save_model(
-                self.model,
-                root,
-                save_as_external_data=True,
-                all_tensors_to_one_file=True,
-                location=root.split("/")[-1] + "_data",
-                size_threshold=1024,
-                convert_attribute=False,
-            )
-        else:
-            onnx.save(self.model, root)
-
-        if self._config is not None:
-            model_type = "" if not hasattr(self._config, "model_type") else getattr(self._config, "model_type")
-            setattr(self._config.__class__, "model_type", model_type)
-            output_config_file = Path(root).parent.joinpath("config.json").as_posix()
-            self._config.to_json_file(output_config_file, use_diff=False)
-
-    def get_initializer_share_num(self, name):
-        """Get the number of shares of initializer."""
-        num = 0
-        if self.get_initializer(name) is None:
-            return num
-
-        for node in self.nodes():
-            if name in node.input:
-                num += 1
-        return num
-
-    def get_node(self, name):
-        """Get a node by name."""
-        for node in self.model.graph.node:
-            if node.name == name:
-                return node
-        return None
-
-    def get_node_by_weight(self, weight_name):
-        """Get a node by its weight name."""
-        if len(self._input_name_to_nodes) == 0:
-            self._input_name_to_nodes = self.input_name_to_nodes()
-        nodes = self._input_name_to_nodes[weight_name]
-        if len(nodes) == 1:
-            return nodes[0]
-        elif len(nodes) == 0:
-            raise ValueError("{} is not used by any node in this model.".format(weight_name))
-        else:
-            raise NotImplementedError("Models with shared weights is not supported.")
-
-    def set_initializer(self, tensor, array, raw=False):
-        """Update initializer."""
-        old_tensor = self.get_initializer(tensor)
-        self.remove_initializer(old_tensor)
-        dims = old_tensor.dims
-        data_type = old_tensor.data_type
-        new_tensor = (
-            onnx.helper.make_tensor(tensor, data_type, dims, array.flatten().tolist())
-            if not raw
-            else onnx.helper.make_tensor(tensor, data_type, dims, array.tostring(), raw=raw)
-        )
-        self.add_initializer(new_tensor)
-
-    def get_siblings(self, node):
-        """Get siblings nodes."""
-        siblings = []
-        for parent in self.get_parents(node):
-            for child in self.get_children(parent):
-                if child.name != node.name:
-                    siblings.append(child)
-        return siblings
-
-    def get_scale_zero(self, tensor):
-        """Help function to get scale and zero_point."""
-        if not tensor.endswith("_quantized"):
-            logger.debug("Find {} in the quantized graph is not quantized.".format(tensor))
-            return None, None
-
-        if len(self._input_name_to_nodes) == 0:
-            self._input_name_to_nodes = self.input_name_to_nodes()
-        if len(self._output_name_to_node) == 0:
-            self._output_name_to_node = self.output_name_to_node()
-
-        def _searcher(tensor_name):
-            """Search scale and zero point tensor recursively."""
-            node = self._input_name_to_nodes[tensor_name][0]
-            parent = self._output_name_to_node[tensor_name] if tensor_name in self._output_name_to_node else None
-            direct_int8 = ["Reshape", "Transpose", "Squeeze", "Unsqueeze", "MaxPool", "Pad", "Split"]
-            if parent is not None and parent.op_type in direct_int8:
-                fp32_tensor_name = (
-                    parent.input[0]
-                    .replace("_quantized", "")
-                    .replace("_QuantizeLinear", "")
-                    .replace("_QuantizeInput", "")
-                )
-            elif node.op_type in ["Gather"]:  # pragma: no cover
-                fp32_tensor_name = (
-                    node.output[0]
-                    .replace("_quantized", "")
-                    .replace("_QuantizeLinear", "")
-                    .replace("_QuantizeInput", "")
-                )
-            else:
-                fp32_tensor_name = (
-                    tensor_name.replace("_quantized", "").replace("_QuantizeLinear", "").replace("_QuantizeInput", "")
-                )
-            scale = fp32_tensor_name + "_scale"
-            scale_tensor = self.get_initializer(scale)
-            zo = fp32_tensor_name + "_zero_point"
-            zo_tensor = self.get_initializer(zo)
-
-            if scale_tensor is None or zo_tensor is None:
-                if parent is not None:
-                    scale_tensor, zo_tensor = _searcher(parent.input[0])
-            return scale_tensor, zo_tensor
-
-        node = self._input_name_to_nodes[tensor][0]
-        # TODO check if scale_tensor and zero_point is needed
-        # for bias of qlinearconv, scale and zero_point is not needed
-        if (node.op_type == "QLinearConv" and tensor == node.input[-1]) or (
-            node.op_type == "QGemm" and tensor == node.input[-3]
-        ):
-            return None, None
-        else:
-            scale_tensor, zo_tensor = _searcher(tensor)
-            assert scale_tensor, "missing scale for tensor {}".format(tensor)
-            assert zo_tensor, "missing zero point for tensor {}".format(tensor)
-            return scale_tensor, zo_tensor
-
-    def replace_input_of_all_nodes(self, old_input_name, new_input_name, white_optype=[], black_optype=[]):
-        """Replace inputs of all nodes."""
-        if len(white_optype) > 0:
-            for node in self.model.graph.node:
-                if node.op_type in white_optype:
-                    ONNXModel.replace_node_input(node, old_input_name, new_input_name)
-        else:
-            for node in self.model.graph.node:
-                if node.op_type not in black_optype:
-                    ONNXModel.replace_node_input(node, old_input_name, new_input_name)
-
-    def replace_output_of_all_nodes(self, old_output_name, new_output_name, white_optype=[], black_optype=[]):
-        """Replace outputs of all nodes."""
-        if len(white_optype) > 0:
-            for node in self.model.graph.node:
-                if node.op_type in white_optype:
-                    ONNXModel.replace_node_output(node, old_output_name, new_output_name)
-        else:
-            for node in self.model.graph.node:
-                if node.op_type not in black_optype:
-                    ONNXModel.replace_node_output(node, old_output_name, new_output_name)
-
-    def remove_unused_nodes(self):
-        """Remove unused nodes."""
-        unused_nodes = []
-        nodes = self.nodes()
-        if len(self._input_name_to_nodes) == 0:
-            self._input_name_to_nodes = self.input_name_to_nodes()
-        if len(self._output_name_to_node) == 0:
-            self._output_name_to_node = self.output_name_to_node()
-        for node in nodes:
-            if (
-                node.op_type == "Constant"
-                and node.output[0] not in self.model.graph.output
-                and node.output[0] not in self._input_name_to_nodes
-            ):
-                unused_nodes.append(node)
-            elif (
-                node.op_type == "QuantizeLinear"
-                and len(self.get_children(node)) == 1
-                and self.get_children(node)[0].op_type == "DequantizeLinear"
-                and node.input[0] not in self._output_name_to_node
-                and self.get_children(node)[0].output[0] not in self._input_name_to_nodes
-            ):
-                unused_nodes.append(node)
-                unused_nodes.extend(self.get_children(node))
-            else:
-                # remove the node if it does not serve as the input or output of any other nodes
-                unused = True
-                for output in node.output:
-                    if output in self._input_name_to_nodes or output in self.output():
-                        unused = False
-                        break
-                for input in node.input:
-                    if self.get_initializer(input) is not None:
-                        continue
-                    elif input in self._output_name_to_node or input in self.input():
-                        unused = False
-                        break
-                if unused:
-                    unused_nodes.append(node)
-        self.remove_nodes(unused_nodes)
-
-        ununsed_weights = []
-        for w in self.model.graph.initializer:
-            if w.name not in self._input_name_to_nodes and w.name not in self.model.graph.output:
-                ununsed_weights.append(w)
-                # Remove from graph.input
-                for graph_input in self.graph().input:
-                    if graph_input.name == w.name:
-                        self.graph().input.remove(graph_input)
-
-        self.remove_initializers(ununsed_weights)
-        self.update()
-
-    def topological_sort(self, enable_subgraph=False):
-        """Topological sort the model."""
-        import copy
-        from collections import deque
-        from functools import reduce
-
-        if not enable_subgraph:
-            input_name_to_nodes = {}
-            output_name_to_node = {}
-            for node in self.model.graph.node:
-                for input_name in node.input:
-                    if len(input_name.strip()) != 0:
-                        if input_name not in input_name_to_nodes:
-                            input_name_to_nodes[input_name] = [node]
-                        else:
-                            input_name_to_nodes[input_name].append(node)
-                for output_name in node.output:
-                    if len(output_name.strip()) != 0:
-                        output_name_to_node[output_name] = node
-        else:  # pragma: no cover
-            if len(self._input_name_to_nodes) == 0:
-                self._input_name_to_nodes = self.input_name_to_nodes()
-            if len(self._output_name_to_node) == 0:
-                self._output_name_to_node = self.output_name_to_node()
-            input_name_to_nodes = self._input_name_to_nodes
-            output_name_to_node = self._output_name_to_node
-
-        all_nodes = {}
-        q = deque()
-        wait = deque()
-        for inp in self.model.graph.input:
-            q.extend(input_name_to_nodes[inp.name])
-        for n in self.model.graph.node:
-            if all([i not in output_name_to_node and i not in self.input() for i in n.input]):
-                q.append(n)
-
-        while q:
-            n = q.popleft()
-            if not all([output_name_to_node[i].name in all_nodes for i in n.input if i in output_name_to_node]):
-                if n not in wait:
-                    wait.append(n)
-                continue
-
-            all_nodes[n.name] = n
-            for out in n.output:
-                if out in input_name_to_nodes:
-                    q.extend([i for i in input_name_to_nodes[out] if i.name not in all_nodes and i not in q])
-            if len(q) == 0 and len(wait) != 0:
-                q = copy.deepcopy(wait)
-                wait.clear()
-        nodes = [i[1] for i in all_nodes.items()]
-        assert len(list(set([n.name for n in nodes]))) == len(list(set([n.name for n in self.model.graph.node])))
-        self.model.graph.ClearField("node")
-        self.model.graph.node.extend(nodes)
-
-    def get_nodes_chain(self, start, stop, result_chain=[]):
-        """Get nodes chain with given start node and stop node."""
-        from collections import deque
-
-        from onnx import NodeProto
-
-        from neural_compressor.onnxrt.utils.utility import find_by_name
-
-        # process start node list
-        start_node = deque()
-        for node in start:
-            if isinstance(node, str):
-                start_node.append(node)
-            elif isinstance(node, NodeProto):
-                start_node.append(node.name)
-            else:
-                assert False, "'get_nodes_chain' function only support list[string]" "or list[NodeProto] params"
-
-        # process stop node list
-        stop_node = []
-        for node in stop:
-            if isinstance(node, str):
-                stop_node.append(node)
-            elif isinstance(node, NodeProto):
-                stop_node.append(node.name)
-            else:
-                assert False, "'get_nodes_chain' function only support list[string]" "or list[NodeProto] params"
-
-        while start_node:
-            node_name = start_node.popleft()
-            if node_name in stop_node:
-                continue
-            if node_name not in result_chain:
-                result_chain.append(node_name)
-            else:
-                continue
-
-            node = find_by_name(node_name, list(self.model.graph.node))
-            for parent in self.get_parents(node):
-                start_node.append(parent.name)
-
-        return result_chain
-
-    def find_split_node_for_layer_wise_quantization(self):
-        """Find split node for layer wise quantization."""
-        # find split nodes of decoder blocks
-        # embed -> decoder.0 -(split_node)-> ... -(split_node)-> decoder.n -(split_node)-> norm -> head
-        # after split: embed -> decoder.0,
-        #              decoder.1,
-        #              decoder.2,
-        #              ...,
-        #              decoder.n,
-        #              norm -> head
-        start_nodes = []
-        for node in self.model.graph.node:
-            start_node, qkv_nodes_list = None, None
-            if node.op_type == "SkipLayerNormalization":
-                start_node = node
-                qkv_nodes_list = [
-                    self.match_parent_path(
-                        start_node,
-                        ["MatMul", "Reshape", "Transpose", "Reshape", "MatMul"],
-                        [None, 0, 0, 0, 0],
-                    ),
-                    self.match_parent_path(
-                        start_node,
-                        ["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
-                        [1, 1, 0, 0, 0],
-                    ),
-                ]
-            if node.op_type == "Add":
-                start_node = node
-                qkv_nodes_list = [
-                    # match base attention structure
-                    self.match_parent_path(
-                        start_node,
-                        ["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
-                        [0, None, 0, 0, 0],
-                    ),
-                    self.match_parent_path(
-                        start_node, ["Add", "MatMul", "Reshape", "Transpose", "MatMul"], [1, None, 0, 0, 0]
-                    ),
-                    # match gpt attention no past structure
-                    self.match_parent_path(
-                        start_node,
-                        ["Reshape", "Gemm", "Reshape", "Reshape", "Transpose", "MatMul"],
-                        [None, 0, 0, 0, 0, 0],
-                        output_name_to_node_dict=self._output_name_to_node,
-                        return_indice=[],
-                    ),
-                    # match bart attention structure
-                    self.match_parent_path(
-                        start_node,
-                        ["Add", "MatMul", "Reshape", "Transpose", "Reshape", "MatMul"],
-                        [0, None, 0, 0, 0, 0],
-                    ),
-                    self.match_parent_path(
-                        start_node,
-                        ["Add", "MatMul", "Reshape", "Transpose", "Reshape", "MatMul"],
-                        [1, None, 0, 0, 0, 0],
-                    ),
-                    self.match_parent_path(
-                        start_node,
-                        ["MatMul", "Mul", "MatMul", "Mul", "Div", "Add"],
-                        [None, 0, None, 0, None, 0],
-                    ),
-                    self.match_parent_path(
-                        start_node,
-                        ["MatMul", "Mul", "MatMul", "SimplifiedLayerNormalization", "Add"],
-                        [None, 0, None, 0, 0],
-                    ),
-                ]
-            if not start_node:
-                continue
-            if not any(qkv_nodes_list):
-                continue
-            start_nodes.append(start_node)
-        return start_nodes
-
-    def find_qkv_in_attention(self, find_all=False):
-        """Find qkv MatMul in Attention.
-
-        Args:
-            find_all (bool, optional): find all qkv MatMul. Defaults to False
-
-        Returns:
-            qkv (list): qkv MatMul list
-        """
-        qkv = []
-        if len(self._input_name_to_nodes) == 0:
-            self._input_name_to_nodes = self.input_name_to_nodes()
-        for node in self.model.graph.node:
-            if node.op_type == "Attention":
-                qkv.append([node.name])
-                continue
-            start_node, qkv_nodes_list = None, None
-            if node.op_type == "SkipLayerNormalization":
-                start_node = node
-                qkv_nodes_list = [
-                    self.match_parent_path(
-                        start_node,
-                        ["MatMul", "Reshape", "Transpose", "Reshape", "MatMul"],
-                        [None, 0, 0, 0, 0],
-                    ),
-                    self.match_parent_path(
-                        start_node,
-                        ["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
-                        [1, 1, 0, 0, 0],
-                    ),
-                ]
-            if node.op_type == "Add":
-                start_node = node
-                qkv_nodes_list = [
-                    # match base attention structure
-                    self.match_parent_path(
-                        start_node,
-                        ["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
-                        [0, None, 0, 0, 0],
-                    ),
-                    self.match_parent_path(
-                        start_node, ["Add", "MatMul", "Reshape", "Transpose", "MatMul"], [1, None, 0, 0, 0]
-                    ),
-                    # match gpt attention no past structure
-                    self.match_parent_path(
-                        start_node,
-                        ["Reshape", "Gemm", "Reshape", "Reshape", "Transpose", "MatMul"],
-                        [None, 0, 0, 0, 0, 0],
-                        output_name_to_node_dict=self._output_name_to_node,
-                        return_indice=[],
-                    ),
-                    # match bart attention structure
-                    self.match_parent_path(
-                        start_node,
-                        ["Add", "MatMul", "Reshape", "Transpose", "Reshape", "MatMul"],
-                        [0, None, 0, 0, 0, 0],
-                    ),
-                    self.match_parent_path(
-                        start_node,
-                        ["Add", "MatMul", "Reshape", "Transpose", "Reshape", "MatMul"],
-                        [1, None, 0, 0, 0, 0],
-                    ),
-                ]
-            if not start_node:
-                continue
-            if not any(qkv_nodes_list):
-                continue
-            qkv_nodes = [qkv for qkv in qkv_nodes_list if qkv is not None][-1]
-            other_inputs = []
-            for input in start_node.input:
-                if input not in self._output_name_to_node:
-                    continue
-                if input == qkv_nodes[0].output[0]:
-                    continue
-                other_inputs.append(input)
-            if len(other_inputs) != 1:
-                continue
-            root_input = other_inputs[0]
-            children = self._input_name_to_nodes[root_input]
-            children_types = [child.op_type for child in children]
-            if children_types.count("MatMul") == 3:
-                qkv.append([child.name for child in children if child.op_type == "MatMul"])
-                if not find_all:
-                    break
-        return qkv
-
-    def find_ffn_matmul(self, attention_index, attention_matmul_list, block_len):
-        """Find MatMul in FFN.
-
-        Args:
-            attention_index (list): index of Attention
-            attention_matmul_list (list): list of Attention and MatMul nodes
-            block_len (int): block length
-
-        Returns:
-            list: list of MatMul in FFN
-        """
-        ffn_matmul = []
-        for idx in range(len(attention_index)):
-            if idx != len(attention_index) - 1:
-                index = attention_index[idx + 1]
-                if index - 2 >= 0:
-                    ffn_matmul.append([attention_matmul_list[index - 2], attention_matmul_list[index - 1]])
-            else:
-                index = attention_index[idx]
-                if index + block_len - 1 < len(attention_matmul_list):
-                    ffn_matmul.append(
-                        [attention_matmul_list[index + block_len - 2], attention_matmul_list[index + block_len - 1]]
-                    )
-        return ffn_matmul
-
-    def export(self, save_path, conf):
-        """Export Qlinear to QDQ model."""
-        from neural_compressor.config import ONNXQlinear2QDQConfig
-        from neural_compressor.utils.export import onnx_qlinear_to_qdq
-
-        if isinstance(conf, ONNXQlinear2QDQConfig):
-            if len(self._input_name_to_nodes) == 0:
-                self._input_name_to_nodes = self.input_name_to_nodes()
-            add_nodes, remove_nodes, inits = onnx_qlinear_to_qdq(self.model, self._input_name_to_nodes)
-            self.add_nodes(add_nodes)
-            self.remove_nodes(remove_nodes)
-            self.add_initializers(inits)
-            self.update()
-            self.remove_unused_nodes()
-            self.topological_sort()
-            self.save(save_path)
-        else:
-            logger.warning("Unsupported config for export, " "only ONNXQlinear2QDQConfig is supported!")
-            exit(0)
-
-    def add_tensors_to_outputs(self, tensor_names):
-        """Add the tensors to the model outputs to gets their values.
-
-        Args:
-            tensor_names: The names of tensors to be dumped.
-        """
-        added_outputs = []
-        for tensor in tensor_names:
-            if tensor not in self.output():
-                added_tensor = onnx.helper.ValueInfoProto()
-                added_tensor.name = tensor
-                added_outputs.append(added_tensor)
-        self.model.graph.output.extend(added_outputs)  # pylint: disable=no-member
-
-    def remove_tensors_from_outputs(self, tensor_names):
-        """Remove the tensors from the model outputs.
-
-        Args:
-            tensor_names: The names of tensors to be removed.
-        """
-        removed_outputs = []
-        for tensor in tensor_names:
-            if tensor in self.output():
-                removed_outputs.append(self.model.graph.output[self.output().index(tensor)])
-        for output in removed_outputs:
-            self.model.graph.output.remove(output)
-
-    def match_first_parent(self, node, parent_op_type, output_name_to_node_dict, exclude=[]):
-        """Find parent node based on constraints on op_type.
-
-        Args:
-            node (str): current node name.
-            parent_op_type (str): constraint of parent node op_type.
-            output_name_to_node (dict): dictionary with output name as key, and node as value.
-            exclude (list): list of nodes that are excluded (not allowed to match as parent).
-
-        Returns:
-            parent: The matched parent node. None if not found.
-            index: The input index of matched parent node. None if not found.
-        """
-        for i, input in enumerate(node.input):
-            if input in output_name_to_node_dict:
-                parent = output_name_to_node_dict[input]
-                if parent.op_type == parent_op_type and parent not in exclude:
-                    return parent, i
-        return None, None
-
-    def match_parent(
-        self,
-        node,
-        parent_op_type,
-        input_index=None,
-        output_name_to_node_dict=None,
-        exclude=[],
-        return_indice=None,
-    ):
-        """Find parent node based on constraints on op_type and index.
-
-        Args:
-            node (str): current node name.
-            parent_op_type (str): constraint of parent node op_type.
-            input_index (int or None): only check the parent given input index of current node.
-            output_name_to_node (dict): dictionary with output name as key, and node as value.
-            exclude (list): list of nodes that are excluded (not allowed to match as parent).
-            return_indice (list): a list to append the input index when input_index is None.
-
-        Returns:
-            parent: The matched parent node.
-        """
-        assert node is not None
-        assert input_index is None or input_index >= 0
-
-        if output_name_to_node_dict is None:
-            if len(self._output_name_to_node) == 0:
-                self._output_name_to_node = self.output_name_to_node()
-            output_name_to_node_dict = self._output_name_to_node
-
-        if input_index is None:
-            parent, index = self.match_first_parent(node, parent_op_type, output_name_to_node_dict, exclude)
-            if return_indice is not None:
-                return_indice.append(index)
-            return parent
-
-        if input_index >= len(node.input):
-            return None
-
-        parent = self.get_parent(node, input_index, output_name_to_node_dict)
-        if parent is not None and parent.op_type == parent_op_type and parent not in exclude:
-            return parent
-
-        return None
-
-    def match_parent_path(
-        self,
-        node,
-        parent_op_types,
-        parent_input_index,
-        output_name_to_node_dict=None,
-        return_indice=None,
-    ):
-        """Find a sequence of input edges based on constraints on parent op_type and index.
-
-        Args:
-            node (str): current node name.
-            parent_op_types (str): constraint of parent node op_type of each input edge.
-            parent_input_index (list): constraint of input index of each input edge.
-                                       None means no constraint.
-            output_name_to_node (dict): dictionary with output name as key, and node as value.
-            return_indice (list): a list to append the input index when there is
-                                  no constraint on input index of an edge.
-
-        Returns:
-            parents: a list of matched parent node.
-        """
-        assert len(parent_input_index) == len(parent_op_types)
-
-        if output_name_to_node_dict is None:
-            if len(self._output_name_to_node) == 0:
-                self._output_name_to_node = self.output_name_to_node()
-            output_name_to_node_dict = self._output_name_to_node
-
-        current_node = node
-        matched_parents = []
-        for i, op_type in enumerate(parent_op_types):
-            matched_parent = self.match_parent(
-                current_node,
-                op_type,
-                parent_input_index[i],
-                output_name_to_node_dict,
-                exclude=[],
-                return_indice=return_indice,
-            )
-            if matched_parent is None:
-                return None
-
-            matched_parents.append(matched_parent)
-            current_node = matched_parent
-
-        return matched_parents
-
-    def is_smoothquant_model(self):
-        """Check the model is smooth quantized or not.
-
-        Returns:
-            bool: the model is smooth quantized or not.
-        """
-        for init in self.model.graph.initializer:
-            if "_smooth_scale" in init.name:
-                return True
-        return False
-
-    def find_split_nodes(self):
-        """Find split nodes for layer-wise quantization."""
-        split_nodes = self.find_split_node_for_layer_wise_quantization()
-        return split_nodes
-
-    def split_model_with_node(self, split_node_name, path_of_model_to_split, save_both_split_models=True):
-        """Split model into two parts at a given node.
-
-        Args:
-            split_node_name (str): name of the node where the model is split at>
-            path_of_model_to_split (str): path of model to be split.
-            save_both_split_models (bool): whether to save the two split models.
-                False means only save the first split model.
-                True means save both the two split models.
-                Default id True.
-
-        Returns:
-            tuple: the first split model, the second split model
-        """
-        # origin model : ... -> node_1 -> split_node -> node_2 -> ...
-        # split model 1: ... -> node_1 -> split_node
-        # split model 2: node_2 -> ...
-
-        # remove nodes which are not followed by other nodes
-        unvalid_nodes = [
-            i
-            for i in self.model.graph.node
-            if all(out not in self._input_name_to_nodes and not self.is_graph_output(out) for out in i.output)
-        ]
-        while len(unvalid_nodes) > 0:
-            self.remove_nodes(unvalid_nodes)
-            self._input_name_to_nodes = self.input_name_to_nodes()
-            unvalid_nodes = [
-                i
-                for i in self.model.graph.node
-                if all([out not in self._input_name_to_nodes and not self.is_graph_output(out) for out in i.output])
-            ]
-        self.topological_sort()
-
-        split_model_part_1 = onnx.ModelProto()
-        split_model_part_1.CopyFrom(self.model)
-        split_model_part_1.graph.ClearField("node")
-
-        split_model_part_2 = onnx.ModelProto()
-        split_model_part_2.CopyFrom(self.model)
-        split_model_part_2.graph.ClearField("node")
-
-        split_node_output = None
-        part_idx = 1
-        for node in self.model.graph.node:
-            if part_idx == 1:
-                split_model_part_1.graph.node.append(node)
-            elif part_idx == 2:
-                split_model_part_2.graph.node.append(node)
-
-            if node.name == split_node_name:
-                split_node_output = node.output
-                part_idx = 2
-
-        assert len(split_node_output) == 1, (
-            "Only support split at node with 1 output tensor, while "
-            "current split node {} has {} output tensors".format(split_node_name, len(split_node_output))
-        )
-        split_tensor_name = split_node_output[0]
-
-        split_tensor_type, split_tensor_shape = self._get_output_type_shape_by_tensor_name(split_tensor_name)
-        split_tensor = onnx.helper.make_tensor_value_info(split_tensor_name, split_tensor_type, split_tensor_shape)
-
-        split_model_part_1.graph.output.append(split_tensor)
-        split_model_part_2.graph.input.append(split_tensor)
-
-        split_model_part_1 = ONNXModel(split_model_part_1, ignore_warning=True)
-        split_model_part_2 = ONNXModel(split_model_part_2, ignore_warning=True)
-
-        # remove unused input & output
-        split_model_part_1._remove_unused_input_output()
-        split_model_part_2._remove_unused_input_output()
-
-        insert_output_for_model_1 = []
-        insert_input_for_model_2 = []
-        for output in split_model_part_1._output_name_to_node.keys():
-            if output in split_model_part_2._input_name_to_nodes.keys():
-                output_type, output_shape = self._get_output_type_shape_by_tensor_name(output)
-                output_tensor = onnx.helper.make_tensor_value_info(output, output_type, output_shape)
-                if output_tensor not in split_model_part_1.model.graph.output:
-                    insert_output_for_model_1.append(output_tensor)
-                if output_tensor not in split_model_part_2.model.graph.input:
-                    insert_input_for_model_2.append(output_tensor)
-
-        # insert model 1 output
-        for output in insert_output_for_model_1:
-            split_model_part_1.model.graph.output.append(output)
-
-        # insert model 2 input
-        for input in insert_input_for_model_2:
-            split_model_part_2.model.graph.input.append(input)
-
-        # remove unused init
-        split_model_part_1.remove_unused_init()
-        split_model_part_2.remove_unused_init()
-
-        split_model_part_1.update()
-        split_model_part_2.update()
-
-        dir_of_model_to_split = os.path.dirname(path_of_model_to_split)
-
-        split_model_part_1.load_model_initializer_by_tensor(dir_of_model_to_split)
-        split_model_part_1_path = os.path.join(dir_of_model_to_split, "split_model_part_1.onnx")
-        split_model_part_1.model_path = split_model_part_1_path
-        split_model_part_1._save_split_model(split_model_part_1_path)
-        split_model_part_1.check_is_large_model()
-        logger.debug("save split model part 1 to {} for layer wise quantization".format(split_model_part_1_path))
-
-        if save_both_split_models:
-            split_model_part_2.load_model_initializer_by_tensor(dir_of_model_to_split)
-            split_model_part_2_path = os.path.join(dir_of_model_to_split, "split_model_part_2.onnx")
-            split_model_part_2.model_path = split_model_part_2_path
-            split_model_part_2._save_split_model(split_model_part_2_path)
-            split_model_part_2.check_is_large_model()
-            logger.debug("save split model part 2 to {} for layer wise quantization".format(split_model_part_2_path))
-            return split_model_part_1, split_model_part_2
-        else:
-            return split_model_part_1, split_model_part_2
-
-    def _save_split_model(self, save_path):
-        """Save split model as external data for layer wise quantization.
-
-        Args:
-            save_path (str): the path to save the split model
-        """
-        if os.path.exists(save_path + "_data"):
-            os.remove(save_path + "_data")
-        onnx.save_model(
-            self.model,
-            save_path,
-            save_as_external_data=True,
-            all_tensors_to_one_file=True,
-            location=save_path.split("/")[-1] + "_data",
-            size_threshold=1024,
-            convert_attribute=False,
-        )
-
-    def _get_output_type_shape_by_tensor_name(self, tensor_name):
-        """Get output type and shape with a tensor name.
-
-        Args:
-            tensor_name (str): name of a tensor
-
-        Returns:
-            tuple: output type and shape
-        """
-        elem_type = onnx.TensorProto.FLOAT
-        shape = None
-        for output in self.model.graph.value_info:
-            if output.name == tensor_name:
-                elem_type = output.type.tensor_type.elem_type
-                shape = [
-                    dim.dim_value if dim.HasField("dim_value") else -1 for dim in output.type.tensor_type.shape.dim
-                ]
-                break
-        return elem_type, shape
-
-    def _remove_unused_input_output(self):
-        """Remove unused input & output for split model."""
-        remove_outputs = []
-        remove_inputs = []
-        if len(self._input_name_to_nodes) == 0:
-            self._input_name_to_nodes = self.input_name_to_nodes()
-        for output in self.model.graph.output:
-            if output.name not in self._output_name_to_node.keys():
-                remove_outputs.append(output)
-
-        for input in self.model.graph.input:
-            if input.name not in self._input_name_to_nodes.keys():
-                remove_inputs.append(input)
-
-        for output in remove_outputs:
-            self.model.graph.output.remove(output)
-        for input in remove_inputs:
-            self.model.graph.input.remove(input)
-
-    def remove_unused_init(self):
-        """Remove unused init."""
-        remov_inits = []
-        if len(self._input_name_to_nodes) == 0:
-            self._input_name_to_nodes = self.input_name_to_nodes()
-        for init in self.model.graph.initializer:
-            if init.name not in self._input_name_to_nodes.keys():
-                remov_inits.append(init)
-        self.remove_initializers(remov_inits)
-
-    def load_model_initializer_by_tensor(self, data_path=None):
-        """Load model initializer by tensor.
-
-        Args:
-            data_path (str, optional): the directory of saved initializer. Defaults to None.
-        """
-        from onnx.external_data_helper import load_external_data_for_tensor
-
-        if data_path is None:
-            data_path = os.path.dirname(self._model_path)
-        for init in self.model.graph.initializer:
-            if init.HasField("data_location") and init.data_location == onnx.TensorProto.EXTERNAL:
-                load_external_data_for_tensor(init, data_path)
-
-    def write_external_data_to_new_location(self, external_data_location="external.data", overwrite=False):
-        """Write external data of merged quantized model to new location to save memory.
-
-        Args:
-            external_data_location (str, optional): external data location of merged quantized model.
-                                                    Defaults to "external.data".
-            overwrite (bool, optional): if True, remove existed externa data. Defaults to False.
-        """
-        from onnx.external_data_helper import convert_model_to_external_data, write_external_data_tensors
-
-        if overwrite and os.path.exists(os.path.join(os.path.dirname(self._model_path), external_data_location)):
-            os.remove(os.path.join(os.path.dirname(self._model_path), external_data_location))
-        self.load_model_initializer_by_tensor()
-        convert_model_to_external_data(self.model, location=external_data_location)
-        # TODO : if init is already saved, skip write it
-        write_external_data_tensors(self.model, filepath=os.path.dirname(self._model_path))
-
-    def merge_split_models(self, to_merge_model):
-        """Merge two split model into final model."""
-        to_merge_model.write_external_data_to_new_location()
-        self.add_nodes([node for node in to_merge_model.nodes()])
-        self.add_initializers([init for init in to_merge_model.initializer()])
-        self.update()
-
-        # add new output
-        for output in to_merge_model.graph().output:
-            if output.name not in self.output():
-                self.model.graph.output.append(output)
-
-        # remove unused output
-        remove_output = []
-        for output in self.model.graph.output:
-            if output.name in to_merge_model.input():
-                remove_output.append(output)
-        for output in remove_output:
-            self.model.graph.output.remove(output)
-
-        # add new input
-        for input in to_merge_model.graph().input:
-            if (
-                input.name not in self.input()
-                and input.name not in self.output()
-                and input.name not in self._output_name_to_node.keys()
-            ):
-                self.model.graph.input.append(input)
-
-    def re_org_output(self, origin_output):
-        """Re-org output of merged model for layer-wise quantization."""
-        outputs = {}
-        tmp_remove = []
-        for output in self.model.graph.output:
-            outputs[output.name] = output
-            tmp_remove.append(output)
-
-        for output in tmp_remove:
-            self.model.graph.output.remove(output)
-
-        for out_name in origin_output:
-            self.model.graph.output.append(outputs[out_name])
diff --git a/neural_compressor/onnxrt/utils/utility.py b/neural_compressor/onnxrt/utils/utility.py
deleted file mode 100644
index 21678717229..00000000000
--- a/neural_compressor/onnxrt/utils/utility.py
+++ /dev/null
@@ -1,288 +0,0 @@
-# Copyright (c) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from pathlib import Path
-from typing import Callable, Dict, List, Tuple, Union
-
-import numpy as np
-import onnx
-import onnxruntime.tools.symbolic_shape_infer as symbolic_shape_infer
-from packaging.version import Version
-
-from neural_compressor.common import Logger
-
-logger = Logger().get_logger()
-
-__all__ = [
-    "ONNXRT116_VERSION",
-    "ONNXRT1161_VERSION",
-    "algos_mapping",
-    "WHITE_MODULE_LIST",
-    "MAXIMUM_PROTOBUF",
-    "PRIORITY_RTN",
-    "PRIORITY_GPTQ",
-    "PRIORITY_AWQ",
-    "PRIORITY_SMOOTH_QUANT",
-    "dtype_mapping",
-    "find_by_name",
-    "simple_progress_bar",
-    "register_algo",
-    "get_model_info",
-    "is_B_transposed",
-    "get_qrange_for_qType",
-    "quantize_data",
-    "check_model_with_infer_shapes",
-]
-
-ONNXRT116_VERSION = Version("1.16.0")
-ONNXRT1161_VERSION = Version("1.16.1")
-
-# Dictionary to store a mapping between algorithm names and corresponding algo implementation(function)
-algos_mapping: Dict[str, Callable] = {}
-
-# All constants for onnxrt
-WHITE_MODULE_LIST = ["MatMul", "Conv"]
-
-MAXIMUM_PROTOBUF = 2147483648
-
-PRIORITY_RTN = 60
-PRIORITY_GPTQ = 70
-PRIORITY_AWQ = 50
-PRIORITY_SMOOTH_QUANT = 80
-
-dtype_mapping = {
-    "fp32": 1,
-    "float32": 1,
-    "uint8": 2,
-    "int8": 3,
-    "uint16": 4,
-    "int16": 5,
-    "int32": 6,
-    "int64": 7,
-    "string": 8,
-    "bool": 9,
-    "fp16": 10,
-    "float16": 10,
-    "double": 11,
-    "uint32": 12,
-    "uint64": 13,
-    "complex64": 14,
-    "complex128": 15,
-    "bf16": 16,
-    "bfloat16": 16,
-}
-
-
-def find_by_name(name, item_list):
-    """Helper function to find item by name in a list."""
-    items = []
-    for item in item_list:
-        assert hasattr(item, "name"), "{} should have a 'name' attribute defined".format(item)  # pragma: no cover
-        if item.name == name:
-            items.append(item)
-    if len(items) > 0:
-        return items[0]
-    else:
-        return None
-
-
-def simple_progress_bar(total, i):
-    """Progress bar for cases where tqdm can't be used."""
-    progress = i / total
-    bar_length = 20
-    bar = "#" * int(bar_length * progress)
-    spaces = " " * (bar_length - len(bar))
-    percentage = progress * 100
-    print(f"\rProgress: [{bar}{spaces}] {percentage:.2f}%", end="")
-
-
-def register_algo(name):
-    """Decorator function to register algorithms in the algos_mapping dictionary.
-
-    Usage example:
-        @register_algo(name=example_algo)
-        def example_algo(model: Union[onnx.ModelProto, Path, str],
-                         quant_config: RTNConfig) -> onnx.ModelProto:
-            ...
-
-    Args:
-        name (str): The name under which the algorithm function will be registered.
-
-    Returns:
-        decorator: The decorator function to be used with algorithm functions.
-    """
-
-    def decorator(algo_func):
-        algos_mapping[name] = algo_func
-        return algo_func
-
-    return decorator
-
-
-def get_model_info(
-    model: Union[onnx.ModelProto, Path, str], white_op_type_list: List[Callable]
-) -> List[Tuple[str, Callable]]:
-    if not isinstance(model, onnx.ModelProto):
-        model = onnx.load(model)
-    filter_result = []
-    filter_result_set = set()
-    for node in model.graph.node:
-        if node.op_type in white_op_type_list:
-            pair = (node.name, node.op_type)
-            if pair not in filter_result_set:
-                filter_result_set.add(pair)
-                filter_result.append(pair)
-    logger.debug(f"Get model info: {filter_result}")
-    return filter_result
-
-
-def is_B_transposed(node):
-    """Whether inuput B is transposed."""
-    transB = [attr for attr in node.attribute if attr.name == "transB"]
-    if len(transB):
-        return 0 < onnx.helper.get_attribute_value(transB[0])
-    return False
-
-
-def get_qrange_for_qType(qType, reduce_range=False):
-    """Helper function to get the quantization range for a type.
-
-    Args:
-        qType (int): data type
-        reduce_range (bool, optional): use 7 bit or not. Defaults to False.
-    """
-    if qType == onnx.onnx_pb.TensorProto.UINT8:
-        return 127 if reduce_range else 255
-    elif qType == onnx.onnx_pb.TensorProto.INT8:
-        # [-64, 64] for reduce_range, and [-127, 127] full_range.
-        return 128 if reduce_range else 254
-    else:
-        raise ValueError("unsupported quantization data type")
-
-
-def _quantize_data_with_scale_zero(data, qType, scheme, scale, zero_point):
-    """Quantize data with scale and zero point.
-
-    To pack weights, we compute a linear transformation
-        - when data type == uint8 mode, from [rmin, rmax] -> [0, 2^{b-1}] and
-        - when data type == int8, from [-m , m] -> [-(2^{b-1}-1), 2^{b-1}-1] where
-            m = max(abs(rmin), abs(rmax))
-
-    Args:
-        data (np.array): data to quantize
-        qType (int): data type to quantize to. Supported types UINT8 and INT8
-        scheme (string): sym or asym quantization.
-        scale (float): computed scale of quantized data
-        zero_point (uint8 or int8): computed zero point of quantized data
-    """
-    data = np.asarray(data)
-    if qType == onnx.onnx_pb.TensorProto.INT8 and scheme == "sym":
-        # signed byte type
-        quantized_data = (data.astype(np.float32) / scale).round().astype("b")
-    elif qType == onnx.onnx_pb.TensorProto.UINT8 and scheme == "asym":
-        quantized_data = ((data.astype(np.float32) / scale).round() + zero_point).astype("B")
-    else:
-        raise ValueError("Unexpected combination of data type {} and scheme {}.".format(qType, scheme))
-    return quantized_data
-
-
-def _calculate_scale_zp(rmin, rmax, quantize_range, qType, scheme):
-    """Calculate scale and zero point."""
-    if isinstance(rmax, np.ndarray):
-        if scheme == "sym":
-            max_range = np.maximum(abs(rmin), abs(rmax))
-            scale = np.ones(rmax.shape, dtype="float32")
-            scale[max_range > 0] = np.array(
-                [float(i) / quantize_range for i in (max_range[max_range > 0] * 2.0).flatten().tolist()],
-                dtype="float32",
-            )
-        else:
-            scale = np.ones(rmax.shape, dtype="float32")
-            scale[rmin != rmax] = np.array(
-                [float(i) / quantize_range for i in (rmax - rmin)[rmin != rmax].flatten().tolist()], dtype="float32"
-            )
-
-        if scheme == "sym" and qType == onnx.onnx_pb.TensorProto.INT8:
-            zero_point = np.zeros(scale.shape, dtype="int8") if isinstance(scale, np.ndarray) else 0
-        elif isinstance(scale, np.ndarray) and (scale == 1).all():
-            zero_point = (
-                np.zeros(scale.shape, dtype="int8")
-                if qType == onnx.onnx_pb.TensorProto.INT8
-                else np.zeros(scale.shape, dtype="uint8")
-            )
-        elif qType == onnx.onnx_pb.TensorProto.UINT8:
-            zero_point = np.maximum(0, np.minimum(255, ((0 - float(rmin)) / scale).round()).round()).astype("uint8")
-        else:
-            zero_point = (
-                (-64 - rmin) / float(scale) if quantize_range == 128 else (-127 - rmin) / float(scale)
-            ).round()
-
-    else:
-        if scheme == "sym":
-            max_range = max(abs(rmin), abs(rmax))
-            scale = (float(max_range) * 2) / quantize_range if max_range > 0 else 1
-        else:
-            scale = (float(rmax) - float(rmin)) / quantize_range if rmin != rmax else 1
-
-        if scale == 1 or (scheme == "sym" and qType == onnx.onnx_pb.TensorProto.INT8):
-            zero_point = 0
-        elif qType == onnx.onnx_pb.TensorProto.UINT8:
-            zero_point = round((0 - float(rmin)) / scale)
-            zero_point = np.uint8(round(max(0, min(255, zero_point))))
-        else:
-            zero_point = (
-                round((-64 - float(rmin)) / scale) if quantize_range == 128 else round((-127 - float(rmin)) / scale)
-            )
-    return scale, zero_point
-
-
-def quantize_data(data, quantize_range, qType, scheme):
-    """Quantize data.
-
-    To pack weights, we compute a linear transformation
-        - when data type == uint8 mode, from [rmin, rmax] -> [0, 2^{b-1}] and
-        - when data type == int8, from [-m , m] -> [-(2^{b-1}-1), 2^{b-1}-1] where
-            m = max(abs(rmin), abs(rmax))
-    and add necessary intermediate nodes to transform quantized weight to full weight
-    using the equation r = S(q-z), where
-        r: real original value
-        q: quantized value
-        S: scale
-        z: zero point
-
-    Args:
-        data (array): data to quantize
-        quantize_range (list): list of data to weight pack.
-        qType (int): data type to quantize to. Supported types UINT8 and INT8
-        scheme (string): sym or asym quantization.
-    """
-    rmin = min(min(data), 0)
-    rmax = max(max(data), 0)
-
-    scale, zero_point = _calculate_scale_zp(rmin, rmax, quantize_range, qType, scheme)
-    quantized_data = _quantize_data_with_scale_zero(data, qType, scheme, scale, zero_point)
-    return rmin, rmax, zero_point, scale, quantized_data
-
-
-def check_model_with_infer_shapes(model):
-    """Check if the model has been shape inferred."""
-    from neural_compressor.onnxrt.utils.onnx_model import ONNXModel
-
-    if isinstance(model, (Path, str)):
-        model = onnx.load(model, load_external_data=False)
-    elif isinstance(model, ONNXModel):
-        model = model.model
-    if len(model.graph.value_info) > 0:
-        return True
-    return False
diff --git a/requirements_ort.txt b/requirements_ort.txt
deleted file mode 100644
index 23f608859d1..00000000000
--- a/requirements_ort.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-numpy < 2.0
-onnx
-onnxruntime
-onnxruntime-extensions
-prettytable
-psutil
-py-cpuinfo
-pydantic
-transformers
diff --git a/setup.py b/setup.py
index f1a9c9b22f6..bb23ac7866a 100644
--- a/setup.py
+++ b/setup.py
@@ -42,7 +42,7 @@ def get_build_version():
     assert False, "Error: Could not open '%s' due %s\n" % (filepath, error)
 
 PKG_INSTALL_CFG = {
-    # overall install config for build from source, python setup.py install
+    # overall installation config, pip install neural-compressor
     "neural_compressor": {
         "project_name": "neural_compressor",
         "include_packages": find_packages(
@@ -53,33 +53,12 @@ def get_build_version():
         ),
         "package_data": {"": ["*.yaml"]},
         "install_requires": fetch_requirements("requirements.txt"),
-    },
-    # 2.x binary build config, pip install neural-compressor
-    "neural_compressor_2x": {
-        "project_name": "neural_compressor",
-        "include_packages": find_packages(
-            include=["neural_compressor", "neural_compressor.*"],
-            exclude=[
-                "neural_compressor.template",
-                "neural_compressor.common",
-                "neural_compressor.common.*",
-                "neural_compressor.torch",
-                "neural_compressor.torch.*",
-                "neural_compressor.tensorflow",
-                "neural_compressor.tensorflow.*",
-                "neural_compressor.onnxrt",
-                "neural_compressor.onnxrt.*",
-            ],
-        ),
-        "package_data": {"": ["*.yaml"]},
-        "install_requires": fetch_requirements("requirements.txt"),
         "extras_require": {
-            "pt": [f"neural_compressor_3x_pt=={__version__}"],
-            "tf": [f"neural_compressor_3x_tf=={__version__}"],
-            "ort": [f"neural_compressor_3x_ort=={__version__}"],
+            "pt": fetch_requirements("requirements_pt.txt"),
+            "tf": fetch_requirements("requirements_tf.txt"),
         },
     },
-    # 3.x pt binary build config, pip install neural-compressor[pt], install 2.x API + 3.x PyTorch API.
+    # 3.x pt binary build config, pip install neural-compressor-pt, install 3.x PyTorch API.
     "neural_compressor_3x_pt": {
         "project_name": "neural_compressor_3x_pt",
         "include_packages": find_packages(
@@ -92,7 +71,7 @@ def get_build_version():
         ),
         "install_requires": fetch_requirements("requirements_pt.txt"),
     },
-    # 3.x tf binary build config, pip install neural-compressor[tf], install 2.x API + 3.x TensorFlow API.
+    # 3.x tf binary build config, pip install neural-compressor-tf, install 3.x TensorFlow API.
     "neural_compressor_3x_tf": {
         "project_name": "neural_compressor_3x_tf",
         "include_packages": find_packages(
@@ -106,19 +85,6 @@ def get_build_version():
         "package_data": {"": ["*.yaml"]},
         "install_requires": fetch_requirements("requirements_tf.txt"),
     },
-    # 3.x ort binary build config, pip install neural-compressor[ort], install 2.x API + 3.x ONNXRT API.
-    "neural_compressor_3x_ort": {
-        "project_name": "neural_compressor_3x_ort",
-        "include_packages": find_packages(
-            include=[
-                "neural_compressor.common",
-                "neural_compressor.common.*",
-                "neural_compressor.onnxrt",
-                "neural_compressor.onnxrt.*",
-            ],
-        ),
-        "install_requires": fetch_requirements("requirements_ort.txt"),
-    },
 }
 
 
@@ -131,10 +97,6 @@ def get_build_version():
     ext_modules = []
     cmdclass = {}
 
-    if "2x" in sys.argv:
-        sys.argv.remove("2x")
-        cfg_key = "neural_compressor_2x"
-
     if "pt" in sys.argv:
         sys.argv.remove("pt")
         cfg_key = "neural_compressor_3x_pt"
@@ -143,10 +105,6 @@ def get_build_version():
         sys.argv.remove("tf")
         cfg_key = "neural_compressor_3x_tf"
 
-    if "ort" in sys.argv:
-        sys.argv.remove("ort")
-        cfg_key = "neural_compressor_3x_ort"
-
     if bool(os.getenv("USE_FP8_CONVERT", False)):
         from torch.utils.cpp_extension import BuildExtension, CppExtension
 
diff --git a/test/3x/onnxrt/quantization/layer_wise/test_layer_wise.py b/test/3x/onnxrt/quantization/layer_wise/test_layer_wise.py
deleted file mode 100644
index c8e7584ee7f..00000000000
--- a/test/3x/onnxrt/quantization/layer_wise/test_layer_wise.py
+++ /dev/null
@@ -1,155 +0,0 @@
-import os
-import shutil
-import unittest
-from copy import deepcopy
-
-import onnx
-import onnxruntime as ort
-import onnxruntime.tools.symbolic_shape_infer as symbolic_shape_infer
-import torch
-from optimum.exporters.onnx import main_export
-from transformers import AutoTokenizer
-
-from neural_compressor.common import Logger
-from neural_compressor.onnxrt.quantization.calibrate import CalibrationDataReader
-
-logger = Logger().get_logger()
-
-
-def find_onnx_file(folder_path):
-    # return first .onnx file path in folder_path
-    for root, dirs, files in os.walk(folder_path):
-        for file in files:
-            if file.endswith(".onnx"):
-                return os.path.join(root, file)
-    return None
-
-
-class DummyNLPDataloader(CalibrationDataReader):
-    def __init__(self, model_name):
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-        self.sequence_a = "intel-extension-for-transformers is based in SH"
-        self.sequence_b = "Where is intel-extension-for-transformers based? NYC or SH"
-
-        self.encoded_list = []
-        encoded_input = dict(self.tokenizer(self.sequence_a, self.sequence_b, return_tensors="pt"))
-        input_shape = encoded_input["input_ids"].shape
-        encoded_input["position_ids"] = (
-            torch.arange(0, input_shape[-1], dtype=torch.long).unsqueeze(0).view(-1, input_shape[-1])
-        )
-
-        # convert torch tensor to numpy
-        for input_name, input_value in encoded_input.items():
-            if isinstance(input_value, torch.Tensor):
-                encoded_input[input_name] = input_value.numpy()
-
-        self.encoded_list.append(encoded_input)
-        self.iter_next = iter(self.encoded_list)
-
-    def get_next(self):
-        return next(self.iter_next, None)
-
-    def rewind(self):
-        self.iter_next = iter(self.encoded_list)
-
-
-class TestLayerWiseQuant(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        # onnx model exported with transformers>=4.38.0 is different with low version
-        # which will cause layer-wise quant ut to fail
-        # limit transformers to 4.37.2
-        # TODO: remove transformers version limitation
-        llama_id = "yujiepan/llama-2-tiny-3layers-random"
-        main_export(llama_id, output="llama-2-tiny-3layers-random", task="text-generation")
-        model_path = find_onnx_file("llama-2-tiny-3layers-random")
-
-        model = onnx.load(model_path)
-        model = symbolic_shape_infer.SymbolicShapeInference.infer_shapes(model, auto_merge=True)
-        infer_shape_model_path = "llama-2-tiny-3layers-random/model-infer-shape.onnx"
-        onnx.save(model, infer_shape_model_path)
-
-        sess_options = ort.SessionOptions()
-        sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
-        sess_options.optimized_model_filepath = "llama-2-tiny-3layers-random/optimized_model.onnx"
-        ort.InferenceSession(infer_shape_model_path, sess_options)
-
-        self.llama = "llama-2-tiny-3layers-random/optimized_model.onnx"
-        self.calibration_data_reader = DummyNLPDataloader(llama_id)
-
-    @classmethod
-    def tearDownClass(self):
-        shutil.rmtree("llama-2-tiny-3layers-random", ignore_errors=True)
-
-    def setUp(self):
-        # print the test name
-        logger.info(f"Running ONNXRT TestLayerWiseQuant test: {self.id()}")
-
-    def _check_model_is_quantized(self, model):
-        node_optypes = [node.op_type for node in model.graph.node]
-        return "MatMulNBits" in node_optypes or "MatMulFpQ4" in node_optypes
-
-    def _get_quantized_matmul_weight(self, model, matmul_name):
-        weight_init_name = None
-        for node in model.graph.node:
-            if node.name == matmul_name:
-                weight_init_name = node.input[1]
-        if weight_init_name is None:
-            return None
-
-        weight_init = None
-        for init in model.graph.initializer:
-            if init.name == weight_init_name:
-                weight_init = onnx.numpy_helper.to_array(init)
-        return weight_init
-
-    def _apply_quantize(self, quant_config, data_reader=None):
-        from neural_compressor.onnxrt.quantization.quantize import _quantize
-
-        fp32_model = self.llama
-        if data_reader is None:
-            qmodel = _quantize(fp32_model, quant_config)
-        else:
-            qmodel = _quantize(fp32_model, quant_config, data_reader)
-        self.assertIsNotNone(qmodel)
-        return qmodel
-
-    def test_rtn_layer_wise(self):
-        from neural_compressor.onnxrt.quantization import RTNConfig
-
-        rtn_config = RTNConfig(layer_wise_quant=True)
-        qmodel_lwq = self._apply_quantize(rtn_config)
-        self.assertTrue(self._check_model_is_quantized(qmodel_lwq))
-
-        rtn_config = RTNConfig(layer_wise_quant=False)
-        qmodel = self._apply_quantize(rtn_config)
-        self.assertTrue(self._check_model_is_quantized(qmodel))
-
-        lwq_quantized_weight = self._get_quantized_matmul_weight(qmodel_lwq, "/lm_head/MatMul_Q4")
-        self.assertIsNotNone(lwq_quantized_weight)
-        quantized_weight = self._get_quantized_matmul_weight(qmodel, "/lm_head/MatMul_Q4")
-        self.assertIsNotNone(quantized_weight)
-        self.assertTrue((lwq_quantized_weight == quantized_weight).all())
-
-    def test_gptq_layer_wise(self):
-        from neural_compressor.onnxrt.quantization import GPTQConfig
-
-        self.calibration_data_reader.rewind()
-        gptq_config = GPTQConfig(layer_wise_quant=True)
-        qmodel_lwq = self._apply_quantize(gptq_config, self.calibration_data_reader)
-        self.assertTrue(self._check_model_is_quantized(qmodel_lwq))
-
-        self.calibration_data_reader.rewind()
-        gptq_config = GPTQConfig(layer_wise_quant=False)
-        qmodel = self._apply_quantize(gptq_config, self.calibration_data_reader)
-        self.assertTrue(self._check_model_is_quantized(qmodel))
-
-        lwq_quantized_weight = self._get_quantized_matmul_weight(qmodel_lwq, "/lm_head/MatMul_Q4")
-        self.assertIsNotNone(lwq_quantized_weight)
-        quantized_weight = self._get_quantized_matmul_weight(qmodel, "/lm_head/MatMul_Q4")
-        self.assertIsNotNone(quantized_weight)
-        self.assertTrue((lwq_quantized_weight == quantized_weight).all())
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/3x/onnxrt/quantization/weight_only/test_awq.py b/test/3x/onnxrt/quantization/weight_only/test_awq.py
deleted file mode 100644
index 1587399f9ca..00000000000
--- a/test/3x/onnxrt/quantization/weight_only/test_awq.py
+++ /dev/null
@@ -1,219 +0,0 @@
-import os
-import shutil
-import unittest
-
-import torch
-from optimum.exporters.onnx import main_export
-from transformers import AutoTokenizer
-
-from neural_compressor.common import Logger
-from neural_compressor.onnxrt.quantization.calibrate import CalibrationDataReader
-
-logger = Logger().get_logger()
-
-
-def find_onnx_file(folder_path):
-    # return first .onnx file path in folder_path
-    for root, dirs, files in os.walk(folder_path):
-        for file in files:
-            if file.endswith(".onnx"):
-                return os.path.join(root, file)
-    return None
-
-
-class DummyNLPDataloader(CalibrationDataReader):
-    def __init__(self, model_name):
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-        self.sequence_a = "intel-extension-for-transformers is based in SH"
-        self.sequence_b = "Where is intel-extension-for-transformers based? NYC or SH"
-
-        self.encoded_list = []
-        encoded_input = dict(self.tokenizer(self.sequence_a, self.sequence_b, return_tensors="pt"))
-        input_shape = encoded_input["input_ids"].shape
-        encoded_input["position_ids"] = (
-            torch.arange(0, input_shape[-1], dtype=torch.long).unsqueeze(0).view(-1, input_shape[-1])
-        )
-
-        # convert torch tensor to numpy
-        for input_name, input_value in encoded_input.items():
-            if isinstance(input_value, torch.Tensor):
-                encoded_input[input_name] = input_value.numpy()
-
-        self.encoded_list.append(encoded_input)
-        self.iter_next = iter(self.encoded_list)
-
-    def get_next(self):
-        return next(self.iter_next, None)
-
-    def rewind(self):
-        self.iter_next = iter(self.encoded_list)
-
-
-class TestAWQQuant(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        main_export(
-            "hf-internal-testing/tiny-random-gptj",
-            output="gptj",
-        )
-        self.gptj = find_onnx_file("./gptj")
-        self.calibration_data_reader = DummyNLPDataloader("hf-internal-testing/tiny-random-gptj")
-
-    @classmethod
-    def tearDownClass(self):
-        shutil.rmtree("gptj", ignore_errors=True)
-
-    def setUp(self):
-        # print the test name
-        logger.info(f"Running ONNXRT TestAWQQuant test: {self.id()}")
-
-    def _count_woq_matmul(self, q_model, bits=4, group_size=32):
-        op_names = [
-            i.name
-            for i in q_model.graph.node
-            if i.op_type.startswith("MatMul") and i.input[1].endswith("_Q{}G{}".format(bits, group_size))
-        ]
-        return len(op_names)
-
-    def _check_model_is_quantized(self, model):
-        node_optypes = [node.op_type for node in model.graph.node]
-        return "MatMulNBits" in node_optypes or "MatMulFpQ4" in node_optypes
-
-    def _check_node_is_quantized(self, model, node_name):
-        for node in model.graph.node:
-            if (node.name == node_name or node.name == node_name + "_Q4") and node.op_type in [
-                "MatMulNBits",
-                "MatMulFpQ4",
-            ]:
-                return True
-        return False
-
-    def _apply_awq(self, quant_config):
-        logger.info(f"Test AWQ with config {quant_config}")
-        from neural_compressor.onnxrt.quantization.quantize import _quantize
-
-        fp32_model = self.gptj
-        qmodel = _quantize(fp32_model, quant_config, calibration_data_reader=self.calibration_data_reader)
-        self.assertIsNotNone(qmodel)
-        return qmodel
-
-    def test_awq_params_combination(self):
-        from neural_compressor.onnxrt import AWQConfig
-
-        # some tests were skipped to accelerate the CI
-        # TODO: check params combination.
-        # TODO: Add number check for group_size.
-        awq_options = {
-            "weight_dtype": ["int"],
-            "weight_bits": [4, 3, 8],
-            "weight_group_size": [32],
-            "weight_sym": [True, False],
-            "act_dtype": ["fp32"],
-            "accuracy_level": [0],
-            "enable_auto_scale": [True, False],
-            "enable_mse_search": [True, False],
-        }
-        from itertools import product
-
-        keys = AWQConfig.params_list
-        for value in product(*awq_options.values()):
-            d = dict(zip(keys, value))
-            print(d)
-            quant_config = AWQConfig(**d)
-            qmodel = self._apply_awq(quant_config)
-            self.assertEqual(self._count_woq_matmul(qmodel, bits=value[1], group_size=value[2]), 30)
-
-    def test_awq_config(self):
-        from neural_compressor.onnxrt.quantization import AWQConfig
-
-        awq_config1 = AWQConfig(weight_bits=4)
-        quant_config_dict = {
-            "awq": {"weight_bits": 4},
-        }
-        awq_config2 = AWQConfig.from_dict(quant_config_dict["awq"])
-        self.assertEqual(awq_config1.to_dict(), awq_config2.to_dict())
-
-    def test_quantize_awq_from_dict_default(self):
-        from neural_compressor.onnxrt import get_default_awq_config
-
-        qmodel = self._apply_awq(quant_config=get_default_awq_config())
-        self.assertIsNotNone(qmodel)
-        self.assertTrue(self._check_model_is_quantized(qmodel))
-
-    def test_quantize_awq_from_dict_beginner(self):
-        quant_config = {
-            "awq": {
-                "weight_bits": 4,
-                "weight_group_size": 32,
-            },
-        }
-        qmodel = self._apply_awq(quant_config)
-        self.assertIsNotNone(qmodel)
-        self.assertIsNotNone(qmodel)
-        self.assertTrue(self._check_model_is_quantized(qmodel))
-
-    def test_quantize_awq_from_class_beginner(self):
-        from neural_compressor.onnxrt import AWQConfig
-
-        quant_config = AWQConfig(weight_bits=4, weight_group_size=32)
-        qmodel = self._apply_awq(quant_config)
-        self.assertIsNotNone(qmodel)
-
-    def test_quantize_awq_fallback_from_class_beginner(self):
-        from neural_compressor.onnxrt import AWQConfig
-
-        fp32_config = AWQConfig(weight_dtype="fp32")
-        quant_config = AWQConfig(
-            weight_bits=4,
-            weight_dtype="int",
-            weight_sym=False,
-            weight_group_size=32,
-        )
-        quant_config.set_local("/h.4/mlp/fc_out/MatMul", fp32_config)
-        qmodel = self._apply_awq(quant_config)
-        self.assertIsNotNone(qmodel)
-        self.assertEqual(self._count_woq_matmul(qmodel), 29)
-        self.assertFalse(self._check_node_is_quantized(qmodel, "/h.4/mlp/fc_out/MatMul"))
-
-    def test_quantize_awq_from_dict_advance(self):
-        quant_config = {
-            "awq": {
-                "global": {
-                    "weight_bits": 4,
-                    "weight_group_size": 32,
-                },
-                "local": {
-                    "/h.4/mlp/fc_out/MatMul": {
-                        "weight_dtype": "fp32",
-                    }
-                },
-            }
-        }
-        qmodel = self._apply_awq(quant_config)
-        self.assertIsNotNone(qmodel)
-        self.assertEqual(self._count_woq_matmul(qmodel), 29)
-        self.assertFalse(self._check_node_is_quantized(qmodel, "/h.4/mlp/fc_out/MatMul"))
-
-        quant_config = {
-            "awq": {
-                "global": {
-                    "weight_bits": 4,
-                    "weight_group_size": 32,
-                },
-                "local": {
-                    "/h.4/mlp/fc_out/MatMul": {
-                        "weight_bits": 8,
-                        "weight_group_size": 32,
-                    }
-                },
-            }
-        }
-        qmodel = self._apply_awq(quant_config)
-        self.assertIsNotNone(qmodel)
-        for node in qmodel.graph.node:
-            if node.name == "/h.4/mlp/fc_out/MatMul":
-                self.assertTrue(node.input[1].endswith("Q8G32"))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/3x/onnxrt/quantization/weight_only/test_gptq.py b/test/3x/onnxrt/quantization/weight_only/test_gptq.py
deleted file mode 100644
index 4309af4e654..00000000000
--- a/test/3x/onnxrt/quantization/weight_only/test_gptq.py
+++ /dev/null
@@ -1,222 +0,0 @@
-import os
-import shutil
-import unittest
-
-import torch
-from optimum.exporters.onnx import main_export
-from transformers import AutoTokenizer
-
-from neural_compressor.common import Logger
-from neural_compressor.onnxrt.quantization.calibrate import CalibrationDataReader
-
-logger = Logger().get_logger()
-
-
-def find_onnx_file(folder_path):
-    # return first .onnx file path in folder_path
-    for root, dirs, files in os.walk(folder_path):
-        for file in files:
-            if file.endswith(".onnx"):
-                return os.path.join(root, file)
-    return None
-
-
-class DummyNLPDataloader(CalibrationDataReader):
-    def __init__(self, model_name):
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-        self.sequence_a = "intel-extension-for-transformers is based in SH"
-        self.sequence_b = "Where is intel-extension-for-transformers based? NYC or SH"
-
-        self.encoded_list = []
-        encoded_input = dict(self.tokenizer(self.sequence_a, self.sequence_b, return_tensors="pt"))
-        input_shape = encoded_input["input_ids"].shape
-        encoded_input["position_ids"] = (
-            torch.arange(0, input_shape[-1], dtype=torch.long).unsqueeze(0).view(-1, input_shape[-1])
-        )
-
-        # convert torch tensor to numpy
-        for input_name, input_value in encoded_input.items():
-            if isinstance(input_value, torch.Tensor):
-                encoded_input[input_name] = input_value.numpy()
-
-        self.encoded_list.append(encoded_input)
-        self.iter_next = iter(self.encoded_list)
-
-    def get_next(self):
-        return next(self.iter_next, None)
-
-    def rewind(self):
-        self.iter_next = iter(self.encoded_list)
-
-
-class TestGPTQQuant(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        main_export(
-            "hf-internal-testing/tiny-random-gptj",
-            output="gptj",
-        )
-        self.gptj = find_onnx_file("./gptj")
-        self.calibration_data_reader = DummyNLPDataloader("hf-internal-testing/tiny-random-gptj")
-
-    @classmethod
-    def tearDownClass(self):
-        shutil.rmtree("gptj", ignore_errors=True)
-
-    def setUp(self):
-        # print the test name
-        logger.info(f"Running ONNXRT TestGPTQQuant test: {self.id()}")
-
-    def _count_woq_matmul(self, q_model, bits=4, group_size=32):
-        op_names = [
-            i.name
-            for i in q_model.graph.node
-            if i.op_type.startswith("MatMul") and i.input[1].endswith("_Q{}G{}".format(bits, group_size))
-        ]
-        return len(op_names)
-
-    def _check_model_is_quantized(self, model):
-        node_optypes = [node.op_type for node in model.graph.node]
-        return "MatMulNBits" in node_optypes or "MatMulFpQ4" in node_optypes
-
-    def _check_node_is_quantized(self, model, node_name):
-        for node in model.graph.node:
-            if (node.name == node_name or node.name == node_name + "_Q4") and node.op_type in [
-                "MatMulNBits",
-                "MatMulFpQ4",
-            ]:
-                return True
-        return False
-
-    def _apply_gptq(self, quant_config):
-        logger.info(f"Test GPTQ with config {quant_config}")
-        from neural_compressor.onnxrt.quantization.quantize import _quantize
-
-        fp32_model = self.gptj
-        qmodel = _quantize(fp32_model, quant_config, calibration_data_reader=self.calibration_data_reader)
-        self.assertIsNotNone(qmodel)
-        return qmodel
-
-    def test_gptq_params_combination(self):
-        from neural_compressor.onnxrt import GPTQConfig
-
-        # some tests were skipped to accelerate the CI
-        # TODO: check params combination.
-        # TODO: Add number check for group_size.
-        gptq_options = {
-            "weight_dtype": ["int"],
-            "weight_bits": [4],
-            "weight_group_size": [32],
-            "weight_sym": [True, False],
-            "act_dtype": ["fp32"],
-            "accuracy_level": [0],
-            "percdamp": [0.01],
-            "blocksize": [128],
-            "actorder": [True, False],
-            "mse": [True, False],
-            "perchannel": [True, False],
-        }
-        from itertools import product
-
-        keys = GPTQConfig.params_list
-        for value in product(*gptq_options.values()):
-            d = dict(zip(keys, value))
-            print(d)
-            quant_config = GPTQConfig(**d)
-            qmodel = self._apply_gptq(quant_config)
-            self.assertEqual(self._count_woq_matmul(qmodel, bits=value[1], group_size=value[2]), 30)
-
-    def test_gptq_config(self):
-        from neural_compressor.onnxrt.quantization import GPTQConfig
-
-        gptq_config1 = GPTQConfig(weight_bits=4)
-        quant_config_dict = {
-            "gptq": {"weight_bits": 4},
-        }
-        gptq_config2 = GPTQConfig.from_dict(quant_config_dict["gptq"])
-        self.assertEqual(gptq_config1.to_dict(), gptq_config2.to_dict())
-
-    def test_quantize_gptq_from_dict_default(self):
-        from neural_compressor.onnxrt import get_default_gptq_config
-
-        qmodel = self._apply_gptq(quant_config=get_default_gptq_config())
-        self.assertIsNotNone(qmodel)
-        self.assertTrue(self._check_model_is_quantized(qmodel))
-
-    def test_quantize_gptq_from_dict_beginner(self):
-        quant_config = {
-            "gptq": {
-                "weight_bits": 4,
-                "weight_group_size": 32,
-            },
-        }
-        qmodel = self._apply_gptq(quant_config)
-        self.assertIsNotNone(qmodel)
-        self.assertIsNotNone(qmodel)
-        self.assertTrue(self._check_model_is_quantized(qmodel))
-
-    def test_quantize_gptq_from_class_beginner(self):
-        from neural_compressor.onnxrt import GPTQConfig
-
-        quant_config = GPTQConfig(weight_bits=4, weight_group_size=32)
-        qmodel = self._apply_gptq(quant_config)
-        self.assertIsNotNone(qmodel)
-
-    def test_quantize_gptq_fallback_from_class_beginner(self):
-        from neural_compressor.onnxrt import GPTQConfig
-
-        fp32_config = GPTQConfig(weight_dtype="fp32")
-        quant_config = GPTQConfig(
-            weight_bits=4,
-            weight_dtype="int",
-            weight_sym=False,
-            weight_group_size=32,
-        )
-        quant_config.set_local("/h.4/mlp/fc_out/MatMul", fp32_config)
-        qmodel = self._apply_gptq(quant_config)
-        self.assertIsNotNone(qmodel)
-        self.assertEqual(self._count_woq_matmul(qmodel), 29)
-        self.assertFalse(self._check_node_is_quantized(qmodel, "/h.4/mlp/fc_out/MatMul"))
-
-    def test_quantize_gptq_from_dict_advance(self):
-        quant_config = {
-            "gptq": {
-                "global": {
-                    "weight_bits": 4,
-                    "weight_group_size": 32,
-                },
-                "local": {
-                    "/h.4/mlp/fc_out/MatMul": {
-                        "weight_dtype": "fp32",
-                    }
-                },
-            }
-        }
-        qmodel = self._apply_gptq(quant_config)
-        self.assertIsNotNone(qmodel)
-        self.assertEqual(self._count_woq_matmul(qmodel), 29)
-        self.assertFalse(self._check_node_is_quantized(qmodel, "/h.4/mlp/fc_out/MatMul"))
-
-        quant_config = {
-            "gptq": {
-                "global": {
-                    "weight_bits": 4,
-                    "weight_group_size": 32,
-                },
-                "local": {
-                    "/h.4/mlp/fc_out/MatMul": {
-                        "weight_bits": 8,
-                        "weight_group_size": 32,
-                    }
-                },
-            }
-        }
-        qmodel = self._apply_gptq(quant_config)
-        self.assertIsNotNone(qmodel)
-        for node in qmodel.graph.node:
-            if node.name == "/h.4/mlp/fc_out/MatMul":
-                self.assertTrue(node.input[1].endswith("Q8G32"))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/3x/onnxrt/quantization/weight_only/test_rtn.py b/test/3x/onnxrt/quantization/weight_only/test_rtn.py
deleted file mode 100644
index 11a05bc48da..00000000000
--- a/test/3x/onnxrt/quantization/weight_only/test_rtn.py
+++ /dev/null
@@ -1,193 +0,0 @@
-import os
-import shutil
-import unittest
-
-from optimum.exporters.onnx import main_export
-
-from neural_compressor.common import Logger
-
-logger = Logger().get_logger()
-
-
-def find_onnx_file(folder_path):
-    # return first .onnx file path in folder_path
-    for root, dirs, files in os.walk(folder_path):
-        for file in files:
-            if file.endswith(".onnx"):
-                return os.path.join(root, file)
-    return None
-
-
-class TestRTNQuant(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        main_export(
-            "hf-internal-testing/tiny-random-gptj",
-            output="gptj",
-        )
-        self.gptj = find_onnx_file("./gptj")
-
-    @classmethod
-    def tearDownClass(self):
-        shutil.rmtree("gptj", ignore_errors=True)
-
-    def setUp(self):
-        # print the test name
-        logger.info(f"Running ONNXRT TestRTNQuant test: {self.id()}")
-
-    def _check_model_is_quantized(self, model):
-        node_optypes = [node.op_type for node in model.graph.node]
-        return "MatMulNBits" in node_optypes or "MatMulFpQ4" in node_optypes
-
-    def _check_node_is_quantized(self, model, node_name):
-        for node in model.graph.node:
-            if (node.name == node_name or node.name == node_name + "_Q4") and node.op_type in [
-                "MatMulNBits",
-                "MatMulFpQ4",
-            ]:
-                return True
-        return False
-
-    def _count_woq_matmul(self, q_model, bits=4, group_size=32):
-        op_names = [
-            i.name
-            for i in q_model.graph.node
-            if i.op_type.startswith("MatMul") and i.input[1].endswith("_Q{}G{}".format(bits, group_size))
-        ]
-        return len(op_names)
-
-    def _apply_rtn(self, quant_config):
-        logger.info(f"Test RTN with config {quant_config}")
-        from neural_compressor.onnxrt.quantization.quantize import _quantize
-
-        fp32_model = self.gptj
-        qmodel = _quantize(fp32_model, quant_config)
-        self.assertIsNotNone(qmodel)
-        return qmodel
-
-    def test_rtn_params_combination(self):
-        from neural_compressor.onnxrt import RTNConfig
-
-        # some tests were skipped to accelerate the CI
-        # TODO: check params combination.
-        # TODO: Add number check for group_size.
-        rtn_options = {
-            "weight_dtype": ["int"],
-            "weight_bits": [4, 3, 8],
-            "weight_group_size": [32],
-            "weight_sym": [True, False],
-            "act_dtype": ["fp32"],
-        }
-        from itertools import product
-
-        keys = RTNConfig.params_list
-        for value in product(*rtn_options.values()):
-            d = dict(zip(keys, value))
-            quant_config = RTNConfig(**d)
-            qmodel = self._apply_rtn(quant_config)
-            self.assertEqual(self._count_woq_matmul(qmodel, bits=value[1], group_size=value[2]), 30)
-
-    def test_rtn_config(self):
-        from neural_compressor.onnxrt.quantization import RTNConfig
-
-        rtn_config1 = RTNConfig(weight_bits=4)
-        quant_config_dict = {
-            "rtn": {"weight_bits": 4},
-        }
-        rtn_config2 = RTNConfig.from_dict(quant_config_dict["rtn"])
-        self.assertEqual(rtn_config1.to_dict(), rtn_config2.to_dict())
-
-    def test_quantize_rtn_from_dict_default(self):
-        from neural_compressor.onnxrt import get_default_rtn_config
-        from neural_compressor.onnxrt.quantization.quantize import _quantize
-
-        qmodel = self._apply_rtn(quant_config=get_default_rtn_config())
-        self.assertIsNotNone(qmodel)
-        self.assertTrue(self._check_model_is_quantized(qmodel))
-
-    def test_quantize_rtn_from_dict_beginner(self):
-        from neural_compressor.onnxrt.quantization.quantize import _quantize
-
-        quant_config = {
-            "rtn": {
-                "weight_bits": 4,
-                "weight_group_size": 32,
-            },
-        }
-        qmodel = self._apply_rtn(quant_config)
-        self.assertIsNotNone(qmodel)
-        self.assertIsNotNone(qmodel)
-        self.assertTrue(self._check_model_is_quantized(qmodel))
-
-    def test_quantize_rtn_from_class_beginner(self):
-        from neural_compressor.onnxrt import RTNConfig
-        from neural_compressor.onnxrt.quantization.quantize import _quantize
-
-        quant_config = RTNConfig(weight_bits=4, weight_group_size=32)
-        qmodel = self._apply_rtn(quant_config)
-        self.assertIsNotNone(qmodel)
-
-    def test_quantize_rtn_fallback_from_class_beginner(self):
-        from neural_compressor.onnxrt import RTNConfig
-        from neural_compressor.onnxrt.quantization.quantize import _quantize
-
-        fp32_config = RTNConfig(weight_dtype="fp32")
-        fp32_model = self.gptj
-        quant_config = RTNConfig(
-            weight_bits=4,
-            weight_dtype="int",
-            weight_sym=False,
-            weight_group_size=32,
-        )
-        quant_config.set_local("/h.4/mlp/fc_out/MatMul", fp32_config)
-        qmodel = _quantize(fp32_model, quant_config)
-        self.assertIsNotNone(qmodel)
-        self.assertEqual(self._count_woq_matmul(qmodel), 29)
-        self.assertFalse(self._check_node_is_quantized(qmodel, "/h.4/mlp/fc_out/MatMul"))
-
-    def test_quantize_rtn_from_dict_advance(self):
-        from neural_compressor.onnxrt.quantization.quantize import _quantize
-
-        fp32_model = self.gptj
-        quant_config = {
-            "rtn": {
-                "global": {
-                    "weight_bits": 4,
-                    "weight_group_size": 32,
-                },
-                "local": {
-                    "/h.4/mlp/fc_out/MatMul": {
-                        "weight_dtype": "fp32",
-                    }
-                },
-            }
-        }
-        qmodel = _quantize(fp32_model, quant_config)
-        self.assertIsNotNone(qmodel)
-        self.assertEqual(self._count_woq_matmul(qmodel), 29)
-        self.assertFalse(self._check_node_is_quantized(qmodel, "/h.4/mlp/fc_out/MatMul"))
-
-        fp32_model = self.gptj
-        quant_config = {
-            "rtn": {
-                "global": {
-                    "weight_bits": 4,
-                    "weight_group_size": 32,
-                },
-                "local": {
-                    "/h.4/mlp/fc_out/MatMul": {
-                        "weight_bits": 8,
-                        "weight_group_size": 32,
-                    }
-                },
-            }
-        }
-        qmodel = _quantize(fp32_model, quant_config)
-        self.assertIsNotNone(qmodel)
-        for node in qmodel.graph.node:
-            if node.name == "/h.4/mlp/fc_out/MatMul":
-                self.assertTrue(node.input[1].endswith("Q8G32"))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/3x/onnxrt/requirements.txt b/test/3x/onnxrt/requirements.txt
deleted file mode 100644
index 4165ba5e0a6..00000000000
--- a/test/3x/onnxrt/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-optimum
-pytest
diff --git a/test/3x/onnxrt/test_autotune.py b/test/3x/onnxrt/test_autotune.py
deleted file mode 100644
index 8291d3ef344..00000000000
--- a/test/3x/onnxrt/test_autotune.py
+++ /dev/null
@@ -1,304 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-# Copyright (c) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import glob
-import os
-import shutil
-import unittest
-from typing import Callable, Dict, List, Optional, Union
-from unittest.mock import patch
-
-import numpy as np
-import onnx
-import onnxruntime as ort
-from optimum.exporters.onnx import main_export
-
-from neural_compressor.common import Logger
-from neural_compressor.common.base_tuning import Evaluator, TuningConfig
-from neural_compressor.onnxrt import AWQConfig, CalibrationDataReader, GPTQConfig, RTNConfig, SmoohQuantConfig
-from neural_compressor.onnxrt.quantization import autotune
-
-logger = Logger().get_logger()
-
-
-def _create_evaluator_for_eval_fns(eval_fns: Optional[Union[Callable, Dict, List[Dict]]] = None) -> Evaluator:
-    evaluator = Evaluator()
-    evaluator.set_eval_fn_registry(eval_fns)
-    return evaluator
-
-
-class DataReader(CalibrationDataReader):
-    def __init__(self, model):
-        model = onnx.load(model)
-        batch_size = 1
-        sequence_length = 1
-        self.data = {
-            "input_ids": np.random.randint(10, size=(batch_size, sequence_length)).astype("int64"),
-            "attention_mask": np.zeros((batch_size, sequence_length)).astype("int64"),
-        }
-        for inp in model.graph.input:
-            if inp.name in self.data:
-                continue
-            if inp.name == "position_ids":
-                # model is exported with optimum >= 1.14.0 with new input 'position_ids'
-                self.data[inp.name] = np.random.randint(10, size=(batch_size, sequence_length)).astype("int64")
-
-        self.enum_data = None
-
-    def get_next(self):
-        if self.enum_data is None:
-            self.enum_data = iter([self.data])
-        return next(self.enum_data, None)
-
-    def rewind(self):
-        self.enum_data = None
-
-
-class TestONNXRT3xAutoTune(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        main_export(
-            "hf-internal-testing/tiny-random-gptj",
-            output="gptj",
-        )
-        self.gptj = glob.glob(os.path.join("./gptj", "*.onnx"))[0]
-        self.data_reader = DataReader(self.gptj)
-
-    @classmethod
-    def tearDownClass(self):
-        shutil.rmtree("./gptj", ignore_errors=True)
-
-    @patch("logging.Logger.warning")
-    def test_auto_tune_warning(self, mock_warning):
-        acc_data = iter([1.0, 0.8, 0.99, 1.0, 0.99, 0.99])
-
-        def eval_acc_fn(model) -> float:
-            session = ort.InferenceSession(model.SerializeToString(), providers=["CPUExecutionProvider"])
-            return next(acc_data)
-
-        custom_tune_config = TuningConfig(config_set=[SmoohQuantConfig(alpha=0.5), SmoohQuantConfig(alpha=0.6)])
-        with self.assertRaises(SystemExit):
-            best_model = autotune(
-                model_input=self.gptj,
-                tune_config=custom_tune_config,
-                eval_fn=eval_acc_fn,
-                calibration_data_reader=self.data_reader,
-            )
-        call_args_list = mock_warning.call_args_list
-        # There may be multiple calls to warning, so we need to check all of them
-        self.assertIn(
-            "Please refine your eval_fn to accept model path (str) as input.", [info[0][0] for info in call_args_list]
-        )
-
-    def test_sq_auto_tune(self):
-        acc_data = iter([1.0, 0.8, 0.99, 1.0, 0.99, 0.99])
-
-        def eval_acc_fn(model) -> float:
-            return next(acc_data)
-
-        perf_data = iter([1.0, 0.9, 0.99])
-
-        def eval_perf_fn(model) -> float:
-            return next(perf_data)
-
-        eval_fns = [
-            {"eval_fn": eval_acc_fn, "weight": 0.5, "name": "accuracy"},
-            {
-                "eval_fn": eval_perf_fn,
-                "weight": 0.5,
-            },
-        ]
-
-        evaluator = _create_evaluator_for_eval_fns(eval_fns)
-
-        def eval_fn_wrapper(model):
-            result = evaluator.evaluate(model)
-            return result
-
-        custom_tune_config = TuningConfig(config_set=[SmoohQuantConfig(alpha=0.5), SmoohQuantConfig(alpha=0.6)])
-        best_model = autotune(
-            model_input=self.gptj,
-            tune_config=custom_tune_config,
-            eval_fn=eval_acc_fn,
-            calibration_data_reader=self.data_reader,
-        )
-        self.assertIsNotNone(best_model)
-
-        custom_tune_config = TuningConfig(config_set=[SmoohQuantConfig(alpha=[0.5, 0.6])])
-        best_model = autotune(
-            model_input=self.gptj,
-            tune_config=custom_tune_config,
-            eval_fn=eval_fn_wrapper,
-            calibration_data_reader=self.data_reader,
-        )
-        self.assertEqual(len(evaluator.eval_fn_registry), 2)
-        self.assertIsNotNone(best_model)
-
-    def test_rtn_auto_tune(self):
-        acc_data = iter([1.0, 0.8, 0.6, 1.0, 0.99, 0.9])
-
-        def eval_acc_fn(model) -> float:
-            return next(acc_data)
-
-        perf_data = iter([1.0, 0.99, 0.99])
-
-        def eval_perf_fn(model) -> float:
-            return next(perf_data)
-
-        eval_fns = [
-            {"eval_fn": eval_acc_fn, "weight": 0.5, "name": "accuracy"},
-            {
-                "eval_fn": eval_perf_fn,
-                "weight": 0.5,
-            },
-        ]
-
-        evaluator = _create_evaluator_for_eval_fns(eval_fns)
-
-        def eval_fn_wrapper(model):
-            result = evaluator.evaluate(model)
-            return result
-
-        custom_tune_config = TuningConfig(config_set=[RTNConfig(weight_group_size=32), RTNConfig(weight_group_size=64)])
-        best_model = autotune(
-            model_input=self.gptj,
-            tune_config=custom_tune_config,
-            eval_fn=eval_acc_fn,
-            calibration_data_reader=self.data_reader,
-        )
-        self.assertIsNone(best_model)
-
-        custom_tune_config = TuningConfig(config_set=[RTNConfig(weight_group_size=[32, 64])])
-        best_model = autotune(
-            model_input=self.gptj,
-            tune_config=custom_tune_config,
-            eval_fn=eval_fn_wrapper,
-            calibration_data_reader=self.data_reader,
-        )
-        self.assertEqual(len(evaluator.eval_fn_registry), 2)
-        self.assertIsNotNone(best_model)
-        op_names = [
-            i.name
-            for i in best_model.graph.node
-            if i.op_type.startswith("MatMul") and i.input[1].endswith("_Q{}G{}".format(4, 32))
-        ]
-        self.assertTrue(len(op_names) > 0)
-
-    def test_awq_auto_tune(self):
-        acc_data = iter([1.0, 0.8, 0.6, 1.0, 0.99, 0.9])
-
-        def eval_acc_fn(model) -> float:
-            return next(acc_data)
-
-        perf_data = iter([1.0, 0.99, 0.99])
-
-        def eval_perf_fn(model) -> float:
-            return next(perf_data)
-
-        eval_fns = [
-            {"eval_fn": eval_acc_fn, "weight": 0.5, "name": "accuracy"},
-            {
-                "eval_fn": eval_perf_fn,
-                "weight": 0.5,
-            },
-        ]
-
-        evaluator = _create_evaluator_for_eval_fns(eval_fns)
-
-        def eval_fn_wrapper(model):
-            result = evaluator.evaluate(model)
-            return result
-
-        custom_tune_config = TuningConfig(config_set=[AWQConfig(weight_group_size=32), AWQConfig(weight_group_size=64)])
-        best_model = autotune(
-            model_input=self.gptj,
-            tune_config=custom_tune_config,
-            eval_fn=eval_acc_fn,
-            calibration_data_reader=self.data_reader,
-        )
-        self.assertIsNone(best_model)
-
-        custom_tune_config = TuningConfig(config_set=[AWQConfig(weight_group_size=[32, 64])])
-        best_model = autotune(
-            model_input=self.gptj,
-            tune_config=custom_tune_config,
-            eval_fn=eval_fn_wrapper,
-            calibration_data_reader=self.data_reader,
-        )
-        self.assertEqual(len(evaluator.eval_fn_registry), 2)
-        self.assertIsNotNone(best_model)
-        op_names = [
-            i.name
-            for i in best_model.graph.node
-            if i.op_type.startswith("MatMul") and i.input[1].endswith("_Q{}G{}".format(4, 32))
-        ]
-        self.assertTrue(len(op_names) > 0)
-
-    def test_gptq_auto_tune(self):
-        acc_data = iter([1.0, 0.8, 0.6, 1.0, 0.99, 0.9])
-
-        def eval_acc_fn(model) -> float:
-            return next(acc_data)
-
-        perf_data = iter([1.0, 0.99, 0.99])
-
-        def eval_perf_fn(model) -> float:
-            return next(perf_data)
-
-        eval_fns = [
-            {"eval_fn": eval_acc_fn, "weight": 0.5, "name": "accuracy"},
-            {
-                "eval_fn": eval_perf_fn,
-                "weight": 0.5,
-            },
-        ]
-        evaluator = _create_evaluator_for_eval_fns(eval_fns)
-
-        def eval_fn_wrapper(model):
-            result = evaluator.evaluate(model)
-            return result
-
-        custom_tune_config = TuningConfig(
-            config_set=[GPTQConfig(weight_group_size=32), GPTQConfig(weight_group_size=64)]
-        )
-        best_model = autotune(
-            model_input=self.gptj,
-            tune_config=custom_tune_config,
-            eval_fn=eval_acc_fn,
-            calibration_data_reader=self.data_reader,
-        )
-        self.assertIsNone(best_model)
-
-        custom_tune_config = TuningConfig(config_set=[GPTQConfig(weight_group_size=[32, 64])])
-        best_model = autotune(
-            model_input=self.gptj,
-            tune_config=custom_tune_config,
-            eval_fn=eval_fn_wrapper,
-            calibration_data_reader=self.data_reader,
-        )
-        self.assertEqual(len(evaluator.eval_fn_registry), 2)
-        self.assertIsNotNone(best_model)
-        op_names = [
-            i.name
-            for i in best_model.graph.node
-            if i.op_type.startswith("MatMul") and i.input[1].endswith("_Q{}G{}".format(4, 32))
-        ]
-        self.assertTrue(len(op_names) > 0)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/3x/onnxrt/test_config.py b/test/3x/onnxrt/test_config.py
deleted file mode 100644
index 9b0c49de1b8..00000000000
--- a/test/3x/onnxrt/test_config.py
+++ /dev/null
@@ -1,251 +0,0 @@
-import copy
-import os
-import shutil
-import unittest
-
-import numpy as np
-import onnx
-from optimum.exporters.onnx import main_export
-
-from neural_compressor.common import Logger
-
-logger = Logger().get_logger()
-
-
-def find_onnx_file(folder_path):
-    # return first .onnx file path in folder_path
-    for root, dirs, files in os.walk(folder_path):
-        for file in files:
-            if file.endswith(".onnx"):
-                return os.path.join(root, file)
-    return None
-
-
-def build_simple_onnx_model():
-    A = onnx.helper.make_tensor_value_info("A", onnx.TensorProto.FLOAT, [1, 5, 5])
-    C = onnx.helper.make_tensor_value_info("C", onnx.TensorProto.FLOAT, [1, 5, 2])
-    D = onnx.helper.make_tensor_value_info("D", onnx.TensorProto.FLOAT, [1, 5, 2])
-    H = onnx.helper.make_tensor_value_info("H", onnx.TensorProto.FLOAT, [1, 5, 2])
-
-    e_value = np.random.randint(2, size=(10)).astype(np.float32)
-    B_init = onnx.helper.make_tensor("B", onnx.TensorProto.FLOAT, [5, 2], e_value.reshape(10).tolist())
-    E_init = onnx.helper.make_tensor("E", onnx.TensorProto.FLOAT, [1, 5, 2], e_value.reshape(10).tolist())
-
-    matmul_node = onnx.helper.make_node("MatMul", ["A", "B"], ["C"], name="Matmul")
-    add = onnx.helper.make_node("Add", ["C", "E"], ["D"], name="add")
-
-    f_value = np.random.randint(2, size=(10)).astype(np.float32)
-    F_init = onnx.helper.make_tensor("F", onnx.TensorProto.FLOAT, [1, 5, 2], e_value.reshape(10).tolist())
-    add2 = onnx.helper.make_node("Add", ["D", "F"], ["H"], name="add2")
-
-    graph = onnx.helper.make_graph([matmul_node, add, add2], "test_graph_1", [A], [H], [B_init, E_init, F_init])
-    model = onnx.helper.make_model(graph)
-    model = onnx.helper.make_model(graph, **{"opset_imports": [onnx.helper.make_opsetid("", 13)]})
-    return model
-
-
-class TestQuantizationConfig(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        main_export(
-            "hf-internal-testing/tiny-random-gptj",
-            output="gptj",
-        )
-        self.gptj = find_onnx_file("./gptj")
-
-        simple_onnx_model = build_simple_onnx_model()
-        onnx.save(simple_onnx_model, "simple_onnx_model.onnx")
-        self.simple_onnx_model = "simple_onnx_model.onnx"
-
-    @classmethod
-    def tearDownClass(self):
-        shutil.rmtree("gptj", ignore_errors=True)
-        os.remove("simple_onnx_model.onnx")
-
-    def setUp(self):
-        # print the test name
-        logger.info(f"Running TestQuantizationConfig test: {self.id()}")
-
-    def _check_node_is_quantized(self, model, node_name):
-        for node in model.graph.node:
-            if (node.name == node_name or node.name == node_name + "_Q4") and node.op_type in [
-                "MatMulNBits",
-                "MatMulFpQ4",
-            ]:
-                return True
-        return False
-
-    def _count_woq_matmul(self, q_model, bits=4, group_size=32):
-        op_names = [
-            i.name
-            for i in q_model.graph.node
-            if i.op_type.startswith("MatMul") and i.input[1].endswith("_Q{}G{}".format(bits, group_size))
-        ]
-        return len(op_names)
-
-    def test_config_white_lst(self):
-        from neural_compressor.onnxrt import RTNConfig
-        from neural_compressor.onnxrt.quantization.quantize import _quantize
-
-        global_config = RTNConfig(weight_bits=4)
-        # set operator instance
-        fc_out_config = RTNConfig(weight_dtype="fp32", white_list=["/h.4/mlp/fc_out/MatMul"])
-        # get model and quantize
-        fp32_model = self.gptj
-        qmodel = _quantize(fp32_model, quant_config=global_config + fc_out_config)
-        self.assertIsNotNone(qmodel)
-        self.assertEqual(self._count_woq_matmul(qmodel), 29)
-        self.assertFalse(self._check_node_is_quantized(qmodel, "/h.4/mlp/fc_out/MatMul"))
-
-    def test_config_white_lst2(self):
-        from neural_compressor.onnxrt import RTNConfig
-        from neural_compressor.onnxrt.quantization.quantize import _quantize
-
-        global_config = RTNConfig(weight_dtype="fp32")
-        # set operator instance
-        fc_out_config = RTNConfig(weight_bits=4, white_list=["/h.4/mlp/fc_out/MatMul"])
-        # get model and quantize
-        fp32_model = self.gptj
-        qmodel = _quantize(fp32_model, quant_config=global_config + fc_out_config)
-        self.assertIsNotNone(qmodel)
-        self.assertEqual(self._count_woq_matmul(qmodel), 1)
-        self.assertTrue(self._check_node_is_quantized(qmodel, "/h.4/mlp/fc_out/MatMul"))
-
-    def test_config_white_lst3(self):
-        from neural_compressor.onnxrt import RTNConfig
-        from neural_compressor.onnxrt.utils.utility import get_model_info
-
-        global_config = RTNConfig(weight_bits=4)
-        # set operator instance
-        fc_out_config = RTNConfig(weight_bits=8, white_list=["/h.4/mlp/fc_out/MatMul"])
-        quant_config = global_config + fc_out_config
-        # get model and quantize
-        fp32_model = self.gptj
-        model_info = get_model_info(fp32_model, white_op_type_list=["MatMul"])
-        logger.info(quant_config)
-        configs_mapping = quant_config.to_config_mapping(model_info=model_info)
-        logger.info(configs_mapping)
-        self.assertTrue(configs_mapping[("/h.4/mlp/fc_out/MatMul", "MatMul")].weight_bits == 8)
-        self.assertTrue(configs_mapping[("/h.4/mlp/fc_in/MatMul", "MatMul")].weight_bits == 4)
-
-    def test_config_from_dict(self):
-        from neural_compressor.onnxrt import RTNConfig
-
-        quant_config = {
-            "rtn": {
-                "global": {
-                    "weight_dtype": "int",
-                    "weight_bits": 4,
-                    "weight_group_size": 32,
-                },
-                "local": {
-                    "fc1": {
-                        "weight_dtype": "int",
-                        "weight_bits": 8,
-                    }
-                },
-            }
-        }
-        config = RTNConfig.from_dict(quant_config["rtn"])
-        self.assertIsNotNone(config.local_config)
-
-    def test_config_to_dict(self):
-        from neural_compressor.onnxrt import RTNConfig
-
-        quant_config = RTNConfig(weight_bits=4)
-        fc_out_config = RTNConfig(weight_bits=8)
-        quant_config.set_local("/h.4/mlp/fc_out/MatMul", fc_out_config)
-        config_dict = quant_config.to_dict()
-        self.assertIn("global", config_dict)
-        self.assertIn("local", config_dict)
-
-    def test_same_type_configs_addition(self):
-        from neural_compressor.onnxrt import RTNConfig
-
-        quant_config1 = {
-            "rtn": {
-                "weight_dtype": "int",
-                "weight_bits": 4,
-                "weight_group_size": 32,
-            },
-        }
-        q_config = RTNConfig.from_dict(quant_config1["rtn"])
-        quant_config2 = {
-            "rtn": {
-                "global": {
-                    "weight_bits": 8,
-                    "weight_group_size": 32,
-                },
-                "local": {
-                    "/h.4/mlp/fc_out/MatMul": {
-                        "weight_dtype": "int",
-                        "weight_bits": 4,
-                    }
-                },
-            }
-        }
-        q_config2 = RTNConfig.from_dict(quant_config2["rtn"])
-        q_config3 = q_config + q_config2
-        q3_dict = q_config3.to_dict()
-        for op_name, op_config in quant_config2["rtn"]["local"].items():
-            for attr, val in op_config.items():
-                self.assertEqual(q3_dict["local"][op_name][attr], val)
-        self.assertNotEqual(q3_dict["global"]["weight_bits"], quant_config2["rtn"]["global"]["weight_bits"])
-
-    def test_config_mapping(self):
-        from neural_compressor.onnxrt import RTNConfig
-        from neural_compressor.onnxrt.utils.utility import get_model_info
-
-        quant_config = RTNConfig(weight_bits=4)
-        # set operator instance
-        fc_out_config = RTNConfig(weight_bits=8)
-        quant_config.set_local("/h.4/mlp/fc_out/MatMul", fc_out_config)
-        # get model and quantize
-        fp32_model = self.gptj
-        model_info = get_model_info(fp32_model, white_op_type_list=["MatMul"])
-        logger.info(quant_config)
-        configs_mapping = quant_config.to_config_mapping(model_info=model_info)
-        logger.info(configs_mapping)
-        self.assertTrue(configs_mapping[("/h.4/mlp/fc_out/MatMul", "MatMul")].weight_bits == 8)
-        self.assertTrue(configs_mapping[("/h.4/mlp/fc_in/MatMul", "MatMul")].weight_bits == 4)
-        # test regular matching
-        fc_config = RTNConfig(weight_bits=3)
-        quant_config.set_local("/h.[1-4]/mlp/fc_out/MatMul", fc_config)
-        configs_mapping = quant_config.to_config_mapping(model_info=model_info)
-        logger.info(configs_mapping)
-        self.assertTrue(configs_mapping[("/h.4/mlp/fc_out/MatMul", "MatMul")].weight_bits == 3)
-        self.assertTrue(configs_mapping[("/h.3/mlp/fc_out/MatMul", "MatMul")].weight_bits == 3)
-        self.assertTrue(configs_mapping[("/h.2/mlp/fc_out/MatMul", "MatMul")].weight_bits == 3)
-        self.assertTrue(configs_mapping[("/h.1/mlp/fc_out/MatMul", "MatMul")].weight_bits == 3)
-
-    def test_diff_types_configs_addition(self):
-        from neural_compressor.onnxrt import GPTQConfig, RTNConfig
-
-        quant_config1 = {
-            "rtn": {
-                "weight_bits": 4,
-                "weight_group_size": 32,
-            },
-        }
-        q_config = RTNConfig.from_dict(quant_config1["rtn"])
-        d_config = GPTQConfig(weight_group_size=128)
-        combined_config = q_config + d_config
-        combined_config_d = combined_config.to_dict()
-        logger.info(combined_config)
-        self.assertIn("rtn", combined_config_d)
-        self.assertIn("gptq", combined_config_d)
-
-
-class TestQuantConfigForAutotune(unittest.TestCase):
-    def test_expand_config(self):
-        # test the expand functionalities, the user is not aware it
-        from neural_compressor.onnxrt import RTNConfig
-
-        tune_config = RTNConfig(weight_bits=[4, 8])
-        expand_config_list = RTNConfig.expand(tune_config)
-        self.assertEqual(expand_config_list[0].weight_bits, 4)
-        self.assertEqual(expand_config_list[1].weight_bits, 8)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/3x/onnxrt/test_smooth_quant.py b/test/3x/onnxrt/test_smooth_quant.py
deleted file mode 100644
index 6974020185b..00000000000
--- a/test/3x/onnxrt/test_smooth_quant.py
+++ /dev/null
@@ -1,127 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-# Copyright (c) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import glob
-import os
-import shutil
-import unittest
-
-import numpy as np
-import onnx
-from optimum.exporters.onnx import main_export
-
-from neural_compressor.common import Logger
-from neural_compressor.onnxrt import CalibrationDataReader, QuantType, SmoohQuantConfig, get_default_sq_config
-from neural_compressor.onnxrt.quantization.quantize import _quantize
-
-logger = Logger().get_logger()
-
-
-class DataReader(CalibrationDataReader):
-    def __init__(self, model):
-        model = onnx.load(model)
-        batch_size = 1
-        sequence_length = 1
-        self.data = {
-            "input_ids": np.random.randint(10, size=(batch_size, sequence_length)).astype("int64"),
-            "attention_mask": np.zeros((batch_size, sequence_length)).astype("int64"),
-        }
-        for inp in model.graph.input:
-            if inp.name in self.data:
-                continue
-            if inp.name == "position_ids":
-                # model is exported with optimum >= 1.14.0 with new input 'position_ids'
-                self.data[inp.name] = np.random.randint(10, size=(batch_size, sequence_length)).astype("int64")
-
-        self.enum_data = None
-
-    def get_next(self):
-        if self.enum_data is None:
-            self.enum_data = iter([self.data])
-        return next(self.enum_data, None)
-
-    def rewind(self):
-        self.enum_data = None
-
-
-class TestONNXRT3xSmoothQuant(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        main_export(
-            "hf-internal-testing/tiny-random-gptj",
-            output="gptj",
-        )
-        self.gptj = glob.glob(os.path.join("./gptj", "*.onnx"))[0]
-        self.data_reader = DataReader(self.gptj)
-
-    @classmethod
-    def tearDownClass(self):
-        shutil.rmtree("./gptj", ignore_errors=True)
-
-    def test_sq_from_class_beginner(self):
-        self.data_reader.rewind()
-        config = get_default_sq_config()
-        model = _quantize(self.gptj, config, self.data_reader)
-        num_muls = len([i for i in model.graph.node if i.name.endswith("_smooth_mul") and i.op_type == "Mul"])
-        self.assertEqual(num_muls, 30)
-
-    def test_sq_auto_tune_from_class_beginner(self):
-        self.data_reader.rewind()
-        config = SmoohQuantConfig(alpha="auto", scales_per_op=False)
-        model = _quantize(self.gptj, config, self.data_reader)
-        num_muls = len([i for i in model.graph.node if i.name.endswith("_smooth_mul") and i.op_type == "Mul"])
-        self.assertEqual(num_muls, 15)
-
-    def test_sq_from_dict_beginner(self):
-        config = {
-            "smooth_quant": {
-                "global": {
-                    "alpha": 0.5,
-                    "scales_per_op": False,
-                },
-            }
-        }
-        self.data_reader.rewind()
-        model = _quantize(self.gptj, config, self.data_reader)
-        num_muls = len([i for i in model.graph.node if i.name.endswith("_smooth_mul") and i.op_type == "Mul"])
-        self.assertEqual(num_muls, 15)
-
-    def test_sq_auto_tune_from_dict_beginner(self):
-        config = {
-            "smooth_quant": {
-                "global": {
-                    "alpha": "auto",
-                },
-            }
-        }
-        self.data_reader.rewind()
-        model = _quantize(self.gptj, config, self.data_reader)
-        num_muls = len([i for i in model.graph.node if i.name.endswith("_smooth_mul") and i.op_type == "Mul"])
-        self.assertEqual(num_muls, 30)
-
-    def test_sq_ort_param_class_beginner(self):
-        self.data_reader.rewind()
-        config = SmoohQuantConfig(weight_type=QuantType.QUInt8, activation_type=QuantType.QUInt8)
-        model = _quantize(self.gptj, config, self.data_reader)
-        num_muls = len([i for i in model.graph.node if i.name.endswith("_smooth_mul") and i.op_type == "Mul"])
-        self.assertTrue(2 in [i.data_type for i in model.graph.initializer])
-        self.assertTrue(3 not in [i.data_type for i in model.graph.initializer])
-        self.assertEqual(num_muls, 30)
-
-
-if __name__ == "__main__":
-    unittest.main()