Merge branch 'master' of https://github.com/intel/neural-compressor i…

…nto jianyu_3.0_onlinedoc
intel · chensuyue · Jul 23, 2024 · Jul 17, 2024 · Jul 18, 2024 · Jul 19, 2024
commit 64b7e4bc21d18e8d0c4c551350bdd5d5c85eb5cd
diff --git a/.azure-pipelines/scripts/install_nc.sh b/.azure-pipelines/scripts/install_nc.sh
@@ -10,10 +10,6 @@ elif [[ $1 = *"3x_tf"* ]]; then
     python -m pip install --no-cache-dir -r requirements_tf.txt
     python setup.py tf bdist_wheel
     pip install dist/neural_compressor*.whl --force-reinstall
-elif [[ $1 = *"3x_ort" ]]; then
-    python -m pip install --no-cache-dir -r requirements_ort.txt
-    python setup.py ort bdist_wheel
-    pip install dist/neural_compressor*.whl --force-reinstall
 else
     python -m pip install --no-cache-dir -r requirements.txt
     python setup.py bdist_wheel

diff --git a/.azure-pipelines/scripts/ut/3x/coverage.3x_ort b/.azure-pipelines/scripts/ut/3x/coverage.3x_ort
diff --git a/.azure-pipelines/scripts/ut/3x/run_3x_ort.sh b/.azure-pipelines/scripts/ut/3x/run_3x_ort.sh
diff --git a/.azure-pipelines/ut-3x-ort.yml b/.azure-pipelines/ut-3x-ort.yml
diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml
@@ -140,16 +140,3 @@ subprojects:
       - "UT-3x-Torch (Coverage Compare CollectDatafiles)"
       - "UT-3x-Torch (Unit Test 3x Torch Unit Test 3x Torch)"
       - "UT-3x-Torch (Unit Test 3x Torch baseline Unit Test 3x Torch baseline)"
-
-  - id: "Unit Tests 3x-ONNXRT workflow"
-    paths:
-      - "neural_compressor/common/**"
-      - "neural_compressor/onnxrt/**"
-      - "test/3x/onnxrt/**"
-      - "setup.py"
-      - "requirements_ort.txt"
-    checks:
-      - "UT-3x-ONNXRT"
-      - "UT-3x-ONNXRT (Coverage Compare CollectDatafiles)"
-      - "UT-3x-ONNXRT (Unit Test 3x ONNXRT Unit Test 3x ONNXRT)"
-      - "UT-3x-ONNXRT (Unit Test 3x ONNXRT baseline Unit Test 3x ONNXRT baseline)"
diff --git a/README.md b/README.md
@@ -19,20 +19,25 @@ Intel® Neural Compressor aims to provide popular model compression techniques s
 as well as Intel extensions such as [Intel Extension for TensorFlow](https://github.com/intel/intel-extension-for-tensorflow) and [Intel Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch).
 In particular, the tool provides the key features, typical examples, and open collaborations as below:
 
-* Support a wide range of Intel hardware such as [Intel Xeon Scalable Processors](https://www.intel.com/content/www/us/en/products/details/processors/xeon/scalable.html), [Intel Xeon CPU Max Series](https://www.intel.com/content/www/us/en/products/details/processors/xeon/max-series.html), [Intel Data Center GPU Flex Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/flex-series.html), and [Intel Data Center GPU Max Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/max-series.html) with extensive testing; support AMD CPU, ARM CPU, and NVidia GPU through ONNX Runtime with limited testing
+* Support a wide range of Intel hardware such as [Intel Gaudi Al Accelerators](https://www.intel.com/content/www/us/en/products/details/processors/ai-accelerators/gaudi-overview.html), [Intel Core Ultra Processors](https://www.intel.com/content/www/us/en/products/details/processors/core-ultra.html), [Intel Xeon Scalable Processors](https://www.intel.com/content/www/us/en/products/details/processors/xeon/scalable.html), [Intel Xeon CPU Max Series](https://www.intel.com/content/www/us/en/products/details/processors/xeon/max-series.html), [Intel Data Center GPU Flex Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/flex-series.html), and [Intel Data Center GPU Max Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/max-series.html) with extensive testing; 
+support AMD CPU, ARM CPU, and NVidia GPU through ONNX Runtime with limited testing; support NVidia GPU for some WOQ algorithms like AutoRound and HQQ. 
 
 * Validate popular LLMs such as [LLama2](/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), [Falcon](/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), [GPT-J](/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), [Bloom](/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), [OPT](/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), and more than 10,000 broad models such as [Stable Diffusion](/examples/pytorch/nlp/huggingface_models/text-to-image/quantization), [BERT-Large](/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx), and [ResNet50](/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx) from popular model hubs such as [Hugging Face](https://huggingface.co/), [Torch Vision](https://pytorch.org/vision/stable/index.html), and [ONNX Model Zoo](https://github.com/onnx/models#models), with automatic [accuracy-driven](/docs/source/design.md#workflow) quantization strategies
 
 * Collaborate with cloud marketplaces such as [Google Cloud Platform](https://console.cloud.google.com/marketplace/product/bitnami-launchpad/inc-tensorflow-intel?project=verdant-sensor-286207), [Amazon Web Services](https://aws.amazon.com/marketplace/pp/prodview-yjyh2xmggbmga#pdp-support), and [Azure](https://azuremarketplace.microsoft.com/en-us/marketplace/apps/bitnami.inc-tensorflow-intel), software platforms such as [Alibaba Cloud](https://www.intel.com/content/www/us/en/developer/articles/technical/quantize-ai-by-oneapi-analytics-on-alibaba-cloud.html), [Tencent TACO](https://new.qq.com/rain/a/20221202A00B9S00) and [Microsoft Olive](https://github.com/microsoft/Olive), and open AI ecosystem such as [Hugging Face](https://huggingface.co/blog/intel), [PyTorch](https://pytorch.org/tutorials/recipes/intel_neural_compressor_for_pytorch.html), [ONNX](https://github.com/onnx/models#models), [ONNX Runtime](https://github.com/microsoft/onnxruntime), and [Lightning AI](https://github.com/Lightning-AI/lightning/blob/master/docs/source-pytorch/advanced/post_training_quantization.rst)
 
 ## What's New
-* [2024/03] A new SOTA approach [AutoRound](https://github.com/intel/auto-round) Weight-Only Quantization on [Intel Gaudi2 AI accelerator](https://habana.ai/products/gaudi2/) is available for LLMs.
+* [2024/07] From 3.0 release, framework extension API is recommended to be used for quantization.
+* [2024/07] Performance optimizations and usability improvements on [client-side](https://github.com/intel/neural-compressor/blob/master/docs/3x/client_quant.md).
 
 ## Installation
 
 ### Install from pypi
 ```Shell
-pip install neural-compressor
+# Install 2.X API + Framework extension API + PyTorch dependency
+pip install neural-compressor[pt] 
+# Install 2.X API + Framework extension API + TensorFlow dependency
+pip install neural-compressor[tf]
 ```
 > **Note**:
 > Further installation methods can be found under [Installation Guide](https://github.com/intel/neural-compressor/blob/master/docs/source/installation_guide.md). check out our [FAQ](https://github.com/intel/neural-compressor/blob/master/docs/source/faq.md) for more details.

diff --git a/docs/3x/client_quant.md b/docs/3x/client_quant.md
@@ -0,0 +1,50 @@
+Quantization on Client
+==========================================
+
+1. [Introduction](#introduction)
+2. [Get Started](#get-started) \
+   2.1 [Get Default Algorithm Configuration](#get-default-algorithm-configuration)\
+   2.2 [Optimal Performance and Peak Memory Usage](#optimal-performance-and-peak-memory-usage)
+
+
+## Introduction
+
+For `RTN`, `GPTQ`, and `Auto-Round` algorithms, we provide default algorithm configurations for different processor types (`client` and `sever`). Generally, lightweight configurations are tailored specifically for client devices to enhance performance and efficiency.
+
+
+## Get Started
+
+### Get Default Algorithm Configuration
+
+Here, we take the `RTN` algorithm as example to demonstrate the usage on a client machine.
+
+```python
+from neural_compressor.torch.quantization import get_default_rtn_config, convert, prepare
+from neural_compressor.torch import load_empty_model
+
+model_state_dict_path = "/path/to/model/state/dict"
+float_model = load_empty_model(model_state_dict_path)
+quant_config = get_default_rtn_config()
+prepared_model = prepare(float_model, quant_config)
+quantized_model = convert(prepared_model)
+```
+
+> [!TIP]
+> By default, the appropriate configuration is determined based on hardware information, but users can explicitly specify `processor_type` as either `client` or `server` when calling `get_default_rtn_config`.
+
+
+For Windows machines, run the following command to utilize all available cores automatically:
+
+```bash
+python main.py
+```
+
+> [!TIP]
+> For Linux systems, users need to configure the environment variables appropriately to achieve optimal performance. For example, set the `OMP_NUM_THREADS` explicitly. For processors with hybrid architecture (including both P-cores and E-cores), it is recommended to bind tasks to all P-cores using `taskset`.
+
+### Optimal Performance and Peak Memory Usage
+
+Below are approximate performance and memory usage figures conducted on a client machine with 24 cores and 32GB of RAM. These figures provide a rough estimate for quick reference and may vary based on specific hardware and configurations.
+
+- 7B models (e.g., [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)): the quantization process takes about 65 seconds, with a peak memory usage of around 6GB.
+- 1.5B models (e.g., [Qwen/Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct)),  the quantization process takes about 20 seconds, with a peak memory usage of around 5GB.
diff --git a/docs/source/3x/PT_WeightOnlyQuant.md b/docs/source/3x/PT_WeightOnlyQuant.md
@@ -15,6 +15,7 @@ PyTorch Weight Only Quantization
     - [HQQ](#hqq)
   - [Specify Quantization Rules](#specify-quantization-rules)
   - [Saving and Loading](#saving-and-loading)
+- [Efficient Usage on Client-Side](#efficient-usage-on-client-side)
 - [Examples](#examples)
 
 ## Introduction
@@ -276,6 +277,11 @@ loaded_model = load(
 )  # Please note that the original_model parameter passes the original model.
 ```
 
+## Efficient Usage on Client-Side
+
+For client machines with limited RAM and cores, we offer optimizations to reduce computational overhead and minimize memory usage. For detailed information, please refer to [Quantization on Client](https://github.com/intel/neural-compressor/blob/master/docs/3x/client_quant.md).
+
+
 ## Examples
 
 Users can also refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only) on how to quantize a  model with WeightOnlyQuant.

diff --git a/docs/source/3x/TensorFlow.md b/docs/source/3x/TensorFlow.md
@@ -23,14 +23,15 @@ Intel(R) Neural Compressor provides `quantize_model` and `autotune` as main inte
 
 **quantize_model**
 
-The design philosophy of the `quantize_model` interface is easy-of-use. With minimal parameters requirement, including `model`, `quant_config`, `calib_dataloader` and `calib_iteration`, it offers a straightforward choice of quantizing TF model in one-shot.
+The design philosophy of the `quantize_model` interface is easy-of-use. With minimal parameters requirement, including `model`, `quant_config`, `calib_dataloader`, `calib_iteration`, it offers a straightforward choice of quantizing TF model in one-shot.
 
 ```python
 def quantize_model(
     model: Union[str, tf.keras.Model, BaseModel],
     quant_config: Union[BaseConfig, list],
     calib_dataloader: Callable = None,
     calib_iteration: int = 100,
+    calib_func: Callable = None,
 ):
 ```
 `model` should be a string of the model's location, the object of Keras model or INC TF model wrapper class.
@@ -41,6 +42,9 @@ def quantize_model(
 
 `calib_iteration` is used to decide how many iterations the calibration process will be run.
 
+`calib_func` is a substitution for `calib_dataloader` when the built-in calibration function of INC does not work for model inference.
+
+
 Here is a simple example of using `quantize_model` interface with a dummy calibration dataloader and the default `StaticQuantConfig`:
 ```python
 from neural_compressor.tensorflow import StaticQuantConfig, quantize_model
@@ -68,6 +72,7 @@ def autotune(
     eval_args: Optional[Tuple[Any]] = None,
     calib_dataloader: Callable = None,
     calib_iteration: int = 100,
+    calib_func: Callable = None,
 ) -> Optional[BaseModel]:
 ```
 `model` should be a string of the model's location, the object of Keras model or INC TF model wrapper class.
@@ -82,6 +87,8 @@ def autotune(
 
 `calib_iteration` is used to decide how many iterations the calibration process will be run.
 
+`calib_func` is a substitution for `calib_dataloader` when the built-in calibration function of INC does not work for model inference.
+
 Here is a simple example of using `autotune` interface with different quantization rules defined by a list of  `StaticQuantConfig`:
 ```python
 from neural_compressor.common.base_tuning import TuningConfig

diff --git a/docs/source/installation_guide.md b/docs/source/installation_guide.md
@@ -29,28 +29,28 @@ The following prerequisites and requirements must be satisfied for a successful
 
 ### Install from Binary
 - Install from Pypi
-  ```Shell
-  # install stable basic version from pypi
-  pip install neural-compressor
-  ```
-  ```Shell
-  # [Experimental] install stable basic + PyTorch framework extension API from pypi 
-  pip install neural-compressor[pt]
-  ```
-  ```Shell
-  # [Experimental] install stable basic + TensorFlow framework extension API from pypi 
-  pip install neural-compressor[tf]
-  ```
-
-- Install from test Pypi
-  ```Shell
-  # install nightly version
-  git clone https://github.com/intel/neural-compressor.git
-  cd neural-compressor
-  pip install -r requirements.txt
-  # install nightly basic version from pypi
-  pip install -i https://test.pypi.org/simple/ neural-compressor
-  ```
+```Shell
+# Install 2.X API + Framework extension API + PyTorch dependency
+pip install neural-compressor[pt]
+```
+```Shell
+# Install 2.X API + Framework extension API + TensorFlow dependency
+pip install neural-compressor[tf]
+```
+```Shell
+# Install 2.X API + Framework extension API
+# With this install CMD, some dependencies for framework extension API not installed, 
+# you can install them separately by `pip install -r requirements_pt.txt` or `pip install -r requirements_tf.txt`.
+pip install neural-compressor
+```
+```Shell
+# Framework extension API + TensorFlow dependency
+pip install neural-compressor-pt
+```
+```Shell
+# Framework extension API + TensorFlow dependency
+pip install neural-compressor-tf
+```
 
 ### Install from Source
 
@@ -76,15 +76,20 @@ The AI Kit is distributed through many common channels, including from Intel's w
 ## System Requirements
 
 ### Validated Hardware Environment
+
+#### Intel® Neural Compressor supports HPUs based on heterogeneous architecture with two compute engines (MME and TPC): 
+* Intel Gaudi Al Accelerators (Gaudi2)
+
 #### Intel® Neural Compressor supports CPUs based on [Intel 64 architecture or compatible processors](https://en.wikipedia.org/wiki/X86-64):
 
-* Intel Xeon Scalable processor (formerly Skylake, Cascade Lake, Cooper Lake, Ice Lake, and Sapphire Rapids)
-* Intel Xeon CPU Max Series (formerly Sapphire Rapids HBM)
+* Intel Xeon Scalable processor (Skylake, Cascade Lake, Cooper Lake, Ice Lake, and Sapphire Rapids)
+* Intel Xeon CPU Max Series (Sapphire Rapids HBM)
+* Intel Core Ultra Processors (Meteor Lake)
 
 #### Intel® Neural Compressor supports GPUs built on Intel's Xe architecture:
 
-* Intel Data Center GPU Flex Series (formerly Arctic Sound-M)
-* Intel Data Center GPU Max Series (formerly Ponte Vecchio)
+* Intel Data Center GPU Flex Series (Arctic Sound-M)
+* Intel Data Center GPU Max Series (Ponte Vecchio)
 
 #### Intel® Neural Compressor quantized ONNX models support multiple hardware vendors through ONNX Runtime:
 

diff --git a/neural_compressor/onnxrt/__init__.py b/neural_compressor/onnxrt/__init__.py
diff --git a/neural_compressor/onnxrt/algorithms/__init__.py b/neural_compressor/onnxrt/algorithms/__init__.py
diff --git a/neural_compressor/onnxrt/algorithms/layer_wise/__init__.py b/neural_compressor/onnxrt/algorithms/layer_wise/__init__.py
diff --git a/neural_compressor/onnxrt/algorithms/layer_wise/core.py b/neural_compressor/onnxrt/algorithms/layer_wise/core.py
diff --git a/neural_compressor/onnxrt/algorithms/smoother/__init__.py b/neural_compressor/onnxrt/algorithms/smoother/__init__.py
diff --git a/neural_compressor/onnxrt/algorithms/smoother/calibrator.py b/neural_compressor/onnxrt/algorithms/smoother/calibrator.py
diff --git a/neural_compressor/onnxrt/algorithms/smoother/core.py b/neural_compressor/onnxrt/algorithms/smoother/core.py
diff --git a/neural_compressor/onnxrt/algorithms/weight_only/__init__.py b/neural_compressor/onnxrt/algorithms/weight_only/__init__.py
diff --git a/neural_compressor/onnxrt/algorithms/weight_only/awq.py b/neural_compressor/onnxrt/algorithms/weight_only/awq.py
diff --git a/neural_compressor/onnxrt/algorithms/weight_only/gptq.py b/neural_compressor/onnxrt/algorithms/weight_only/gptq.py
diff --git a/neural_compressor/onnxrt/algorithms/weight_only/rtn.py b/neural_compressor/onnxrt/algorithms/weight_only/rtn.py
diff --git a/neural_compressor/onnxrt/algorithms/weight_only/utility.py b/neural_compressor/onnxrt/algorithms/weight_only/utility.py
diff --git a/neural_compressor/onnxrt/quantization/__init__.py b/neural_compressor/onnxrt/quantization/__init__.py
diff --git a/neural_compressor/onnxrt/quantization/algorithm_entry.py b/neural_compressor/onnxrt/quantization/algorithm_entry.py
diff --git a/neural_compressor/onnxrt/quantization/autotune.py b/neural_compressor/onnxrt/quantization/autotune.py
diff --git a/neural_compressor/onnxrt/quantization/calibrate.py b/neural_compressor/onnxrt/quantization/calibrate.py
diff --git a/neural_compressor/onnxrt/quantization/config.py b/neural_compressor/onnxrt/quantization/config.py
diff --git a/neural_compressor/onnxrt/quantization/quantize.py b/neural_compressor/onnxrt/quantization/quantize.py
diff --git a/neural_compressor/onnxrt/utils/__init__.py b/neural_compressor/onnxrt/utils/__init__.py
diff --git a/neural_compressor/onnxrt/utils/onnx_model.py b/neural_compressor/onnxrt/utils/onnx_model.py
diff --git a/neural_compressor/onnxrt/utils/utility.py b/neural_compressor/onnxrt/utils/utility.py
diff --git a/neural_compressor/tensorflow/algorithms/smoother/core.py b/neural_compressor/tensorflow/algorithms/smoother/core.py
@@ -37,19 +37,23 @@ class SmoothQuant:
     def __init__(
         self,
         config: SmoothQuantConfig,
-        calib_dataloader: Callable,
+        calib_dataloader: Callable = None,
         calib_iteration: int = 1,
+        calib_func: Callable = None,
     ):
         """Convert the model by smooth quant.
 
         Args:
-            config: the SmoothQuantConfig class used to set this class
-            calibdataloader: the calibration dataloader
-            calib_iteration: how many steps of iterations on the dataloader to move forward
+            config: the SmoothQuantConfig class used to set this class.
+            calibdataloader: the calibration dataloader.
+            calib_iteration: how many steps of iterations on the dataloader to move forward.
+            calib_func: the function used for calibration, should be a substitution for calib_dataloader
+            when the built-in calibration function of INC does not work for model inference.
 
         Returns:
             model: A smoothed Tensorflow model
         """
+        assert calib_func is None, "calibration function is not supported for smooth quant."
         self.config = config
         self.calib_dataloader = calib_dataloader
         self.calib_iteration = calib_iteration

diff --git a/neural_compressor/tensorflow/algorithms/static_quant/keras.py b/neural_compressor/tensorflow/algorithms/static_quant/keras.py
@@ -314,16 +314,18 @@ def fuse_conv_bn(conv_weight, bn_weight, conv_type="Conv2D", eps=1.0e-5):
         return bn_fused_model
 
     @dump_elapsed_time("Pass quantize model")
-    def quantize(self, quant_config, model, dataloader, iteration, q_func=None):
+    def quantize(self, quant_config, model, dataloader, iteration, calib_func=None):
         """Execute the quantize process on the specified model.
 
         Args:
-            tune_cfg(dict): The user defined 'StaticQuantConfig' class.
+            quant_config(dict): The user defined 'StaticQuantConfig' class.
             model (object): The model to do quantization.
             dataloader(object): The calibration dataloader used to load quantization dataset.
             iteration(int): The iteration of calibration.
-            q_func (optional): training function for quantization aware training mode.
+            calib_func (optional): the function used for calibration, should be a substitution for calibration
+            dataloader when the built-in calibration function of INC does not work for model inference.
         """
+        assert calib_func is None, "The calibration function is not supported on Keras backend yet"
         self.query_fw_capability(model)
         converter = KerasConfigConverter(quant_config, iteration)
         tune_cfg = converter.parse_to_tune_cfg()
@@ -367,15 +369,13 @@ def quantize(self, quant_config, model, dataloader, iteration, q_func=None):
 
         return quantized_model
 
-    def _calibrate(self, model, dataloader, calib_interation):
+    def _calibrate(self, model, dataloader=None, calib_interation=None):
         """Apply calibration.
 
         Args:
             model (tf.keras.Model): The model inserted with FakeQuant layers for calibration.
             dataloader(object): The calibration dataloader used to load quantization dataset.
             iteration(int): The iteration of calibration.
-            fq_output_layers (dict): A dict mapping from names of FakeQuant layers to
-                names of their output layers.
         """
         # run eagerly to fetch the numpy min/max
         results = {}

diff --git a/neural_compressor/tensorflow/algorithms/static_quant/tensorflow.py b/neural_compressor/tensorflow/algorithms/static_quant/tensorflow.py
@@ -172,7 +172,7 @@ def quantize(
         model: BaseModel,
         calib_dataloader: Callable = None,
         calib_iteration: int = 100,
-        q_func=None,
+        calib_func: Callable = None,
     ):
         """Execute the quantize process on the specified model.
 
@@ -181,11 +181,11 @@ def quantize(
             model: the fp32 model to be quantized.
             calib_dataloader: a data loader for calibration.
             calib_iteration: the iteration of calibration.
-            q_func: training function for quantization aware training mode,
-                                which not enabled for tensorflow yet.
+            calib_func: the function used for calibration, should be a substitution for calib_dataloader
+            when the built-in calibration function of INC does not work for model inference.
 
         Returns:
-            tf.compat.v1.GraphDef: the quantized model
+            converted_model: the quantized INC model wrapper.
         """
         assert (
             self.approach != "post_training_dynamic_quant"
@@ -195,7 +195,7 @@ def quantize(
             self.approach != "quant_aware_training"
         ), "Quantize Aware Training is not supported on TensorFlow framework now!"
 
-        self.calib_sampling_size = calib_dataloader.batch_size * calib_iteration
+        self.calib_sampling_size = calib_dataloader.batch_size * calib_iteration if calib_dataloader else 100
         tune_cfg = self.parse_quant_config(quant_config, model, calib_iteration)
         self._tuning_cfg_to_fw(tune_cfg)
         self.bf16_ops.extend(self.smooth_quant_mul_ops)
@@ -228,7 +228,7 @@ def quantize(
                     fp32_ops=self.fp32_ops,
                     bf16_ops=self.bf16_ops,
                     data_loader=calib_dataloader,
-                    calib_func=q_func,
+                    calib_func=calib_func,
                     qdq_enabled=self.qdq_enabled,
                     new_api=self.new_api,
                     performance_only=self.performance_only,
@@ -251,7 +251,7 @@ def quantize(
                     fp32_ops=self.fp32_ops,
                     bf16_ops=self.bf16_ops,
                     data_loader=calib_dataloader,
-                    calib_func=q_func,
+                    calib_func=calib_func,
                     qdq_enabled=self.qdq_enabled,
                     new_api=self.new_api,
                     performance_only=self.performance_only,
@@ -275,7 +275,7 @@ def quantize(
                 fp32_ops=self.fp32_ops,
                 bf16_ops=self.bf16_ops,
                 data_loader=calib_dataloader,
-                calib_func=q_func,
+                calib_func=calib_func,
                 qdq_enabled=self.qdq_enabled,
                 new_api=self.new_api,
                 performance_only=self.performance_only,
@@ -750,21 +750,21 @@ def quantize(
         model: BaseModel,
         calib_dataloader: Callable = None,
         calib_iteration: int = 100,
-        q_func=None,
+        calib_func: Callable = None,
     ):
         """Execute the quantize process on the specified model.
 
         Args:
-            tune_cfg (dict): quantization configuration
-            model (tf.compat.v1.GraphDef): fp32 model
-            data_loader (generator): generator the data and labels
-            q_func (optional): training function for quantization aware training mode,
-                                which not enabled for tensorflow yet.
+            quant_config: a quantization configuration.
+            model: the fp32 model to be quantized.
+            calib_dataloader: a data loader for calibration.
+            calib_iteration: the iteration of calibration.
+            calib_func: the function used for calibration, should be a substitution for calib_dataloader
+            when the built-in calibration function of INC does not work for model inference.
 
         Returns:
-            tf.compat.v1.GraphDef: the quantized model
+            converted_model: the quantized INC model wrapper.
         """
-        assert q_func is None, "quantization aware training mode is not support on tensorflow"
         self.calib_sampling_size = calib_dataloader.batch_size * calib_iteration
         tune_cfg = self.parse_quant_config(quant_config, model, calib_iteration)
         self._tuning_cfg_to_fw(tune_cfg)
@@ -798,7 +798,7 @@ def quantize(
                     fp32_ops=self.fp32_ops,
                     bf16_ops=self.bf16_ops,
                     data_loader=calib_dataloader,
-                    calib_func=q_func,
+                    calib_func=calib_func,
                     itex_mode=self.itex_mode,
                     qdq_enabled=self.qdq_enabled,
                     new_api=self.new_api,
@@ -846,7 +846,7 @@ def quantize(
                 fp32_ops=self.fp32_ops,
                 bf16_ops=self.bf16_ops,
                 data_loader=calib_dataloader,
-                calib_func=q_func,
+                calib_func=calib_func,
                 itex_mode=self.itex_mode,
                 qdq_enabled=self.qdq_enabled,
                 new_api=self.new_api,

diff --git a/neural_compressor/tensorflow/quantization/algorithm_entry.py b/neural_compressor/tensorflow/quantization/algorithm_entry.py
@@ -28,6 +28,7 @@ def static_quant_entry(
     quant_config: BaseConfig,
     calib_dataloader: Callable = None,
     calib_iteration: int = 100,
+    calib_func: Callable = None,
 ):
     """The main entry to apply static quantization.
 
@@ -36,6 +37,8 @@ def static_quant_entry(
         quant_config: a quantization configuration.
         calib_dataloader: a data loader for calibration.
         calib_iteration: the iteration of calibration.
+        calib_func: the function used for calibration, should be a substitution for calib_dataloader
+        when the built-in calibration function of INC does not work for model inference.
 
     Returns:
         q_model: the quantized model.
@@ -49,7 +52,7 @@ def static_quant_entry(
         framework = TensorFlowAdaptor
 
     quantizer = framework(TFConfig.global_config)
-    q_model = quantizer.quantize(quant_config, model, calib_dataloader, calib_iteration)
+    q_model = quantizer.quantize(quant_config, model, calib_dataloader, calib_iteration, calib_func)
     TFConfig.reset_global_config()
 
     return q_model
@@ -61,12 +64,26 @@ def smooth_quant_entry(
     smooth_quant_config: SmoothQuantConfig,
     calib_dataloader: Callable = None,
     calib_iteration: int = 100,
+    calib_func: Callable = None,
 ):
+    """The main entry to apply smooth quantization.
+
+    Args:
+        model: a fp32 model to be quantized.
+        quant_config: a quantization configuration.
+        calib_dataloader: a data loader for calibration.
+        calib_iteration: the iteration of calibration.
+        calib_func: the function used for calibration, should be a substitution for calib_dataloader
+        when the built-in calibration function of INC does not work for model inference.
+
+    Returns:
+        q_model: the quantized model.
+    """
     assert not isinstance(model, KerasModel), "INC don't support smooth quantization for Keras models now."
 
     from neural_compressor.tensorflow.algorithms import SmoothQuant
 
-    converter = SmoothQuant(smooth_quant_config, calib_dataloader, calib_iteration)
+    converter = SmoothQuant(smooth_quant_config, calib_dataloader, calib_iteration, calib_func)
     sq_model = converter(model)
 
     return sq_model
diff --git a/neural_compressor/tensorflow/quantization/autotune.py b/neural_compressor/tensorflow/quantization/autotune.py
@@ -46,6 +46,7 @@ def autotune(
     eval_args: Optional[Tuple[Any]] = None,
     calib_dataloader: Callable = None,
     calib_iteration: int = 100,
+    calib_func: Callable = None,
 ) -> Optional[BaseModel]:
     """The main entry of auto-tune."""
     model = Model(model)
@@ -59,7 +60,7 @@ def autotune(
         tuning_logger.trial_start(trial_index=trial_index)
         tuning_logger.execution_start()
         logger.info(quant_config.to_dict())
-        q_model = quantize_model(model, quant_config, calib_dataloader, calib_iteration)
+        q_model = quantize_model(model, quant_config, calib_dataloader, calib_iteration, calib_func)
         tuning_logger.execution_end()
         tuning_logger.evaluation_start()
         eval_result: float = eval_func_wrapper.evaluate(q_model)
@@ -73,7 +74,9 @@ def autotune(
                 logger.info("Re-quantizing with best quantization config...")
                 del q_model
                 best_quant_config: BaseConfig = best_trial_record.quant_config
-                best_quant_model = quantize_model(model, best_quant_config, calib_dataloader, calib_iteration)
+                best_quant_model = quantize_model(
+                    model, best_quant_config, calib_dataloader, calib_iteration, calib_func
+                )
             else:
                 best_quant_model = q_model
             break

diff --git a/neural_compressor/tensorflow/quantization/quantize.py b/neural_compressor/tensorflow/quantization/quantize.py
@@ -34,6 +34,7 @@ def quantize_model(
     quant_config: Union[BaseConfig, list],
     calib_dataloader: Callable = None,
     calib_iteration: int = 100,
+    calib_func: Callable = None,
 ):
     """The main entry to quantize model.
 
@@ -42,16 +43,20 @@ def quantize_model(
         quant_config: single or lists of quantization configuration.
         calib_dataloader: a data loader for calibration.
         calib_iteration: the iteration of calibration.
+        calib_func: the function used for calibration, should be a substitution for calib_dataloader
+        when the built-in calibration function of INC does not work for model inference.
 
     Returns:
         q_model: the quantized model.
     """
     q_model = Model(model)
     if isinstance(quant_config, list):
         for config in quant_config:
-            q_model = quantize_model_with_single_config(q_model, config, calib_dataloader, calib_iteration)
+            q_model = quantize_model_with_single_config(q_model, config, calib_dataloader, calib_iteration, calib_func)
     else:
-        q_model = quantize_model_with_single_config(q_model, quant_config, calib_dataloader, calib_iteration)
+        q_model = quantize_model_with_single_config(
+            q_model, quant_config, calib_dataloader, calib_iteration, calib_func
+        )
 
     return q_model
 
@@ -61,6 +66,7 @@ def quantize_model_with_single_config(
     quant_config: BaseConfig,
     calib_dataloader: Callable = None,
     calib_iteration: int = 100,
+    calib_func: Callable = None,
 ):
     """Quantize model using single config.
 
@@ -69,6 +75,8 @@ def quantize_model_with_single_config(
         quant_config: a quantization configuration.
         calib_dataloader: a data loader for calibration.
         calib_iteration: the iteration of calibration.
+        calib_func: the function used for calibration, should be a substitution for calib_dataloader
+        when the built-in calibration function of INC does not work for model inference.
 
     Returns:
         q_model: the quantized model.
@@ -91,5 +99,5 @@ def quantize_model_with_single_config(
     for algo_name, algo_func in algos_mapping.items():
         if need_apply(configs_mapping, algo_name):
             logger.info(f"Start to apply {algo_name} on the model.")
-            q_model = algo_func(q_model, configs_mapping, calib_dataloader, calib_iteration)
+            q_model = algo_func(q_model, configs_mapping, calib_dataloader, calib_iteration, calib_func)
     return q_model
diff --git a/neural_compressor/tensorflow/quantization/utils/graph_converter.py b/neural_compressor/tensorflow/quantization/utils/graph_converter.py
@@ -231,6 +231,10 @@ def _inference(self, model):
         Args:
             model(TensorflowBaseModel): input TensorflowBaseModel
         """
+        if self.calib_func:
+            self.calib_func(model)
+            return
+
         if model.model_type == "llm_saved_model":
             self._inference_llm(model)
             return

diff --git a/neural_compressor/torch/algorithms/layer_wise/utils.py b/neural_compressor/torch/algorithms/layer_wise/utils.py
@@ -90,7 +90,7 @@ def get_named_children(model, pre=[]):
     return module_list
 
 
-def dowload_hf_model(repo_id, cache_dir=None, repo_type=None, revision=None):
+def dowload_hf_model(repo_id, cache_dir=None, repo_type=None, revision=None):  # pragma: no cover
     """Download hugging face model from hf hub."""
     from huggingface_hub.constants import DEFAULT_REVISION, HUGGINGFACE_HUB_CACHE
     from huggingface_hub.file_download import REGEX_COMMIT_HASH, repo_folder_name
@@ -122,7 +122,7 @@ def dowload_hf_model(repo_id, cache_dir=None, repo_type=None, revision=None):
         return file_path
 
 
-def load_empty_model(pretrained_model_name_or_path, cls=AutoModelForCausalLM, **kwargs):
+def load_empty_model(pretrained_model_name_or_path, cls=AutoModelForCausalLM, **kwargs):  # pragma: no cover
     """Load a empty model."""
     is_local = os.path.isdir(pretrained_model_name_or_path)
     if is_local:  # pragma: no cover

diff --git a/neural_compressor/torch/algorithms/static_quant/save_load.py b/neural_compressor/torch/algorithms/static_quant/save_load.py
@@ -32,9 +32,16 @@ def save(model, output_dir="./saved_results"):
 
     qmodel_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), WEIGHT_NAME)
     qconfig_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), QCONFIG_NAME)
-    model.ori_save(qmodel_file_path)
-    with open(qconfig_file_path, "w") as f:
-        json.dump(model.tune_cfg, f, indent=4)
+    device = next(model.parameters(), None).device.type if next(model.parameters(), None) else "cpu"
+    if device == "cpu":
+        model.ori_save(qmodel_file_path)
+        with open(qconfig_file_path, "w") as f:
+            json.dump(model.tune_cfg, f, indent=4)
+    else:  # pragma: no cover
+        from neural_compressor.common.utils import save_config_mapping
+
+        torch.jit.save(model, qmodel_file_path)
+        save_config_mapping(model.qconfig, qconfig_file_path)
 
     logger.info("Save quantized model to {}.".format(qmodel_file_path))
     logger.info("Save configuration of quantized model to {}.".format(qconfig_file_path))

diff --git a/neural_compressor/torch/algorithms/static_quant/static_quant.py b/neural_compressor/torch/algorithms/static_quant/static_quant.py
@@ -33,11 +33,13 @@
 
 from neural_compressor.torch.algorithms import Quantizer
 from neural_compressor.torch.utils import logger
+from neural_compressor.torch.utils.auto_accelerator import auto_detect_accelerator
 
 from .utility import (
     CpuInfo,
     cfg_to_qconfig,
     dump_model_op_stats,
+    generate_xpu_qconfig,
     get_ipex_version,
     get_quantizable_ops_recursively,
     ipex_config_path,
@@ -56,6 +58,7 @@ def __init__(self, quant_config: OrderedDict = {}):
         """
         super().__init__(quant_config)
         self.user_cfg = OrderedDict()
+        self.device = auto_detect_accelerator().current_device()
 
     def prepare(self, model, example_inputs, inplace=True, *args, **kwargs):
         """Prepares a given model for quantization.
@@ -70,43 +73,61 @@ def prepare(self, model, example_inputs, inplace=True, *args, **kwargs):
         """
         assert example_inputs is not None, "Please provide example_inputs for static quantization."
 
-        _, cfgs, op_infos_from_cfgs, output_tensor_id_op_name, _ = get_quantizable_ops_recursively(
-            model, example_inputs
-        )
-        # update json file in ipex_config_path; map ipex op_name to pt op_name
-        self.user_cfg = cfg_to_qconfig(self.quant_config, cfgs, op_infos_from_cfgs, output_tensor_id_op_name)
-        model.eval()
+        if self.device == "cpu":
+            _, cfgs, op_infos_from_cfgs, output_tensor_id_op_name, _ = get_quantizable_ops_recursively(
+                model, example_inputs
+            )
+            # update json file in ipex_config_path; map ipex op_name to pt op_name
+            self.user_cfg = cfg_to_qconfig(self.quant_config, cfgs, op_infos_from_cfgs, output_tensor_id_op_name)
+        else:  # pragma: no cover
+            model = model.to("xpu")
 
-        use_bf16 = self.quant_config.get("use_bf16", None)
+        model.eval()
 
         # Check save_qconf_summary part is a workaround for IPEX bug.
-        # Sometimes the prepared model from get_op_capablitiy loss this attribute
-        if not hasattr(model, "save_qconf_summary") or not hasattr(model, "load_qconf_summary"):
-            from torch.ao.quantization import MinMaxObserver, PerChannelMinMaxObserver, QConfig
-
-            if ipex_ver.release >= Version("2.1").release:
-                # HistogramObserver will cause a performance issue.
-                # static_qconfig = ipex.quantization.default_static_qconfig_mapping
-                qconfig = QConfig(
-                    activation=MinMaxObserver.with_args(qscheme=torch.per_tensor_affine, dtype=torch.quint8),
-                    weight=PerChannelMinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_channel_symmetric),
-                )
-                from torch.ao.quantization import QConfigMapping
-
-                static_qconfig = QConfigMapping().set_global(qconfig)
-            else:
-                static_qconfig = QConfig(
-                    activation=MinMaxObserver.with_args(qscheme=torch.per_tensor_affine, dtype=torch.quint8),
-                    weight=PerChannelMinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_channel_symmetric),
-                )
-            if isinstance(example_inputs, dict):
-                model = ipex.quantization.prepare(
-                    model, static_qconfig, example_kwarg_inputs=example_inputs, inplace=inplace
-                )
+        # Sometimes the prepared model from get_op_capablitiy loss this attributes
+        if not hasattr(model, "save_qconf_summary") or not hasattr(model, "load_qconf_summary"):  # pragma: no cover
+            from torch.ao.quantization import HistogramObserver, MinMaxObserver, PerChannelMinMaxObserver, QConfig
+
+            if self.device != "cpu":  # pragma: no cover
+                from torch.quantization.quantize_jit import prepare_jit
+
+                with torch.no_grad():
+                    modelJit = torch.jit.trace(model, example_inputs)
+                qconfig = generate_xpu_qconfig(self.quant_config)
+                model = prepare_jit(modelJit, qconfig, inplace)
             else:
-                model = ipex.quantization.prepare(model, static_qconfig, example_inputs=example_inputs, inplace=inplace)
+                if ipex_ver.release >= Version("2.1").release:
+                    # HistogramObserver will cause a performance issue.
+                    # static_qconfig = ipex.quantization.default_static_qconfig_mapping
+                    qconfig = QConfig(
+                        activation=MinMaxObserver.with_args(qscheme=torch.per_tensor_affine, dtype=torch.quint8),
+                        weight=PerChannelMinMaxObserver.with_args(
+                            dtype=torch.qint8, qscheme=torch.per_channel_symmetric
+                        ),
+                    )
+                    from torch.ao.quantization import QConfigMapping
+
+                    static_qconfig = QConfigMapping().set_global(qconfig)
+                else:  # pragma: no cover
+                    static_qconfig = QConfig(
+                        activation=MinMaxObserver.with_args(qscheme=torch.per_tensor_affine, dtype=torch.quint8),
+                        weight=PerChannelMinMaxObserver.with_args(
+                            dtype=torch.qint8, qscheme=torch.per_channel_symmetric
+                        ),
+                    )
+                if isinstance(example_inputs, dict):
+                    model = ipex.quantization.prepare(
+                        model, static_qconfig, example_kwarg_inputs=example_inputs, inplace=inplace
+                    )
+                else:
+                    model = ipex.quantization.prepare(
+                        model, static_qconfig, example_inputs=example_inputs, inplace=inplace
+                    )
+
+        if self.device == "cpu":
+            model.load_qconf_summary(qconf_summary=ipex_config_path)
 
-        model.load_qconf_summary(qconf_summary=ipex_config_path)
         return model
 
     def convert(self, model, example_inputs, inplace=True, *args, **kwargs):
@@ -124,18 +145,27 @@ def convert(self, model, example_inputs, inplace=True, *args, **kwargs):
 
         from neural_compressor.torch.algorithms.static_quant import save
 
-        model.save_qconf_summary(qconf_summary=ipex_config_path)
-        model = _ipex_post_quant_process(model, example_inputs, use_bf16, inplace=inplace)
+        if self.device != "cpu":  # pragma: no cover
+            from torch.quantization.quantize_jit import convert_jit
 
-        with open(ipex_config_path, "r") as f:
-            model.tune_cfg = json.load(f)
-        model.ipex_config_path = ipex_config_path
+            model = convert_jit(model, inplace)
+            simple_inference(model, example_inputs, iterations=2)
+            model.qconfig = self.quant_config["op"]
+            dump_model_op_stats(model.qconfig)
+        else:
+            model.save_qconf_summary(qconf_summary=ipex_config_path)
+            model = _ipex_post_quant_process(model, example_inputs, use_bf16, inplace=inplace)
 
-        dump_model_op_stats(self.user_cfg)
+            with open(ipex_config_path, "r") as f:
+                model.tune_cfg = json.load(f)
+            model.ipex_config_path = ipex_config_path
+
+            dump_model_op_stats(self.user_cfg)
 
-        logger.info("Static quantization done.")
         model.ori_save = model.save
         model.save = MethodType(save, model)
+
+        logger.info("Static quantization done.")
         return model
 
 

@@ -163,6 +163,47 @@ def check_cfg_and_qconfig(user_cfg, cfgs, op_infos_from_cfgs, output_tensor_ids_
     return cfgs, ori_user_cfg
 
 
+def generate_xpu_qconfig(tune_cfg):  # pragma: no cover
+    # qconfig observer & config constants for ipex-xpu
+    from torch.ao.quantization import HistogramObserver, MinMaxObserver, QConfig
+
+    act_observer_minmax_asym = MinMaxObserver.with_args(quant_min=0, quant_max=127)
+    act_observer_minmax_sym = MinMaxObserver.with_args(
+        dtype=torch.qint8, qscheme=torch.per_tensor_symmetric, quant_min=-128, quant_max=127
+    )
+    act_observer_kl_asym = HistogramObserver.with_args(quant_min=0, quant_max=127)
+    act_observer_kl_sym = HistogramObserver.with_args(
+        dtype=torch.qint8, qscheme=torch.per_tensor_symmetric, quant_min=-128, quant_max=127
+    )
+    # no tuning for granularity due to tuning space
+    weight_observer_minmax_sym = MinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_tensor_symmetric)
+
+    qconfig = {}
+    user_cfg = copy.deepcopy(tune_cfg["op"])
+    for _, cfg in user_cfg.items():
+        act_algo = cfg["activation"]["algorithm"]
+        act_sym = cfg["activation"]["scheme"]
+        break
+
+    if act_algo == "minmax":
+        if act_sym == "sym":
+            activation = act_observer_minmax_sym
+        else:
+            activation = act_observer_minmax_asym
+    else:
+        if act_sym == "sym":
+            activation = act_observer_kl_sym
+        else:
+            activation = act_observer_kl_asym
+
+    qconfig[""] = QConfig(activation=activation, weight=weight_observer_minmax_sym)
+
+    for (op_name, op_type), cfg in user_cfg.items():
+        if cfg["weight"]["dtype"] == "fp32":
+            qconfig[op_name] = None
+    return qconfig
+
+
 def generate_activation_observer(
     scheme, algorithm, smooth_quant=False, smooth_quant_enable=False, alpha=0.5
 ):  # pragma: no cover

@@ -1097,6 +1097,7 @@ def __init__(
         act_algo: str = "minmax",
         excluded_precisions: list = [],
         white_list: Optional[List[OP_NAME_OR_MODULE_TYPE]] = DEFAULT_WHITE_LIST,
+        model_info: Optional[List[Tuple[str, Callable]]] = None,
     ):
         """Init Static Quant Configs."""
         super().__init__(white_list=white_list)
@@ -1109,6 +1110,7 @@ def __init__(
         self.act_granularity = act_granularity
         self.act_algo = act_algo
         self.excluded_precisions = excluded_precisions
+        self.model_info = model_info
         self._post_init()
 
     @classmethod
@@ -1126,10 +1128,28 @@ def get_model_info_for_ipex(model: torch.nn.Module, example_inputs) -> List[Tupl
         _, _, _, _, model_info = get_quantizable_ops_recursively(model, example_inputs=example_inputs)
         return model_info
 
-    @staticmethod
-    def get_model_info(model: torch.nn.Module, example_inputs=None) -> List[Tuple[str, Callable]]:
+    def get_model_info_for_ipex_xpu(self, model: torch.nn.Module) -> List[Tuple[str, Callable]]:  # pragma: no cover
+        if self.model_info:
+            return self.model_info
+        else:
+            white_list = torch.quantization.quantization_mappings.get_default_qconfig_propagation_list()
+            filter_result = []
+            for op_name, module in model.named_modules():
+                if type(module) in white_list:
+                    pair = (op_name, type(module).__name__)
+                    filter_result.append(pair)
+            logger.debug(f"Get model info: {filter_result}")
+            self.model_info = filter_result
+            return filter_result
+
+    def get_model_info(self, model: torch.nn.Module, example_inputs=None) -> List[Tuple[str, Callable]]:
+        from neural_compressor.torch.utils.auto_accelerator import auto_detect_accelerator
+
         if is_ipex_imported():
-            return StaticQuantConfig.get_model_info_for_ipex(model, example_inputs)
+            if auto_detect_accelerator().current_device() == "cpu":
+                return StaticQuantConfig.get_model_info_for_ipex(model, example_inputs)
+            else:
+                return StaticQuantConfig.get_model_info_for_ipex_xpu(self, model)
 
     def to_config_mapping(
         self, config_list: List[BaseConfig] = None, model_info: List[Tuple[str, str]] = None

@@ -15,4 +15,3 @@
 from .environ import *
 from .constants import *
 from .utility import *
-from neural_compressor.torch.algorithms.layer_wise import load_empty_model
@@ -278,3 +278,65 @@ def get_processor_type_from_user_config(user_processor_type: Optional[Union[str,
     else:
         raise NotImplementedError(f"Unsupported processor type: {user_processor_type}")
     return processor_type
+
+
+def dowload_hf_model(repo_id, cache_dir=None, repo_type=None, revision=None):
+    """Download hugging face model from hf hub."""
+    import os
+
+    from huggingface_hub.constants import DEFAULT_REVISION, HUGGINGFACE_HUB_CACHE
+    from huggingface_hub.file_download import REGEX_COMMIT_HASH, repo_folder_name
+    from huggingface_hub.utils import EntryNotFoundError
+
+    if cache_dir is None:
+        cache_dir = HUGGINGFACE_HUB_CACHE
+    if revision is None:
+        revision = DEFAULT_REVISION
+    if repo_type is None:
+        repo_type = "model"
+    storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type))
+    commit_hash = None
+    if REGEX_COMMIT_HASH.match(revision):
+        commit_hash = revision
+    else:
+        ref_path = os.path.join(storage_folder, "refs", revision)
+        if os.path.exists(ref_path):
+            with open(ref_path) as f:
+                commit_hash = f.read()
+    if storage_folder and commit_hash:
+        pointer_path = os.path.join(storage_folder, "snapshots", commit_hash)
+        if os.path.isdir(pointer_path):
+            return pointer_path
+    else:  # pragma: no cover
+        from huggingface_hub import snapshot_download
+
+        file_path = snapshot_download(repo_id)
+        return file_path
+
+
+def load_empty_model(pretrained_model_name_or_path, cls=None, **kwargs):
+    """Load a empty model."""
+    import os
+
+    from accelerate import init_empty_weights
+    from transformers import AutoConfig, AutoModelForCausalLM
+    from transformers.models.auto.auto_factory import _BaseAutoModelClass
+
+    cls = AutoModelForCausalLM if cls is None else cls
+    is_local = os.path.isdir(pretrained_model_name_or_path)
+    if is_local:  # pragma: no cover
+        path = pretrained_model_name_or_path
+    else:
+        path = dowload_hf_model(pretrained_model_name_or_path)
+    if cls.__base__ == _BaseAutoModelClass:
+        config = AutoConfig.from_pretrained(path, **kwargs)
+        with init_empty_weights():
+            model = cls.from_config(config)
+    else:  # pragma: no cover
+        config = cls.config_class.from_pretrained(path, **kwargs)
+        with init_empty_weights():
+            model = cls(config)
+    model.tie_weights()
+    model.eval()
+    model.path = pretrained_model_name_or_path
+    return model
@@ -1,6 +1,6 @@
 numba
 numpy < 2.0
-peft==0.10.0
+peft
 prettytable
 psutil
 py-cpuinfo

@@ -42,7 +42,7 @@ def get_build_version():
     assert False, "Error: Could not open '%s' due %s\n" % (filepath, error)
 
 PKG_INSTALL_CFG = {
-    # overall install config for build from source, python setup.py install
+    # overall installation config, pip install neural-compressor
     "neural_compressor": {
         "project_name": "neural_compressor",
         "include_packages": find_packages(
@@ -53,33 +53,12 @@ def get_build_version():
         ),
         "package_data": {"": ["*.yaml"]},
         "install_requires": fetch_requirements("requirements.txt"),
-    },
-    # 2.x binary build config, pip install neural-compressor
-    "neural_compressor_2x": {
-        "project_name": "neural_compressor",
-        "include_packages": find_packages(
-            include=["neural_compressor", "neural_compressor.*"],
-            exclude=[
-                "neural_compressor.template",
-                "neural_compressor.common",
-                "neural_compressor.common.*",
-                "neural_compressor.torch",
-                "neural_compressor.torch.*",
-                "neural_compressor.tensorflow",
-                "neural_compressor.tensorflow.*",
-                "neural_compressor.onnxrt",
-                "neural_compressor.onnxrt.*",
-            ],
-        ),
-        "package_data": {"": ["*.yaml"]},
-        "install_requires": fetch_requirements("requirements.txt"),
         "extras_require": {
-            "pt": [f"neural_compressor_3x_pt=={__version__}"],
-            "tf": [f"neural_compressor_3x_tf=={__version__}"],
-            "ort": [f"neural_compressor_3x_ort=={__version__}"],
+            "pt": fetch_requirements("requirements_pt.txt"),
+            "tf": fetch_requirements("requirements_tf.txt"),
         },
     },
-    # 3.x pt binary build config, pip install neural-compressor[pt], install 2.x API + 3.x PyTorch API.
+    # 3.x pt binary build config, pip install neural-compressor-pt, install 3.x PyTorch API.
     "neural_compressor_3x_pt": {
         "project_name": "neural_compressor_3x_pt",
         "include_packages": find_packages(
@@ -92,7 +71,7 @@ def get_build_version():
         ),
         "install_requires": fetch_requirements("requirements_pt.txt"),
     },
-    # 3.x tf binary build config, pip install neural-compressor[tf], install 2.x API + 3.x TensorFlow API.
+    # 3.x tf binary build config, pip install neural-compressor-tf, install 3.x TensorFlow API.
     "neural_compressor_3x_tf": {
         "project_name": "neural_compressor_3x_tf",
         "include_packages": find_packages(
@@ -106,19 +85,6 @@ def get_build_version():
         "package_data": {"": ["*.yaml"]},
         "install_requires": fetch_requirements("requirements_tf.txt"),
     },
-    # 3.x ort binary build config, pip install neural-compressor[ort], install 2.x API + 3.x ONNXRT API.
-    "neural_compressor_3x_ort": {
-        "project_name": "neural_compressor_3x_ort",
-        "include_packages": find_packages(
-            include=[
-                "neural_compressor.common",
-                "neural_compressor.common.*",
-                "neural_compressor.onnxrt",
-                "neural_compressor.onnxrt.*",
-            ],
-        ),
-        "install_requires": fetch_requirements("requirements_ort.txt"),
-    },
 }
 
 
@@ -131,10 +97,6 @@ def get_build_version():
     ext_modules = []
     cmdclass = {}
 
-    if "2x" in sys.argv:
-        sys.argv.remove("2x")
-        cfg_key = "neural_compressor_2x"
-
     if "pt" in sys.argv:
         sys.argv.remove("pt")
         cfg_key = "neural_compressor_3x_pt"
@@ -143,10 +105,6 @@ def get_build_version():
         sys.argv.remove("tf")
         cfg_key = "neural_compressor_3x_tf"
 
-    if "ort" in sys.argv:
-        sys.argv.remove("ort")
-        cfg_key = "neural_compressor_3x_ort"
-
     if bool(os.getenv("USE_FP8_CONVERT", False)):
         from torch.utils.cpp_extension import BuildExtension, CppExtension
 

@@ -4,17 +4,24 @@
 import pytest
 import torch
 
+try:
+    import intel_extension_for_pytorch as ipex
+
+    is_ipex_available = True
+except:  # pragma: no cover
+    is_ipex_available = False
+    assert False, "Please install IPEX for static quantization."
+
 from neural_compressor.torch.quantization import (
     StaticQuantConfig,
     convert,
     get_default_static_config,
     prepare,
     quantize,
 )
-from neural_compressor.torch.utils import is_ipex_available
+from neural_compressor.torch.utils.auto_accelerator import auto_detect_accelerator
 
-if is_ipex_available():
-    import intel_extension_for_pytorch as ipex
+device = auto_detect_accelerator().current_device()
 
 
 def build_simple_torch_model():
@@ -53,7 +60,7 @@ def setup_class(self):
     def teardown_class(self):
         shutil.rmtree("saved_results", ignore_errors=True)
 
-    @pytest.mark.skipif(not is_ipex_available(), reason="Requires IPEX")
+    @pytest.mark.skipif(not is_ipex_available or device != "cpu", reason="Requires IPEX on CPU device")
     def test_static_quant_default(self):
         fp32_model = copy.deepcopy(self.fp32_model)
         quant_config = get_default_static_config()
@@ -70,7 +77,7 @@ def test_static_quant_default(self):
         q_model = convert(prepared_model)
         assert q_model is not None, "Quantization failed!"
 
-    @pytest.mark.skipif(not is_ipex_available(), reason="Requires IPEX")
+    @pytest.mark.skipif(not is_ipex_available or device != "cpu", reason="Requires IPEX on CPU device")
     def test_static_quant_fallback(self):
         fp32_model = copy.deepcopy(self.fp32_model)
         quant_config = get_default_static_config()
@@ -100,7 +107,7 @@ def test_static_quant_fallback(self):
                 dtype = q_model.tune_cfg[" "]["q_op_infos"][op]["input_tensor_infos"][0]["force_dtype"]
                 assert dtype == "torch.float32", "Failed to fallback fc2 layer, please check!"
 
-    @pytest.mark.skipif(not is_ipex_available(), reason="Requires IPEX")
+    @pytest.mark.skipif(not is_ipex_available or device != "cpu", reason="Requires IPEX on CPU device")
     @pytest.mark.parametrize(
         "act_sym, act_algo",
         [
@@ -119,7 +126,7 @@ def test_static_quant_params(self, act_sym, act_algo):
         q_model = convert(prepared_model)
         assert q_model is not None, "Quantization failed!"
 
-    @pytest.mark.skipif(not is_ipex_available(), reason="Requires IPEX")
+    @pytest.mark.skipif(not is_ipex_available or device != "cpu", reason="Requires IPEX on CPU device")
     def test_static_quant_accuracy(self):
         class M(torch.nn.Module):
             def __init__(self):
@@ -148,7 +155,7 @@ def run_fn(model):
         # set a big atol to avoid random issue
         assert torch.allclose(output1, output2, atol=2e-2), "Accuracy gap atol > 0.02 is unexpected. Please check."
 
-    @pytest.mark.skipif(not is_ipex_available(), reason="Requires IPEX")
+    @pytest.mark.skipif(not is_ipex_available or device != "cpu", reason="Requires IPEX on CPU device")
     def test_static_quant_save_load(self):
         from intel_extension_for_pytorch.quantization import convert as ipex_convert
         from intel_extension_for_pytorch.quantization import prepare as ipex_prepare
@@ -196,7 +203,7 @@ def run_fn(model):
         loaded_model = load("saved_results")
         assert isinstance(loaded_model, torch.jit.ScriptModule)
 
-    @pytest.mark.skipif(not is_ipex_available(), reason="Requires IPEX")
+    @pytest.mark.skipif(not is_ipex_available or device != "cpu", reason="Requires IPEX on CPU device")
     def test_static_quant_with_quantize_API(self):
         # quantize API
         fp32_model = copy.deepcopy(self.fp32_model)
@@ -205,7 +212,7 @@ def test_static_quant_with_quantize_API(self):
         q_model = quantize(fp32_model, quant_config=quant_config, run_fn=run_fn, example_inputs=example_inputs)
         assert q_model is not None, "Quantization failed!"
 
-    @pytest.mark.skipif(not is_ipex_available(), reason="Requires IPEX")
+    @pytest.mark.skipif(not is_ipex_available or device != "cpu", reason="Requires IPEX on CPU device")
     def test_static_quant_mixed_precision(self):
         fp32_model = copy.deepcopy(self.fp32_model)
         example_inputs = self.input
@@ -227,3 +234,49 @@ def test_static_quant_mixed_precision(self):
         run_fn(prepared_model)
         q_model = convert(prepared_model)
         assert q_model is not None, "Quantization failed!"
+
+    @pytest.mark.skipif(not is_ipex_available or device == "cpu", reason="Requires IPEX on XPU device")
+    @pytest.mark.parametrize(
+        "act_sym, act_algo",
+        [
+            (True, "kl"),
+            (True, "minmax"),
+            (False, "kl"),
+            (False, "minmax"),
+        ],
+    )
+    def test_static_quant_xpu(self, act_sym, act_algo):
+        import torchvision.models as models
+
+        model = models.resnet50(pretrained=True)
+        fp32_model = copy.deepcopy(model)
+        data = torch.rand(1, 3, 224, 224)
+        example_inputs = data.to("xpu")
+
+        def run_fn(model):
+            model(example_inputs)
+
+        quant_config = StaticQuantConfig(act_sym=act_sym, act_algo=act_algo, excluded_precisions=["bf16"])
+        # fallback by op_name
+        quant_config.set_local("conv1", StaticQuantConfig(w_dtype="fp32", act_dtype="fp32"))
+        prepared_model = prepare(fp32_model, quant_config=quant_config, example_inputs=example_inputs)
+        run_fn(prepared_model)
+        q_model = convert(prepared_model)
+        run_fn(q_model)
+        assert q_model is not None, "Quantization failed!"
+
+        quant_config = StaticQuantConfig(act_sym=act_sym, act_algo=act_algo, excluded_precisions=["bf16"])
+        # fallback by op_type
+        quant_config.set_local("Conv2d", StaticQuantConfig(w_dtype="fp32", act_dtype="fp32"))
+        prepared_model = prepare(fp32_model, quant_config=quant_config, example_inputs=example_inputs)
+        run_fn(prepared_model)
+        q_model = convert(prepared_model)
+        run_fn(q_model)
+        assert q_model is not None, "Quantization failed!"
+
+        q_model.save("saved_results")
+        from neural_compressor.torch.quantization import load
+
+        # load
+        loaded_model = load("saved_results")
+        assert isinstance(loaded_model, torch.jit.ScriptModule), "Loading failed!"
@@ -2,7 +2,6 @@ auto_round @ git+https://github.com/intel/auto-round.git@24b2e74070f2b4e6f26ff06
 expecttest
 intel_extension_for_pytorch
 numpy
-peft==0.10.0
 prettytable
 psutil
 pytest